I have a code that build scatter plot and display the linear regression trend line and the R square.
I calculate the R square manually by calculte the slope, intercept and the r_value as following:
#Try for Linear Regression Moddel- still couldn't display anything on any scatter plot.
x = merged_data['NDVI']
y = merged_data['nitrogen']
from scipy.stats import linregress
slope, intercept, r_value, p_value, std_err = linregress(x, y)
print('slope:',slope)
print('intercept:',intercept)
print('R:',r_value)
print('R^2:',(r_value**2))
## Create Figure (empty canvas)
fig = plt.figure()
##Add set of axes to figure
axes = fig.add_axes([1,1,1,1]) # left, bottom, width, height (range 0 to 1)
##plot
plt.scatter(x,y,alpha=0.5)
plt.title('NDVI vs Nitrogen 17/6/2019')
plt.xlabel('NDVI')
#here I insert the calculted value manually according to the print values
plt.figtext(1.8,1.6, "y=-7.269X+10.11")
plt.figtext(1.8,1.55, "R^2=-0.017")
plt.ylabel('Nitrogen')
plt.show()
I have many different databases which I want to check this for them and I don't want to manually change everytime the test in the plot, is ther any way I can tell python automatically take those values and put them in the right place?
Check this out:
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import linregress
# generate dataset 1
x1 = np.random.normal(0,1,1000)
epsilon1 = np.random.normal(0,1,1000)
y1 = x1 + epsilon1
# generate dataset 2
x2 = np.random.normal(0,1,1000)
epsilon2 = np.random.normal(0,1,1000)
y2 = -x2 + epsilon2
def give_me_scatter(x, y, title, xlabel, ylabel):
slope, intercept, r_value, p_value, std_err = linregress(x, y)
print('slope:',slope)
print('intercept:',intercept)
print('R:',r_value)
print('R^2:',(r_value**2))
## Create Figure (empty canvas)
fig = plt.figure()
##Add set of axes to figure
axes = fig.add_axes([1,1,1,1]) # left, bottom, width, height (range 0 to 1)
##plot
plt.scatter(x,y,alpha=0.5)
plt.title(title)
plt.xlabel(xlabel)
#here I insert the calculted value manually according to the print values
plt.figtext(1.0,1.95, "y={0:.3}X+{1:.3}".format(slope, intercept))
plt.figtext(1.0,1.90, "R^2={0:.3}".format(r_value**2))
plt.ylabel(ylabel)
plt.show()
For dataset 1:
give_me_scatter(x1, y1, 'x1 vs y1 10/12/2019', 'x1', 'y1')
slope: 0.9505854192888193
intercept: -0.0499255665055585
R: 0.6949004149189184
R^2: 0.482886586654485
For dataset 2:
give_me_scatter(x2, y2, 'x2 vs y2 10/12/2019', 'x2', 'y2')
slope: -0.9288542869184935
intercept: -0.008475040216075778
R: -0.6781390024143394
R^2: 0.4598725065955155
Related
I have a dataset for curvature and I need to find the tangent to the curve but unfortunately, this is a bit far from the curve. Kindly guide me the issue solution related to the problem. Thank you!
My code is as follows:
fig, ax1 = plt.subplots()
chData_m = efficient.get('Car.Road.y')
x_fit = chData_m.timestamps
y_fit = chData_m.samples
fittedParameters = np.polyfit(x_fit[:],y_fit[:],1)
f = plt.figure(figsize=(800/100.0, 600/100.0), dpi=100)
axes = f.add_subplot(111)
# first the raw data as a scatter plot
axes.plot(x_fit, y_fit, 'D')
# create data for the fitted equation plot
xModel = np.linspace(min(x_fit), max(x_fit))
yModel = np.polyval(fittedParameters, xModel)
# now the model as a line plot
axes.plot(xModel, yModel)
axes.set_xlabel('X Data') # X axis data label
axes.set_ylabel('Y Data') # Y axis data label
# polynomial derivative from numpy
deriv = np.polyder(fittedParameters)
# for plotting
minX = min(x_fit)
maxX = max(x_fit)
# value of derivative (slope) at a specific X value, so
# that a straight line tangent can be plotted at the point
# you might place this code in a loop to animate
pointVal = 10.0 # example X value
y_value_at_point = np.polyval(fittedParameters, pointVal)
slope_at_point = np.polyval(deriv, pointVal)
ylow = (minX - pointVal) * slope_at_point + y_value_at_point
yhigh = (maxX - pointVal) * slope_at_point + y_value_at_point
# now the tangent as a line plot
axes.plot([minX, maxX], [ylow, yhigh])
plt.show()
plt.close('all') # clean up after using pyplot
and the output is:
I am not sure how you wanted to use numpy polyfit/polyval to determine the tangent formula. I describe here a different approach. The advantage of this approach is that it does not have any assumptions about the nature of the function. The disadvantage is that it will not work for vertical tangents.
To be on the safe side, I have considered both cases, i.e., that the evaluated x-value is a data point in your series and that it is not. Some problems may arise because I see that you mention timestamps in your question without specifying their nature by providing a toy dataset - so, this version may or may not work with the datetime objects or timestamps of your original data:
import matplotlib.pyplot as plt
import numpy as np
#generate fake data with unique random x-values between 0 and 70
def func(x, a=0, b=100, c=1, n=3.5):
return a + (b/(1+(c/x)**n))
np.random.seed(123)
x = np.random.choice(range(700000), 100)/10000
x.sort()
y = func(x, 1, 2, 15, 2.4)
#data point to evaluate
xVal = 29
#plot original data
fig, ax = plt.subplots()
ax.plot(x, y, c="blue", label="data")
#calculate gradient
slope = np.gradient(y, x)
#determine slope and intercept at xVal
ind1 = (np.abs(x - xVal)).argmin()
#case 1 the value is a data point
if xVal == x[ind1]:
yVal, slopeVal = y[ind1], slope[ind1]
#case 2 the value lies between to data points
#in which case we approximate linearly from the two nearest data points
else:
if xVal < x[ind1]:
ind1, ind2 = ind1-1, ind1
else:
ind1, ind2 = ind1, ind1+1
yVal = y[ind1] + (y[ind2]-y[ind1]) * (xVal-x[ind1]) / (x[ind2]-x[ind1])
slopeVal = slope[ind1] + (slope[ind2]-slope[ind1]) * (xVal-x[ind1]) / (x[ind2]-x[ind1])
intercVal = yVal - slopeVal * xVal
ax.plot([x.min(), x.max()], [slopeVal*x.min()+intercVal, slopeVal*x.max()+intercVal], color="green",
label=f"tangent\nat point [{xVal:.1f}, {yVal:.1f}]\nwith slope {slopeVal:.2f}\nand intercept {intercVal:.2f}" )
ax.set_ylim(0.8 * y.min(), 1.2 * y.max())
ax.legend()
plt.show()
I have a dataset that looks similar to the one simulated in the code below. There are two sets of observations, one for those at X=0 and another for those at X>0.
import numpy as np
import seaborn as sns; sns.set()
import matplotlib.pyplot as plt
X1 = np.random.normal(0, 1, 100)
X1 = X1 - np.min(X1)
Y1 = X1 + np.random.normal(0, 1, 100)
X0 = np.zeros(100)
Y0 = np.random.normal(0, 1.2, 100) + 2
X = np.concatenate((X1, X0))
Y = np.concatenate((Y1, Y0))
sns.distplot(Y0, color="orange")
plt.show()
sns.scatterplot(X, Y, hue = (X == 0), legend=False)
plt.show()
There are two plots: a histogram with KDE and a scatterplot.
I want to take the histogram with KDE, rotate it, and orient it appropriately with respect to the scatter plot. I would also like to add a trend line for each respective set of observations.
The ideal result would look something like this:
How do you do this in python, either using seaborn or matplotlib?
This can be done by combining plt.subplots with shared y-axis to keep the scale and sns plots. For trend line you need some additional computation, but you can use np for quick fitting. Here is an example how to achieve your goal, and here is jupyter notebook to play with.
import numpy as np
import seaborn as sns; sns.set()
import matplotlib.pyplot as plt
# Prepare some data
np.random.seed(2020)
mean_Y1 = 0
std_Y1 = 1
size_Y1 = 100
X1 = np.random.normal(mean_Y1, std_Y1, size_Y1)
X1 = X1 - np.min(X1)
Y1 = X1 + np.random.normal(mean_Y1, std_Y1, size_Y1)
# this for computing trend line
Z = np.polyfit(X1, Y1, 1)
Y_ = np.poly1d(Z)(X1)
mean_Y0 = 2
std_Y0 = 1.2
size_Y0 = 100
X0 = np.zeros(100)
Y0 = np.random.normal(mean_Y0, std_Y0, size_Y0)
X = np.concatenate((X1, X0))
Y = np.concatenate((Y1, Y0))
# Now time for plotting
fig, axs = plt.subplots(1, 2,
sharey=True,
figsize=(10, 5),
gridspec_kw={'width_ratios': (1, 2)}
)
# control space between plots
fig.subplots_adjust(wspace=0.1)
# set the ticks for y-axis:
axs[0].yaxis.set_tick_params(left=False, labelleft=False, labelright=True)
# if you wish you can rotate xticks on the histogram with:
axs[0].xaxis.set_tick_params(rotation=90)
# plot histogram
dist = sns.distplot(Y0, color="orange", vertical=True, ax=axs[0])
# now we need to get the coordinate of the peak, we need this for mean line
line_data = dist.get_lines()[0].get_data()
max_Y0 = np.max(line_data[0])
# plotting the mean line
axs[0].plot([0, max_Y0], [mean_Y0, mean_Y0], '--', c='orange')
# inverting xaxis
axs[0].invert_xaxis()
# Plotting scatterpot
sns.scatterplot(X, Y, hue = (X == 0), legend=False, ax=axs[1])
# Plotting trend line
sns.lineplot(X1, Y_, ax=axs[1])
# Plotting mean again
axs[1].plot([0, max(X1)], [mean_Y0, mean_Y0], '--', c='orange')
plt.show()
Out:
Trying to iterate through a for loop which runs 3 regressions over a pandas dataframe while printing a plot of the line for each variable.
year = crime_df.iloc[:,0]
violent_crime_rate = crime_df.iloc[:,3]
murder_rate = crime_df.iloc[:,5]
aggravated_assault_rate = crime_df.iloc[:,11]
x_axis = [violentcrimerate, murderrate, aggravatedassaultrate]
for x in x_axis:
slope, intercept, r_value, p_value, std_err = linregress(year, x)
fit = slope * year + intercept
fig, ax = plt.subplots()
fig.suptitle('x', fontsize=16, fontweight="bold")
ax.plot(year, x, linewidth=0, marker='o')
ax.plot(year, fit, 'b--')
plt.show()
Code produces 3 plots with title 'x' and distinct regression lines but I would like to know how to set relative titles (and labels) for each plot with respect to each variable within the loop. Unsure how to retrieve the variable names from the list I'm referencing. Tried str(x) in the suptitle line but that returned the values in the column rather than the list title.
something like this?
import numpy as np
import matplotlib.pyplot as plt
matrix = np.random.rand(4,12) # emulate some data
crime_df = pd.DataFrame(matrix)# emulate some data
year = crime_df.iloc[:,0]
violent_crime_rate = crime_df.iloc[:,3]
murder_rate = crime_df.iloc[:,5]
aggravated_assault_rate = crime_df.iloc[:,11]
names = ['violent_crime_rate','murder_rate','aggravated_assault_rate']
x_axis = [violent_crime_rate, murder_rate, aggravated_assault_rate]
def linregress(year,x): #emulate some data
return np.random.rand(5)
for ind, x in enumerate(x_axis):
slope, intercept, r_value, p_value, std_err = linregress(year, x)
fit = slope * year + intercept
fig, ax = plt.subplots()
fig.suptitle('x:'+str(names[ind]), fontsize=16, fontweight="bold")
ax.plot(year, x, linewidth=0, marker='o', label = names[ind] + ':1')
ax.plot(year, fit, 'b--', label = names[ind] + ':2')
ax.legend()
plt.show()
I'm trying to fit a second order polynomial to raw data and output the results using Matplotlib. There are about a million points in the data set that I'm trying to fit. It is supposed to be simple, with many examples available around the web. However for some reason I cannot get it right.
I get the following warning message:
RankWarning: Polyfit may be poorly conditioned
This is my output:
This is output using Excel:
See below for my code. What am I missing??
xData = df['X']
yData = df['Y']
xTitle = 'X'
yTitle = 'Y'
title = ''
minX = 100
maxX = 300
minY = 500
maxY = 2200
title_font = {'fontname':'Arial', 'size':'30', 'color':'black', 'weight':'normal',
'verticalalignment':'bottom'} # Bottom vertical alignment for more space
axis_font = {'fontname':'Arial', 'size':'18'}
#Poly fit
# calculate polynomial
z = np.polyfit(xData, yData, 2)
f = np.poly1d(z)
print(f)
# calculate new x's and y's
x_new = xData
y_new = f(x_new)
#Plot
plt.scatter(xData, yData,c='#002776',edgecolors='none')
plt.plot(x_new,y_new,c='#C60C30')
plt.ylim([minY,maxY])
plt.xlim([minX,maxX])
plt.xlabel(xTitle,**axis_font)
plt.ylabel(yTitle,**axis_font)
plt.title(title,**title_font)
plt.show()
The array to plot must be sorted. Here is a comparisson between plotting a sorted and an unsorted array. The plot in the unsorted case looks completely distorted, however, the fitted function is of course the same.
2
-3.496 x + 2.18 x + 17.26
import matplotlib.pyplot as plt
import numpy as np; np.random.seed(0)
x = (np.random.normal(size=300)+1)
fo = lambda x: -3*x**2+ 1.*x +20.
f = lambda x: fo(x) + (np.random.normal(size=len(x))-0.5)*4
y = f(x)
fig, (ax, ax2) = plt.subplots(1,2, figsize=(6,3))
ax.scatter(x,y)
ax2.scatter(x,y)
def fit(ax, x,y, sort=True):
z = np.polyfit(x, y, 2)
fit = np.poly1d(z)
print(fit)
ax.set_title("unsorted")
if sort:
x = np.sort(x)
ax.set_title("sorted")
ax.plot(x, fo(x), label="original func", color="k", alpha=0.6)
ax.plot(x, fit(x), label="fit func", color="C3", alpha=1, lw=2.5 )
ax.legend()
fit(ax, x,y, sort=False)
fit(ax2, x,y, sort=True)
plt.show()
The problem is probably using a power basis for data that is displaced some distance from zero along the x axis. If you use the Polynomial class from numpy.polynomial it will scale and shift the data before the fit, which will help, and also keep track of the scale and shift used. Note that if you want the coefficients in the normal form you will need to convert to that form.
Would somebody be able to explain to me how to use the location parameter with the gamma.fit function in Scipy?
It seems to me that a location parameter (μ) changes the support of the distribution from x ≥ 0 to y = ( x - μ ) ≥ 0. If μ is positive then aren't we losing all the data which doesn't satisfy x - μ ≥ 0?
Thanks!
The fit function takes all of the data into consideration when finding a fit. Adding noise to your data will alter the fit parameters and can give a distribution that does not represent the data very well. So we have to be a bit clever when we are using fit.
Below is some code that generates data, y1, with loc=2 and scale=1 using numpy. It also adds noise to the data over the range 0 to 10 to create y2. Fitting y1 yield excellent results, but attempting to fit the noisy y2 is problematic. The noise we added smears out the distribution. However, we can also hold 1 or more parameters constant when fitting the data. In this case we pass floc=2 to the fit, which forces the location to be held at 2 when performing the fit, returning much better results.
from scipy.stats import gamma
import numpy as np
import matplotlib.pyplot as plt
x = np.arange(0,10,.1)
y1 = np.random.gamma(shape=1, scale=1, size=1000) + 2 # sets loc = 2
y2 = np.hstack((y1, 10*np.random.rand(100))) # add noise from 0 to 10
# fit the distributions, get the PDF distribution using the parameters
shape1, loc1, scale1 = gamma.fit(y1)
g1 = gamma.pdf(x=x, a=shape1, loc=loc1, scale=scale1)
shape2, loc2, scale2 = gamma.fit(y2)
g2 = gamma.pdf(x=x, a=shape2, loc=loc2, scale=scale2)
# again fit the distribution, but force loc=2
shape3, loc3, scale3 = gamma.fit(y2, floc=2)
g3 = gamma.pdf(x=x, a=shape3, loc=loc3, scale=scale3)
And make some plots...
# plot the distributions and fits. to lazy to do iteration today
fig, axes = plt.subplots(1, 3, figsize=(13,4))
ax = axes[0]
ax.hist(y1, bins=40, normed=True);
ax.plot(x, g1, 'r-', linewidth=6, alpha=.6)
ax.annotate(s='shape = %.3f\nloc = %.3f\nscale = %.3f' %(shape1, loc1, scale1), xy=(6,.2))
ax.set_title('gamma fit')
ax = axes[1]
ax.hist(y2, bins=40, normed=True);
ax.plot(x, g2, 'r-', linewidth=6, alpha=.6)
ax.annotate(s='shape = %.3f\nloc = %.3f\nscale = %.3f' %(shape2, loc2, scale2), xy=(6,.2))
ax.set_title('gamma fit with noise')
ax = axes[2]
ax.hist(y2, bins=40, normed=True);
ax.plot(x, g3, 'r-', linewidth=6, alpha=.6)
ax.annotate(s='shape = %.3f\nloc = %.3f\nscale = %.3f' %(shape3, loc3, scale3), xy=(6,.2))
ax.set_title('gamma fit w/ noise, location forced')