Pandas plotting linear regression on scatter graph - python

I'm trying to plot a linear regression on a scatter graph.
def chart1(df, yr, listcols):
temp = df[(df['YEAR']==yr)]
fig, axes = plt.subplots(nrows=2, ncols=2, figsize = (12,12))
for e in [['WD','pk_h',0,0],['WD','of_h',0,1],['SAT','of_h',1,0],['SUN','of_h',1,1]]:
temp.ix[(temp['daytype']==e[0])&(temp['hourtype']==e[1]),listcols].plot(kind='scatter', title=str(yr)+' '+e[0]+' '+e[1], x=listcols[0], y=listcols[1], ax=axes[e[2],e[3]])
fig.tight_layout()
return temp
chartd = chart1(o2, 2017,['PROD', 'option_exercise'])
I can't figure out how to make it possible in my loop.

It should work this way:
In your for loop run a regression and store the results in 'res'. Manually caclulate the predicted y ('yhat') using the stored coefficients. Then chart both x vs. y and x vs. yhat:
import pandas.stats.api
def chart4(df, yr, day, Y, sensi):
temp = df[(df['YEAR']==yr)]
temp = temp[(temp['daytype']==day)]
fig = plt.figure(figsize=(15,13))
for i, var in enumerate(sensi):
res = ols(y=temp[Y], x=temp[var])
label = 'R2: ' + str(res.r2)
temp['yhat'] = temp[var]*res.beta[0] + res.beta[1]
axis=fig.add_subplot(4,3,i+1)
temp.plot(ax=axis,kind='scatter', x=var, y=Y, title=var)
temp.plot(ax=axis, kind='scatter', x=var, y='yhat', color='grey', s=1, label=label)
axis.set_xlabel(r'alpha', fontsize=18)
fig.tight_layout()
return

Related

Plotting matplotlib subplots with functions

I am attemption to create a function to serve as a quick visual assessment for a normal distribution and to automate this for a whole dataframe. I want to create a no. of cols x 2 subplot (2 columns, each column of a dataframe a row) with the left plot being a histogram and the right a probability plot. I have written functions for each of these plots and they work fine, and the ax argument I have added can successfully plot them in a specific subplot coordinate. When I try to call these functions in a final function, intended to apply these to each column in a dataframe only the first histogram is returned and the rest of the plots empty.
Not sure where I am going wrong. See code for functions below. Note, no errors are returned:
#Histogram for normality
def normal_dist_hist(data, ax):
#Format data for plotting
#Included ax for subplot coordinate
if data.isnull().values.any() == True:
data.dropna(inplace=True)
if data.dtypes == 'float64':
data.astype('int64')
#Plot distribution with Gaussian overlay
mu, std = stats.norm.fit(data)
ax.hist(data, bins=50, density=True, alpha=0.6, color='g')
xmin, xmax = ax.get_xlim()
x = np.linspace(xmin, xmax, 100)
p = stats.norm.pdf(x, mu, std)
ax.plot(x, p, 'k', linewidth=2)
title = "Fit results: mu = %.2f, std = %.2f" % (mu, std)
ax.set_title(title)
plt.show()
#Probability plot
def normal_test_QQplots(data, ax):
#added ax argument for specifying subplot coordinate,
data.dropna(inplace=True)
probplt = stats.probplot(data,dist='norm',fit=True,plot=ax)
plt.show()
def normality_report(df):
fig, axes = plt.subplots(nrows=len(df.columns), ncols=2,figsize=(12,50))
ax_y = 0
for col in df.columns[1:]:
ax_x = 0
normal_dist_hist(df[col], ax=axes[ax_y, ax_x])
ax_x = 1
normal_test_QQplots(df[col], ax=axes[ax_y, ax_x])
ax_y += 1
Remove the plt.show() from your methods normal_dist_hist(...) and normal_test_QQplots(...). Add plt.show() at the end of your normality_report(...).
def normal_dist_hist(data, ax):
...
plt.show() # Remove this
#Probability plot
def normal_test_QQplots(data, ax):
...
plt.show() # Remove this
def normality_report(df):
...
for col in df.columns[1:]:
ax_x = 0
normal_dist_hist(df[col], ax=axes[ax_y, ax_x])
ax_x = 1
normal_test_QQplots(df[col], ax=axes[ax_y, ax_x])
ax_y += 1
plt.show() # Add it here.

Plot 2 images side by side for each for loop

I'm training a KNN model and I want to plot 2 images per for loop, as shown in the imagen below:
What I need
At the left, I plot the boundary visualization of my model for a certain amoung of neighbours. At the right, I plot the confusion matrix.
To accomplish something along those lines I've written the following code:
fig = plt.figure()
for i in range(1,3):
neigh = KNeighborsClassifier(n_neighbors=i)
neigh.fit(X, y)
y_pred = neigh.predict(X)
acc = accuracy_score(y_pred,y)
# Boundary
ax1 = fig.add_subplot(1,2,1)
visualize_classifier(neigh, X, y, ax=ax1) # Defined by me
# Plot confusion matrix. Defined by sklearn.metrics
ax2 = fig.add_subplot(1,2,2)
plot_confusion_matrix(neigh, X, y, cmap=plt.cm.Blues, values_format = '.0f',ax=ax2)
ax1.set_title(f'Neighbors = {i}.\nAccuracy = {acc:.4f}',
fontsize = 14)
ax2.set_title(f'Neighbors = {i}.\nAccuracy = {acc:.4f}',
fontsize = 14)
plt.tight_layout()
plt.figure(i)
plt.show()
The visualize_classifier() function:
def visualize_classifier(model, X, y, ax=None, cmap='Dark2'):
ax = ax or plt.gca()
# Plot the training points
ax.scatter(X.iloc[:, 0], X.iloc[:, 1], c=y, s=30, cmap=cmap, # Changed to iloc.
clim=(y.min(), y.max()), zorder=3, alpha = 0.5)
ax.axis('tight')
ax.set_xlabel('x1')
ax.set_ylabel('x2')
# ax.axis('off')
xlim = ax.get_xlim()
ylim = ax.get_ylim()
xx, yy = np.meshgrid(np.linspace(*xlim, num=200),
np.linspace(*ylim, num=200))
Z = model.predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)
# Create a color plot with the results
n_classes = len(np.unique(y))
contours = ax.contourf(xx, yy, Z, alpha=0.3,
levels=np.arange(n_classes + 1) - 0.5,
cmap=cmap, clim=(y.min(), y.max()),
zorder=1)
ax.set(xlim=xlim, ylim=ylim)
What I get
What I get. Continues...
As you can see, only the first loop is plotted. the second one is not plotted and I can't figure out why.
Furthermore, I have the same title for the plot at the right and at the left. I would like to have only one on top of both, how can this be accomplished?
Now, you might be wondering why do I need to do this and the answer is that I would like to see how the boundaries change depending on the number of neighbors. It's just to get a visual sense of KNN algorithm.
Any suggestion would be pretty much appreciated.
I was able to make it work. What I had wrong was the first line inside the for loop. I assigned plt.figure(i, figsize=(18, 8)) to the variable fig.
for i in range(1,30):
fig = plt.figure(i, figsize=(18, 8))
sns.set(font_scale=2.0) # Adjust to fit
neigh = KNeighborsClassifier(n_neighbors=i)
neigh.fit(X, y)
y_pred = neigh.predict(X)
acc = accuracy_score(y_pred,y)
# Boundary
ax1 = fig.add_subplot(1,2,1)
visualize_classifier(neigh, X, y, ax=ax1) # Defined by me
# Plot confusion matrix. Defined by sklearn.metrics
ax2 = fig.add_subplot(1,2,2)
plot_confusion_matrix(neigh, X, y, cmap=plt.cm.Blues, values_format = '.0f',ax=ax2)
fig.suptitle(f'Neighbors = {i}. Accuracy = {acc:.4f}',y=1)
plt.show()
For the title I used: fig.suptitle(f'Neighbors = {i}. Accuracy = {acc:.4f}',y=1)

Hist wrong binwidth with logarithmix x and y axis

I need to plot a hist with bot logarithmic y and x-axis, but I'd like also to have hist's bins displayed of same size.
How can I achieve this result with the following code (the x used is very long so I have intentionally avoided to insert it):
import matplotlib as plt
import numpy as np
fig, ax1 = plt.subplots()
hist, bins, _ = ax1.hist(x, log=True, color="red", rwidth=0.5)
plt.xscale("log")
np_x = np.array(x)
print("np_x.mean() = " + str(np_x.mean()))
plt.axvline(np_x.mean() * 1.1, color='lime', linestyle='dashed', linewidth=3,
label='Mean: {:.2f}'.format(np_x.mean()))
handles, labels = ax1.get_legend_handles_labels()
binwidth = math.floor(bins[1] - bins[0])
mylabel = "Binwidth: {}".format(binwidth) + ", Bins: {}".format(len(hist))
red_patch = mpatches.Patch(color='red', label=mylabel)
handles = [red_patch] + handles
labels = [mylabel] + labels
ax1.legend(handles, labels)
plt.xlabel(x_label)
plt.ylabel(y_label)
plt.show()

Iterate through linear regression while outputting plots In Python (SciPy & MatPlotLib)

Trying to iterate through a for loop which runs 3 regressions over a pandas dataframe while printing a plot of the line for each variable.
year = crime_df.iloc[:,0]
violent_crime_rate = crime_df.iloc[:,3]
murder_rate = crime_df.iloc[:,5]
aggravated_assault_rate = crime_df.iloc[:,11]
x_axis = [violentcrimerate, murderrate, aggravatedassaultrate]
for x in x_axis:
slope, intercept, r_value, p_value, std_err = linregress(year, x)
fit = slope * year + intercept
fig, ax = plt.subplots()
fig.suptitle('x', fontsize=16, fontweight="bold")
ax.plot(year, x, linewidth=0, marker='o')
ax.plot(year, fit, 'b--')
plt.show()
Code produces 3 plots with title 'x' and distinct regression lines but I would like to know how to set relative titles (and labels) for each plot with respect to each variable within the loop. Unsure how to retrieve the variable names from the list I'm referencing. Tried str(x) in the suptitle line but that returned the values in the column rather than the list title.
something like this?
import numpy as np
import matplotlib.pyplot as plt
matrix = np.random.rand(4,12) # emulate some data
crime_df = pd.DataFrame(matrix)# emulate some data
year = crime_df.iloc[:,0]
violent_crime_rate = crime_df.iloc[:,3]
murder_rate = crime_df.iloc[:,5]
aggravated_assault_rate = crime_df.iloc[:,11]
names = ['violent_crime_rate','murder_rate','aggravated_assault_rate']
x_axis = [violent_crime_rate, murder_rate, aggravated_assault_rate]
def linregress(year,x): #emulate some data
return np.random.rand(5)
for ind, x in enumerate(x_axis):
slope, intercept, r_value, p_value, std_err = linregress(year, x)
fit = slope * year + intercept
fig, ax = plt.subplots()
fig.suptitle('x:'+str(names[ind]), fontsize=16, fontweight="bold")
ax.plot(year, x, linewidth=0, marker='o', label = names[ind] + ':1')
ax.plot(year, fit, 'b--', label = names[ind] + ':2')
ax.legend()
plt.show()

np.polyfit does not fit the data

I can not find a curve that adjust the data (lists 'chi' and 'm'). I used polyfit to generate the curve but it was not enough to capture the behavior of the points.
The code ahead has a plot that shows the discrepancy between the data and the adjustment.
import matplotlib.pyplot as plt
import numpy as np
chi = [159.227326193538,157.045536099339,154.874421083320,152.714227953804,150.565205206850,148.427603026261,146.301673283577,144.187669538078,142.085847036787,139.996462714462,137.919775193605,135.856044784456,133.805533484994,131.768504980940,129.745224645753,127.735959540633,125.740978414520,123.760551704092,121.794951533770,119.844451715712,117.909327749816,115.989856823722,114.086317812809,112.198991280194,110.328159476736,108.474106341033,106.637117499424,104.817480265986,103.015483642536,101.231418318633,99.4655766715733,97.7182527663948,95.9897423558747,94.2803428805298,92.5903534686167,90.9200749361326,89.2698097868135,87.6398622121363,86.0305380913169,84.4421449913117,82.8749921668166,81.3293905602669,79.8056528018393,78.3040932094484,76.8250277887500,75.3687742331392,73.9356519237512,72.5259819294609,71.1400870068830,69.7782916003724,68.4409218420233,67.1283055516702,65.8407722368873,64.5786530929887,63.3422810030283,62.1319905377998,60.9481179558368,59.7910012034130,58.6609799145416,57.5583954109757,56.4835907022086,55.4369104854728,54.4187011457414,53.4293107557267,52.4690890758814,51.5383875543978,50.6375593272080,49.7669592179839,48.9269437381375,48.1178710868206,47.3401011509247,46.5939955050811,45.8799174116612,45.1982318207762,44.5493053702771,43.9335063857545,43.3512048805394,42.8027725557022,42.2885828000534,41.8090106901432,41.3644329902617,40.9552281524389,40.5817763164445,40.2444593097885,39.9436606477201,39.6797655332288,39.4531608570438,39.2642351976343,39.1133788212092,39.0009836817171,38.9274434208471,38.8931533680273,38.8985105404262,38.9439136429520,39.0297630682529,39.1564608967166,39.3244108964711,39.5340185233838,39.7856909210623,40.0798369208539,40.4168670418459,40.7971934908652,41.2212301624788,41.6893926389935,42.2020981904556,42.7597657746519,43.3628160371087,44.0116713110920,44.7067556176079,45.4484946654022,46.2373158509606,47.0736482585089,47.9579226600125,48.8905715151762,49.8720289714460,50.9027308640062,51.9831147157818,53.1136197374377,54.2946868273783,55.5267585717480,56.8102792444312,58.1456948070521,59.5334529089743,60.9740028873018,62.4677957668786,64.0152842602876,65.6169227678529,67.2731673776373,68.9844758654438,70.7513076948157,72.5741240170354,74.4533876711260,76.3895631838499,78.3831167697092,80.4345163309464,82.5442314575433,84.7127334272220,86.9404952054444,89.2279914454118,91.5756984880661,93.9840943620883,96.4536587839001,98.9848731576614,101.578220575274,104.234185816379,106.953255348357,109.735917326327,112.582661593151,115.493979679428,118.470364803498,121.512311871442,124.620317477080,127.794879901969,131.036499115411,134.345676774445,137.722916223849,141.168722496142,144.683602311584,148.268064078173,151.922617891649,155.647775535488,159.444050480909,163.311957886871,167.252014600072,171.264739154948,175.350651773679,179.510274366181,183.744130530113,188.052745550870,192.436646401591,196.896361743152,201.432421924170,206.045358981001,210.735706637743,215.504000306232,220.350777086043,225.276575764494,230.281936816639,235.367402405274,240.533516380936,245.780824281900,251.109873334181,256.521212451534,262.015392235454,267.592964975176,273.254484647676,279.000506917667,284.831589137604,290.748290347682,296.751171275834,302.840794337735,309.017723636798,315.282524964177,321.635765798766,328.078015307199,334.609844343848,341.231825450827,347.944532857988,354.748542482925,361.644431930971,368.632780495196]
m=[-1,-0.990000000000000,-0.980000000000000,-0.970000000000000,-0.960000000000000,-0.950000000000000,-0.940000000000000,-0.930000000000000,-0.920000000000000,-0.910000000000000,-0.900000000000000,-0.890000000000000,-0.880000000000000,-0.870000000000000,-0.860000000000000,-0.850000000000000,-0.840000000000000,-0.830000000000000,-0.820000000000000,-0.810000000000000,-0.800000000000000,-0.790000000000000,-0.780000000000000,-0.770000000000000,-0.760000000000000,-0.750000000000000,-0.740000000000000,-0.730000000000000,-0.720000000000000,-0.710000000000000,-0.700000000000000,-0.690000000000000,-0.680000000000000,-0.670000000000000,-0.660000000000000,-0.650000000000000,-0.640000000000000,-0.630000000000000,-0.620000000000000,-0.610000000000000,-0.600000000000000,-0.590000000000000,-0.580000000000000,-0.570000000000000,-0.560000000000000,-0.550000000000000,-0.540000000000000,-0.530000000000000,-0.520000000000000,-0.510000000000000,-0.500000000000000,-0.490000000000000,-0.480000000000000,-0.470000000000000,-0.460000000000000,-0.450000000000000,-0.440000000000000,-0.430000000000000,-0.420000000000000,-0.410000000000000,-0.400000000000000,-0.390000000000000,-0.380000000000000,-0.370000000000000,-0.360000000000000,-0.350000000000000,-0.340000000000000,-0.330000000000000,-0.320000000000000,-0.310000000000000,-0.300000000000000,-0.290000000000000,-0.280000000000000,-0.270000000000000,-0.260000000000000,-0.250000000000000,-0.240000000000000,-0.230000000000000,-0.220000000000000,-0.210000000000000,-0.200000000000000,-0.190000000000000,-0.180000000000000,-0.170000000000000,-0.160000000000000,-0.150000000000000,-0.140000000000000,-0.130000000000000,-0.120000000000000,-0.110000000000000,-0.100000000000000,-0.0900000000000000,-0.0800000000000000,-0.0700000000000000,-0.0599999999999999,-0.0499999999999999,-0.0400000000000000,-0.0300000000000000,-0.0200000000000000,-0.0100000000000000,0,0.0100000000000000,0.0200000000000000,0.0300000000000000,0.0400000000000000,0.0499999999999999,0.0599999999999999,0.0700000000000000,0.0800000000000000,0.0900000000000000,0.100000000000000,0.110000000000000,0.120000000000000,0.130000000000000,0.140000000000000,0.150000000000000,0.160000000000000,0.170000000000000,0.180000000000000,0.190000000000000,0.200000000000000,0.210000000000000,0.220000000000000,0.230000000000000,0.240000000000000,0.250000000000000,0.260000000000000,0.270000000000000,0.280000000000000,0.290000000000000,0.300000000000000,0.310000000000000,0.320000000000000,0.330000000000000,0.340000000000000,0.350000000000000,0.360000000000000,0.370000000000000,0.380000000000000,0.390000000000000,0.400000000000000,0.410000000000000,0.420000000000000,0.430000000000000,0.440000000000000,0.450000000000000,0.460000000000000,0.470000000000000,0.480000000000000,0.490000000000000,0.500000000000000,0.510000000000000,0.520000000000000,0.530000000000000,0.540000000000000,0.550000000000000,0.560000000000000,0.570000000000000,0.580000000000000,0.590000000000000,0.600000000000000,0.610000000000000,0.620000000000000,0.630000000000000,0.640000000000000,0.650000000000000,0.660000000000000,0.670000000000000,0.680000000000000,0.690000000000000,0.700000000000000,0.710000000000000,0.720000000000000,0.730000000000000,0.740000000000000,0.750000000000000,0.760000000000000,0.770000000000000,0.780000000000000,0.790000000000000,0.800000000000000,0.810000000000000,0.820000000000000,0.830000000000000,0.840000000000000,0.850000000000000,0.860000000000000,0.870000000000000,0.880000000000000,0.890000000000000,0.900000000000000,0.910000000000000,0.920000000000000,0.930000000000000,0.940000000000000,0.950000000000000,0.960000000000000,0.970000000000000,0.980000000000000,0.990000000000000,1]
poly = np.polyfit(chi, m, deg = 40)
fit_fn = np.poly1d(poly)
f = plt.figure()
ax = f.add_subplot(111)
ax.plot(m, chi, 'r-', label = 'data')
ax.plot(fit_fn(chi), chi, 'b-', label = 'adjust')
ax.set_xlabel('$m$')
ax.set_ylabel('$\chi^2$')
plt.legend()
plt.show()
plt.close()
The problem is that you mixed the x and the y coordinates while fitting and plotting the fit. Since m is the x-coordinate (independent variable) and chi is the y-coordinate (dependent variable), pass them in the right order. The lines modified are indicated by a comment #
poly = np.polyfit(m, chi, deg = 4) # <-----
fit_fn = np.poly1d(poly)
f = plt.figure()
ax = f.add_subplot(111)
ax.plot(m, chi, 'rx', label = 'data') # <---- Just used x to plot symbols
ax.plot(m, fit_fn(m), 'b-', lw=2, label = 'adjust') # <-----
ax.set_xlabel('$m$')
ax.set_ylabel('$\chi^2$')
plt.legend()
plt.show()
plt.close()

Categories

Resources