I am attemption to create a function to serve as a quick visual assessment for a normal distribution and to automate this for a whole dataframe. I want to create a no. of cols x 2 subplot (2 columns, each column of a dataframe a row) with the left plot being a histogram and the right a probability plot. I have written functions for each of these plots and they work fine, and the ax argument I have added can successfully plot them in a specific subplot coordinate. When I try to call these functions in a final function, intended to apply these to each column in a dataframe only the first histogram is returned and the rest of the plots empty.
Not sure where I am going wrong. See code for functions below. Note, no errors are returned:
#Histogram for normality
def normal_dist_hist(data, ax):
#Format data for plotting
#Included ax for subplot coordinate
if data.isnull().values.any() == True:
data.dropna(inplace=True)
if data.dtypes == 'float64':
data.astype('int64')
#Plot distribution with Gaussian overlay
mu, std = stats.norm.fit(data)
ax.hist(data, bins=50, density=True, alpha=0.6, color='g')
xmin, xmax = ax.get_xlim()
x = np.linspace(xmin, xmax, 100)
p = stats.norm.pdf(x, mu, std)
ax.plot(x, p, 'k', linewidth=2)
title = "Fit results: mu = %.2f, std = %.2f" % (mu, std)
ax.set_title(title)
plt.show()
#Probability plot
def normal_test_QQplots(data, ax):
#added ax argument for specifying subplot coordinate,
data.dropna(inplace=True)
probplt = stats.probplot(data,dist='norm',fit=True,plot=ax)
plt.show()
def normality_report(df):
fig, axes = plt.subplots(nrows=len(df.columns), ncols=2,figsize=(12,50))
ax_y = 0
for col in df.columns[1:]:
ax_x = 0
normal_dist_hist(df[col], ax=axes[ax_y, ax_x])
ax_x = 1
normal_test_QQplots(df[col], ax=axes[ax_y, ax_x])
ax_y += 1
Remove the plt.show() from your methods normal_dist_hist(...) and normal_test_QQplots(...). Add plt.show() at the end of your normality_report(...).
def normal_dist_hist(data, ax):
...
plt.show() # Remove this
#Probability plot
def normal_test_QQplots(data, ax):
...
plt.show() # Remove this
def normality_report(df):
...
for col in df.columns[1:]:
ax_x = 0
normal_dist_hist(df[col], ax=axes[ax_y, ax_x])
ax_x = 1
normal_test_QQplots(df[col], ax=axes[ax_y, ax_x])
ax_y += 1
plt.show() # Add it here.
Related
I am using seaborn's FacetGrid to do multiple histogram plots from a dataframe (plot_df) on the parameter - "xyz". But I want to do the following additional things too in those plots,
Create a vertical axes line at x-value = 0
Color all the bins that are equal to or lesser than 0 (on x-axis) with a different shade
Calculate the percentage area of the histogram for only those bins that are below 0 (on x-axis)
I am able to get lot of examples online but not with seaborn FacetGrid option
g = sns.FacetGrid(plot_df, col='xyz', height=5)```
g.map(plt.hist, "slack", bins=50)
You could loop through the generated axes (for xyz, ax in g.axes_dict.items(): ....) and call your plotting functions for each of those axes.
Or, you could call g.map_dataframe(...) with a custom function. That function will need to draw onto the "current ax".
Changing the x and y labels, needs to be done after the call to g.map_dataframe() because seaborn erases the x and y labels at the end of that functions.
You can call plt.setp(g.axes, xlabel='data', ylabel='frequency') to set the labels for all the subplots. Or g.set_ylabels('...') to only set the y labels for the "outer" subplots.
Here is some example code to get you started:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
def individual_plot(**kwargs):
ax = plt.gca() # get the current ax
data = kwargs['data']['slack'].values
xmin, xmax = data.min(), data.max()
bin_width = xmax / 50
# histogram part > 0
ax.hist(data, bins=np.arange(0.000001, xmax + 0.001, bin_width), color='tomato')
# histogram part < 0
ax.hist(data, bins=-np.arange(0, abs(xmin) + bin_width + 0.001, bin_width)[::-1], color='lime')
# line at x=0
ax.axvline(0, color='navy', ls='--')
# calculate and show part < 0
percent_under_zero = sum(data <= 0) / len(data) * 100
ax.text(0.5, 0.98, f'part < 0: {percent_under_zero:.1f} %',
color='k', ha='center', va='top', transform=ax.transAxes)
# first generate some test data
plot_df = pd.DataFrame({'xyz': np.repeat([*'xyz'], 1000),
'slack': np.random.randn(3000) * 10 + np.random.choice([10, 500], 3000, p=[0.9, 0.1])})
g = sns.FacetGrid(plot_df, col='xyz', height=5)
g.map_dataframe(individual_plot)
plt.setp(g.axes, xlabel='data', ylabel='frequency')
plt.tight_layout()
plt.show()
I have a graph with about 30 different curves and I want to show them all in the legend. But it gets too long and they don't all fit. I want to put them down the bottom, how do I do this?
fig = plt.figure()
for i in range(len(data)):
print(i)
x1 = data.loc[0][5:-4]
y1 = data.loc[i][5:-4]
y1.replace(' ',100.0,inplace=True)
x = list(reversed(x1))
y = list(reversed(y1))
report_num = data.loc[i,'Report No']
plt.plot(x, y, label = report_num)
plt.xscale('log')
plt.grid()
plt.yticks(yints)
plt.xticks(x,x,rotation=40)
plt.title('Particle Size Distribution Curve - %s'%(report_num))
plt.legend(loc=1)
I'm trying to generate a jointplot for data with linear x and log y. The ranges are -22, -13 for x and 1e-3, 1 for y. The plot seems ok, however the marginal histograms are not correct: at least the one for the x data:
Here's my code...
# Convert observed magnitude to Absolute ...
absMag, pop3Mag, nmAbsMag = compMags(dir,z)
pop3Fraction = haloData[dir][z]['1500A_P3']/haloData[dir][z]['1500A']
pop3Fraction[pop3Fraction < 1e-3] = 1e-3 # Map Pop 3 flux < 1e-3 to 1e-3
data = np.array((absMag,pop3Fraction)).T # data is list of (x,y) pairs...
df = pd.DataFrame(data, columns=["M", "f"])
x, y = data.T
# g = sns.jointplot(x="x", y="y", data=df)
g = sns.JointGrid(x='M', y='f', data=df, xlim=[-22,-13],ylim=[0.001,1])
g.plot_joint(plt.scatter)
g.ax_marg_x.set_xscale('linear')
g.ax_marg_y.set_yscale('log')
x_h = g.ax_marg_x.hist(df['M'], color='b', edgecolor='k', bins=magBins)
y_h = g.ax_marg_y.hist(df['f'], orientation="horizontal", color='r', edgecolor='k', bins=fracBins, log=True)
ax = g.ax_joint
ax.set_xscale('linear')
ax.set_yscale('log')
ax.set_xlim([-22,-13])
ax.set_xticks([-21,-19,-17,-15,-13,-11])
ax.set_ylim([1e-3,1])
I'm not sure why the top histogram is not aligned with the data... ???
Never-mind ... on closer inspection there really are more points near -13 than anywhere else... I really need a 2d histogram here to show these nuances.
If someone has a suggestion as to how to make that plot clearly with seaborn I'd appreciate it.
I'm trying to plot a linear regression on a scatter graph.
def chart1(df, yr, listcols):
temp = df[(df['YEAR']==yr)]
fig, axes = plt.subplots(nrows=2, ncols=2, figsize = (12,12))
for e in [['WD','pk_h',0,0],['WD','of_h',0,1],['SAT','of_h',1,0],['SUN','of_h',1,1]]:
temp.ix[(temp['daytype']==e[0])&(temp['hourtype']==e[1]),listcols].plot(kind='scatter', title=str(yr)+' '+e[0]+' '+e[1], x=listcols[0], y=listcols[1], ax=axes[e[2],e[3]])
fig.tight_layout()
return temp
chartd = chart1(o2, 2017,['PROD', 'option_exercise'])
I can't figure out how to make it possible in my loop.
It should work this way:
In your for loop run a regression and store the results in 'res'. Manually caclulate the predicted y ('yhat') using the stored coefficients. Then chart both x vs. y and x vs. yhat:
import pandas.stats.api
def chart4(df, yr, day, Y, sensi):
temp = df[(df['YEAR']==yr)]
temp = temp[(temp['daytype']==day)]
fig = plt.figure(figsize=(15,13))
for i, var in enumerate(sensi):
res = ols(y=temp[Y], x=temp[var])
label = 'R2: ' + str(res.r2)
temp['yhat'] = temp[var]*res.beta[0] + res.beta[1]
axis=fig.add_subplot(4,3,i+1)
temp.plot(ax=axis,kind='scatter', x=var, y=Y, title=var)
temp.plot(ax=axis, kind='scatter', x=var, y='yhat', color='grey', s=1, label=label)
axis.set_xlabel(r'alpha', fontsize=18)
fig.tight_layout()
return
I am pretty new to python and want to plot a dataset using a histogram and a heatmap below. However, I am a bit confused about
How to put a title above both plots and
How to insert some text into bots plots
How to reference the upper and the lower plot
For my first task I used the title instruction, which inserted a caption in between both plots instead of putting it above both plots
For my second task I used the figtext instruction. However, I could not see the text anywhere in the plot. I played a bit with the x, y and fontsize parameters without any success.
Here is my code:
def drawHeatmap(xDim, yDim, plot, threshold, verbose):
global heatmapList
stableCells = 0
print("\n[I] - Plotting Heatmaps ...")
for currentHeatmap in heatmapList:
if -1 in heatmapList[currentHeatmap]:
continue
print("[I] - Plotting heatmap for PUF instance", currentHeatmap,"(",len(heatmapList[currentHeatmap])," values)")
# Convert data to ndarray
#floatMap = list(map(float, currentHeatmap[1]))
myArray = np.array(heatmapList[currentHeatmap]).reshape(xDim,yDim)
# Setup two plots per page
fig, ax = plt.subplots(2)
# Histogram
weights = np.ones_like(heatmapList[currentHeatmap]) / len(heatmapList[currentHeatmap])
hist, bins = np.histogram(heatmapList[currentHeatmap], bins=50, weights=weights)
width = 0.7 * (bins[1] - bins[0])
center = (bins[:-1] + bins[1:]) / 2
ax[0].bar(center, hist, align='center', width=width)
stableCells = calcPercentageStable(threshold, verbose)
plt.figtext(100,100,"!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!", fontsize=40)
heatmap = ax[1].pcolor(myArray, cmap=plt.cm.Blues, alpha=0.8, vmin=0, vmax=1)
cbar = fig.colorbar(heatmap, shrink=0.8, aspect=10, fraction=.1,pad=.01)
#cbar.ax.tick_params(labelsize=40)
for y in range(myArray.shape[0]):
for x in range(myArray.shape[1]):
plt.text(x + 0.5, y + 0.5, '%.2f' % myArray[y, x],
horizontalalignment='center',
verticalalignment='center',
fontsize=(xDim/yDim)*5
)
#fig = plt.figure()
fig = matplotlib.pyplot.gcf()
fig.set_size_inches(60.5,55.5)
plt.savefig(dataDirectory+"/"+currentHeatmap+".pdf", dpi=800, papertype="a3", format="pdf")
#plt.title("Heatmap for PUF instance "+str(currentHeatmap[0][0])+" ("+str(numberOfMeasurements)+" measurements; "+str(sizeOfMeasurements)+" bytes)")
if plot:
plt.show()
print("\t[I] - Done ...")
And here is my current output:
Perhaps this example will make things easier to understand. Things to note are:
Use fig.suptitle to add a title to the top of a figure.
Use ax[i].text(x, y, str) to add text to an Axes object
Each Axes object, ax[i] in your case, holds all the information about a single plot. Use them instead of calling plt, which only really works well with one subplot per figure or to modify all subplots at once. For example, instead of calling plt.figtext, call ax[0].text to add text to the top plot.
Try following the example code below, or at least read through it to get a better idea how to use your ax list.
import numpy as np
import matplotlib.pyplot as plt
histogram_data = np.random.rand(1000)
heatmap_data = np.random.rand(10, 100)
# Set up figure and axes
fig = plt.figure()
fig.suptitle("These are my two plots")
top_ax = fig.add_subplot(211) #2 rows, 1 col, 1st plot
bot_ax = fig.add_subplot(212) #2 rows, 1 col, 2nd plot
# This is the same as doing 'fig, (top_ax, bot_ax) = plt.subplots(2)'
# Histogram
weights = np.ones_like(histogram_data) / histogram_data.shape[0]
hist, bins = np.histogram(histogram_data, bins=50, weights=weights)
width = 0.7 * (bins[1] - bins[0])
center = (bins[:-1] + bins[1:]) / 2
# Use top_ax to modify anything with the histogram plot
top_ax.bar(center, hist, align='center', width=width)
# ax.text(x, y, str). Make sure x,y are within your plot bounds ((0, 1), (0, .5))
top_ax.text(0.5, 0.5, "Here is text on the top plot", color='r')
# Heatmap
heatmap_params = {'cmap':plt.cm.Blues, 'alpha':0.8, 'vmin':0, 'vmax':1}
# Use bot_ax to modify anything with the heatmap plot
heatmap = bot_ax.pcolor(heatmap_data, **heatmap_params)
cbar = fig.colorbar(heatmap, shrink=0.8, aspect=10, fraction=.1,pad=.01)
# See how it looks
plt.show()