I have a data frame that contains average concentrations for 4 different sites based on season and year. The code I wrote produces a figure for each site, with four subplots for each season. Year is on the y-axis and concentration is on the x-axis.
Here's the link to my data: https://drive.google.com/file/d/1mVAsjRiFmMXaW0F8HBhadi1ZQPcUGIa7/view?usp=sharing
The issue is that the code automatically plots the subplots as
fall - spring
summer - winter
I want them to plot in chronological order, because that makes more sense that alphabetical:
spring - summer
fall - winter
Here is my code:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.formula.api as smf
import scipy.stats
main_dataframe = pd.read_csv('NOx_sznl.csv')
main_dataframe.rename(columns={'NOx_3168':'Banning NOx', 'NOx_2199':'Palm Springs NOx', 'NOx_2551':'El Centro NOx', 'NOx_3135':'Calexico NOx'}, inplace=True)
col = list(main_dataframe.columns)
col.remove('Year')
col.remove('Season')
for ind,station in enumerate(col):
df_new = main_dataframe[['Season', 'Year', col[ind]]]
###here I tried to reorder the seasons in the dataframe
df_new = df_new.set_index('Season')
df_new = df_new.loc[['Spring', 'Summer', 'Fall', 'Winter'], :]
df_new = df_new.reset_index()
###but it didn't change the outcome
df_new = df_new.set_index('Year')
# df_new['Betty Jo Mcneece Receiving Home'].astype('float')
df_new[col[ind]] = df_new[col[ind]]
grouped = df_new.groupby('Season')
rowlength = grouped.ngroups/2 # fix up if odd number of groups
fig, axs = plt.subplots(figsize=(15,10),
nrows=2, ncols=int(rowlength), # fix as above
gridspec_kw=dict(hspace=0.4))#, sharex='col', sharey='row') # Much control of gridspec
targets = zip(grouped.groups.keys(), axs.flatten())
for i, (key, ax) in enumerate(targets):
ax.plot(grouped.get_group(key)[col[ind]], marker='o', color='orange')
ax.set_ylim(0,)
ax.set_yticks(ax.get_yticks(),size=12)
#ax.set_xlim(2009,2020)
ax.set_xticks(np.arange(2009,2020,1))
ax.set_xticklabels(ax.get_xticks(), rotation = 45, size=12)
fig.suptitle("%s"%col[ind], fontsize=30)
# ax.set_title('%s')
plt.subplot(221)
plt.gca().set_title('Fall', fontsize=20)
plt.subplot(222)
plt.gca().set_title('Spring', fontsize=20)
plt.subplot(223)
plt.gca().set_title('Summer', fontsize=20)
plt.subplot(224)
plt.gca().set_title('Winter', fontsize=20)
plt.show()
I would apppreciate any help rearranging the subplots.
The order of the subplots is given by grouped.groups.keys() in targets = zip(grouped.groups.keys(), axs.flatten()) but the problem is further upstream in grouped = df_new.groupby('Season') which is where grouped.groups.keys() comes from. df.groupby() automatically sorts alphabetically unless you do sort=False, so grouped = df_new.groupby('Season', sort=False) should follow the order you provided when you made df_new.
Here is what your code looks like on my end so you can have an exact copy.
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.formula.api as smf
import scipy.stats
main_dataframe = pd.read_csv('NOx_sznl.csv')
main_dataframe.rename(columns={'NOx_3168': 'Banning NOx',
'NOx_2199': 'Palm Springs NOx',
'NOx_2551': 'El Centro NOx',
'NOx_3135': 'Calexico NOx'},
inplace=True)
col = list(main_dataframe.columns)
col.remove('Year')
col.remove('Season')
for ind, station in enumerate(col):
df_new = main_dataframe[['Season', 'Year', col[ind]]]
###here I tried to reorder the seasons in the dataframe
df_new = df_new.set_index('Season')
df_new = df_new.loc[['Spring', 'Summer', 'Fall', 'Winter'], :]
df_new = df_new.reset_index()
###but it didn't change the outcome
df_new = df_new.set_index('Year')
# df_new['Betty Jo Mcneece Receiving Home'].astype('float')
df_new[col[ind]] = df_new[col[ind]]
grouped = df_new.groupby('Season', sort=False)
rowlength = grouped.ngroups/2 # fix up if odd number of groups
fig, axs = plt.subplots(figsize=(15,10),
nrows=2, ncols=int(rowlength), # fix as above
gridspec_kw=dict(hspace=0.4))#, sharex='col', sharey='row') # Much control of gridspec
targets = zip(grouped.groups.keys(), axs.flatten())
for i, (key, ax) in enumerate(targets):
ax.plot(grouped.get_group(key)[col[ind]], marker='o', color='orange')
ax.set_ylim(0,)
ax.set_yticks(ax.get_yticks(),size=12)
#ax.set_xlim(2009,2020)
ax.set_xticks(np.arange(2009,2020,1))
ax.set_xticklabels(ax.get_xticks(), rotation = 45, size=12)
ax.set_title(key)
fig.suptitle("%s"%col[ind], fontsize=30)
plt.show()
Related
I need to create multiple boxplots on the same graph. The sports are 3. I need to obtain 3 boxplots on the same graph of each sport, with a specific variable on the y-axis. I need to be able to change the variable. The variable for each student is registered various times and is given by the mean of the 3 largest numbers. I have 30 students identified with an ID (that goes from 1 to 30). Each student does only one sport. This is what I wrote but clearly it doesn't work. Can someone help? I hope my explanation made sense.
def boxplot(sport, variable):
list=[]
for l in range(1,31):
g = df[(df.ID == l) & (df.sport == sport)][variable].nlargest(n=3).mean()
list.append(g)
my_dict = {f'Boxplot for {variable} in {sport}': list}
fig, ax = plt.subplots()
ax.boxplot(my_dict.values())
ax.set_xticklabels(my_dict.keys())
plt.show()
Here's one way to do it.
import plotly.express as px
df = px.data.tips()
fig = px.box(df, x="day", y="total_bill", color="smoker")
fig.update_traces(quartilemethod="exclusive") # or "inclusive", or "linear" by default
fig.show()
If you data is not melted or stacked, you can change the layout like this.
https://pandas.pydata.org/docs/reference/api/pandas.melt.html
Finally, for Matplotlib, you can do it like this.
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# Results of the long jump finals at two Olympic Games
data = pd.DataFrame({
'London 2012 (Men)': [8.31, 8.16, 8.12, 8.11, 8.10, 8.07, 8.01, 7.93],
'Rio 2016 (Men)': [8.38, 8.37, 8.29, 8.25, 8.17, 8.10, 8.06, 8.05],
'London 2012 (Women)': [7.12, 7.07, 6.89, 6.88, 6.77, 6.76, 6.72, 6.67],
'Rio 2016 (Women)': [7.17, 7.15, 7.08, 6.95, 6.81, 6.79, 6.74, 6.69]
})
# Plot
bp = plt.boxplot(
# A data frame needs to be converted to an array before it can be plotted this way
np.array(data),
# You can use the column headings from the data frame as labels
labels=list(data)
)
# Axis details
plt.title('Long Jump Finals')
plt.ylabel('Distance [m]')
plt.xlabel('Olympics')
plt.minorticks_on()
plt.tick_params(axis='x', which='minor', bottom=False)
plt.tick_params(axis='x', which='major', labelsize='small')
plt.show()
Here is one final update. Make sure the y-axis is numeric...
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plot
df = px.data.tips()
df=pd.DataFrame(df)
print(type(df))
df.head()
df.columns = ['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size']
b_plot = df.boxplot(column = ['tip','size','total_bill'])
b_plot.plot()
plot.show()
I have the following function:
Say hue="animals have three categories dog,bird,horse and we have two dataframes df_m and df_f consisting of data of male animals and women animals only, respectively.
The function plots three distplot of y (e.g y="weight") one for each hue={dog,bird,horse}. In each subplot we plot df_m[y] and df_f[y] such that I can compare the weight of male dogs/female dogs, male birds/female birds, male horses/female horses.
If I set distkwargs={"hist":False} when calling the function the legends ["F","M"] disappears, for some reason. Having distkwargs={"hist":True}` shows the legends
def plot_multi_kde_cat(self,dfs,y,hue,subkwargs={},distkwargs={},legends=[]):
"""
Create a subplot multi_kde with categories in the same plot
dfs: List
- DataFrames for each category e.g one for male and one for females
hue: string
- column for which each category is plotted (in each subplot)
"""
hues = dfs[0][hue].cat.categories
if len(hues)==2: #Only two categories
fig,axes = plt.subplots(1,2,**subkwargs) #Get axes and flatten them
axes=axes.flatten()
for ax,hu in zip(axes,hues):
for df in dfs:
sns.distplot(df.loc[df[hue]==hu,y],ax=ax,**distkwargs)
ax.set_title(f"Segment: {hu}")
ax.legend(legends)
else: #More than two categories: create a square grid and remove unsused axes
n_rows = int(np.ceil(np.sqrt(len(hues)))) #number of rows
fig,axes = plt.subplots(n_rows,n_rows,**subkwargs)
axes = axes.flatten()
for ax,hu in zip(axes,hues):
for df in dfs:
sns.distplot(df.loc[df[hue]==hu,y],ax=ax,**distkwargs)
ax.set_title(f"Segment: {hu}")
ax.legend(legends)
n_remove = len(axes)-len(hues) #number of axes to remove
if n_remove>0:
for ax in axes[-n_remove:]:
ax.set_visible(False)
fig.tight_layout()
return fig,axes
You can work around the problem by explicitly providing the label to the distplot. This forces a legend entry for each distplot. ax.legend() then already gets the correct labels.
Here is some minimal sample code to illustrate how everything works together:
from matplotlib import pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np
def plot_multi_kde_cat(dfs, y, hue, subkwargs={}, distkwargs={}, legends=[]):
hues = np.unique(dfs[0][hue])
fig, axes = plt.subplots(1, len(hues), **subkwargs)
axes = axes.flatten()
for ax, hu in zip(axes, hues):
for df, legend_label in zip(dfs, legends):
sns.distplot(df.loc[df[hue] == hu, y], ax=ax, label=legend_label, **distkwargs)
ax.set_title(f"Segment: {hu}")
ax.legend()
N = 20
df_m = pd.DataFrame({'animal': np.random.choice(['tiger', 'horse'], N), 'weight': np.random.uniform(100, 200, N)})
df_f = pd.DataFrame({'animal': np.random.choice(['tiger', 'horse'], N), 'weight': np.random.uniform(80, 160, N)})
plot_multi_kde_cat([df_m, df_f], 'weight', 'animal',
subkwargs={}, distkwargs={'hist': False}, legends=['male', 'female'])
plt.show()
I can't get the legends to show on the subplots which show up just fine and take the other formatting I've applied. What am I missing?
If I do a plot for the dataframe alone, it shows the legend. If I add a label to the plot for the subplots, it assigns that label to all three lines.
Here is image. plot vs subplot
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from functools import reduce
%matplotlib notebook
#Source for files
# Per Capita Personal Income
# Ann Arbor https://fred.stlouisfed.org/series/ANNA426PCPI
# MI https://fred.stlouisfed.org/series/MIPCPI
# USA https://fred.stlouisfed.org/series/A792RC0A052NBEA
dfAnnArbor_PCPI = pd.read_csv('PerCapitaPersonalIncomeAnnArborMI.csv', skiprows=1, names=['Date', 'PCPI'])
dfMI_PCPI = pd.read_csv('PerCapitaPersonalIncomeMI.csv', skiprows=1, names=['Date', 'PCPI'])
dfUSA_PCPI = pd.read_csv('PerCapitaPersonalIncomeUSA.csv', skiprows=1, names=['Date', 'PCPI'])
# consolidate three df into one using Date
dfAll = [dfAnnArbor_PCPI, dfMI_PCPI, dfUSA_PCPI]
dfPCPI = reduce(lambda left, right: pd.merge(left, right, on='Date', how='outer'), dfAll)
dfPCPI = dfPCPI.dropna() # drop rows with NaN
dfPCPI.columns = ['Date', 'AnnArbor', 'MI', 'USA'] # rename columns
dfPCPI['Date'] = dfPCPI['Date'].str[:4] # select only year
dfPCPI = dfPCPI.set_index('Date')
dfPCPI_Rel = dfPCPI.apply(lambda x: x / x[0])
dfPCPI_Small = dfPCPI.iloc[8:].copy()
dfPCPI_SmRel = dfPCPI_Small.apply(lambda x: x / x[0])
dfPCPI_SmRel.plot()
fig, ax = plt.subplots(1, 2)
ax0 = ax[0].plot(dfPCPI_Rel, '-', label='a')
ax1 = ax[1].plot(dfPCPI_SmRel, '-', label='test1')
ax[0].legend()
for x in fig.axes:
for label in x.get_xticklabels():
label.set_rotation(45)
ax[1].xaxis.set_major_locator(ticker.MultipleLocator(2))
plt.show()
The legend in pyplot refers to an axis instance. Therefore, if you want multiple plots to have their own legend, you need to call legend() for each axis. In your case
ax[0].legend()
ax[1].legend()
Additionally, as you are calling plot(), you may want to use the keyword label in each plot() call so as to have a label for each legend entry.
You should try fig.legend() instead of plt.legend()
I have a data set with three sets of data: class type, neighborhood, and visibility.
I'm trying to create a bar chart that is both stacked and unstacked -- stacked by visibility, lined up by neighborhood. So basically, I'm looking for a combination of the unstacked-ness of this chart:
nbvis_gb = nbvis.sort_values(by=['visibility'],ascending=False).groupby(by='visibility',sort=False)
fig, ax = plt.subplots(nrows=1,ncols=2,figsize=(14,8),sharey=True)
for (i, j), ax,color in zip(nbvis_gb,ax.flatten(),colors_hood):
print(j['class'].values)
title = str(i)
j.plot.bar(ax=ax,colors=colors_hood)
ax.set_title(title, fontsize=20)
#ax.set_ylim(0,1.05)
ax.tick_params(labelsize=16)
ax.set_xticklabels(j['class'].values)
ax.legend_.remove()
ax.legend(loc=8,fontsize=20,ncol=4,bbox_to_anchor=(0,-.45))
fig.tight_layout(h_pad=2)
fig.suptitle('Visibility of containers by class and neighborhood',y=1.03,fontsize=24)
and the stacked-ness of this chart:
nbvis.unstack()['Neighborhood 1'].plot.bar(stacked=True)
Any help would be greatly appreciated!
Cheers,
Elizabeth
Consider melt and pivot_table of your dataframe to create a multi-index datafame aligned to your graph dimensions. Below outputs graph to screen and saves figure to png image in same folder using seaborn's color scheme. Of course adjust graph settings as needed.
Data
import numpy as np
import pandas as pd
from itertools import product
from matplotlib import pyplot as plt
import seaborn
np.random.seed(444)
df = pd.DataFrame(list(product(['bucket (1)', 'flower pot (2)', 'tarp (3)', 'trash (6)', 'toy (7)',
'piping/tubing (9)', 'other (10)'],
['visible containers', 'partial or not visible containers'])),
columns=['class', 'visibility']).assign(Neighborhood1 = abs(np.random.randn(14)),
Neighborhood2 = abs(np.random.randn(14)),
Neighborhood3 = abs(np.random.randn(14)),
Neighborhood4 = abs(np.random.randn(14)))
Graphing
seaborn.set()
def runplot(pvtdf):
fig, axes = plt.subplots(nrows=1, ncols=len(mdf['Neighborhood'].unique()))
for i, n in enumerate(mdf['Neighborhood'].unique()):
pvtdf.xs(n).plot(ax=axes[i], kind='bar', stacked=True, edgecolor='w',
figsize=(20,8), width=0.5, fontsize = 12,
title='{} - Visibility of containers \n by class and neighborhood'.format(n))
axes[i].title.set_size(16)
plt.tight_layout()
fig.savefig('Output.png')
plt.show()
plt.clf()
# MELT LONG
mdf = pd.melt(df, id_vars = ['class', 'visibility'], var_name='Neighborhood')
# PIVOT WIDE
pvtdf = mdf.pivot_table(index= ['Neighborhood', 'class'], columns='visibility', values='value')
runplot(pvtdf, n)
plt.close()
Output
here's one way you could do this. I used some dummy data:
df = pd.DataFrame({"class":['bucket', 'pot', 'tarp', 'trash', 'toy', 'tubing', 'other','bucket', 'pot', 'tarp', 'trash', 'toy', 'tubing', 'other',],
"visability":["visable", "visable","visable","visable","visable","visable","visable", "not visable","not visable","not visable","not visable","not visable","not visable","not visable",],
"n1":np.random.random(size=14),
"n2":np.random.random(size=14),
"n3":np.random.random(size=14),
"n4":np.random.random(size=14)})
I think the trick is to use bottom:
N=7
width = 0.095
w = 0
ind = np.arange(N) + .15
classes = ['bucket', 'pot', 'tarp', 'trash', 'toy', 'tubing', 'other']
neighborhoods = ['n1', 'n2', 'n3', 'n4']
fig, ax = plt.subplots()
top_colors = ['#ff9999', '#9999ff', '#e6b3ff', '#66ff66']
bottom_colors = ['#b30000', '#000066', '#7700b3', '#004d00']
for i, n in enumerate(neighborhoods):
vis = df[(df.visability == "visable")][n]
non_vis = df[df.visability == "not visable"][n]
rect1 = ax.bar(ind+w, vis, float(width), color=top_colors[i])
rect2 = ax.bar(ind+w, non_vis, width, color=bottom_colors[i], bottom=vis)
w += 0.15
extra_space = 0.05
ax.set_xticks(ind+width+xtra_space)
ax.set_xticklabels(('bucket', 'pot', 'tarp', 'trash', 'toy', 'tubing', 'other',))
ax.set_title('Visability of container types by class')
plt.show()
I've assigned the 365 days of a year to several clusters and I'm now trying to plot them on a heatmap.
My code works fine except that cbar.set_ticks(some_range) has no effects: the tick labels on my colorbar have the right text but the wrong position
Here is a MCVE
from datetime import date
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
import matplotlib
import seaborn as sns
#create some random data
n_cluster = 4
index = pd.date_range('01/01/2016', end='31/12/2016', freq='1D')
df = pd.DataFrame(np.random.randint(0, n_cluster, len(index)),
index=index, columns=['cluster'])
pivot = df.pivot_table('cluster',
columns=[lambda x: x.weekofyear],
index= [lambda x: x.dayofweek])
#yticklabels of the heatmap
days = [date(2018, 1, d).strftime('%a')[:3] for d in range(1, 8)]
#get a discrete cmap
cmap = plt.cm.get_cmap('RdBu', n_cluster)
fig = plt.figure(figsize=(10,3))
gs = matplotlib.gridspec.GridSpec(1, 2, width_ratios=[50,1])
ax = plt.subplot(gs[0])
cbar = plt.subplot(gs[1])
sns.heatmap(pivot, square=True, cmap=cmap,
yticklabels=days, ax=ax, cbar_ax=cbar)
#There is something wrong here
cbar.set_yticks([i + 1/(2.0*n_cluster) for i in np.arange(0, 1, 1.0/n_cluster)])
#This one is ok
cbar.set_yticklabels(range(0, n_cluster))
Thanks for your help
As a workaround, the following adds the correct labels in the correct place,
cbar.yaxis.set_ticks([0.125, 0.375, 0.625, 0.875])
which looks like,
EDIT:
Or the more general suggestion of mfitzp,
cbar.yaxis.set_ticks([i + 1/(2.0*n_cluster)
for i in np.arange(0, 1, 1.0/n_cluster)])