Plot boxplot and line from pandas - python

I am trying to reproduce this graph - a line plot with a boxplot at every point:
Imgur
However, the line plot is always starting at the origin instead of at the first x tick:
Imgur
I have collected my datastructure in a pandas file, with each column header the k_e (of the x axis), with the column being all of the datapoints.
I am plotting the mean of each column and the boxplot like so:
df = df.astype(float)
_, ax = plt.subplots()
df.mean().plot(ax = ax)
df.boxplot(showfliers=False, ax=ax)
plt.xlabel(r'$k_{e}$')
plt.ylabel('Test error rate')
plt.title(r'Accuracies with different $k_{e}$')
plt.show()
I have referred to the link below, and so am passing the 'ax' position but this does not help.
plot line over boxplot using pandas DateFrame
EDIT: Here is a minimal example:
test_errors_dict = dict()
np.random.seed(40)
test_errors_dict[2] = np.random.rand(20)
test_errors_dict[3] = np.random.rand(20)
test_errors_dict[5] = np.random.rand(20)
df = pd.DataFrame(data=test_errors_dict)
df = df.astype(float)
_, ax = plt.subplots()
df.mean().plot(ax=ax)
df.boxplot(showfliers=False, ax=ax)
plt.show()
Result:
Imgur
As shown in the above, the line plots do not align with the boxplot

The boxes are at positions 1,2,3, while the plot is at positions 2,3,5. You may reindex the mean Series to also use the positions 1,2,3.
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
test_errors_dict = dict()
np.random.seed(40)
test_errors_dict[2] = np.random.rand(20)
test_errors_dict[3] = np.random.rand(20)
test_errors_dict[5] = np.random.rand(20)
df = pd.DataFrame(data=test_errors_dict)
df = df.astype(float)
mean = df.mean()
mean.index = np.arange(1,len(mean)+1)
_, ax = plt.subplots()
mean.plot(ax=ax)
df.boxplot(showfliers=False, ax=ax)
plt.show()

Related

Trying to make scatter plots in subplots using for-loops

I am trying to make subplots using for loop to go through my x variables in the dataframe. All plots would be a scatter plot.
X-variable: 'Protein', 'Fat', 'Sodium', 'Fiber', 'Carbo', 'Sugars'
y-variable: 'Cal'
This is where I am stuck
plt.subplot(2, 3, 2)
for i in range(3):
plt.scatter(i,sub['Cal'])
With this code:
import matplotlib.pyplot as plt
import pandas as pd
df = pd.read_csv('data.csv')
columns = list(df.columns)
columns.remove('Cal')
fig, ax = plt.subplots(1, len(columns), figsize = (20, 5))
for idx, col in enumerate(columns, 0):
ax[idx].plot(df['Cal'], df[col], 'o')
ax[idx].set_xlabel('Cal')
ax[idx].set_title(col)
plt.show()
I get this subplot of scatter plots:
However, maybe it is a better choice to use a single scatterplot and use marker color in order to distinguish data type. See this code:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
sns.set_style('darkgrid')
df = pd.read_csv('data.csv')
# df.drop(columns = ['Sodium'], inplace = True) # <--- removes 'Sodium' column
table = df.melt('Cal', var_name = 'Type')
fig, ax = plt.subplots(1, 1, figsize = (10, 10))
sns.scatterplot(data = table,
x = 'Cal',
y = 'value',
hue = 'Type',
s = 200,
alpha = 0.5)
plt.show()
that give this plot where all data are together:
The 'Sodium' values are different from others by far, so, if you remove this column with this line:
df.drop(columns = ['Sodium'], inplace = True)
you get a more readable plot:

Create Subplots in For Loop Matplotlib

i'm trying to make a 3,2 subplot using matplotlib and I'm not understanding how to do this after reading the documentation as it applies to my code as follows:
import pandas as pd
from sys import exit
import numpy as np
import matplotlib.pyplot as plt
import datetime
import xarray as xr
import cartopy.crs as ccrs
import calendar
list = [0,1,2,3,4,5]
now = datetime.datetime.now()
currm = now.month
import calendar
fig, axes = plt.subplots(nrows=3,ncols=2)
fig.subplots_adjust(hspace=0.5)
fig.suptitle('Teleconnection Pos+ Phases {} 2020'.format(calendar.month_name[currm-1]))
#for x in list:
#for ax, x in zip(axs.ravel(), list):
for x, ax in enumerate(axes.flatten()):
dam = DS.where(DS['time.year']==rmax.iloc[x,1]).groupby('time.month').mean()#iterate by index
of column "1" or the years
dam = dam.sel(month=3)#current month mean 500
dam = dam.sel(level=500)
damc = dam.to_array()
lats = damc['lat'].data
lons = damc['lon'].data
#plot data
ax = plt.axes(projection=ccrs.PlateCarree())
ax.coastlines(lw=1)
damc = damc.squeeze()
ax.contour(lons,lats,damc,cmap='jet')
ax.set_title(tindices[x])
plt.show()
#plt.clf()
I've tried multiple options some of which are above in the comments and I cannot get the subplots to show in the 3,2 subplot i'm expecting. I only get single plots. I've included the first plot in the for loop below as you can see it's not plotted inside the 3,2 subplot region:
[![enter image description here][1]][1]
The row with "ax.contour" may be the problem but i'm not sure. Thank you very much and here below is my target subplot region:
[![enter image description here][1]][1]
Without a reproducible sample data, below cannot be tested. However, your loop assigns a new ax and does not use the ax being iterating on. Additionally, plt.show() is placed within the loop. Consider below adjustment
for x, ax in enumerate(axes.flatten()):
...
ax = plt.axes(projection=ccrs.PlateCarree())
...
plt.show()
Consider placing projection in the plt.subplots and then index axes within loop:
fig, axes = plt.subplots(nrows=3, ncols=2, subplot_kw={'projection': ccrs.PlateCarree()})
fig.subplots_adjust(hspace=0.5)
fig.suptitle('Teleconnection Pos+ Phases {} 2020'.format(calendar.month_name[currm-1]))
axes = axes.flatten()
for x, ax in enumerate(axes):
dam = DS.where(DS['time.year']==rmax.iloc[x,1]).groupby('time.month').mean()
dam = dam.sel(month=3)#current month mean 500
dam = dam.sel(level=500)
damc = dam.to_array()
lats = damc['lat'].data
lons = damc['lon'].data
axes[x].coastlines(lw=1)
damc = damc.squeeze()
axes[x].contour(lons, lats, damc, cmap='jet')
axes[x].set_title(tindices[x])
plt.show()
plt.clf()

How to use pandas df.plot.scatter to make a figure with subplots

Hello how can i make a figure with scatter subplots using pandas? Its working with plot, but not with scatter.
Here an Example
import numpy as np
import pandas as pd
matrix = np.random.rand(200,5)
df = pd.DataFrame(matrix,columns=['index','A','B','C','D'])
#single plot, working with
df.plot(
kind='scatter',
x='index',
y='A',
s= 0.5
)
# not workig
df.plot(
subplots=True,
kind='scatter',
x='index',
y=['A','B','C'],
s= 0.5
)
Error
raise ValueError(self._kind + " requires an x and y column")
ValueError: scatter requires an x and y column
Edit:
Solution to make a figure with subplots with using df.plot
(Thanks to #Fourier)
import numpy as np
import pandas as pd
matrix = np.random.rand(200,5)#random data
df = pd.DataFrame(matrix,columns=['index','A','B','C','D']) #make df
#get a list for subplots
labels = list(df.columns)
labels.remove('index')
df.plot(
layout=(-1, 5),
kind="line",
x='index',
y=labels,
subplots = True,
sharex = True,
ls="none",
marker="o")
Would this work for you:
import pandas as pd
import numpy as np
df = pd.DataFrame({"index":np.arange(5),"A":np.random.rand(5),"B":np.random.rand(5),"C":np.random.rand(5)})
df.plot(kind="line", x="index", y=["A","B","C"], subplots=True, sharex=True, ls="none", marker="o")
Output
Note: This uses a line plot with invisible lines. For a scatter, I would go and loop over it.
for column in df.columns[:-1]: #[:-1] ignores the index column for my random sample
df.plot(kind="scatter", x="index", y=column)
EDIT
In order to add custom ylabels you can do the following:
axes = df.plot(kind='line', x="index", y=["A","B","C"], subplots=True, sharex=True, ls="none", marker="o", legend=False)
ylabels = ["foo","bar","baz"]
for ax, label in zip(axes, ylabels):
ax.set_ylabel(label)

plotting sub_plots from pivot_table using matplotlib/seaborn

I have a code from a dataframe
Y = df['label']
for col in categorical_cols:
tab = pd.crosstab(df[col],Y)
annot = x.div(x.sum(axis=1).astype('float64'),axis=0)
annot.plot(kind='bar',stacked=True)
plt.title('Distribution of %s'%col)
plt.xlabel('%s'%col,size='x-large')
plt.xticks(rotation=45)
plt.legend()
How can I plot these using different subplots in a single figure because this loops prints the last column's figure. So all figures are same.
Also: How can I produce the same using matplotlib/seaborn using matplotlib which shows me the % or absolute values.
You need to create the different subplots and then pass one axes object to each call of annot.plot via the ax keyword, something like this:
import math
import matplotlib.pyplot as plt
n = len(categorical_cols)
nrows = math.ceil(float(n) / 3.0)
fig, ax = plt.subplots(ncols=3, nrows=nrows, figsize=(9, nrows*3))
ax = ax.flatten()
Y = df['label']
for idx, col in enumerate(categorical_cols):
tab = pd.crosstab(df[col],Y)
annot = x.div(x.sum(axis=1).astype('float64'),axis=0)
annot.plot(kind='bar',stacked=True, ax=ax[idx])
ax[idx].title('Distribution of %s'%col)
ax[idx].set_xlabel('%s'%col,size='x-large')
ax.tick_params('x', labelrotation=45)
plt.legend()

Share axis and remove unused in matplotlib subplots

I want to plot a series of seaborn heatmaps in a grid. I know the number of subplots (which can be odd or even).
The heatmaps will show the mean "occupation ratio" by "day of week" (y axis) and "hour of day" (x axis), e.g. they all share the same x / y domains.
Here's my current code:
df2 = df[['name','openLots','occupationRatio','DoW','Hour']]
fig, axs = plt.subplots(figsize=(24,24), nrows=7, ncols=6)
axs = axs.flatten()
locations = df2['name'].sort_values().unique()
def occupation_heatmap (name, ax):
dfn = df2[df2['name'] == name]
dfn = dfn.groupby(['DoW', 'Hour']).mean()['occupationRatio'].unstack()
dfn = dfn.reindex(['Mon', 'Tue', 'Wed','Thu','Fri','Sat','Sun'])
sns.heatmap(data=dfn, cmap="coolwarm", vmin=0, vmax=1.0, ax= ax)
ax.set_title(name)
i = 0
for n in locations:
occupation_heatmap (n, axs[i])
i = i+1
plt.tight_layout()
It looks almost like what I want (last few rows):
However want I want:
Have the y axis labels (DoW) only once per row (leftmost plot)
Have the colormap legend only on the rightmost plot in each row (or leave it out completely, the colors are pretty self-explainatory)
remove the "empty plots" in the last row because of an odd total number
Many thanks for any hints
Have the y axis labels (DoW) only once per row (leftmost plot)
This can be done using sharey = True as argument to plt.subplots.
Have the colormap legend only on the rightmost plot in each row (or leave it out completely, the colors are pretty self-explainatory)
Use the cbar = False argument to seaborn.heatmap in order not to show a colorbar. This can be given as an input to the plotting function in dependence of the actual number of subplots.
remove the "empty plots" in the last row because of an odd total number
After the loop for creating the plots you may add another loop removing the unused axes.
for j in range(len(locations), ncols*nrows):
axs[j].axis("off")
Here is a complete example (where I borrowed the cod to generate a dataframe from #Robbie):
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
days = ['Mon','Tue','Wed','Thurs','Fri','Sat','Sun']
names = ["Parkhaus {:02}".format(i+1) for i in range(22)]
nItems = 1000
df = pd.DataFrame()
df['name'] = [names[i] for i in np.random.randint(0,len(names),nItems)]
df['openLots'] = np.random.randint(0,100,nItems)
df['occupationRatio'] = np.random.rand(nItems)
df['DoW'] = [days[i] for i in np.random.randint(0,7,nItems)]
df['Hour'] = np.random.randint(0,12,nItems)
df2 = df[['name','openLots','occupationRatio','DoW','Hour']]
nrows = 4; ncols=6
fig, axs = plt.subplots(nrows=nrows, ncols=ncols, figsize=(15,9), sharey=True)
axs = axs.flatten()
locations = df2['name'].sort_values().unique()
def occupation_heatmap (name, ax, cbar=False, ylabel=False):
dfn = df2[df2['name'] == name]
dfn = dfn.groupby(['DoW', 'Hour']).mean()['occupationRatio'].unstack()
dfn = dfn.reindex(['Mon', 'Tue', 'Wed','Thu','Fri','Sat','Sun'])
sns.heatmap(data=dfn, cmap="coolwarm", vmin=0, vmax=1.0, ax=ax, cbar=cbar)
ax.set_title(name)
plt.setp(ax.get_yticklabels(), rotation=0)
if not ylabel: ax.set_ylabel("")
for i, n in enumerate(locations):
occupation_heatmap (n, axs[i], cbar=i%ncols==ncols-1, ylabel=i%ncols==0)
for j in range(len(locations), ncols*nrows):
axs[j].axis("off")
plt.tight_layout()
plt.show()
You can be more flexible and just create an axis for each name present, something like this:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import string
days = ['Mon','Tue','Wed','Thurs','Fri','Sat','Sun']
names = [string.lowercase[i] for i in range(22)]
nItems = 1000
df = pd.DataFrame()
df['name'] = [names[i] for i in np.random.randint(0,len(names),nItems)]
df['openLots'] = np.random.randint(0,100,nItems)
df['occupationRatio'] = np.random.randint(0,100,nItems)
df['DoW'] = [days[i] for i in np.random.randint(0,7,nItems)]
df['Hour'] = np.random.randint(0,12,nItems)
fig = plt.figure(figsize=(12,12))
for index, name in enumerate(names):
ax = fig.add_subplot(4,6,index+1)
dfn = df.loc[df.name==name]
dfn = dfn.groupby(['DoW','Hour']).mean()['occupationRatio'].unstack()
dfn = dfn.reindex(days)
# Now we can operate on each plot axis individually
if index%6!=5: #i.e.
# Don't draw a colorbar
sns.heatmap(data = dfn, cmap='coolwarm', ax=ax, cbar=False)
else:
sns.heatmap(data = dfn, cmap='coolwarm', ax=ax)
if index%6!=0:
# Remove the y-axis label
ax.set_ylabel('')
ax.set_yticks(())
ax.set_title(name)
fig.tight_layout()
fig.show()
Results in:
You could also play around with the x-axes (for example remove labels and ticks except for the bottom row).

Categories

Resources