I have a dataframe df with 4 unique UID - 1001,1002,1003,1004.
I want to write a user-defined function in python that does the following:
growth curve -plots Turbidity against Time for each unique UID. Turbidity values are the ones in the Time_1, Time_2, Time_3,Time_4 & Time_5 columns. For example, UID = 1003 will have 4 plots on each graph
Add a legend to each graph such as M+L, F+L, M+R, and F+R (from columns Gen and Type)
Add a title to each graph. For example- UID:1003 + Site:FRX
Export the graphs as a pdf or jpeg or tiff file - 4 graphs per page
# The dataset
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import numpy as np
df= {
'Gen':['M','M','M','M','F','F','F','F','M','M','M','M','F','F','F','F'],
'Site':['FRX','FRX','FRX','FRX','FRX','FRX','FRX','FRX','FRX','FRX','FRX','FRX','FRX','FRX','FRX','FRX'],
'Type':['L','L','L','L','L','L','L','L','R','R','R','R','R','R','R','R'],
'UID':[1001,1002,1003,1004,1001,1002,1003,1004,1001,1002,1003,1004,1001,1002,1003,1004],
'Time1':[100.78,112.34,108.52,139.19,149.02,177.77,79.18,89.10,106.78,102.34,128.52,119.19,129.02,147.77,169.18,170.11],
'Time2':[150.78,162.34,188.53,197.69,208.07,217.76,229.48,139.51,146.87,182.54,189.57,199.97,229.28,244.73,269.91,249.19],
'Time3':[250.78,262.34,288.53,297.69,308.07,317.7,329.81,339.15,346.87,382.54,369.59,399.97,329.28,347.73,369.91,349.12],
'Time4':[240.18,232.14,258.53,276.69,338.07,307.74,359.16,339.25,365.87,392.48,399.97,410.75,429.08,448.39,465.15,469.33],
'Time5':[270.84,282.14,298.53,306.69,318.73,327.47,369.63,389.59,398.75,432.18,449.78,473.55,494.85,509.39,515.52,539.23]
}
df = pd.DataFrame(df,columns = ['Gen','Site','Type','UID','Time1','Time2','Time3','Time4','Time5'])
df
My attempt
# See below for my thoughts/attempt- I am open to other python libraries and approaches
def graph2pdf(inputdata):
#1. convert from wide to long
inputdata = pd.melt(df,id_vars = ['Gen','Type','UID'],var_name = 'Time',value_name = 'Turbidity')
#
cmaps = ['Reds', 'Blues', 'Greens', 'Greys','Yellows']
label_patches = []
for i, cmap in enumerate(cmaps):
# I want a growth curve not a distribution curve
sns.kdeplot(x = Time, y = Turbidity,data = data, cmap=cmaps[i]+'_d')
label_patch = mpatches.Patch(color=sns.color_palette(cmaps[i])[2],label=label)
label_patches.append(label_patch)
#2. add legend
plt.legend(handles=label_patches, loc='upper left')
#3. add title- 'UID number+ SiteName: FRX' to each of the graphs
plt.title('UID:1003+FRX')
plt.show()
#4. export as pdf file i.e 4 graphs per page
with PdfPages('turbidityvstime_pdf.pdf') as pdf:
plt.figure(figsize=(2,2)) # 4 graphs per page, I am anticipating more pages in the future
pdf.savefig() # saves the current figure into a pdf page
plt.close()
# testing the user-defined function
graph2pdf(df)
I want the graph to look something like the figure below (turbidity instead of density on the y-axis and time on the x-axis). if possible, a white or clear background is preferred
Thanks
I line plot is usually not appropriate for discrete data, because the slope of the lines can imply trends that do not exist.
This is discrete because measurements are taken at discrete moments in time, not a continuous time series.
Discrete data is best visualized with a bar plot.
Use seaborn figure-level methods like sns.catplot or sns.replot to create the figure with four subplots.
Tested in python 3.8.11, pandas 1.3.2, matplotlib 3.4.3, seaborn 0.11.2
import pandas as pd
import seaborn as sns
def graph2pdf(df):
# melt the dataframe; any column not a var or value, should be in id_vars
data = df.melt(id_vars=df.columns[:4], var_name='Time', value_name='Turbidity')
# combine Gen and Type to create label, which can be used for hue
data['label'] = data.Gen + '-' + data.Type
# plot a catplot for bars
p1 = sns.catplot(data=data, kind='bar', x='Time', y='Turbidity', hue='label', col='UID', col_wrap=2, height=3.25)
p1.fig.subplots_adjust(top=0.9) # adjust the figure
p1.fig.suptitle('UID:1003+FRX')
p1.savefig("barplots.png")
# plot a relplot for lines
p2 = sns.relplot(data=data, kind='line', x='Time', y='Turbidity', hue='label', col='UID', col_wrap=2, height=3.25, marker='o')
p2.fig.subplots_adjust(top=0.9)
p2.fig.suptitle('UID:1003+FRX')
p2.savefig("lineplots.png")
graph2pdf(df)
I am visualizing the results of a survey. The answers are long and I would like to fit them entirely into the graph. Therefore, I would be very grateful if you could point me to a way to have multi-line xticklabels, or include the xticklabels in a legend on the side as seen in this example:
Because otherwise I would have to make the graph very wide to fit the entire answer. My current code and the resulting plot look as follows:
import seaborn as sns
from textwrap import wrap
sns.set(style="dark")
catp = (sns.catplot(data=results, x='1',
kind='count',
hue_order=results.sort_values('1')['1'],
palette='crest',
height=3.3,
aspect=17.4/7)
.set(xlabel=None,
ylabel='Number of Participants',
title="\n".join(wrap("Question 1: Out of the three options, please choose the one you would prefer your fully autonomous car to choose, if you sat in it.", 90)))
)
plt.tight_layout()
catp.ax.set_yticks((0,10,20,30,40))
for p in catp.ax.patches:
percentage = '{:.1f}%'.format(100 * p.get_height()/92)
x = p.get_x() + p.get_width() / 2 - 0.05
y = p.get_y() + p.get_height() + 0.3
catp.ax.annotate(percentage, (x, y), size = 12)
plt.show()
Best regards!
Edit: You can create a sample dataframe with this code:
import pandas as pd
import numpy as np
from itertools import chain
x = (np.repeat('Brake and crash into the bus', 37),
np.repeat('Steer into the passing car on the left', 22),
np.repeat('Steer into the right hand sidewall', 39))
results = pd.DataFrame({'1': list(chain(*x))})
Extract xticklabels and fix them with wrap as you did with the title
matplotlib 3.4.2 now comes with .bar_label to more easily annotate bars
See this answer for customizing the bar annotation labels.
The height and aspect of the figure will still require some adjusting depending on wrap width.
An alternate solution is to fix the values in the dataframe:
df['1'] = df['1'].apply(lambda row: '\n'.join(wrap(row, 30)))
for col in df.columns: df[col] = df[col].apply(lambda row: '\n'.join(wrap(row, 30))) for all columns.
The list comprehension for labels uses an assignment expression (:=), which requires python >= 3.8. This can be rewritten as a standard for loop.
labels = [f'{v.get_height()/len(df)*100:0.1f}%' for v in c] works without an assignment expression, but doesn't check if the bar height is 0.
Tested in python 3.8.11, pandas 1.3.2, matplotlib 3.4.2, seaborn 0.11.2
import seaborn as sns
from textwrap import wrap
from itertools import chain
import pandas as pd
import numpy as np
# sample dataframe
x = (np.repeat('Brake and crash into the bus, which will result in the killing of the children on the bus, but save your life', 37),
np.repeat('Steer into the passing car on the left, pushing it into the wall, saving your life, but killing passengers in the other car', 22),
np.repeat('Steer into the right hand sidewall, killing you but saving the lives of all other passengers', 39))
df = pd.DataFrame({'1': list(chain(*x))})
# plotting
sns.set(style="dark")
catp = (sns.catplot(data=df, x='1',
kind='count',
hue_order=df.sort_values('1')['1'],
palette='crest',
height=5,
aspect=17.4/7)
.set(xlabel=None,
ylabel='Number of Participants',
title="\n".join(wrap("Question 1: Out of the three options, please choose the one you would prefer your fully autonomous car to choose, if you sat in it.", 90)))
)
plt.tight_layout()
catp.ax.set_yticks((0,10,20,30,40))
for ax in catp.axes.ravel():
# extract labels
labels = ax.get_xticklabels()
# fix the labels
for v in labels:
text = v.get_text()
text = '\n'.join(wrap(text, 30))
v.set_text(text)
# set the new labels
ax.set_xticklabels(labels)
# annotate the bars
for c in ax.containers:
# create a custom annotation: percent of total
labels = [f'{w/len(df)*100:0.1f}%' if (w := v.get_height()) > 0 else '' for v in c]
ax.bar_label(c, labels=labels, label_type='edge')
I'm trying to increase the number of xticks for each chart in the dataframe.
for c in df:
fig = plt.figure(figsize=[10,5]);
ax = df[c].plot(kind='hist', color=(0.2,0.4,0.6,0.6), bins=30);
I've tried:
ax.xticks(np.arange(min(c),max(x)+1,1));
Results in an AttributeError.
Thus are there any methods to increase the number of xticks without specifying the ticks explicitly but rather dynamically so it works for all the charts?
the function doesn't understand the c in min (and I guess it is max(c) too.
it works this way:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
fig = plt.figure(figsize=[10,5])
for c in df:
ax = df[c].plot(kind='hist', color=(0.2,0.4,0.6,0.6), bins=30)
plt.xticks(np.arange(min(df[c]),max(df[c]), step = 1))
I have a situation with my data. I like the behaviour of .plot() over a data frame. But sometimes it doesn't work, because the frequency of the time index is not an integer.
But reproducing the plot in matplotlib is OK. Just ugly.
The part that bother me the most is the settings of the x axis. The tick frequency and the limits. Is there any easy way that I can reproduce this behaviour in matplotlib?
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# Create Data
f = lambda x: np.sin(0.1*x) + 0.1*np.random.randn(1,x.shape[0])
x = np.arange(0,217,0.001)
y = f(x)
# Create DataFrame
data = pd.DataFrame(y.transpose(), columns=['dp'], index=None)
data['t'] = pd.date_range('2021-01-01 14:32:09', periods=len(data['dp']),freq='ms')
data.set_index('t', inplace=True)
# Pandas plot()
data.plot()
# Matplotlib plot (ugly x-axis)
plt.plot(data.index,data['dp'])
EDIT: Basically, what I want to achieve is a similar spacing in the xtics labels, and the tight margin adjust of the values. Legends and axis title, I can do them
Pandas output
Matplotlib output
Thanks
You can use some matplotlib date utilities:
Figure.autofmt_xdate() to unrotate and center the date labels
Axis.set_major_locator() to change the interval to 1 min
Axis.set_major_formatter() to reformat as %H:%M
fig, ax = plt.subplots()
ax.plot(data.index, data['dp'])
import matplotlib.dates as mdates
fig.autofmt_xdate(rotation=0, ha='center')
ax.xaxis.set_major_locator(mdates.MinuteLocator(interval=1))
ax.xaxis.set_major_formatter(mdates.DateFormatter('%H:%M'))
# uncomment to remove the first `xtick`
# ax.set_xticks(ax.get_xticks()[1:])
so I am plotting error bar of pandas dataframe. Now the error bar has a weird arrow at the top, but what I want is a horizontal line. For example, a figure like this:
But now my error bar ends with arrow instead of a horinzontal line.
Here is the code i used to generate it:
plot = meansum.plot(
kind="bar",
yerr=stdsum,
colormap="OrRd_r",
edgecolor="black",
grid=False,
figsize=(8, 2),
ax=ax,
position=0.45,
error_kw=dict(ecolor="black", elinewidth=0.5, lolims=True, marker="o"),
width=0.8,
)
So what should I change to make the error become the one I want. Thx.
Using plt.errorbar from matplotlib makes it easier as it returns several objects including the caplines which contain the marker you want to change (the arrow which is automatically used when lolims is set to True, see docs).
Using pandas, you just need to dig the correct line in the children of plot and change its marker:
import pandas as pd
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
df = pd.DataFrame({"val":[1,2,3,4],"error":[.4,.3,.6,.9]})
meansum = df["val"]
stdsum = df["error"]
plot = meansum.plot(kind='bar',yerr=stdsum,colormap='OrRd_r',edgecolor='black',grid=False,figsize=8,2),ax=ax,position=0.45,error_kw=dict(ecolor='black',elinewidth=0.5, lolims=True),width=0.8)
for ch in plot.get_children():
if str(ch).startswith('Line2D'): # this is silly, but it appears that the first Line in the children are the caplines...
ch.set_marker('_')
ch.set_markersize(10) # to change its size
break
plt.show()
The result looks like:
Just don't set lolim = True and you are good to go, an example with sample data:
import pandas as pd
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
df = pd.DataFrame({"val":[1,2,3,4],"error":[.4,.3,.6,.9]})
meansum = df["val"]
stdsum = df["error"]
plot = meansum.plot(kind='bar',yerr=stdsum,colormap='OrRd_r',edgecolor='black',grid=False,figsize=(8,2),ax=ax,position=0.45,error_kw=dict(ecolor='black',elinewidth=0.5),width=0.8)
plt.show()