I have the following dataset:
df = pd.DataFrame ({"a": [1,2,3,4,5,6,7,8,9,1,11,12,13,14,15,16,17,18,19,20],
'b':[1,2,3,4,50,60,70,8,9,10,110,120,130,140,150,16,17,18,19,20],
'c':[np.nan,2.2,3.4,np.nan,40.9,60.2,np.nan,8.2,8.9,10.1,np.nan,120.2,
130.07,140.23,np.nan,16.054,17.20,18.1,np.nan,20.1],
'd': [100, np.nan,np.nan, 500,np.nan, np.nan,500,
np.nan,np.nan,np.nan,100, np.nan,np.nan, np.nan,500,
np.nan,np.nan, np.nan,100,np.nan ]}
)
I am trying to plot the data based on the following conditions:
Between 100 to the next 100 in column 'd' I want to have one
plot having column 'a' in the x axis, and scatterplot of column 'b' and line plot of 'c' in the y axis.
That is I will be having 3 different plots. First one from index 0 to 10, second one from index 10 to index 18, third one from 18 to 20. (I can generate this using for loop)
Within each plot I want segmented lineplot based on the location 500 value in column 'd',i.e., for the first plot from index 0-3 one lineplot, from index 3-6 another and from index 6-10 another lineplot.( I can't make the segmented lineplot)
I am using the following codes:
index = index + [len(df)]
index1 = index1 + [len(df)]
for k in range (len(index)-1):
x = df['a'][index[k] + 1:index[k+1]]
y = df['c'][index[k]+ 1:index[k+1]]
y1 = df['b'][index[k]+ 1:index[k+1]]
plt.scatter(x, y)
plt.plot(x, y1)
plt.savefig('plot'+ str(k+1000) +'.png')
plt.clf()
My first plot look like this: (But want to have three segmented
lineplot not the continuous one (that is line from index 0-3 should not be connected with 3-6 and so on)
Sorry for the rather long question and thx:)
The expected output is unclear, but here is a general strategy to split your dataset in groups with help of groupby:
option 1: independent figures
group = df['d'].eq(100).cumsum()
for name, g in df.groupby(group):
f,ax = plt.subplots()
ax.scatter(g['a'], g['c'])
ax.plot(g['a'], g['b'])
f.savefig(f'figure_{name}.png')
option 2
ax = plt.subplot()
group = df['d'].eq(100).cumsum()
for name, g in df.groupby(group):
ax.scatter(g['a'], g['c'])
ax.plot(g['a'], g['b'], label=name)
ax.legend()
option 3
ax = plt.subplot()
group = df['d'].eq(100).cumsum()
for name, g in df.groupby(group):
g = g.reset_index()
ax.scatter(g.index+1, g['c'])
ax.plot(g.index+1, g['b'])
Related
I have this dataframe and I want to create a subplot for each exercise, depicting the sum of sets for each date in a trend line format.
This is the jupyter notebook : https://github.com/TheoPallis/Portfolio/blob/Site/Workout_Project/Workout.ipynb.
I have tried the following code :
grp = df.groupby('Date').sum()
melted = pd.melt(grp, var_name="Exercise", ignore_index=False)
print(melted)
px.line(melted,
y='value',
facet_col='Exercise',
facet_col_spacing=0.1)
but I get this unelegant output. .
Ideally, the sublots would be arranged in a grid format :
First row : 3 subplots
Second row : 3 subplots
Third row : 2 subplots (and 1 blank?)
grp = data.groupby('Date').sum()
melted = pd.melt(grp, var_name="Exercise", ignore_index=False)
# define exercises list
exercises = melted.reset_index()
exercises = set(melted['Exercise'])
# define plot grid
fig, axs = plt.subplots(nrows=3, ncols=3, figsize=(15, 12))
plt.subplots_adjust(hspace=0.5)
# loop over grid and exercises
for exercises, ax in zip(exercises, axs.ravel()):
melted[melted["Exercise"] == exercises].plot(ax=ax)
I've got a 3 level index. I'm holding two levels in control at all times and plotting data over the third. The plots look nice, but all three levels show as x-axis labels.
gb_zone_month_mean = df.groupby(by=["hemisphere", "climate_zone", "month"]).mean()
zones = [0, 1, 2, 3] # Climate zone (level 2 of index)
varis = variables[3:] # The 10 variables I care about.
idx = pd.IndexSlice
fig, ax = plt.subplots(4, 10, figsize=(20, 10))
for z_i in zones:
for i, v in zip(range(len(varis)), varis):
gb_zone_month_mean.loc[idx[1, z_i, :], v].plot(kind="bar", ax=ax[z_i][i])
plt.tight_layout()
plt.show()
As you can see, there is only one level of the multi-index that is varying in any given plot. That's the month.
How can I choose which level of the multi-index is shown in the x-axis labels?
You can reset_index with drop=True before plot. Also, use groupby would be much faster:
# sample data
df = pd.DataFrame(np.random.randint(0,10,(1000,4)), columns = ['a','b','c','d'])
# aggregation
groups = df.groupby(['a','b','c']).mean()
zones = [0,1,2,3]
varis = [3,4,5,6]
# create the axies, change to number you want
fig, axes = plt.subplots(4,10, figsize=(10,10))
# let say you want to plot on level `a`, `b`
# change to level name you want
# since `varis`, i.e. level `b` is selective, we query before groupby
for data,ax in zip(groups.query('b in #varis').groupby(['a','b']), axes.ravel()):
(zone, var), d = data
d.reset_index(level=['a','b'],drop=True)['d'].plot.bar(ax=ax)
Output:
Another option is seaborn's FacetGrid and barplot
import seaborn as sns
plot_data = groups.query('b in #varis').reset_index()
g = sns.FacetGrid(data=plot_data, row='b', col='a')
g.map(sns.barplot, 'c', 'd', order=plot_data['c'].unique())
You get:
Say I have a dataframe structured like so:
Name x y
Joe 0,1,5 0,3,8
Sue 0,2,8 1,9,5
...
Harold 0,5,6 0,7,2
I'd like to plot the values in the x and y axis on a line plot based on row. In reality, there are many x and y values, but there is always one x value for every y value in these columns. The name of the plot would be the value in "name".
I've tried to do this by first converting x and y to lists in their own separate columns like so:
df['xval'] = df.['x'].str.split(',')
df['yval'] = df.['y'].str.split(',')
And then passing them to seaborn:
ax = sns.lineplot(x=df['xval'], y=df['yval'], data=df)
However, this does not work because 1) I recieve an error, which I presume is due to attempting to pass a list from a dataframe, claiming:
TypeError: unhashable type: 'list'
And 2) I cannot specify the value for df['name'] for the specific line plot. What's the best way to go about solving this problem?
Data and imports:
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
df = pd.DataFrame({
'name': ['joe', 'sue', 'mike'],
'x': ['0,1,5', '0,2,8', '0,4'],
'y': ['0,3,8', '1,9,5', '1,6']
})
We should convert df into a useable format for plotting. This makes all plotting eaiser. We can take advantage of the fact that x and y have a 1-to-1 relationship. Notice I've added a third name with a 2 xy value as opposed to 3 to show this method will work for varied amounts of x and y per name as long as each row has equal numbers of x and y values.
Creating the plot_df:
# Grab Name Column to Start Plot DF with
plot_df = df.loc[:, ['name']]
# Split X column
plot_df['x'] = df['x'].str.split(',')
# Explode X into Rows
plot_df = plot_df.explode('x').reset_index(drop=True)
# Split and Series Explode y in one step
# This works IF AND ONLY IF a 1-to-1 relationship for x and y
plot_df['y'] = df['y'].str.split(',').explode().reset_index(drop=True)
# These need to be numeric to plot correctly
plot_df.loc[:, ['x', 'y']] = plot_df.loc[:, ['x', 'y']].astype(int)
plot_df:
name x y
0 joe 0 0
1 joe 1 3
2 joe 5 8
3 sue 0 1
4 sue 2 9
5 sue 8 5
6 mike 0 1
7 mike 4 6
References to the methods used in creating plot_df:
DataFrame.loc to subset the dataframe
Series.str.split to split the comma separated values into a list
DataFrame.explode to upscale the DataFrame based on the iterable in x
DataFrame.reset_index to make index unique again after exploding
Series.explode to upscale the lists in the Series y.
Series.reset_index to make index unique again after exploding
DataFrame.astype since the values are initially strings just splitting and exploding is not enough. Will need to convert to a numeric type for them to plot correctly
Plotting (Option 1)
# Plot with hue set to name.
sns.lineplot(data=plot_df, x='x', y='y', hue='name')
plt.show()
References for plotting separate lines:
sns.lineplot to plot. Note the hue argument to create separate lines based on name.
pyplot.show to display.
Plotting (Option 2.a) Subplots:
sns.relplot(data=plot_df, x='x', y='y', col='name', kind='line')
plt.tight_layout()
plt.show()
Plotting (Option 2.b) Subplots:
# Use Grouper From plot_df
grouper = plot_df.groupby('name')
# Create Subplots based on the number of groups (ngroups)
fig, axes = plt.subplots(nrows=grouper.ngroups)
# Iterate over axes and groups
for ax, (grp_name, grp) in zip(axes, grouper):
# Plot from each grp DataFrame on ax from axes
sns.lineplot(data=grp, x='x', y='y', ax=ax, label=grp_name)
plt.show()
References for plotting subplots:
2.a
relplot the row or col parameter can be used to create subplots in a similar way to how hue creates multiple lines. This will return a seaborn.FacetGrid so post processing will be different than lineplot which returns matplotlib.axes.Axes
2.b
groupby to create iterable that can be used to plot subplots.
pyplot.subplots to create subplots to plot on.
groupby.ngroup to count number of groups.
zip to iterate over axes and groups simultaneously.
sns.lineplot to plot. Note label is needed to have legends. grp_name contains the current key that is common in the current grp DataFrame.
pyplot.show to display.
Plotting option 3 (separate plots):
# Plot from each grp DataFrame in it's own plot
for grp_name, grp in plot_df.groupby('name'):
fig, ax = plt.subplots()
sns.lineplot(data=grp, x='x', y='y', ax=ax)
ax.set_title(grp_name)
fig.show()
joe plot
mike plot
sue plot
References for plotting separate plots:
groupby to create iterable that can be used to plot each name separately.
pyplot.subplots to create separate plot to plot on.
sns.lineplot to plot. Note label is needed to have legends. grp_name contains the current key that is common in the current grp DataFrame.
pyplot.show to display.
From what I understood this is what you want.
df = pd.DataFrame()
df['name'] = ['joe', 'sue']
df['x'] = ['0,1,5', '0,2,8']
df['y'] = ['0,3,8', '1,9,5']
df['newx'] = df['x'].str.split(',')
df['newy'] = df['y'].str.split(',')
for i in range(len(df)):
sns.lineplot(x=df.loc[i, 'newx'], y=df.loc[i, 'newy'])
plt.legend(df['name'])
I have a dict of dataframes that I want to use to populate subplots.
Each dict has two columns of data for x and y axis, and two categorical columns for hue.
Pseudo code:
for df in dict of dataframes:
for cat in categories:
plot(x=col_0, y=col_1, hue=cat)
Data for example:
dict_dfs = dict()
for i in range(5):
dict_dfs['df_{}'.format(i)] = pd.DataFrame({'col_1':np.random.randn(10), # first column with data = x axis
'col_2':np.random.randn(10), # second column with data = y axis
'cat_0': ('Home '*5 + 'Car '*5).split(), # first category = hue of plots on the left
'cat_1': ('kitchen '*3 + 'Bedroom '*2 + 'Care '*5).split() # second category = hue of plots on the right
})
IN:
fig, axes = plt.subplots(len(dict_dfs.keys()), 2, figsize=(15,10*len(dict_dfs.keys())))
for i, (name, df) in enumerate(dict_dfs.items()):
for j, cat in enumerate(['cat_0', 'cat_1']):
sns.scatterplot(
x="col_1", y="col_2", hue=cat, data=df, ax=axes[i,j], alpha=0.6)
axes[i,j].set_title('df: {}, cat: {}'.format(name, cat), fontsize = 25, pad = 35, fontweight = 'bold')
axes[i,j].set_xlabel('col_1', fontsize = 26, fontweight = 'bold')
axes[i,j].set_ylabel('col_2', fontsize = 26, fontweight = 'bold')
plt.show()
OUT:
the 10 subplots are created correctly (5 dfs * 2 categories), but only the first one (axes[0, 0]) gets populated. I am used to create subplots with one loop, but it's the first time I use two. I have checked the code without finding the issue. Anyone can help ?
The plt.show() is within the scope of the for-loops, so the figure plot gets shown after the initialization of the first subplot. If you move it out of the loops (un-indent it to the beginning of the line), the plot should correctly be shown with all subplots.
I'm fairly new at coding (completely self taught), and have started using it at at my job as a research assistant in a cancer lab. I need some help setting up a few line graphs in matplot lab.
I have a dataset that includes nextgen sequencing data for about 80 patients. on each patient, we have different timepoints of analysis, different genes detected (out of 40), and the associated %mutation for the gene.
My goal is to write two scripts, one that will generate a "by patient" plot, that will be a linegraph with y-%mutation, x-time of measurement, and will have a different color line for all lines made by each of the patient's associated genes. The second plot will be a "by gene", where I will have one plot contain different color lines that represent each of the different patient's x/y values for that specific gene.
Here is an example dataframe for 1 genenumber for the above script:
gene yaxis xaxis pt# gene#
ASXL1-3 34 1 3 1
ASXL1-3 0 98 3 1
IDH1-3 24 1 3 11
IDH1-3 0 98 3 11
RUNX1-3 38 1 3 21
RUNX1-3 0 98 3 21
U2AF1-3 33 1 3 26
U2AF1-3 0 98 3 26
I have setup a groupby script that when I iterate over it, gives me a dataframe for every gene-timepoint for each patient.
grouped = df.groupby('pt #')
for groupObject in grouped:
group = groupObject[1]
For patient 1, this gives the following output:
y x gene patientnumber patientgene genenumber dxtotransplant \
0 40.0 1712 ASXL1 1 ASXL1-1 1 1857
1 26.0 1835 ASXL1 1 ASXL1-1 1 1857
302 7.0 1835 RUNX1 1 RUNX1-1 21 1857
I need help writing a script that will create either of the plots described above. using the bypatient example, my general idea is that I need to create a different subplot for every gene a patient has, where each subplot is the line graph represented by that one gene.
Using matplotlib this is about as far as I have gotten:
plt.figure()
grouped = df.groupby('patient number')
for groupObject in grouped:
group = groupObject[1]
df = group #may need to remove this
for element in range(len(group)):
xs = np.array(df[df.columns[1]]) #"x" column
ys= np.array(df[df.columns[0]]) #"y" column
gene = np.array(df[df.columns[2]])[element] #"gene" column
plt.subplot(1,1,1)
plt.scatter(xs,ys, label=gene)
plt.plot(xs,ys, label=gene)
plt.legend()
plt.show()
This produces the following output:
In this output, the circled line is not supposed to be connected to the other 2 points. In this case, this is patient 1, who has the following datapoint:
x y gene
1712 40 ASXL1
1835 26 ASXL1
1835 7 RUNX1
Using seaborn I have gotten close to my desired graph using this code:
grouped = df.groupby(['patientnumber'])
for groupObject in grouped:
group = groupObject[1]
g = sns.FacetGrid(group, col="patientgene", col_wrap=4, size=4, ylim=(0,100))
g = g.map(plt.scatter, "x", "y", alpha=0.5)
g = g.map(plt.plot, "x", "y", alpha=0.5)
plt.title= "gene:%s"%element
Using this code I get the following:
If I adjust the line:
g = sns.FacetGrid(group, col="patientnumber", col_wrap=4, size=4, ylim=(0,100))
I get the following result:
As you can see in the 2d example, the plot is treating every point on my plot as if they are from the same line (but they are actually 4 separate lines).
How I can tweak my iterations so that each patient-gene is treated as a separate line on the same graph?
I wrote a subplot function that may give you a hand. I modified the data a tad to help illustrate the plotting functionality.
gene,yaxis,xaxis,pt #,gene #
ASXL1-3,34,1,3,1
ASXL1-3,3,98,3,1
IDH1-3,24,1,3,11
IDH1-3,7,98,3,11
RUNX1-3,38,1,3,21
RUNX1-3,2,98,3,21
U2AF1-3,33,1,3,26
U2AF1-3,0,98,3,26
ASXL1-3,39,1,4,1
ASXL1-3,8,62,4,1
ASXL1-3,0,119,4,1
IDH1-3,27,1,4,11
IDH1-3,12,62,4,11
IDH1-3,1,119,4,11
RUNX1-3,42,1,4,21
RUNX1-3,3,62,4,21
RUNX1-3,1,119,4,21
U2AF1-3,16,1,4,26
U2AF1-3,1,62,4,26
U2AF1-3,0,119,4,26
This is the subplotting function...with some extra bells and whistles :)
def plotByGroup(df, group, xCol, yCol, title = "", xLabel = "", yLabel = "", lineColors = ["red", "orange", "yellow", "green", "blue", "purple"], lineWidth = 2, lineOpacity = 0.7, plotStyle = 'ggplot', showLegend = False):
"""
Plot multiple lines from a Pandas Data Frame for each group using DataFrame.groupby() and MatPlotLib PyPlot.
#params
df - Required - Data Frame - Pandas Data Frame
group - Required - String - Column name to group on
xCol - Required - String - Column name for X axis data
yCol - Required - String - Column name for y axis data
title - Optional - String - Plot Title
xLabel - Optional - String - X axis label
yLabel - Optional - String - Y axis label
lineColors - Optional - List - Colors to plot multiple lines
lineWidth - Optional - Integer - Width of lines to plot
lineOpacity - Optional - Float - Alpha of lines to plot
plotStyle - Optional - String - MatPlotLib plot style
showLegend - Optional - Boolean - Show legend
#return
MatPlotLib Plot Object
"""
# Import MatPlotLib Plotting Function & Set Style
from matplotlib import pyplot as plt
matplotlib.style.use(plotStyle)
figure = plt.figure() # Initialize Figure
grouped = df.groupby(group) # Set Group
i = 0 # Set iteration to determine line color indexing
for idx, grp in grouped:
colorIndex = i % len(lineColors) # Define line color index
lineLabel = grp[group].values[0] # Get a group label from first position
xValues = grp[xCol] # Get x vector
yValues = grp[yCol] # Get y vector
plt.subplot(1,1,1) # Initialize subplot and plot (on next line)
plt.plot(xValues, yValues, label = lineLabel, color = lineColors[colorIndex], lw = lineWidth, alpha = lineOpacity)
# Plot legend
if showLegend:
plt.legend()
i += 1
# Set title & Labels
axis = figure.add_subplot(1,1,1)
axis.set_title(title)
axis.set_xlabel(xLabel)
axis.set_ylabel(yLabel)
# Return plot for saving, showing, etc.
return plt
And to use it...
import pandas
# Load the Data into Pandas
df = pandas.read_csv('data.csv')
#
# Plotting - by Patient
#
# Create Patient Grouping
patientGroup = df.groupby('pt #')
# Iterate Over Groups
for idx, patientDF in patientGroup:
# Let's give them specific titles
plotTitle = "Gene Frequency over Time by Gene (Patient %s)" % str(patientDf['pt #'].values[0])
# Call the subplot function
plot = plotByGroup(patientDf, 'gene', 'xaxis', 'yaxis', title = plotTitle, xLabel = "Days", yLabel = "Gene Frequency")
# Add Vertical Lines at Assay Timepoints
timepoints = set(patientDf.xaxis.values)
[plot.axvline(x = timepoint, linewidth = 1, linestyle = "dashed", color='gray', alpha = 0.4) for timepoint in timepoints]
# Let's see it
plot.show()
And of course, we can do the same by gene.
#
# Plotting - by Gene
#
# Create Gene Grouping
geneGroup = df.groupby('gene')
# Generate Plots for Groups
for idx, geneDF in geneGroup:
plotTitle = "%s Gene Frequency over Time by Patient" % str(geneDf['gene'].values[0])
plot = plotByGroup(geneDf, 'pt #', 'xaxis', 'yaxis', title = plotTitle, xLab = "Days", yLab = "Frequency")
plot.show()
If this isn't what you're looking for, provide a clarification and I'll take another crack at it.