I managed to create multiple subplots by looping the df data frame below but I can not export all of them into one pdf. Any idea on how to generate the pdf? Thanks
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
d = {'index': ['index1', 'index1', 'index2', 'index2'], 'group': ['gr1', 'gr1','gr2','gr2'], 'targetscore':[15,15,10,10], 'exam':['old','new','old','new'], 'score':[5,6,7,8]}
df = pd.DataFrame(data = d)
for i in range(len(df['group'])):
subdf = df[df['group'] == df.iloc[i,1]]
sns.catplot(y = 'score', x = 'group', data = subdf, hue = 'exam', kind = 'bar',
row = 'index', col = 'exam', col_order = ['old', 'new'], height = 3, aspect = 2)
plt.show
You can try something like this:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from matplotlib.backends.backend_pdf import PdfPages
d = {'index': ['index1', 'index1', 'index2', 'index2'], 'group': ['gr1', 'gr1','gr2','gr2'], 'targetscore':[15,15,10,10], 'exam':['old','new','old','new'], 'score':[5,6,7,8]}
df = pd.DataFrame(data = d)
pp = PdfPages('yourpath/foo.pdf') #create the pdf named 'foo.pdf'
for i in range(len(df['group'])):
subdf = df[df['group'] == df.iloc[i,1]]
sns.catplot(y = 'score', x = 'group', data = subdf, hue = 'exam', kind = 'bar',
row = 'index', col = 'exam', col_order = ['old', 'new'], height = 3, aspect = 2)
plt.show
pp.savefig(plt.gcf()) #Save each figure in pdf
pp.close() #close the pdf
Related
I have a dataframe like this in Python:
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import plotly.figure_factory as ff
np.random.seed(1234)
df = pd.DataFrame(np.random.randn(10, 4),
columns=['Col1', 'Col2', 'Col3', 'Col4'])
df['id'] = range(1, len(df.index)+1)
df
# making a long dataframe
# sorting the dataframe by value (i.e. randomly)
long_df = df.melt(id_vars = ['id'],
var_name = 'type',
value_name = 'value').sort_values(by='value')
long_df['id'] = range(1, len(long_df.index)+1)
long_df.head()
long_df = long_df.drop(long_df[long_df.id < 10].index)
long_df.head()
long_df['type'].value_counts().sort_index()
and I created a boxplot using these commands:
box_plot= ff.create_facet_grid(
long_df,
x = 'type',
y = 'value',
trace_type = 'box',
color_name = 'type',
color_is_cat = True,
width = 1000,
ggplot2 = False,
showlegend = False,
)
box_plot.show()
I there any way to set the box width proportional to the number of rows in that category? (similar to the way R does). I expect the box widths to be in this order (from slim to fat): col2(n=5)--> col4(n=7) --> col1(n=9) --> col3(n=10)
It can be done with matplotlib:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
np.random.seed(1234)
df = pd.DataFrame(np.random.randn(10, 4),
columns=['Col1', 'Col2', 'Col3', 'Col4'])
df['id'] = range(1, len(df.index)+1)
# making a long dataframe
# sorting the dataframe by value (i.e. randomly)
long_df = df.melt(id_vars=['id'],
var_name='type',
value_name='value').sort_values(by='value')
long_df['id'] = range(1, len(long_df.index)+1)
long_df = long_df.drop(long_df[long_df.id < 10].index)
long_df['type'].value_counts().sort_index()
cols = ['red', 'green', 'blue', 'orange']
plt.style.use('ggplot')
fig, ax = plt.subplots()
for i, col in enumerate(sorted(long_df['type'].unique(), key=lambda c: long_df[long_df['type'] == c].shape[0])):
col_df = long_df[long_df['type'] == col]
bp = plt.boxplot(col_df['value'],
positions=[i*120],
widths=len(col_df['value'])*10,
patch_artist=True,
labels=[col]
)
for element in ['boxes', 'whiskers', 'fliers', 'means', 'medians', 'caps']:
plt.setp(bp[element], color=f'xkcd:dark {cols[i]}')
for patch in bp['boxes']:
patch.set(facecolor=f'xkcd:light {cols[i]}')
plt.xlabel('type')
plt.show()
Or, if you prefer something closer to R:
from plotnine import ggplot, aes, geom_boxplot
import numpy as np
import pandas as pd
np.random.seed(1234)
df = pd.DataFrame(np.random.randn(10, 4),
columns=['Col1', 'Col2', 'Col3', 'Col4'])
df['id'] = range(1, len(df.index)+1)
# making a long dataframe
# sorting the dataframe by value (i.e. randomly)
long_df = df.melt(id_vars=['id'],
var_name='type',
value_name='value').sort_values(by='value')
long_df['id'] = range(1, len(long_df.index)+1)
long_df = long_df.drop(long_df[long_df.id < 10].index)
type_list = long_df['type'].value_counts(ascending=True).index.tolist()
long_df['type'] = pd.Categorical(long_df['type'], categories=type_list)
p = ggplot(long_df) + aes(x='type', y='value', fill='type') + geom_boxplot(varwidth = True, alpha=0.8, show_legend=False)
print(p)
The following code generates a pdf file that is fed by a looping barplot over a data frame. My goal is to annotate values over the bars. I have already tried various times to monitor the values but failed. May I get any help on this? Thanks
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from matplotlib.backends.backend_pdf import PdfPages
d = {'index': ['index1', 'index1', 'index2', 'index2'], 'group': ['gr1', 'gr1','gr2','gr2'],
targetscore':[15,15,10,10], 'exam':['old','new','old','new'], 'score':[5,6,7,8]}
df = pd.DataFrame(data = d)
pp = PdfPages('mypath/extraction.pdf')
for i in range(len(df['group'])):
subdf = df[df['group'] == df.iloc[i,1]]
sns.catplot(y = 'score', x = 'group', data = subdf, hue = 'exam', kind = 'bar',
row = 'index', col = 'exam', col_order = ['old', 'new'], height = 3, aspect = 2)
plt.show
pp.savefig(plt.gcf())
pp.close()
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
employees = {'Name of Employee': ['Jon','Mark','Tina','Maria','Bill','Jon','Mark','Tina','Maria','Bill','Jon','Mark','Tina','Maria','Bill','Jon','Mark','Tina','Maria','Bill'],
'Sales': [1000,300,400,500,800,1000,500,700,50,60,1000,900,750,200,300,1000,900,250,750,50],
'Quarter': [1,1,1,1,1,2,2,2,2,2,3,3,3,3,3,4,4,4,4,4],
'Country': ['US','Japan','Brazil','UK','US','Brazil','Japan','Brazil','US','US','US','Japan','Brazil','UK','Brazil','Japan','Japan','Brazil','UK','US']
}
df = pd.DataFrame(employees, columns= ['Name of Employee','Sales','Quarter','Country'])
print (df)
pivot = df.pivot_table(index = ['Country'] , values = ['Sales'], aggfunc = 'sum').plot.bar()
plt.xlabel('Countries')
plt.ylabel('Sales')
plt.title('Sales by Countries')
plt.show()
pivot = df.pivot_table(index = ['Country'] , values = ['Sales'], aggfunc = 'sum')
pivot.plot.bar()
for i in range(4):
plt.text(i-0.1, 1000, pivot['Sales'].iloc[i], c='w')
plt.xlabel('Countries')
plt.ylabel('Sales')
plt.title('Sales by Countries')
plt.show()
There may be a better way to write it than this.
I have the following data:
import pandas as pd
import numpy as np
# Generate dummy data.
a = np.random.random(75)
b = np.random.random(75) - 0.6
c = np.random.random(75) + 0.75
# Collate into a DataFrame
df = pd.DataFrame({'a': a, 'b': b, 'c': c})
df.columns = [list(['WT', 'MUT', 'WTxMUT']), list(['Parent', 'Parent', 'Offspring'])]
df.columns.names = ['Genotype', 'Status']
df_melt = pd.melt(df)
and I plot it in seaborn using this code:
import seaborn as sb
sb.swarmplot(data = df_melt, x = "Status", y = "value", hue = "Genotype")
How do I get the x-span of each group? What is the range of the horizontal span of the swarmplot for the Parent group, for instance?
You can get the information from the collections which are created by swarmplot.
swarmplot actually returns the matplotlib Axes instance, and from there we can find the PathCollections that it creates. To get the positions, we can use .get_offsets().
Here is your example, modified to find and print the swarm limits, and then use them to plot a box around the swarms.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
from matplotlib.patches import Rectangle
# Generate dummy data.
a = np.random.random(75)
b = np.random.random(75) - 0.6
c = np.random.random(75) + 0.75
# Collate into a DataFrame
df = pd.DataFrame({'a': a, 'b': b, 'c': c})
df.columns = [list(['WT', 'MUT', 'WTxMUT']), list(['Parent', 'Parent', 'Offspring'])]
df.columns.names = ['Genotype', 'Status']
df_melt = pd.melt(df)
ax = sb.swarmplot(data = df_melt, x = "Status", y = "value", hue = "Genotype")
def getdatalim(coll):
x,y = np.array(coll.get_offsets()).T
try:
print 'xmin={}, xmax={}, ymin={}, ymax={}'.format(
x.min(), x.max(), y.min(), y.max())
rect = Rectangle((x.min(),y.min()),x.ptp(),y.ptp(),edgecolor='k',facecolor='None',lw=3)
ax.add_patch(rect)
except ValueError:
pass
getdatalim(ax.collections[0]) # "Parent"
getdatalim(ax.collections[1]) # "Offspring"
plt.show()
which prints:
xmin=-0.107313729132, xmax=0.10661092707, ymin=-0.598534246847, ymax=0.980441247759
xmin=0.942829146473, xmax=1.06105941656, ymin=0.761277608688, ymax=1.74729717464
And here's the figure:
I am trying to plot a bar chart of a pandas data frame that is the result of two group bys.
In particular, my data frame looks exactly like the output from another SO post's answer (https://stackoverflow.com/a/23377155/7243972):
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
np.random.seed(0)
df = pd.DataFrame({'state': ['CA', 'WA', 'CO', 'AZ'] * 3,
'office_id': list(range(1, 7)) * 2,
'sales': [np.random.randint(100000, 999999) for _ in range(12)]})
state_office = df.groupby(['state', 'office_id']).agg({'sales': 'sum'})
state = df.groupby(['state']).agg({'sales': 'sum'})
results = state_office.div(state, level='state') * 100
I would like to plot results so that each state is a different color and the office_id is on the x-axis. This is so that each office_id is grouped together and they can be easily compared.
I've tried adjusting the plot from results['sales'].plot.bar(), but I am struggling.
First you need to flatten the dataframe:
data = []
for row in results.iterrows():
state, office_id = row[0]
sales = row[1][0]
data.append((state, office_id, sales))
flat_df = pd.DataFrame(data, columns=['state', 'office_id', 'sales'])
then plot
import seaborn as sns
sns.set(style="whitegrid")
g = sns.factorplot(x="office_id", y="sales", hue="state", data=flat_df, kind="bar", palette="muted")
edit: just realized there is a simpler way to flatten the dataframe:
flat_df = results.reset_index(inplace=False)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
np.random.seed(0)
df = pd.DataFrame({'state': ['CA', 'WA', 'CO', 'AZ'] * 3,
'office_id': list(range(1, 7)) * 2,
'sales': [np.random.randint(100000, 999999) for _ in
range(12)]})
state_office = df.groupby(['state', 'office_id']).agg({'sales': 'sum'})
state = df.groupby(['state']).agg({'sales': 'sum'})
results = state_office.div(state, level='state') * 100
results = results.reset_index()
fig, ax = plt.subplots()
for c, df in results.groupby('state'):
ax.scatter(df['office_id'], df['sales'], label=c)
ax.legend()
ax.set_title('Scatterplot')
ax.set_xlabel('office_id')
ax.set_ylabel('sales')
This prints a scatterplot. See if you can take it from here!