Showing values on barplot - python

The following code generates a pdf file that is fed by a looping barplot over a data frame. My goal is to annotate values over the bars. I have already tried various times to monitor the values but failed. May I get any help on this? Thanks
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from matplotlib.backends.backend_pdf import PdfPages
d = {'index': ['index1', 'index1', 'index2', 'index2'], 'group': ['gr1', 'gr1','gr2','gr2'],
targetscore':[15,15,10,10], 'exam':['old','new','old','new'], 'score':[5,6,7,8]}
df = pd.DataFrame(data = d)
pp = PdfPages('mypath/extraction.pdf')
for i in range(len(df['group'])):
subdf = df[df['group'] == df.iloc[i,1]]
sns.catplot(y = 'score', x = 'group', data = subdf, hue = 'exam', kind = 'bar',
row = 'index', col = 'exam', col_order = ['old', 'new'], height = 3, aspect = 2)
plt.show
pp.savefig(plt.gcf())
pp.close()

Related

Exporting looped plots into pdf in Python

I managed to create multiple subplots by looping the df data frame below but I can not export all of them into one pdf. Any idea on how to generate the pdf? Thanks
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
d = {'index': ['index1', 'index1', 'index2', 'index2'], 'group': ['gr1', 'gr1','gr2','gr2'], 'targetscore':[15,15,10,10], 'exam':['old','new','old','new'], 'score':[5,6,7,8]}
df = pd.DataFrame(data = d)
for i in range(len(df['group'])):
subdf = df[df['group'] == df.iloc[i,1]]
sns.catplot(y = 'score', x = 'group', data = subdf, hue = 'exam', kind = 'bar',
row = 'index', col = 'exam', col_order = ['old', 'new'], height = 3, aspect = 2)
plt.show
You can try something like this:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from matplotlib.backends.backend_pdf import PdfPages
d = {'index': ['index1', 'index1', 'index2', 'index2'], 'group': ['gr1', 'gr1','gr2','gr2'], 'targetscore':[15,15,10,10], 'exam':['old','new','old','new'], 'score':[5,6,7,8]}
df = pd.DataFrame(data = d)
pp = PdfPages('yourpath/foo.pdf') #create the pdf named 'foo.pdf'
for i in range(len(df['group'])):
subdf = df[df['group'] == df.iloc[i,1]]
sns.catplot(y = 'score', x = 'group', data = subdf, hue = 'exam', kind = 'bar',
row = 'index', col = 'exam', col_order = ['old', 'new'], height = 3, aspect = 2)
plt.show
pp.savefig(plt.gcf()) #Save each figure in pdf
pp.close() #close the pdf

Matplotlib - plotting grouped values with a for loop

I'm trying to plot a graph grouped by column values using a for loop without knowing the number of unique values in that column.
You can see sample code below (without a for loop) and the desired output.
I would like that each plot will have different color and marker (as seen below).
This is the code:
import pandas as pd
from numpy import random
df = pd.DataFrame(data = random.randn(5,4), index = ['A','B','C','D','E'],
columns = ['W','X','Y','Z'])
df['W'] = ['10/01/2018 12:00:00','10/03/2018 13:00:00',
'10/03/2018 12:30:00','10/04/2018 12:05:00',
'10/08/2018 12:00:15']
df['W']=pd.to_datetime(df['W'])
df['Entity'] = ['C201','C201','C201','C202','C202']
print(df.head())
fig, ax = plt.subplots()
df[df['Entity']=="C201"].plot(x="W",y="Y",label='C201',ax=ax,marker='x')
df[df['Entity']=="C202"].plot(x="W",y="Y",label='C202',ax=ax, marker='o')
This is the output:
You can first find out the unique values of your df['Entity'] and then loop over them. To generate new markers automatically for each Entity, you can define an order of some markers (let's say 5 in the answer below) which will repeat via marker=next(marker).
Complete minimal answer
import itertools
import pandas as pd
from numpy import random
import matplotlib.pyplot as plt
marker = itertools.cycle(('+', 'o', '*', '^', 's'))
df = pd.DataFrame(data = random.randn(5,4), index = ['A','B','C','D','E'],
columns = ['W','X','Y','Z'])
df['W'] = ['10/01/2018 12:00:00','10/03/2018 13:00:00',
'10/03/2018 12:30:00','10/04/2018 12:05:00',
'10/08/2018 12:00:15']
df['W']=pd.to_datetime(df['W'])
df['Entity'] = ['C201','C201','C201','C202','C202']
fig, ax = plt.subplots()
for idy in np.unique(df['Entity'].values):
df[df['Entity']==idy].plot(x="W",y="Y", label=idy, ax=ax, marker=next(marker))
plt.legend()
plt.show()

Seaborn swarmplot: Get point coordinates [duplicate]

I have the following data:
import pandas as pd
import numpy as np
# Generate dummy data.
a = np.random.random(75)
b = np.random.random(75) - 0.6
c = np.random.random(75) + 0.75
# Collate into a DataFrame
df = pd.DataFrame({'a': a, 'b': b, 'c': c})
df.columns = [list(['WT', 'MUT', 'WTxMUT']), list(['Parent', 'Parent', 'Offspring'])]
df.columns.names = ['Genotype', 'Status']
df_melt = pd.melt(df)
and I plot it in seaborn using this code:
import seaborn as sb
sb.swarmplot(data = df_melt, x = "Status", y = "value", hue = "Genotype")
How do I get the x-span of each group? What is the range of the horizontal span of the swarmplot for the Parent group, for instance?
You can get the information from the collections which are created by swarmplot.
swarmplot actually returns the matplotlib Axes instance, and from there we can find the PathCollections that it creates. To get the positions, we can use .get_offsets().
Here is your example, modified to find and print the swarm limits, and then use them to plot a box around the swarms.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
from matplotlib.patches import Rectangle
# Generate dummy data.
a = np.random.random(75)
b = np.random.random(75) - 0.6
c = np.random.random(75) + 0.75
# Collate into a DataFrame
df = pd.DataFrame({'a': a, 'b': b, 'c': c})
df.columns = [list(['WT', 'MUT', 'WTxMUT']), list(['Parent', 'Parent', 'Offspring'])]
df.columns.names = ['Genotype', 'Status']
df_melt = pd.melt(df)
ax = sb.swarmplot(data = df_melt, x = "Status", y = "value", hue = "Genotype")
def getdatalim(coll):
x,y = np.array(coll.get_offsets()).T
try:
print 'xmin={}, xmax={}, ymin={}, ymax={}'.format(
x.min(), x.max(), y.min(), y.max())
rect = Rectangle((x.min(),y.min()),x.ptp(),y.ptp(),edgecolor='k',facecolor='None',lw=3)
ax.add_patch(rect)
except ValueError:
pass
getdatalim(ax.collections[0]) # "Parent"
getdatalim(ax.collections[1]) # "Offspring"
plt.show()
which prints:
xmin=-0.107313729132, xmax=0.10661092707, ymin=-0.598534246847, ymax=0.980441247759
xmin=0.942829146473, xmax=1.06105941656, ymin=0.761277608688, ymax=1.74729717464
And here's the figure:

Plotting percentage of totals with pandas group bys

I am trying to plot a bar chart of a pandas data frame that is the result of two group bys.
In particular, my data frame looks exactly like the output from another SO post's answer (https://stackoverflow.com/a/23377155/7243972):
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
np.random.seed(0)
df = pd.DataFrame({'state': ['CA', 'WA', 'CO', 'AZ'] * 3,
'office_id': list(range(1, 7)) * 2,
'sales': [np.random.randint(100000, 999999) for _ in range(12)]})
state_office = df.groupby(['state', 'office_id']).agg({'sales': 'sum'})
state = df.groupby(['state']).agg({'sales': 'sum'})
results = state_office.div(state, level='state') * 100
I would like to plot results so that each state is a different color and the office_id is on the x-axis. This is so that each office_id is grouped together and they can be easily compared.
I've tried adjusting the plot from results['sales'].plot.bar(), but I am struggling.
First you need to flatten the dataframe:
data = []
for row in results.iterrows():
state, office_id = row[0]
sales = row[1][0]
data.append((state, office_id, sales))
flat_df = pd.DataFrame(data, columns=['state', 'office_id', 'sales'])
then plot
import seaborn as sns
sns.set(style="whitegrid")
g = sns.factorplot(x="office_id", y="sales", hue="state", data=flat_df, kind="bar", palette="muted")
edit: just realized there is a simpler way to flatten the dataframe:
flat_df = results.reset_index(inplace=False)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
np.random.seed(0)
df = pd.DataFrame({'state': ['CA', 'WA', 'CO', 'AZ'] * 3,
'office_id': list(range(1, 7)) * 2,
'sales': [np.random.randint(100000, 999999) for _ in
range(12)]})
state_office = df.groupby(['state', 'office_id']).agg({'sales': 'sum'})
state = df.groupby(['state']).agg({'sales': 'sum'})
results = state_office.div(state, level='state') * 100
results = results.reset_index()
fig, ax = plt.subplots()
for c, df in results.groupby('state'):
ax.scatter(df['office_id'], df['sales'], label=c)
ax.legend()
ax.set_title('Scatterplot')
ax.set_xlabel('office_id')
ax.set_ylabel('sales')
This prints a scatterplot. See if you can take it from here!

Obtaining span of plotted points from seaborn swarmplot

I have the following data:
import pandas as pd
import numpy as np
# Generate dummy data.
a = np.random.random(75)
b = np.random.random(75) - 0.6
c = np.random.random(75) + 0.75
# Collate into a DataFrame
df = pd.DataFrame({'a': a, 'b': b, 'c': c})
df.columns = [list(['WT', 'MUT', 'WTxMUT']), list(['Parent', 'Parent', 'Offspring'])]
df.columns.names = ['Genotype', 'Status']
df_melt = pd.melt(df)
and I plot it in seaborn using this code:
import seaborn as sb
sb.swarmplot(data = df_melt, x = "Status", y = "value", hue = "Genotype")
How do I get the x-span of each group? What is the range of the horizontal span of the swarmplot for the Parent group, for instance?
You can get the information from the collections which are created by swarmplot.
swarmplot actually returns the matplotlib Axes instance, and from there we can find the PathCollections that it creates. To get the positions, we can use .get_offsets().
Here is your example, modified to find and print the swarm limits, and then use them to plot a box around the swarms.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
from matplotlib.patches import Rectangle
# Generate dummy data.
a = np.random.random(75)
b = np.random.random(75) - 0.6
c = np.random.random(75) + 0.75
# Collate into a DataFrame
df = pd.DataFrame({'a': a, 'b': b, 'c': c})
df.columns = [list(['WT', 'MUT', 'WTxMUT']), list(['Parent', 'Parent', 'Offspring'])]
df.columns.names = ['Genotype', 'Status']
df_melt = pd.melt(df)
ax = sb.swarmplot(data = df_melt, x = "Status", y = "value", hue = "Genotype")
def getdatalim(coll):
x,y = np.array(coll.get_offsets()).T
try:
print 'xmin={}, xmax={}, ymin={}, ymax={}'.format(
x.min(), x.max(), y.min(), y.max())
rect = Rectangle((x.min(),y.min()),x.ptp(),y.ptp(),edgecolor='k',facecolor='None',lw=3)
ax.add_patch(rect)
except ValueError:
pass
getdatalim(ax.collections[0]) # "Parent"
getdatalim(ax.collections[1]) # "Offspring"
plt.show()
which prints:
xmin=-0.107313729132, xmax=0.10661092707, ymin=-0.598534246847, ymax=0.980441247759
xmin=0.942829146473, xmax=1.06105941656, ymin=0.761277608688, ymax=1.74729717464
And here's the figure:

Categories

Resources