Plot a list of dictionaries using matplotlib - python

List =
[{'Month': '1', 'Store': 'A', 'Sales': '100'},
{'Month': '2', 'Store': 'A', 'Sales': '50'},
{'Month': '3', 'Store': 'A', 'Sales': '200'},
{'Month': '1', 'Store': 'B', 'Sales': '300'},
{'Month': '2', 'Store': 'B', 'Sales': '200'},
{'Month': '3', 'Store': 'B', 'Sales': '250'}]
I do know how to plot the basic line.
But how can I have a combined result with both data set?
Like this Expected result

This will do it. Place things in pandas simplify this - also, plot multiple line and then all will be shown on the same chart.
import pandas as pd
import matplotlib.pyplot as plt
df = pd.DataFrame(your_data)
df[['Month', 'Sales']] = df[['Month', 'Sales']].apply(pd.to_numeric, errors='coerce')
a = df[df.Store == 'A']
b = df[df.Store == 'B']
fig = plt.figure(figsize=(10,6))
ax = fig.add_subplot(111)
a.plot('Month', 'Sales', ax=ax)
b.plot('Month', 'Sales', ax=ax)
ax.grid(True)
fig.set_facecolor('white')

Related

a column in a dataframe having list of dictionary i want to convert that into seperate column

data = [{'category': 'A', 'maxvalue': '6', 'minvalue': '0'}, {'category': 'B', 'maxvalue': '11', 'minvalue': '6'}, {'category': 'C', 'maxvalue': '21', 'minvalue': '11'}, {'category': 'D', 'maxvalue': '31', 'minvalue': '21'}, {'category': 'E', 'maxvalue': '41', 'minvalue': '31'}, {'category': 'F', 'maxvalue': '9999999999', 'minvalue': '41'}]
this data is inside a data frame and now that dataframe, the format i want is like below along with other columns inside dataframe
A B C D E F
0-6 6-11 11-21 21-31 31-41 41-51
For an example of data as following :
import pandas as pd
import numpy as np
data=[[{'category': 'A', 'maxvalue': '6', 'minvalue': '0'},
{'category': 'B', 'maxvalue': '11', 'minvalue': '6'},
{'category': 'C', 'maxvalue': '21', 'minvalue': '11'},
{'category': 'D', 'maxvalue': '31', 'minvalue': '21'},
{'category': 'E', 'maxvalue': '41', 'minvalue': '31'},
{'category': 'F', 'maxvalue': '9999999999', 'minvalue': '41'}],
[{'category': 'A', 'maxvalue': '5', 'minvalue': '0'},
{'category': 'B', 'maxvalue': '10', 'minvalue': '5'},
{'category': 'C', 'maxvalue': '20', 'minvalue': '10'},
{'category': 'D', 'maxvalue': '30', 'minvalue': '20'},
{'category': 'E', 'maxvalue': '40', 'minvalue': '30'},
{'category': 'F', 'maxvalue': '9999999999', 'minvalue': '41'}],[]]
df=pd.DataFrame(columns=['bucket'])
df['bucket']=data
df
df2=df.bucket.apply(lambda x:pd.DataFrame.from_dict(x))
df2
data=[]
for i in range(len(df2)):
if not(df2.iloc[i].empty):
df2.iloc[i]['value']=df2.iloc[i]['minvalue']+'-'+(df2.iloc[i]['maxvalue'])
data.append(df2.iloc[i].T.loc['value'].to_list())
else:
data.append(np.zeros(len(df2.iloc[0].T.loc['category'].to_list())))
dfinal=pd.DataFrame(data,columns=df2.iloc[0].T.loc['category'].to_list())
final
Else if you want it to reconfigure in the same dataframe :
df[dfinal.columns]=data
df
One solution could be as follows:
import pandas as pd
import numpy as np
# data set without duplicates in "category"
lst = [[{'category': 'A', 'maxvalue': '4', 'minvalue': '0'},
{'category': 'B', 'maxvalue': '51', 'minvalue': '41'}],
np.nan,
[{'category': 'A', 'maxvalue': '6', 'minvalue': '0'},
{'category': 'B', 'maxvalue': '21', 'minvalue': '11'}]
]
df = pd.DataFrame(columns=['bucket'])
df['bucket'] = lst
# code
from itertools import chain
# select notna rows
out = df[df['bucket'].notna()]
# get dict keys in separate cols with their associated vals as the vals
# repeat index from df according to len of each list
out = pd.DataFrame(list(chain.from_iterable(out['bucket'])),
index=pd.Index.repeat(out.index, out['bucket'].str.len()))
out['result'] = (out.minvalue.astype(str)+'-'+out.maxvalue.astype(str))
# =============================================================================
# solution 1: no duplicate categories per row
# =============================================================================
out1 = out.copy()
out1 = out1.pivot(index=None, columns='category', values='result').fillna('')
print(out1)
category A B
0 0-4 41-51
2 0-6 11-21
# =============================================================================
# solution 2: duplicate categories per row, e.g. 2x "A" in row 0
# =============================================================================
lst = [[{'category': 'A', 'maxvalue': '4', 'minvalue': '0'},
{'category': 'A', 'maxvalue': '51', 'minvalue': '41'}],
np.nan,
[{'category': 'A', 'maxvalue': '6', 'minvalue': '0'},
{'category': 'B', 'maxvalue': '21', 'minvalue': '11'}]
]
# continue after `out['result'] = (out.minvalue...)`
out2 = out.copy()
out2['result'] = out2.groupby([out2.index,'category'])['result']\
.transform(lambda x: ','.join(x))
out2.set_index('category', append=True, inplace=True)
out2 = out2[~out2.index.duplicated(keep='first')]['result']
out2 = out2.reset_index().set_index('level_0').rename_axis(None, axis=0)
out2 = out2.pivot(index=None, columns='category', values='result').fillna('')
print(out2)
category A B
0 0-4,41-51
2 0-6 11-21
# =============================================================================
# solution 3: same data as 2, but only keeping first of duplicates
# =============================================================================
# continue after `out['result'] = (out.minvalue...)`
out3 = out.copy()
out3 = out3.reset_index(drop=False)\
.drop_duplicates(['index','category'])\
.set_index('index', drop=True).rename_axis(None, axis=0)
out3 = out3.pivot(index=None, columns='category', values='result').fillna('')
print(out3) # only printing 0-4, not 41-51 for "A" in row 0
category A B
0 0-4
2 0-6 11-21

Two sided grouped barplots with Python seaborn

I'm trying to draw a two-sided graph similar to the population pyramid explained here and here. The problem is that I have categorical variables (male/female) that I want to group together:
import pandas as pd
import seaborn as sns
# data
data = {'species': ['X', 'X', 'Y', 'Y', 'Z', 'Z', 'X', 'X', 'Y', 'Y', 'Z', 'Z'],
'sex': ['male', 'female', 'male', 'female', 'male', 'female', 'male', 'female', 'male', 'female', 'male', 'female'],
'mass (g)': [4000, 3500, 3800, 3200, 5500, 4900, 2500, 2100, 2400, 2000, 4200, 3800],
'age': ['adult', 'adult', 'adult', 'adult', 'adult', 'adult', 'juvenile', 'juvenile', 'juvenile', 'juvenile', 'juvenile', 'juvenile']}
df = pd.DataFrame(data)
# convert juvenile mass to negative
df.loc[df.age.eq('juvenile'), 'mass (g)'] = df['mass (g)'].mul(-1)
# plot
sns.set_theme(style="whitegrid")
fig, ax = plt.subplots(figsize=(10,5))
sns.barplot(data=df, x='mass (g)', y='species', hue='sex', ci=False, orient='horizontal', dodge=True)
ax.yaxis.tick_right()
ax.yaxis.set_label_position("right")
plt.show()
The figure below is what I'm aiming for. Different color bars are for male/female sex. Different species X, Y, Z are in separate categorical groups. The bars on the right side of the figure show mass of adults for each species.
I sketched in red the bars on the left side to show mass of juveniles for each species. How do I plot this? I can't find anything useful in the seaborn docs or on SO.
If you mix positive and negative values, by default seaborn's barplot will average them out.
You could draw two barplots back to back and reverse the left one:
from matplotlib import pyplot as plt
import seaborn as sns
import pandas as pd
data = {'species': ['X', 'X', 'Y', 'Y', 'Z', 'Z', 'X', 'X', 'Y', 'Y', 'Z', 'Z'],
'sex': ['male', 'female', 'male', 'female', 'male', 'female', 'male', 'female', 'male', 'female', 'male', 'female'],
'mass (g)': [4000, 3500, 3800, 3200, 5500, 4900, 2500, 2100, 2400, 2000, 4200, 3800],
'age': ['adult', 'adult', 'adult', 'adult', 'adult', 'adult', 'juvenile', 'juvenile', 'juvenile', 'juvenile', 'juvenile', 'juvenile']}
df = pd.DataFrame(data)
df['sex'] = pd.Categorical(df['sex']) # make hue column categorical, forcing a fixed order
sns.set_theme(style='whitegrid')
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(10, 5), sharey=True, gridspec_kw={'wspace': 0})
# draw adult subplot at the right
sns.barplot(data=df[df['age'] == 'adult'], x='mass (g)', y='species', hue='sex',
ci=False, orient='horizontal', dodge=True, ax=ax2)
ax2.yaxis.set_label_position('right')
ax2.tick_params(axis='y', labelright=True, right=True)
ax2.set_title(' '+'adult', loc='left')
ax2.legend_.remove() # remove the legend; the legend will be in ax1
# draw juvenile subplot at the left
sns.barplot(data=df[df['age'] == 'juvenile'], x='mass (g)', y='species', hue='sex',
ci=False, orient='horizontal', dodge=True, ax=ax1)
# optionally use the same scale left and right
xmax = max(ax1.get_xlim()[1], ax2.get_xlim()[1])
ax1.set_xlim(xmax=xmax)
ax2.set_xlim(xmax=xmax)
ax1.invert_xaxis() # reverse the direction
ax1.tick_params(labelleft=False, left=False)
ax1.set_ylabel('')
ax1.set_title('juvenile'+' ', loc='right')
plt.tight_layout()
plt.show()
An interesting feature of seaborn's barplots is that it will also do the work of averaging out the values given a dataframe with a row for each individual (and calculate a confidence interval).
Try something like this:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# data
data = {'species': ['X', 'X', 'Y', 'Y', 'Z', 'Z', 'X', 'X', 'Y', 'Y', 'Z', 'Z'],
'sex': ['male', 'female', 'male', 'female', 'male', 'female', 'male', 'female', 'male', 'female', 'male', 'female'],
'mass (g)': [4000, 3500, 3800, 3200, 5500, 4900, 2500, 2100, 2400, 2000, 4200, 3800],
'age': ['adult', 'adult', 'adult', 'adult', 'adult', 'adult', 'juvenile', 'juvenile', 'juvenile', 'juvenile', 'juvenile', 'juvenile']}
df = pd.DataFrame(data)
# convert juvenile mass to negative
df.loc[df.age.eq('juvenile'), 'mass (g)'] = df['mass (g)'].mul(-1)
# plot
sns.set_theme(style="whitegrid")
fig, ax = plt.subplots(figsize=(10,5))
df_reshape = df.set_index(['species','sex','age']).unstack(['age','sex'])['mass (g)']
df_reshape.loc[:, 'adult'].plot.barh(ax=ax)
df_reshape.loc[:, 'juvenile'].plot.barh(legend=False, ax=ax)
plt.show()
Output:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# data
data = {'species': ['X', 'X', 'Y', 'Y', 'Z', 'Z', 'X', 'X', 'Y', 'Y', 'Z', 'Z'],
'sex': ['male', 'female', 'male', 'female', 'male', 'female', 'male', 'female', 'male', 'female', 'male', 'female'],
'mass (g)': [4000, 3500, 3800, 3200, 5500, 4900, 2500, 2100, 2400, 2000, 4200, 3800],
'age': ['adult', 'adult', 'adult', 'adult', 'adult', 'adult', 'juvenile', 'juvenile', 'juvenile', 'juvenile', 'juvenile', 'juvenile']}
df = pd.DataFrame(data)
# convert juvenile mass to negative
df.loc[df.age.eq('juvenile'), 'mass (g)'] = df['mass (g)'].mul(-1)
# plot
sns.set_theme(style="whitegrid")
fig, ax = plt.subplots(figsize=(10,5))
df_reshape = df.set_index(['species','sex','age']).unstack(['age','sex'])['mass (g)']
df_reshape.loc[:, ['adult']].plot.barh(ax=ax, edgecolor='k')
df_reshape.loc[:, ['juvenile']].plot.barh(ax=ax, label='Juvenile', color=['navy','red'], alpha=.6, edgecolor='k', hatch='/')
plt.show()
Output:
I just used `pivot to shape the data correctly
import pandas as pd
import seaborn as sns
# data
data = {'species': ['X', 'X', 'Y', 'Y', 'Z', 'Z', 'X', 'X', 'Y', 'Y', 'Z', 'Z'],
'sex': ['male', 'female', 'male', 'female', 'male', 'female', 'male', 'female', 'male', 'female', 'male', 'female'],
'mass (g)': [4000, 3500, 3800, 3200, 5500, 4900, 2500, 2100, 2400, 2000, 4200, 3800],
'age': ['adult', 'adult', 'adult', 'adult', 'adult', 'adult', 'juvenile', 'juvenile', 'juvenile', 'juvenile', 'juvenile', 'juvenile']}
df = pd.DataFrame(data)
# convert juvenile mass to negative
df.loc[df.age.eq('juvenile'), 'mass (g)'] = df['mass (g)'].mul(-1)
# pivot data
df=df.pivot(columns=['age'], index=['species', 'sex'], values=['mass (g)']).reset_index()
df = df.set_index(['species', 'sex'])['mass (g)'].reset_index()
# plot
sns.set_theme(style="whitegrid")
fig, ax = plt.subplots(figsize=(10,5))
sns.barplot(data=df, x='adult', y='species', hue='sex', ci=False, orient='horizontal', dodge=True)
sns.barplot(data=df, x='juvenile', y='species', hue='sex', ci=False, orient='horizontal', dodge=True)

pandas : drop duplicates in the same time when grouping by

im doing a simple groupby on my data as shown in the code below. Is there a manner to do it directly without the drop_duplicates please, in the same line of code?
Thank you
df_brut['Revenue'] = df_brut.groupby(['cod', 'date', 'zone'])['Revenue'].transform('sum')
df_brut = df_brut.drop_duplicates()
df_brut.columns = ['cod','date', 'zone','SUM_']
My data
data1 = {'date': ['2021-06', '2021-06', '2021-07', '2021-07', '2021-07', '2021-07'], 'cod': ['12', '12', '14', '15', '15', '18'], 'zone': ['LA', 'LA', 'LA', 'PARIS', 'PARIS', 'PARIS'], 'Revenue': [10, 20, 30, 50, 40, 10]}
df_brut= pd.DataFrame(data1)
the grouped data expected is
data2 = {'date': ['2021-06', '2021-07', '2021-07', '2021-07'], 'cod': ['12', '14', '15','18'], 'zone': ['LA', 'LA', 'PARIS', 'PARIS'], 'SUM_': [30, 30, 90, 10]}
df_grouped= pd.DataFrame(data2)
You could do:
(df_brut.groupby(['cod', 'date', 'zone'], as_index=False)['Revenue']
.sum()
.rename({'Revenue': '_SUM'}, axis=1)
)

How to create a heatmap with condition?

I have the following data:
keys = ['A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'B', 'B', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C']
values = ['111', '222', '333', '444', '555', '666', '777', '888', '222', '888', '222', '333', '999', '444', '555', '666', '777', '888']
I want to create a heatmap as follows:
mydata = pd.DataFrame({x: values, y: keys})
df_new = mydata.set_index(x)[y].astype(str).str.get_dummies().T
fig, ax = plt.subplots(figsize = (20,5))
ax = sns.heatmap(df_new, cbar=False, linewidths=.5)
plt.show()
The only issue is that the values appear as duplicated columns in a heatmap. For example, 222 appears 3 times in the heatmap. How can I push the same value to be in a single column?

Need help in Python Pivot table group by

I have the a dataframe something like the below struture :
I need to make it look it as this :
Can any one help pls ?
You can use the groupby() function with a list and append summarising functions with agg().
import pandas as pd
df = pd.DataFrame({'customer': [1,2,1,3,1,2,3],
"group_code": ['111', '111', '222', '111', '111', '111', '333'],
"ind_code": ['A', 'B', 'AA', 'A', 'AAA', 'C', 'BBB'],
"amount": [100, 200, 140, 400, 225, 125, 600],
"card": ['XXX', 'YYY', 'YYY', 'XXX', 'XXX', 'YYY', 'XXX']})
df_groupby = df.groupby(['customer', 'group_code', 'ind_code']).agg(['count', 'mean'])

Categories

Resources