data = [{'category': 'A', 'maxvalue': '6', 'minvalue': '0'}, {'category': 'B', 'maxvalue': '11', 'minvalue': '6'}, {'category': 'C', 'maxvalue': '21', 'minvalue': '11'}, {'category': 'D', 'maxvalue': '31', 'minvalue': '21'}, {'category': 'E', 'maxvalue': '41', 'minvalue': '31'}, {'category': 'F', 'maxvalue': '9999999999', 'minvalue': '41'}]
this data is inside a data frame and now that dataframe, the format i want is like below along with other columns inside dataframe
A B C D E F
0-6 6-11 11-21 21-31 31-41 41-51
For an example of data as following :
import pandas as pd
import numpy as np
data=[[{'category': 'A', 'maxvalue': '6', 'minvalue': '0'},
{'category': 'B', 'maxvalue': '11', 'minvalue': '6'},
{'category': 'C', 'maxvalue': '21', 'minvalue': '11'},
{'category': 'D', 'maxvalue': '31', 'minvalue': '21'},
{'category': 'E', 'maxvalue': '41', 'minvalue': '31'},
{'category': 'F', 'maxvalue': '9999999999', 'minvalue': '41'}],
[{'category': 'A', 'maxvalue': '5', 'minvalue': '0'},
{'category': 'B', 'maxvalue': '10', 'minvalue': '5'},
{'category': 'C', 'maxvalue': '20', 'minvalue': '10'},
{'category': 'D', 'maxvalue': '30', 'minvalue': '20'},
{'category': 'E', 'maxvalue': '40', 'minvalue': '30'},
{'category': 'F', 'maxvalue': '9999999999', 'minvalue': '41'}],[]]
df=pd.DataFrame(columns=['bucket'])
df['bucket']=data
df
df2=df.bucket.apply(lambda x:pd.DataFrame.from_dict(x))
df2
data=[]
for i in range(len(df2)):
if not(df2.iloc[i].empty):
df2.iloc[i]['value']=df2.iloc[i]['minvalue']+'-'+(df2.iloc[i]['maxvalue'])
data.append(df2.iloc[i].T.loc['value'].to_list())
else:
data.append(np.zeros(len(df2.iloc[0].T.loc['category'].to_list())))
dfinal=pd.DataFrame(data,columns=df2.iloc[0].T.loc['category'].to_list())
final
Else if you want it to reconfigure in the same dataframe :
df[dfinal.columns]=data
df
One solution could be as follows:
import pandas as pd
import numpy as np
# data set without duplicates in "category"
lst = [[{'category': 'A', 'maxvalue': '4', 'minvalue': '0'},
{'category': 'B', 'maxvalue': '51', 'minvalue': '41'}],
np.nan,
[{'category': 'A', 'maxvalue': '6', 'minvalue': '0'},
{'category': 'B', 'maxvalue': '21', 'minvalue': '11'}]
]
df = pd.DataFrame(columns=['bucket'])
df['bucket'] = lst
# code
from itertools import chain
# select notna rows
out = df[df['bucket'].notna()]
# get dict keys in separate cols with their associated vals as the vals
# repeat index from df according to len of each list
out = pd.DataFrame(list(chain.from_iterable(out['bucket'])),
index=pd.Index.repeat(out.index, out['bucket'].str.len()))
out['result'] = (out.minvalue.astype(str)+'-'+out.maxvalue.astype(str))
# =============================================================================
# solution 1: no duplicate categories per row
# =============================================================================
out1 = out.copy()
out1 = out1.pivot(index=None, columns='category', values='result').fillna('')
print(out1)
category A B
0 0-4 41-51
2 0-6 11-21
# =============================================================================
# solution 2: duplicate categories per row, e.g. 2x "A" in row 0
# =============================================================================
lst = [[{'category': 'A', 'maxvalue': '4', 'minvalue': '0'},
{'category': 'A', 'maxvalue': '51', 'minvalue': '41'}],
np.nan,
[{'category': 'A', 'maxvalue': '6', 'minvalue': '0'},
{'category': 'B', 'maxvalue': '21', 'minvalue': '11'}]
]
# continue after `out['result'] = (out.minvalue...)`
out2 = out.copy()
out2['result'] = out2.groupby([out2.index,'category'])['result']\
.transform(lambda x: ','.join(x))
out2.set_index('category', append=True, inplace=True)
out2 = out2[~out2.index.duplicated(keep='first')]['result']
out2 = out2.reset_index().set_index('level_0').rename_axis(None, axis=0)
out2 = out2.pivot(index=None, columns='category', values='result').fillna('')
print(out2)
category A B
0 0-4,41-51
2 0-6 11-21
# =============================================================================
# solution 3: same data as 2, but only keeping first of duplicates
# =============================================================================
# continue after `out['result'] = (out.minvalue...)`
out3 = out.copy()
out3 = out3.reset_index(drop=False)\
.drop_duplicates(['index','category'])\
.set_index('index', drop=True).rename_axis(None, axis=0)
out3 = out3.pivot(index=None, columns='category', values='result').fillna('')
print(out3) # only printing 0-4, not 41-51 for "A" in row 0
category A B
0 0-4
2 0-6 11-21
I'm trying to draw a two-sided graph similar to the population pyramid explained here and here. The problem is that I have categorical variables (male/female) that I want to group together:
import pandas as pd
import seaborn as sns
# data
data = {'species': ['X', 'X', 'Y', 'Y', 'Z', 'Z', 'X', 'X', 'Y', 'Y', 'Z', 'Z'],
'sex': ['male', 'female', 'male', 'female', 'male', 'female', 'male', 'female', 'male', 'female', 'male', 'female'],
'mass (g)': [4000, 3500, 3800, 3200, 5500, 4900, 2500, 2100, 2400, 2000, 4200, 3800],
'age': ['adult', 'adult', 'adult', 'adult', 'adult', 'adult', 'juvenile', 'juvenile', 'juvenile', 'juvenile', 'juvenile', 'juvenile']}
df = pd.DataFrame(data)
# convert juvenile mass to negative
df.loc[df.age.eq('juvenile'), 'mass (g)'] = df['mass (g)'].mul(-1)
# plot
sns.set_theme(style="whitegrid")
fig, ax = plt.subplots(figsize=(10,5))
sns.barplot(data=df, x='mass (g)', y='species', hue='sex', ci=False, orient='horizontal', dodge=True)
ax.yaxis.tick_right()
ax.yaxis.set_label_position("right")
plt.show()
The figure below is what I'm aiming for. Different color bars are for male/female sex. Different species X, Y, Z are in separate categorical groups. The bars on the right side of the figure show mass of adults for each species.
I sketched in red the bars on the left side to show mass of juveniles for each species. How do I plot this? I can't find anything useful in the seaborn docs or on SO.
If you mix positive and negative values, by default seaborn's barplot will average them out.
You could draw two barplots back to back and reverse the left one:
from matplotlib import pyplot as plt
import seaborn as sns
import pandas as pd
data = {'species': ['X', 'X', 'Y', 'Y', 'Z', 'Z', 'X', 'X', 'Y', 'Y', 'Z', 'Z'],
'sex': ['male', 'female', 'male', 'female', 'male', 'female', 'male', 'female', 'male', 'female', 'male', 'female'],
'mass (g)': [4000, 3500, 3800, 3200, 5500, 4900, 2500, 2100, 2400, 2000, 4200, 3800],
'age': ['adult', 'adult', 'adult', 'adult', 'adult', 'adult', 'juvenile', 'juvenile', 'juvenile', 'juvenile', 'juvenile', 'juvenile']}
df = pd.DataFrame(data)
df['sex'] = pd.Categorical(df['sex']) # make hue column categorical, forcing a fixed order
sns.set_theme(style='whitegrid')
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(10, 5), sharey=True, gridspec_kw={'wspace': 0})
# draw adult subplot at the right
sns.barplot(data=df[df['age'] == 'adult'], x='mass (g)', y='species', hue='sex',
ci=False, orient='horizontal', dodge=True, ax=ax2)
ax2.yaxis.set_label_position('right')
ax2.tick_params(axis='y', labelright=True, right=True)
ax2.set_title(' '+'adult', loc='left')
ax2.legend_.remove() # remove the legend; the legend will be in ax1
# draw juvenile subplot at the left
sns.barplot(data=df[df['age'] == 'juvenile'], x='mass (g)', y='species', hue='sex',
ci=False, orient='horizontal', dodge=True, ax=ax1)
# optionally use the same scale left and right
xmax = max(ax1.get_xlim()[1], ax2.get_xlim()[1])
ax1.set_xlim(xmax=xmax)
ax2.set_xlim(xmax=xmax)
ax1.invert_xaxis() # reverse the direction
ax1.tick_params(labelleft=False, left=False)
ax1.set_ylabel('')
ax1.set_title('juvenile'+' ', loc='right')
plt.tight_layout()
plt.show()
An interesting feature of seaborn's barplots is that it will also do the work of averaging out the values given a dataframe with a row for each individual (and calculate a confidence interval).
Try something like this:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# data
data = {'species': ['X', 'X', 'Y', 'Y', 'Z', 'Z', 'X', 'X', 'Y', 'Y', 'Z', 'Z'],
'sex': ['male', 'female', 'male', 'female', 'male', 'female', 'male', 'female', 'male', 'female', 'male', 'female'],
'mass (g)': [4000, 3500, 3800, 3200, 5500, 4900, 2500, 2100, 2400, 2000, 4200, 3800],
'age': ['adult', 'adult', 'adult', 'adult', 'adult', 'adult', 'juvenile', 'juvenile', 'juvenile', 'juvenile', 'juvenile', 'juvenile']}
df = pd.DataFrame(data)
# convert juvenile mass to negative
df.loc[df.age.eq('juvenile'), 'mass (g)'] = df['mass (g)'].mul(-1)
# plot
sns.set_theme(style="whitegrid")
fig, ax = plt.subplots(figsize=(10,5))
df_reshape = df.set_index(['species','sex','age']).unstack(['age','sex'])['mass (g)']
df_reshape.loc[:, 'adult'].plot.barh(ax=ax)
df_reshape.loc[:, 'juvenile'].plot.barh(legend=False, ax=ax)
plt.show()
Output:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# data
data = {'species': ['X', 'X', 'Y', 'Y', 'Z', 'Z', 'X', 'X', 'Y', 'Y', 'Z', 'Z'],
'sex': ['male', 'female', 'male', 'female', 'male', 'female', 'male', 'female', 'male', 'female', 'male', 'female'],
'mass (g)': [4000, 3500, 3800, 3200, 5500, 4900, 2500, 2100, 2400, 2000, 4200, 3800],
'age': ['adult', 'adult', 'adult', 'adult', 'adult', 'adult', 'juvenile', 'juvenile', 'juvenile', 'juvenile', 'juvenile', 'juvenile']}
df = pd.DataFrame(data)
# convert juvenile mass to negative
df.loc[df.age.eq('juvenile'), 'mass (g)'] = df['mass (g)'].mul(-1)
# plot
sns.set_theme(style="whitegrid")
fig, ax = plt.subplots(figsize=(10,5))
df_reshape = df.set_index(['species','sex','age']).unstack(['age','sex'])['mass (g)']
df_reshape.loc[:, ['adult']].plot.barh(ax=ax, edgecolor='k')
df_reshape.loc[:, ['juvenile']].plot.barh(ax=ax, label='Juvenile', color=['navy','red'], alpha=.6, edgecolor='k', hatch='/')
plt.show()
Output:
I just used `pivot to shape the data correctly
import pandas as pd
import seaborn as sns
# data
data = {'species': ['X', 'X', 'Y', 'Y', 'Z', 'Z', 'X', 'X', 'Y', 'Y', 'Z', 'Z'],
'sex': ['male', 'female', 'male', 'female', 'male', 'female', 'male', 'female', 'male', 'female', 'male', 'female'],
'mass (g)': [4000, 3500, 3800, 3200, 5500, 4900, 2500, 2100, 2400, 2000, 4200, 3800],
'age': ['adult', 'adult', 'adult', 'adult', 'adult', 'adult', 'juvenile', 'juvenile', 'juvenile', 'juvenile', 'juvenile', 'juvenile']}
df = pd.DataFrame(data)
# convert juvenile mass to negative
df.loc[df.age.eq('juvenile'), 'mass (g)'] = df['mass (g)'].mul(-1)
# pivot data
df=df.pivot(columns=['age'], index=['species', 'sex'], values=['mass (g)']).reset_index()
df = df.set_index(['species', 'sex'])['mass (g)'].reset_index()
# plot
sns.set_theme(style="whitegrid")
fig, ax = plt.subplots(figsize=(10,5))
sns.barplot(data=df, x='adult', y='species', hue='sex', ci=False, orient='horizontal', dodge=True)
sns.barplot(data=df, x='juvenile', y='species', hue='sex', ci=False, orient='horizontal', dodge=True)
im doing a simple groupby on my data as shown in the code below. Is there a manner to do it directly without the drop_duplicates please, in the same line of code?
Thank you
df_brut['Revenue'] = df_brut.groupby(['cod', 'date', 'zone'])['Revenue'].transform('sum')
df_brut = df_brut.drop_duplicates()
df_brut.columns = ['cod','date', 'zone','SUM_']
My data
data1 = {'date': ['2021-06', '2021-06', '2021-07', '2021-07', '2021-07', '2021-07'], 'cod': ['12', '12', '14', '15', '15', '18'], 'zone': ['LA', 'LA', 'LA', 'PARIS', 'PARIS', 'PARIS'], 'Revenue': [10, 20, 30, 50, 40, 10]}
df_brut= pd.DataFrame(data1)
the grouped data expected is
data2 = {'date': ['2021-06', '2021-07', '2021-07', '2021-07'], 'cod': ['12', '14', '15','18'], 'zone': ['LA', 'LA', 'PARIS', 'PARIS'], 'SUM_': [30, 30, 90, 10]}
df_grouped= pd.DataFrame(data2)
You could do:
(df_brut.groupby(['cod', 'date', 'zone'], as_index=False)['Revenue']
.sum()
.rename({'Revenue': '_SUM'}, axis=1)
)