I use seaborn to make a categorical barplot of a df containing Pearson correlation R-values for 17 vegetation classes, 3 carbon species and 4 regions. I try to recreate a smaller sample df here:
import pandas as pd
import seaborn as sns
import random
import numpy as np
df = pd.DataFrame({
'veg class':12*['Tree bl dc','Shrubland','Grassland'],
'Pearson R':np.random.uniform(0,1, 36),
'Pearson p':np.random.uniform(0,0.1, 36),
'carbon':4*['CO2','CO2','CO2', 'CO', 'CO', 'CO', 'CO2 corr', 'CO2 corr', 'CO2 corr'],
'spatial':9*['SH'] + 9*['larger AU region'] + 9*['AU'] + 9*['SE-AU']
})
#In my original df, the number of vegetation classes where R-values are
#available is not the same for all spatial scales, so I drop random rows
#to make it more similar:
df.drop([11,14,17,20,23,26,28,29,31,32,34,35], inplace=True)
#I added colums indicating where hatching should be
#boolean:
df['significant'] = 1
df.loc[df['Pearson p'] > 0.05, 'significant'] = 0
#string:
df['hatch'] = ''
df.loc[df['Pearson p'] > 0.05, 'hatch'] = 'x'
df.head()
This is my plotting routine:
sns.set(font_scale=2.1)
#Draw a nested barplot by veg class
g = sns.catplot(
data=df, kind="bar", row="spatial",
x="veg class", y="Pearson R", hue="carbon",
ci=None, palette="YlOrBr", aspect=5
)
g.despine(left=True)
g.set_titles("{row_name}")
g.set_axis_labels("", "Pearson R")
g.set(xlabel=None)
g.legend.set_title("")
g.set_xticklabels(rotation = 60)
(The plot looks as follows: seaborn categorical barplot)
The plot is exactly how I would like it, except that now I would like to add hatching (or any kind of distinction) for all bars where the Pearson R value is insignificant, i.e. where the p value is larger than 0.05. I found this stackoverflow entry, but my problem differs from this, as the plots that should be hatched are not in repetitive order.
Any hints will be highly appreciated!
To determine the height of individual bars and hatching, we get a container for each graph unit, get the height of that individual container, determine it with a specified threshold, and then set the hatching and color. Please add the following code at the end.
for ax in g.axes.flat:
for k in range(len(ax.containers)):
h = ax.patches[k].get_height()
if h >= 0.8:
ax.patches[k].set_hatch('*')
ax.patches[k].set_edgecolor('k')
Edit: The data has been updated to match the actual data, and the code has been modified accordingly. Also, the logic is conditional on the value of the hatching column.
for i,ax in enumerate(g.axes.flat):
s = ax.get_title()
dff = df.query('spatial == #s')
dff = dff.sort_values('veg class', ascending=False)
ha = dff['hatch'].tolist()
p = dff['Pearson R'].tolist()
print(ha)
for k in range(len(dff)):
if ha[k] == 'x':
ax.patches[k].set_hatch('*')
ax.patches[k].set_edgecolor('k')
Related
I've been trying to work through the code in this function and cannot get my series to show up on my plots. Possibly there is an easier way to do this. In each plot I want display each of the 7 entities, in a time series with 1 indicator.
I'm struggling with how to group values by both year, and country. I am new to python and data science so I appreciate any help.
Here is a link to the csv data from the World Bank
https://datacatalog.worldbank.org/search/dataset/0037712
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('fivethirtyeight')
plt.rcParams['figure.figsize'] = (14, 7)
raw = pd.read_csv('WDIData.csv')
countries = ['BIH', 'HRV', 'MKD', 'MNE', 'SRB', 'SVN', 'EUU']
colors = {
'Bosnia and Herzegovina': "#66C2A5",
'Croatia': "#FA8D62",
'North Macedonia': "#F7BA20",
'Montenegro': "#E68AC3",
'Serbia': "#8D9FCA",
'Slovenia': "#A6D853",
'avg. EU': "#CCCCCC"
}
i = 0
df = raw[raw['Country Code'].isin(countries)].copy()
pre_1990 = [str(x) for x in range(1960, 1990)]
df.drop(pre_1990, axis=1, inplace=True)
df = df.rename(columns={'Country Name': 'CountryName', 'Country Code': 'CountryCode', 'Indicator Name': 'IndicatorName', 'Indicator Code': 'IndicatorCode'})
columns = ['CountryName', 'CountryCode', 'IndicatorName', 'IndicatorCode']
df = pd.melt(df, id_vars=columns, var_name='Year', value_name='Value')
df.dropna(inplace=True)
def plot_indicator(indicators, title=None,
xlim=None, ylim=None, xspace=None,
loc=0, loc2=0,
drop_eu=False, filename=None):
lines = ['-', '--']
line_styles = []
fig, ax = plt.subplots()
indicators = indicators if isinstance(indicators, list) else [indicators]
for line, (name, indicator) in zip(lines, indicators):
ls, = plt.plot(np.nan, linestyle=line, color='#999999')
line_styles.append([ls, name])
df_ind = df[(df.IndicatorCode == indicator)]
group = df_ind.groupby(['CountryName'])
for country, values in group:
country_values = values.groupby('Year').mean()
if country == 'European Union':
if drop_eu:
continue
ax.plot(country_values, label=country,
linestyle='--', color='#666666', linewidth=1, zorder=1)
elif country_values.shape[0] > 1:
ax.plot(country_values, label=country, linestyle=line,
color=colors[country], linewidth=2.5)
if line == lines[0]:
legend = plt.legend(loc=loc)
ax.set_xlim(xlim)
ax.set_ylim(ylim)
if xlim and xspace:
ax.set_xticks(np.arange(xlim[0], xlim[1]+1, xspace))
plt.tight_layout()
fig.subplots_adjust(top=0.94)
if title:
ax.set_title(title)
else:
ax.set_title(df_ind.IndicatorName.values[0])
if len(indicators) > 1:
plt.legend(*zip(*line_styles), loc=loc2)
ax.add_artist(legend)
population = [
('pop_dens', 'EN.POP.DNST'), # Population density
('rural', 'SP.RUR.TOTL.ZS'), # Rural population
('under14', 'SP.POP.0014.TO.ZS'),# Population, ages 0-14
('above65', 'SP.POP.65UP.TO.ZS'),# Population ages 65 and above
]
for indicator in population:
plot_indicator(indicator, loc=0, xlim=(1990, 2020))
EDIT
I have re-written this answer to be more clear and concise.
This is a clever bit of code! I found the problem, it was with xlim. As the years are strings, not integers, the x-axis is index-based, not integer-based. This means that when you pass the range between 1990 and 2020 you are looking the 1990th to 2020th values! Obviously, there are not this many values (only 30 years between 1990 and 2020), so there was no data within that range, thus the blank plot.
If you change the code within the function to ax.set_xlim(xlim[0]-int(df_ind['Year'].min()), xlim[1]-int(df_ind['Year'].min())) then you can pass the year and it will subtract the minimum year to give the appropriate index values. I would also add plt.xticks(rotation=45) underneath to stop the ticks overlapping.
ALTERNATIVELY!! (this is the option I would choose):
You can simply change the DataFrame column type to integer, then everything you have remains unchanged. Underneath df.dropna(inplace=True) (just before the function), you can add df['Year'] = df['Year'].astype(int), which solves the problem with the non-integer x-axis above.
Once one or the other has been changed, you should be able to see the lines of the plots.
I seem unable to show the color bar for a two dimensional histplot using seaborn FacetGrid. Can someone point me to the missing link please?
Understanding that similar solutions have been discussed I have not been able to adapt to my use case:
Has the right position and values for color bar but isn't working for histplot
This proposal is not running at all & is rather dated so I am not sure it is still supposed to work
Seems to have fixed vmin/vmax and does not work with histplot
Specifically I am looking to extend the code below so that color bar is shown.
import pandas as pd
import numpy as np
import seaborn as sns
df = pd.DataFrame(list(zip([random.randint(0,10) for i in range(1000)], pd.to_datetime(
[d.strftime('%Y-%m-%d') for d in pd.date_range('1800-01-01', periods=250, freq='1d')]+\
[d.strftime('%Y-%m-%d') for d in pd.date_range('1800-01-01', periods=250, freq='1d')]+\
[d.strftime('%Y-%m-%d') for d in pd.date_range('1800-01-01', periods=250, freq='1d')]+\
[d.strftime('%Y-%m-%d') for d in pd.date_range('1800-01-01', periods=250, freq='1d')]),
[random.choice(string.ascii_letters[26:30]) for i in range(1000)])),
columns=["range","date","case_type"])
df["range"][df["case_type"]=="A"] = [random.randint(4562,873645) for i in range(1000)]
df["range"][df["case_type"]=="C"] = [random.random() for i in range(1000)]
fg = sns.FacetGrid(df, col="case_type", col_wrap=2, sharey=False)
fg.map(sns.histplot, "date", "range", stat="count", data=df)
fg.set_xticklabels(rotation=30)
fg.fig.show()
The objective would be to have a color bar on the right side of the facet grid, spanning the entire chart - two rows here but more may be shown. The displayed 2D histogram feature some very different data types so the counts per bin & color are likely very different and it matters to know if "dark blue" is 100 or 1000.
EDIT: For sake of clarity it appears from comments that the problem breaks down into two steps:
How to normalize the color coding among all plots and
Display a color bar on the right side of the plot using the normalized color mapping
I am not sure there is a seaborn-inherent way to achieve your desired plot. But we can pre-compute sensible values for bin number and vmin/vmax and apply them to all histplots:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
#generate a test dataset with different case_type probabilities
np.random.seed(123)
p1, p2, p3 = 0.8, 0.1, 0.03
df = pd.DataFrame(list(zip(np.random.randint(0, 20, 1000),
pd.to_datetime(4 * [d.strftime('%Y-%m-%d') for d in pd.date_range('1800-01-01', periods=250, freq='1d')]),
np.random.choice(list("ABCD"),size=1000, p=[p1, p2, p3, 1-(p1+p2+p3)]))),
columns=["range","date","case_type"])
df.loc[df.case_type == "A", "range"] *= 3
df.loc[df.case_type == "B", "range"] *= 23
df.loc[df.case_type == "C", "range"] *= 123
#determine the bin number for the x-axis
_, bin_edges = np.histogram(df["date"].dt.strftime("%Y%m%d").astype(int), bins="auto")
bin_nr = len(bin_edges)-1
#predetermine min and max count for each category
c_types = df["case_type"].unique()
vmin_list, vmax_list = [], []
for c_type in c_types:
arr, _, _ = np.histogram2d(df.loc[df.case_type == c_type, "date"], df.loc[df.case_type == c_type, "range"], bins=bin_nr)
vmin_list.append(arr.min())
vmax_list.append(arr.max())
#find lowest and highest counts for all subplots
vmin_all = min(vmin_list)
vmax_all = max(vmax_list)
#now we are ready to plot
fg = sns.FacetGrid(df, col="case_type", col_wrap=2, sharey=False)
#create common colorbar axis
cax = fg.fig.add_axes([.92, .12, .02, .8])
#map colorbar to colorbar axis with common vmin/vmax values
fg.map(sns.histplot,"date", "range", stat="count", bins=bin_nr, vmin=vmin_all, vmax=vmax_all, cbar=True, cbar_ax=cax, data=df)
#prevent overlap
fg.fig.subplots_adjust(right=.9)
fg.set_xticklabels(rotation=30)
plt.show()
Sample output:
You may also notice that I changed your sample dataframe so that the case_types occur at different frequencies, otherwise you don't see much difference between histplots. You should also be aware that the histplots are plotted in the order they appear in the dataframe, which might not be the order you would like to see in your graph.
Disclaimer: This is largely based on mwaskom's answer.
I am trying to create multiple box plot charts for about 5 columns in my dataframe (df_summ):
columns = ['dimension_a','dimension_b']
for i in columns:
sns.set(style = "ticks", palette = "pastel")
box_plot = sns.boxplot(y="measure", x=i,
palette=["m","g"],
data=df_summ_1500_delta)
sns.despine(offset=10, trim=True)
medians = df_summ_1500_delta.groupby([i])['measure'].median()
vertical_offset=df_summ_1500_delta['measure'].median()*-0.5
for xtick in box_plot.get_xticks():
box_plot.text(xtick,medians[xtick] + vertical_offset,medians[xtick],
horizontalalignment='center',size='small',color='blue',weight='semibold')
My only issue is that they aren't be separated on different facets, but rather on top of each other.
Any help on how I can make both on their own separate chart with the x axis being 'dimension a' and the x axis of the second chart being 'dimension b'.
To draw two boxplots next to each other at each x-position, you can use a hue for dimension_a and dimension_b separately. These two columns need to be transformed (with pd.melt()) to "long form".
Here is a some example code starting from generated test data. Note that the order both for the x-values as for the hue-values needs to be enforced to be sure of their exact position. The individual box plots are distributed over a width of 0.8.
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
df = pd.DataFrame({'dimension_a': np.random.choice(['hot', 'cold'], 100),
'dimension_b': np.random.choice(['hot', 'cold'], 100),
'measure': np.random.uniform(100, 500, 100)})
df.loc[df['dimension_a'] == 'hot', 'measure'] += 100
df.loc[df['dimension_a'] == 'cold', 'measure'] -= 100
x_order = ['hot', 'cold']
columns = ['dimension_a', 'dimension_b']
df1 = df.melt(value_vars=columns, var_name='dimension', value_name='value', id_vars='measure')
sns.set(style="ticks", palette="pastel")
ax = sns.boxplot(data=df1, x='value', order=x_order, y='measure',
hue='dimension', hue_order=columns, palette=["m", "g"], dodge=True)
ax.set_xlabel('')
sns.despine(offset=10, trim=True)
for col, dodge_dist in zip(columns, np.linspace(-0.4, 0.4, 2 * len(x_order) + 1)[1::2]):
medians = df.groupby([col])['measure'].median()
vertical_offset = df['measure'].median() * -0.5
for x_ind, xtick in enumerate(x_order):
ax.text(x_ind + dodge_dist, medians[xtick] + vertical_offset, f'{medians[xtick]:.2f}',
horizontalalignment='center', size='small', color='blue', weight='semibold')
plt.show()
I have a number of charts, made with matplotlib and seaborn, that look like the example below.
I show how certain quantities evolve over time on a lineplot
The x-axis labels are not numbers but strings (e.g. 'Q1' or '2018 first half' etc)
I need to "extend" the x-axis to the right, with an empty period. The chart must show from Q1 to Q4, but there is no data for Q4 (the Q4 column is full of nans)
I need this because I need the charts to be side-by-side with others which do have data for Q4
matplotlib doesn't display the column full of nans
If the x-axis were numeric, it would be easy to extend the range of the plot; since it's not numeric, I don't know which x_range each tick corresponds to
I have found the solution below. It works, but it's not elegant: I use integers for the x-axis, add 1, then set the labels back to the strings. Is there a more elegant way?
This is the code:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
from matplotlib.ticker import FuncFormatter
import seaborn as sns
df =pd.DataFrame()
df['period'] = ['Q1','Q2','Q3','Q4']
df['a'] = [3,4,5,np.nan]
df['b'] = [4,4,6,np.nan]
df = df.set_index( 'period')
fig, ax = plt.subplots(1,2)
sns.lineplot( data = df, ax =ax[0])
df_idx = df.index
df2 = df.set_index( np.arange(1, len(df_idx) + 1 ))
sns.lineplot(data = df2, ax = ax[1])
ax[1].set_xlim(1,4)
ax[1].set_xticklabels(df.index)
You can add these lines of code for ax[0]
left_buffer,right_buffer = 3,2
labels = ['Q1','Q2','Q3','Q4']
extanded_labels = ['']*left_buffer + labels + ['']*right_buffer
left_range = list(range(-left_buffer,0))
right_range = list(range(len(labels),len(labels)+right_buffer))
ticks_range = left_range + list(range(len(labels))) + right_range
aux_range = list(range(len(extanded_labels)))
ax[0].set_xticks(ticks_range)
ax[0].set_xticklabels(extanded_labels)
xticks = ax[0].xaxis.get_major_ticks()
for ind in aux_range[0:left_buffer]: xticks[ind].tick1line.set_visible(False)
for ind in aux_range[len(labels)+left_buffer:len(labels)+left_buffer+right_buffer]: xticks[ind].tick1line.set_visible(False)
in which left_buffer and right_buffer are margins you want to add to the left and to the right, respectively. Running the code, you will get
I may have actually found a simpler solution: I can draw a transparent line (alpha = 0 ) by plotting x = index of the dataframe, ie with all the labels, including those for which all values are nans, and y = the average value of the dataframe, so as to be sure it's within the range:
sns.lineplot(x = df.index, y = np.ones(df.shape[0]) * df.mean().mean() , ax = ax[0], alpha =0 )
This assumes the scale of the y a xis has not been changed manually; a better way of doing it would be to check whether it has:
y_centre = np.mean([ax[0].get_ylim()])
sns.lineplot(x = df.index, y = np.ones(df.shape[0]) * y_centre , ax = ax[0], alpha =0 )
Drawing a transparent line forces matplotlib to extend the axes so as to show all the x values, even those for which all the other values are nans.
For a dataframe
import pandas as pd
df=pd.DataFrame({'group':list("AADABCBCCCD"),'Values':[1,0,1,0,1,0,0,1,0,1,0]})
I am trying to plot a barplot showing percentage of times A, B, C, D takes zero (or one).
I have a round about way which works but I am thinking there has to be more straight forward way
tempdf=df.groupby(['group','Values']).Values.count().unstack().fillna(0)
tempdf['total']=df['group'].value_counts()
tempdf['percent']=tempdf[0]/tempdf['total']*100
tempdf.reset_index(inplace=True)
print tempdf
sns.barplot(x='group',y='percent',data=tempdf)
If it were plotting just the mean value, I could simply do sns.barplot on df dataframe than tempdf. I am not sure how to do it elegantly if I am interested in plotting percentages.
Thanks,
You can use Pandas in conjunction with seaborn to make this easier:
import pandas as pd
import seaborn as sns
df = sns.load_dataset("tips")
x, y, hue = "day", "proportion", "sex"
hue_order = ["Male", "Female"]
(df[x]
.groupby(df[hue])
.value_counts(normalize=True)
.rename(y)
.reset_index()
.pipe((sns.barplot, "data"), x=x, y=y, hue=hue))
You could use your own function in sns.barplot estimator, as from docs:
estimator : callable that maps vector -> scalar, optional
Statistical function to estimate within each categorical bin.
For you case you could define function as lambda:
sns.barplot(x='group', y='Values', data=df, estimator=lambda x: sum(x==0)*100.0/len(x))
You can follow these steps so that you can see the count and percentages on top of the bars in your plot. Check the example outputs down below
with_hue function will plot percentages on the bar graphs if you have the 'hue' parameter in your plots. It takes the actual graph, feature, Number_of_categories in feature, and hue_categories(number of categories in hue feature) as a parameter.
without_hue function will plot percentages on the bar graphs if you have a normal plot. It takes the actual graph and feature as a parameter.
def with_hue(plot, feature, Number_of_categories, hue_categories):
a = [p.get_height() for p in plot.patches]
patch = [p for p in plot.patches]
for i in range(Number_of_categories):
total = feature.value_counts().values[i]
for j in range(hue_categories):
percentage = '{:.1f}%'.format(100 * a[(j*Number_of_categories + i)]/total)
x = patch[(j*Number_of_categories + i)].get_x() + patch[(j*Number_of_categories + i)].get_width() / 2 - 0.15
y = patch[(j*Number_of_categories + i)].get_y() + patch[(j*Number_of_categories + i)].get_height()
ax.annotate(percentage, (x, y), size = 12)
plt.show()
def without_hue(plot, feature):
total = len(feature)
for p in ax.patches:
percentage = '{:.1f}%'.format(100 * p.get_height()/total)
x = p.get_x() + p.get_width() / 2 - 0.05
y = p.get_y() + p.get_height()
ax.annotate(percentage, (x, y), size = 12)
plt.show()
You can use the library Dexplot, which has the ability to return relative frequencies for categorical variables. It has a similar API to Seaborn. Pass the column you would like to get the relative frequency for to the count function. If you would like to subdivide this by another column, do so with the split parameter. The following returns raw counts.
import dexplot as dxp
dxp.count('group', data=df, split='Values')
To get the relative frequencies, set the normalize parameter to the column you want to normalize over. Use True to normalize over the overall total count.
dxp.count('group', data=df, split='Values', normalize='group')
Normalizing over the 'Values' column would produce the following graph, where the total of all the '0' bars are 1.
dxp.count('group', data=df, split='Values', normalize='Values')