How to plot a histogram for all unique combinations of data? - python

Is there a way I can get a size frequency histogram for a population under different scenarios for specific days in python
means with error bars
My data are in a format similar to this table:
SCENARIO RUN MEAN DAY
A 1 25 10
A 1 15 30
A 2 20 10
A 2 27 30
B 1 45 10
B 1 50 30
B 2 43 10
B 2 35 30
results_data.groupby(['Scenario', 'Run']).mean() does not give me the days I want to visualize the data by
it returns the mean on the days in each run.

Use seaborn.FacetGrid
FactGrid is a Multi-plot grid for plotting conditional relationships
Map seaborn.distplot onto the FacetGrid and use hue=DAY.
Setup Data and DataFrame
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import random # just for test data
import numpy as np # just for test data
# data
random.seed(365)
np.random.seed(365)
data = {'MEAN': [np.random.randint(20, 51) for _ in range(500)],
'SCENARIO': [random.choice(['A', 'B']) for _ in range(500)],
'DAY': [random.choice([10, 30]) for _ in range(500)],
'RUN': [random.choice([1, 2]) for _ in range(500)]}
# create dataframe
df = pd.DataFrame(data)
Plot with kde=False
g = sns.FacetGrid(df, col='RUN', row='SCENARIO', hue='DAY', height=5)
g = g.map(sns.distplot, 'MEAN', bins=range(20, 51, 5), kde=False, hist_kws=dict(edgecolor="k", linewidth=1)).add_legend()
plt.show()
Plot with kde=True
g = sns.FacetGrid(df, col='RUN', row='SCENARIO', hue='DAY', height=5, palette='GnBu')
g = g.map(sns.distplot, 'MEAN', bins=range(20, 51, 5), kde=True, hist_kws=dict(edgecolor="k", linewidth=1)).add_legend()
plt.show()
Plots with error bars
Using how to add error bars to histogram diagram in python
Using df from above
Use matplotlib.pyplot.errorbar to plot the error bars on the histogram.
from itertools import product
# create unique combinations for filtering df
scenarios = df.SCENARIO.unique()
runs = df.RUN.unique()
days = df.DAY.unique()
combo_list = [scenarios, runs, days]
results = list(product(*combo_list))
# plot
for i, result in enumerate(results, 1): # iterate through each set of combinations
s, r, d = result
data = df[(df.SCENARIO == s) & (df.RUN == r) & (df.DAY == d)] # filter dataframe
# add subplot rows, columns; needs to equal the number of combinations in results
plt.subplot(2, 4, i)
# plot hist and unpack values
n, bins, _ = plt.hist(x='MEAN', bins=range(20, 51, 5), data=data, color='g')
# calculate bin centers
bin_centers = 0.5 * (bins[:-1] + bins[1:])
# draw errobars, use the sqrt error. You can use what you want there
# poissonian 1 sigma intervals would make more sense
plt.errorbar(bin_centers, n, yerr=np.sqrt(n), fmt='k.')
plt.title(f'Scenario: {s} | Run: {r} | Day: {d}')
plt.tight_layout()
plt.show()

Related

Add bar-plot along a particular axis of clustermap with index specific data

I'm trying to add a bar-plot (stacked or otherwise) for each row in a seaborn clustermap.
Let's say that I have a dataframe like this:
import pandas as pd
import numpy as np
import random
df = pd.DataFrame(np.random.randint(0,100,size=(100, 8)), columns=["heatMap_1","heatMap_2","heatMap_3","heatMap_4","heatMap_5", "barPlot_1","barPlot_1","barPlot_1"])
df['index'] = [ random.randint(1,10000000) for k in df.index]
df.set_index('index', inplace=True)
df.head()
heatMap_1 heatMap_2 heatMap_3 heatMap_4 heatMap_5 barPlot_1 barPlot_1 barPlot_1
index
4552288 9 3 54 37 23 42 94 31
6915023 7 47 59 92 70 96 39 59
2988122 91 29 59 79 68 64 55 5
5060540 68 80 25 95 80 58 72 57
2901025 86 63 36 8 33 17 79 86
I can use the first 5 columns (in this example starting with prefix heatmap_) to create seaborn clustermap using this(or the seaborn equivalent):
sns.clustermap(df.iloc[:,0:5], )
and the stacked barplot for last four columns(in this example starting with prefix barPlot_) using this:
df.iloc[:,5:8].plot(kind='bar', stacked=True)
but I'm a bit confused on how to merge both plot types. I understand that clustermap creates it's own figures and I'm not sure if I can extract just the heatmap from clustermap and then use it with subfigures. (Discussed here: Adding seaborn clustermap to figure with other plots). This creates a weird output.
Edit:
Using this:
import pandas as pd
import numpy as np
import random
import seaborn as sns; sns.set(color_codes=True)
import matplotlib.pyplot as plt
import matplotlib.gridspec
df = pd.DataFrame(np.random.randint(0,100,size=(100, 8)), columns=["heatMap_1","heatMap_2","heatMap_3","heatMap_4","heatMap_5", "barPlot_1","barPlot_2","barPlot_3"])
df['index'] = [ random.randint(1,10000000) for k in df.index]
df.set_index('index', inplace=True)
g = sns.clustermap(df.iloc[:,0:5], )
g.gs.update(left=0.05, right=0.45)
gs2 = matplotlib.gridspec.GridSpec(1,1, left=0.6)
ax2 = g.fig.add_subplot(gs2[0])
df.iloc[:,5:8].plot(kind='barh', stacked=True, ax=ax2)
creates this:
which does not really match well (i.e. due to dendrograms there is a shift).
Another options is to manually perform clustering and create a matplotlib heatmap and then add associated subfigures like barplots(discussed here:How to get flat clustering corresponding to color clusters in the dendrogram created by scipy)
Is there a way I can use clustermap as a subplot along with other plots ?
This is the result I'm looking for[1]:
While not a proper answer, I decided to break it down and do everything manually.
Taking inspiration from answer here, I decided to cluster and reorder the heatmap separately:
def heatMapCluter(df):
row_method = "ward"
column_method = "ward"
row_metric = "euclidean"
column_metric = "euclidean"
if column_method == "ward":
d2 = dist.pdist(df.transpose())
D2 = dist.squareform(d2)
Y2 = sch.linkage(D2, method=column_method, metric=column_metric)
Z2 = sch.dendrogram(Y2, no_plot=True)
ind2 = sch.fcluster(Y2, 0.7 * max(Y2[:, 2]), "distance")
idx2 = Z2["leaves"]
df = df.iloc[:, idx2]
ind2 = ind2[idx2]
else:
idx2 = range(df.shape[1])
if row_method:
d1 = dist.pdist(df)
D1 = dist.squareform(d1)
Y1 = sch.linkage(D1, method=row_method, metric=row_metric)
Z1 = sch.dendrogram(Y1, orientation="right", no_plot=True)
ind1 = sch.fcluster(Y1, 0.7 * max(Y1[:, 2]), "distance")
idx1 = Z1["leaves"]
df = df.iloc[idx1, :]
ind1 = ind1[idx1]
else:
idx1 = range(df.shape[0])
return df
Rearranged the original dataframe:
clusteredHeatmap = heatMapCluter(df.iloc[:, 0:5].copy())
# Extract the "barplot" rows and merge them
clusteredDataframe = df.reindex(list(clusteredHeatmap.index.values))
clusteredDataframe = clusteredDataframe.reindex(
list(clusteredHeatmap.columns.values)
+ list(df.iloc[:, 5:8].columns.values),
axis=1,
)
and then used the gridspec to plot both "subfigures" (clustermap and barplot):
# Now let's plot this - first the heatmap and then the barplot.
# Since it is a "two" part plot which shares the same axis, it is
# better to use gridspec
fig = plt.figure(figsize=(12, 12))
gs = GridSpec(3, 3)
gs.update(wspace=0.015, hspace=0.05)
ax_main = plt.subplot(gs[0:3, :2])
ax_yDist = plt.subplot(gs[0:3, 2], sharey=ax_main)
im = ax_main.imshow(
clusteredDataframe.iloc[:, 0:5],
cmap="Greens",
interpolation="nearest",
aspect="auto",
)
clusteredDataframe.iloc[:, 5:8].plot(
kind="barh", stacked=True, ax=ax_yDist, sharey=True
)
ax_yDist.spines["right"].set_color("none")
ax_yDist.spines["top"].set_color("none")
ax_yDist.spines["left"].set_visible(False)
ax_yDist.xaxis.set_ticks_position("bottom")
ax_yDist.set_xlim([0, 100])
ax_yDist.set_yticks([])
ax_yDist.xaxis.grid(False)
ax_yDist.yaxis.grid(False)
Jupyter notebook: https://gist.github.com/siddharthst/2a8b7028d18935860062ac7379b9279f
Image:
1 - http://code.activestate.com/recipes/578175-hierarchical-clustering-heatmap-python/

Is it possible to get plot from panda dataframe includes missing data by Heatmap with especial color?

I was wondering if I can get all plots of columns in panda dataframe in one-window via heatmap in 24x20 self-made matrix-model-square which I designed to map every 480 values of each column(which means 1-cycle) by mapping them inside of it through all cycles. The challenging point is I want to show missing data by using especial color which is out of color range of colormap cmap ='coolwarm'
I already tried by using df = df.replace([np.inf, -np.inf], np.nan) make sure that all inf convert to nan and then by using df = df.replace(0,np.nan) before sns.heatmap(df, vmin=-1, vmax=+1, cmap ='coolwarm' I can recognize missing values via white color since in cmap ='coolwarm' white color represents nan/inf in this interval [vmin=-1, vmax=+1] after applying above-mentioned instructions however it has 2 problem:
First in case that you have 0 in your dataset it will be shown like missing data by white color too and you can't distinguish between inf/nan and 0 in columns. Second problem is you can't even differentiate between nan and inf values!
I also tried mask=df.isnull() inside sns.heatmap() by specifying a mask, where data will not be shown for those cells whose mask values are True but it covers again 0 based on this answer GH375. I'm not sure the answer here mentioned by #Scotty1- is right solution for my case by adding marker to interpolate the values by newdf = newdf.interpolate().
Is it good idea to filter missing data by subsetting :
import math
df = df[df['a'].apply(lambda x: math.isnan(x))]
df = df[df['a'] == float('inf')]
My scripts are following however in for-loop I couldn't get proper output due to in each cycle it prints plot each of them 3 times in different intervals eg. it prints A left then again it prints A under the name of B and C in middle and right in-one-window. Again it prints B 3-times instead once and put it middle and in the end it prints C 3-times instead of once and put in right side it put in middle and left!
import numpy as np
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt
#extract the parameters and put them in lists based on id_set
df = pd.read_csv('D:\SOF.TXT', header=None)
id_set = df[df.index % 4 == 0].astype('int').values
a = df[df.index % 4 == 1].values
b = df[df.index % 4 == 2].values
c = df[df.index % 4 == 3].values
data = {'A': a[:,0], 'B': b[:,0], 'C': c[:,0] }
#main_data contains all the data
main_data = pd.DataFrame(data, columns=['A','B','C'], index = id_set[:,0])
#next iteration create all plots, change the numer of cycles
cycles = int(len(main_data)/480)
print(cycles)
for i in main_data:
try:
os.mkdir(i)
except:
pass
min_val = main_data[i].min()
min_nor = -1
max_val = main_data[i].max()
max_nor = 1
for cycle in range(1): #iterate thriugh all cycles range(1) by ====> range(int(len(main_data)/480))
count = '{:04}'.format(cycle)
j = cycle * 480
ordered_data = mkdf(main_data.iloc[j:j+480][i])
csv = print_df(ordered_data)
#Print .csv files contains matrix of each parameters by name of cycles respectively
csv.to_csv(f'{i}/{i}{count}.csv', header=None, index=None)
if 'C' in i:
min_nor = -40
max_nor = 150
#Applying normalizayion for C between [-40,+150]
new_value = normalize(main_data.iloc[j:j+480][i].values, min_val, max_val, -40, 150)
n_cbar_kws = {"ticks":[-40,150,-20,0,25,50,75,100,125]}
else:
#Applying normalizayion for A,B between [-1,+1]
new_value = normalize(main_data.iloc[j:j+480][i].values, min_val, max_val, -1, 1)
n_cbar_kws = {"ticks":[-1.0,-0.75,-0.50,-0.25,0.00,0.25,0.50,0.75,1.0]}
Sections = mkdf(new_value)
df = print_df(Sections)
#Plotting parameters by using HeatMap
plt.figure()
sns.heatmap(df, vmin=min_nor, vmax=max_nor, cmap ='coolwarm', cbar_kws=n_cbar_kws)
plt.title(i, fontsize=12, color='black', loc='left', style='italic')
plt.axis('off')
#Print .PNG iamges contains HeatMap plots of each parametersby name of cycles respectively
plt.savefig(f'{i}/{i}{count}.png')
#plotting all columns ['A','B','C'] in-one-window side by side
fig, axes = plt.subplots(nrows=1, ncols=3 , figsize=(20,10))
plt.subplot(131)
sns.heatmap(df, vmin=-1, vmax=1, cmap ="coolwarm", cbar=True , cbar_kws={"ticks":[-1.0,-0.75,-0.5,-0.25,0.00,0.25,0.5,0.75,1.0]})
fig.axes[-1].set_ylabel('[MPa]', size=20) #cbar_kws={'label': 'Celsius'}
plt.title('A', fontsize=12, color='black', loc='left', style='italic')
plt.axis('off')
plt.subplot(132)
sns.heatmap(df, vmin=-1, vmax=1, cmap ="coolwarm", cbar=True , cbar_kws={"ticks":[-1.0,-0.75,-0.5,-0.25,0.00,0.25,0.5,0.75,1.0]})
fig.axes[-1].set_ylabel('[Mpa]', size=20) #cbar_kws={'label': 'Celsius'}
#sns.despine(left=True)
plt.title('B', fontsize=12, color='black', loc='left', style='italic')
plt.axis('off')
plt.subplot(133)
sns.heatmap(df, vmin=-40, vmax=150, cmap ="coolwarm" , cbar=True , cbar_kws={"ticks":[-40,150,-20,0,25,50,75,100,125]})
fig.axes[-1].set_ylabel('[°C]', size=20) #cbar_kws={'label': 'Celsius'}
#sns.despine(left=True)
plt.title('C', fontsize=12, color='black', loc='left', style='italic')
plt.axis('off')
plt.suptitle(f'Analysis of data in cycle Nr.: {count}', color='yellow', backgroundcolor='black', fontsize=48, fontweight='bold')
plt.subplots_adjust(top=0.7, bottom=0.3, left=0.05, right=0.95, hspace=0.2, wspace=0.2)
#plt.subplot_tool()
plt.savefig(f'{i}/{i}{i}{count}.png')
plt.show()
my data frame looks like following:
A B C
0 2.291171 -2.689658 -344.047912
10 2.176816 -4.381186 -335.936524
20 2.291171 -2.589725 -342.544885
30 2.176597 -6.360999 0.000000
40 2.577268 -1.993412 -344.326376
50 9.844076 -2.690917 -346.125859
60 2.061782 -2.889378 -346.375655
Here below is overview of my dataset sample from .TXT file: dataset
in case that you want to check out with missing data values please change the last 3 values of end of text file to nan/inf and save it and debug it.
7590 7590
0 nan
7.19025828418 nan
-1738.000075 inf
I'd like to visualise a large pandas-dataframe includes 3 columns columns=['A','B','C'] via heatmaps in-one-window. This dataframe has two types of variables: strings (nan or inf) and floats.
I want the heatmap to show missing data cells inside of matrix-squared-model by fixed colors like nan by black and inf by silver or gray, and the rest of the dataframe as a normal heatmap, with the floats in a scale of cmap ='coolwarm'.
Here is image of desired output when there is no nan/inf in dataset:
I'm looking forward to hearing from those people they are dealing with these issues.

stacked bar chart using seaborn and matplotlib

I am new to pandas and matplotlib and trying to accomplish following. I have a data frame as shown below which is actually a listing the performance of players based on match date
name runs match_date player_id
Dockrell, G H 0 2018-06-17 3752
Stirling, P R 81 2018-06-17 3586
O'Brien, K J 28 2018-06-17 3391
McCarthy, B J 0 2018-06-17 4563
Poynter, S W 0 2018-06-17 4326
Poynter, S W 2 2018-06-17 4326
McCarthy, B J 0 2018-06-17 4563
Shannon, J N K 5 2018-06-17 4219
Shannon, J N K 6 2018-06-17 4219
Stirling, P R 51 2018-06-17 3586
This is a subset of data that I have created based on following code
match_performance = dataFrame[['name','runs','match_date','player_id']].sort_values('match_date',ascending=False).groupby('player_id').head(5)
sns.set_context({"figure.figsize": (10, 4)})
ax = sns.barplot(x="name", y="runs", data=match_performance)
ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
I need to plot this either as stacked bar or grouped bar to display performance of players in there last 5 matches based on player id which I have in the dataframe but I am not sure how to go about plotting this data as required.
Michael Waskom, the creater of Seaborn posted this on Twitter:
#randyzwitch I don't really like stacked bar charts, I'd suggest maybe
using pointplot / factorplot with kind=point
— Michael Waskom (#michaelwaskom) September 4, 2014
Regretfully, the answer is no. There is no built-in function in Seaborn for plotting stacked bar charts.
While this is an older question I found it while looking for a solution, so I hope this may help someone. Achieving stacked bars are a bit tricky with seaborn, but this should do the trick
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
def stacked_chart_sns(df, x, y, group_by, palette):
array_of_dfs = []
w_0 = None
for u in df[group_by].unique():
w = df[df[group_by] == u].copy()
if w_0 is not None:
w = w.merge(w_0, how='outer').fillna(0)
w[group_by] = u
w[y] = w.apply(lambda x: x[y] + x['y_prev'], axis=1)
w = w.drop(columns=['y_prev'])
array_of_dfs += [w]
w_0 = w.drop(columns=[group_by]).rename(columns={y:'y_prev'}).copy()
patches = []
for i, d in enumerate(array_of_dfs[::-1]):
sns.barplot(x=x, y=y, data=d, color=palette[i])
patches += [mpatches.Patch(label=list(df[group_by].unique())[::-1][i], color=palette[i])]
plt.legend(handles=patches, loc = 'upper left', ncol=1, labelspacing=1)
plt.show()
### use it with - for the example data in the question:
stacked_chart_sns(match_performance, 'match_date', 'runs', 'player_id', sns.color_palette("Spectral"))

group multiple plot in one figure python

My function return 28 plots ( figure) but i need to group them on one figure this is my code for generating 28 plots
for cat in df.ASS_ASSIGNMENT.unique() :
a = df.loc[df['ASS_ASSIGNMENT'] == cat]
dates = a['DATE']
prediction = a['CSPL_RECEIVED_CALLS']
plt.plot(dates,prediction)
plt.ylabel("nmb_app")
plt.legend([cat.decode('utf-8')],loc='best')
plt.xlabel(cat.decode('utf-8'))
Use plt.subplots. For example,
import numpy as np
import matplotlib.pyplot as plt
fig, axes = plt.subplots(ncols=7, nrows=4)
for i, ax in enumerate(axes.flatten()):
x = np.random.randint(-5, 5, 20)
y = np.random.randint(-5, 5, 20)
ax.scatter(x, y)
ax.set_title('Axis {}'.format(i))
plt.tight_layout()
Going a little deeper, as Mauve points out, it depends if you want 28 curves in a single plot in a single figure or 28 individual plots each with its own axis all in one figure.
Assuming you have a dataframe, df, with 28 columns you can put all 28 curves on a single plot in a single figure using plt.subplots like so,
fig1, ax1 = plt.subplots()
df.plot(color=colors, ax=ax1)
plt.legend(ncol=4, loc='best')
If instead you want 28 individual axes all in one figure you can use plt.subplots this way
fig2, axes = plt.subplots(nrows=4, ncols=7)
for i, ax in enumerate(axes.flatten()):
df[df.columns[i]].plot(color=colors[i], ax=ax)
ax.set_title(df.columns[i])
Here df looks like
In [114]: df.shape
Out[114]: (15, 28)
In [115]: df.head()
Out[115]:
IYU ZMK DRO UIC DOF ASG DLU \
0 0.970467 1.026171 -0.141261 1.719777 2.344803 2.956578 2.433358
1 7.982833 7.667973 7.907016 7.897172 6.659990 5.623201 6.818639
2 4.608682 4.494827 6.078604 5.634331 4.553364 5.418964 6.079736
3 1.299400 3.235654 3.317892 2.689927 2.575684 4.844506 4.368858
4 10.690242 10.375313 10.062212 9.150162 9.620630 9.164129 8.661847
BO1 JFN S9Q ... X4K ZQG 2TS \
0 2.798409 2.425745 3.563515 ... 7.623710 7.678988 7.044471
1 8.391905 7.242406 8.960973 ... 5.389336 5.083990 5.857414
2 7.631030 7.822071 5.657916 ... 2.884925 2.570883 2.550461
3 6.061272 4.224779 5.709211 ... 4.961713 5.803743 6.008319
4 10.240355 9.792029 8.438934 ... 6.451223 5.072552 6.894701
RS0 P6T FOU LN9 CFG C9D ZG2
0 9.380106 9.654287 8.065816 7.029103 7.701655 6.811254 7.315282
1 3.931037 3.206575 3.728755 2.972959 4.436053 4.906322 4.796217
2 3.784638 2.445668 1.423225 1.506143 0.786983 -0.666565 1.120315
3 5.749563 7.084335 7.992780 6.998563 7.253861 8.845475 9.592453
4 4.581062 5.807435 5.544668 5.249163 6.555792 8.299669 8.036408
and was created by
import pandas as pd
import numpy as np
import string
import random
m = 28
n = 15
def random_data(m, n):
return np.cumsum(np.random.randn(m*n)).reshape(m, n)
def id_generator(number, size=6, chars=string.ascii_uppercase + string.digits):
sequence = []
for n in range(number):
sequence.append(''.join(random.choice(chars) for _ in range(size)))
return sequence
df = pd.DataFrame(random_data(n, m), columns=id_generator(number=m, size=3))
Colors was defined as
import seaborn as sns
colors = sns.cubehelix_palette(28, rot=-0.4)

Grouped Bar graph Pandas

I have a table in a pandas DataFrame named df:
+--- -----+------------+-------------+----------+------------+-----------+
|avg_views| avg_orders | max_views |max_orders| min_views |min_orders |
+---------+------------+-------------+----------+------------+-----------+
| 23 | 123 | 135 | 500 | 3 | 1 |
+---------+------------+-------------+----------+------------+-----------+
What I am looking for now is to plot a grouped bar graph which shows me
(avg, max, min) of views and orders in one single bar chart.
i.e on x axis there would be Views and orders separated by a distance
and 3 bars of (avg, max, min) for views and similarly for orders.
I have attached a sample bar graph image, just to know how the bar graph should look.
Green color should be for avg, yellow for max and pink for avg.
I took the following code from setting spacing between grouped bar plots in matplotlib but it is not working for me:
plt.figure(figsize=(13, 7), dpi=300)
groups = [[23, 135, 3], [123, 500, 1]]
group_labels = ['views', 'orders']
num_items = len(group_labels)
ind = np.arange(num_items)
margin = 0.05
width = (1. - 2. * margin) / num_items
s = plt.subplot(1, 1, 1)
for num, vals in enumerate(groups):
print 'plotting: ', vals
# The position of the xdata must be calculated for each of the two data
# series.
xdata = ind + margin + (num * width)
# Removing the "align=center" feature will left align graphs, which is
# what this method of calculating positions assumes.
gene_rects = plt.bar(xdata, vals, width)
s.set_xticks(ind + 0.5)
s.set_xticklabels(group_labels)
plotting: [23, 135, 3]
...
ValueError: shape mismatch: objects cannot be broadcast to a single shape
Using pandas:
import pandas as pd
groups = [[23,135,3], [123,500,1]]
group_labels = ['views', 'orders']
# Convert data to pandas DataFrame.
df = pd.DataFrame(groups, index=group_labels).T
# Plot.
pd.concat(
[df.mean().rename('average'), df.min().rename('min'),
df.max().rename('max')],
axis=1).plot.bar()
You should not have to modify your dataframe just to plot it in a certain way right ?
Use seaborn !
import seaborn as sns
sns.catplot(x = "x", # x variable name
y = "y", # y variable name
hue = "type", # group variable name
data = df, # dataframe to plot
kind = "bar")
source

Categories

Resources