Distribution Graph - python

I would like to show the distribution of Income based on location and whether that user left or not. For this task which graph should I use. How can I show the distribution of numeric columns according to 2 other categorical columns?

You can use seaborn.FacetGrid in order to quickly organize a subplot with two columns: one for users who left and the other for the ones who didn't. Then you can use a hue in order to distinguish locations:
g = sns.FacetGrid(data = df, col = 'Left', hue = 'Location')
g.map(sns.histplot, 'Income').add_legend()
Complete code
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
base = {'Germany': 120000, 'France': 100000, 'Spain': 80000}
def func(df):
return base[df['Location']] + 10000*np.random.randn() - df['Left']*5000*np.random.randn()
N = 1000
df = pd.DataFrame()
df['Location'] = np.random.choice(a = ['France', 'Germany', 'Spain'], size = N)
df['Left'] = np.random.choice(a = [0, 1], size = N)
df['Income'] = df.apply(func, axis = 1)
g = sns.FacetGrid(data = df, col = 'Left', hue = 'Location')
g.map(sns.histplot, 'Income').add_legend()
plt.show()
Another solution, suggested by #JohanC in the comment, is to use a violinplot, where on x axis you have different locations and on y axis the income, using the hue in order to distinguish users who left and the other for the ones who didn't (moreover violins are splitted by hue in two halves):
fig, ax = plt.subplots()
sns.violinplot(ax = ax, data = df, x = 'Location', y = 'Income', hue = 'Left', split = True)
plt.show()
If you are not allowed to use seaborn, you can achieve a similar result of the first example by using only matplotlib through a loop over different locations:
fig, ax = plt.subplots(1, 2, sharex = 'all', sharey = 'all', figsize = (8, 4))
for location in df['Location'].unique():
ax[0].hist(x = df[(df['Location'] == location) & (df['Left'] == 0)]['Income'], label = location, alpha = 0.7, edgecolor = 'black')
ax[1].hist(x = df[(df['Location'] == location) & (df['Left'] == 1)]['Income'], label = location, alpha = 0.7, edgecolor = 'black')
ax[0].set_title('Left = 0')
ax[1].set_title('Left = 1')
ax[0].set_xlabel('Income')
ax[1].set_xlabel('Income')
ax[0].set_ylabel('Count')
ax[1].legend(title = 'Location', loc = 'upper left', bbox_to_anchor = (1.05, 1))
plt.tight_layout()
plt.show()

Related

Seaborn set color for unique categorical over several pair-plots

I am using seaborn and and t-SNE to visualise class separability/overlap and in my dataset containing five classes. My plot is thus a 2x2 subplots. I used the following function which generates the figure below.
def pair_plot_tsne(df):
tsne = TSNE(verbose=1, random_state=234)
df1 = df[(df['mode'] != 'car') & (df['mode'] != 'bus')]
tsne1 = tsne.fit_transform(df1[cols].values) # cols - df's columns list
df1['tsne_one'] = tsne1[:, 0]
df1['tsne-two'] = tsne1[:, 1]
df2 = df[(df['mode'] != 'foot') & (df['mode']!= 'bus')]
tsne2 = tsne.fit_transform(df2[cols].values)
df2['tsne_one'] = tsne2[:, 0]
df2['tsne-two'] = tsne2[:, 1]
df3 = df[df['mode'] != 'car']
tsne3 = tsne.fit_transform(df3[cols].values)
df3['tsne_one'] = tsne3[:, 0]
df3['tsne-two'] = tsne3[:, 1]
df4 = df[df['mode'] != 'foot']
tsne4 = tsne.fit_transform(df4[cols].values)
df4['tsne_one'] = tsne4[:, 0]
df4['tsne-two'] = tsne4[:, 1]
#create figure
f = plt.figure(figsize=(16,4))
ax1 = plt.subplot(2, 2, 1)
sns.scatterplot( #df1 has 3 classes, so 3 colors
x ='tsne_one', y='tsne-two', hue = 'mode', data = df1, palette = sns.color_palette('hls', 3),
legend='full', alpha = 0.7, ax = ax1 )
ax2 = plt.subplot(2, 2, 2)
sns.scatterplot( #df2 has 3 classes, so 3 colors
x ='tsne_one', y='tsne-two', hue = 'mode', data = df2, palette = sns.color_palette('hls', 3),
legend='full', alpha = 0.7, ax = ax2 )
ax3 = plt.subplot(2, 2, 3)
sns.scatterplot( #df3 has 4 classes, so 4 colors
x ='tsne_one', y='tsne-two', hue = 'mode', data = df3, palette = sns.color_palette('hls', 4),
legend='full', alpha = 0.7, ax = ax3 )
ax4 = plt.subplot(2, 2, 4)
sns.scatterplot( #df4 has 4 classes, so 4 colors
x ='tsne_one', y='tsne-two', hue = 'mode', data = df4, palette = sns.color_palette('hls', 4),
legend='full', alpha = 0.7, ax = ax4 )
return f, ax1, ax2, ax3, ax4
Since I'm plotting a subset of the dataset in each subplot, I would like to have the color of each class consistent, in whichever plot it appears. For class, a blue color for the car mode in whichever subplot it appears, a black color for bus mode in which ever plot it appears, etc...
As it is now, foot is red in subplot(2, 2, 1), and also car is read in subplot(2, 2, 2) although the rest are consistent.
For this use case, seaborn allows a dictionary as palette. The dictionary will assign a color to each hue value.
Here is an example of how such a dictionary could be created for your data:
from matplotlib import pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
df1 = pd.DataFrame({'tsne_one': np.random.randn(10),
'tsne-two': np.random.randn(10),
'mode': np.random.choice(['foot', 'metro', 'bike'], 10)})
df2 = pd.DataFrame({'tsne_one': np.random.randn(10),
'tsne-two': np.random.randn(10),
'mode': np.random.choice(['car', 'metro', 'bike'], 10)})
df3 = pd.DataFrame({'tsne_one': np.random.randn(10),
'tsne-two': np.random.randn(10),
'mode': np.random.choice(['foot', 'bus', 'metro', 'bike'], 10)})
df4 = pd.DataFrame({'tsne_one': np.random.randn(10),
'tsne-two': np.random.randn(10),
'mode': np.random.choice(['car', 'bus', 'metro', 'bike'], 10)})
modes = pd.concat([df['mode'] for df in (df1, df2, df3, df4)], ignore_index=True).unique()
colors = sns.color_palette('hls', len(modes))
palette = {mode: color for mode, color in zip(modes, colors)}
fig, axs = plt.subplots(2, 2, figsize=(12,6))
for df, ax in zip((df1, df2, df3, df4), axs.flatten()):
sns.scatterplot(x='tsne_one', y='tsne-two', hue='mode', data=df, palette=palette, legend='full', alpha=0.7, ax=ax)
plt.tight_layout()
plt.show()

Matplotlib - Implement multiple y-axis scales in animated line graph

I'm trying to remake an existing animated line graph I made where each line has a uniquely scaled y-axis - one on the left, one on the right. The graph is comparing the value of two cryptocurrencies that have vastly different sizes (eth/btc), which is why I need multiple scales to actually see changes.
My data has been formatted in a pd df (numbers here are random):
Date ETH Price BTC Price
0 2020-10-30 00:00:00 0.155705 1331.878496
1 2020-10-31 00:00:00 0.260152 1337.174272
.. ... ... ...
290 2021-08-15 16:42:09 0.141994 2846.719819
[291 rows x 3 columns]
And code is roughly:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.animation as ani
color = ['cyan', 'orange', 'red']
fig = plt.figure()
plt.xticks(rotation=45, ha="right", rotation_mode="anchor")
plt.subplots_adjust(bottom = 0.2, top = 0.9)
plt.ylabel('Coin Value (USD)')
plt.xlabel('Date')
def buildChart(i=int):
df1 = df.set_index('Date', drop=True)
plt.legend(["ETH Price", "BTC Price"])
p = plt.plot(df1[:i].index, df1[:i].values)
for i in range(0,2):
p[i].set_color(color[i])
animator = ani.FuncAnimation(fig, buildChart, interval = 10)
plt.show()
Resulting Animation
I tried to create a second axis with a twin x to the first axis.
color = ['cyan', 'orange', 'blue']
fig, ax1 = plt.subplots() #Changes over here
plt.xticks(rotation=45, ha="right", rotation_mode="anchor")
plt.subplots_adjust(bottom = 0.2, top = 0.9)
plt.ylabel('Coin Value (USD)')
plt.xlabel('Date')
def buildChart(i=int):
df1 = df.set_index('Date', drop=True)
plt.legend(["ETH Price", "Bitcoin Price"])
data1 = df1.iloc[:i, 0:1] # Changes over here
# ------------- More Changes Start
ax2 = ax1.twinx()
ax2.set_ylabel('Cost of Coin (USD)')
data2 = df1.iloc[:i, 1:2]
ax2.plot(df1[:i].index, data2)
ax2.tick_params(axis='y')
# -------------- More Changes End
p = plt.plot(df1[:i].index, data1)
for i in range(0,1):
p[i].set_color(color[i])
import matplotlib.animation as ani
animator = ani.FuncAnimation(fig, buildChart, interval = 10)
plt.show()
Resulting Animation After Changes
Current issues:
X-Axis start at ~1999 rather than late 2020
---- Causes all changes on the y-axis to be a nearly vertical line
Left Y-Axis label is on a scale of 0-1?
Right y-axis labels are recurring, overlapping, moving.
I believe my approach to making a second scale must have been wrong to get this many errors, but this seems like the way to do it.
I re-structured your code in order to easily set up a secondary axis animation.
Here the code of the animation with a single y axis:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation
df = pd.DataFrame({'date': pd.date_range(start = '2020-01-01', end = '2020-04-01', freq = 'D')})
df['ETH'] = 2*df.index + 300 + 100*np.random.randn(len(df))
df['BTC'] = 5*df.index + 13000 + 200*np.random.randn(len(df))
def update(i):
ax.cla()
ax.plot(df.loc[:i, 'date'], df.loc[:i, 'ETH'], label = 'ETH Price', color = 'red')
ax.plot(df.loc[:i, 'date'], df.loc[:i, 'BTC'], label = 'BTC Price', color = 'blue')
ax.legend(frameon = True, loc = 'upper left', bbox_to_anchor = (1.15, 1))
ax.set_ylim(0.9*min(df['ETH'].min(), df['BTC'].min()), 1.1*max(df['ETH'].max(), df['BTC'].max()))
ax.tick_params(axis = 'x', which = 'both', top = False)
ax.tick_params(axis = 'y', which = 'both', right = False)
plt.setp(ax.xaxis.get_majorticklabels(), rotation = 45)
ax.set_xlabel('Date')
ax.set_ylabel('ETH Coin Value (USD)')
plt.tight_layout()
fig, ax = plt.subplots(figsize = (6, 4))
ani = FuncAnimation(fig = fig, func = update, frames = len(df), interval = 100)
plt.show()
Starting from the code above, you should twin the axis out of the update function: if you keep ax.twinx() inside the function, this operation will be repeated in each iteration and you will get a new axis each time.
Below the code for an animation with a secondary axis:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation
df = pd.DataFrame({'date': pd.date_range(start = '2020-01-01', end = '2020-04-01', freq = 'D')})
df['ETH'] = 2*df.index + 300 + 100*np.random.randn(len(df))
df['BTC'] = 5*df.index + 13000 + 200*np.random.randn(len(df))
def update(i):
ax1.cla()
ax2.cla()
line1 = ax1.plot(df.loc[:i, 'date'], df.loc[:i, 'ETH'], label = 'ETH Price', color = 'red')
line2 = ax2.plot(df.loc[:i, 'date'], df.loc[:i, 'BTC'], label = 'BTC Price', color = 'blue')
lines = line1 + line2
labels = [line.get_label() for line in lines]
ax1.legend(lines, labels, frameon = True, loc = 'upper left', bbox_to_anchor = (1.15, 1))
ax1.set_ylim(0.9*df['ETH'].min(), 1.1*df['ETH'].max())
ax2.set_ylim(0.9*df['BTC'].min(), 1.1*df['BTC'].max())
ax1.tick_params(axis = 'x', which = 'both', top = False)
ax1.tick_params(axis = 'y', which = 'both', right = False, colors = 'red')
ax2.tick_params(axis = 'y', which = 'both', right = True, labelright = True, left = False, labelleft = False, colors = 'blue')
plt.setp(ax1.xaxis.get_majorticklabels(), rotation = 45)
ax1.set_xlabel('Date')
ax1.set_ylabel('ETH Coin Value (USD)')
ax2.set_ylabel('BTC Coin Value (USD)')
ax1.yaxis.label.set_color('red')
ax2.yaxis.label.set_color('blue')
ax2.spines['left'].set_color('red')
ax2.spines['right'].set_color('blue')
plt.tight_layout()
fig, ax1 = plt.subplots(figsize = (6, 4))
ax2 = ax1.twinx()
ani = FuncAnimation(fig = fig, func = update, frames = len(df), interval = 100)
plt.show()

matplotlib scatter plotting with noncontiguous yaxis ticks with datatype as integer

My question:
while plotting x and y values from a dataframe, if we have y values as discrete numbers say, id_number or category. if we use scatter plot, it will give linearly spaced yaxis ticks which may have large vertical spacing in between the plotted values depending on how much spaced our original values are.
what i required is to plot some category values ( fixed discrete values ) against the time events ( xaxis ) in a scatter plot, but the values in the table are just integer not strings. As i don't have any deep idea how to do this, the following is what i have achieved, but with modified original table with string values. Here is my testing data ( original data is large )
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as mtic
import matplotlib.category as mcat
np.random.seed(432987435)
nofpoints = 160
xval = np.arange(nofpoints)
disc = [ 200, 240, 250, 290 ]
yval = np.random.choice( disc , nofpoints)
yval_str = yval.astype(str)
yval , yval_str
cval = np.random.random( nofpoints )
df = pd.DataFrame( { 'xval': xval , 'yval':yval , 'cval': cval })
df_str = pd.DataFrame( { 'xval': xval , 'yval':yval_str , 'cval': cval })
using usual plotting method
fig = plt.figure(dpi=128 , figsize=(12,6))
ax1 = fig.add_subplot(111)
# here we are using the original dataframe(df), without any string field inside.
#ax1.grid(True)
ax1.scatter( 'xval' , 'yval' , data=df , marker='o', facecolor='None' , edgecolor='g')
plt.show()
this is what we get
see the large spacing between the values and each plot point is not against the tick values. (I don't want to use legend to show the category using colourmap, since it is preserved for some other purpose)
with modified dataframe having string as yaxis value
fig = plt.figure(dpi=128 , figsize=(12,6))
ax2 = fig.add_subplot(111)
# dataframe used is modified one with a string field inside.
# as we can see the order is shuffled.
ax2.scatter( 'xval' , 'yval' , data=df_str , marker='o', facecolor='None' , edgecolor='k')
plt.show()
to avoid shuffling
fig = plt.figure(dpi=128 , figsize=(12,6))
ax3 = fig.add_subplot(111)
# to maintain the same order and avoid shuffling we used matplotlib.category
#ax3.grid(True)
disc_str = [ str(x) for x in disc ]
units = mcat.UnitData(sorted(disc_str))
ax3.yaxis.set_units(units)
ax3.yaxis.set_major_locator( mcat.StrCategoryLocator(units._mapping))
ax3.yaxis.set_major_formatter( mcat.StrCategoryFormatter(units._mapping))
ax3.scatter( 'xval' , 'yval' , data=df_str , marker='o', facecolor='None' , edgecolor='y')
plt.show()
Is there any way to achieve this, without modifying the original table, i mean to plot integer category values as yaxis values.
You can do it by replacing ax1.scatter with seaborn.stripplot:
sns.stripplot(ax = ax1, data = df, x = 'xval', y = 'yval_str', marker = 'o', color = 'white', edgecolor = 'green', linewidth = 1)
Before you do that, if you want y axis in a particular order, you should sort your df:
df = pd.DataFrame({'xval': xval, 'yval': yval, 'yval_str': yval_str, 'cval': cval}).sort_values(by = 'yval', ascending = False)
Complete Code
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
np.random.seed(432987435)
nofpoints = 160
xval = np.arange(nofpoints)
disc = [200, 240, 250, 290]
yval = np.random.choice(disc, nofpoints)
yval_str = yval.astype(str)
cval = np.random.random(nofpoints)
df = pd.DataFrame({'xval': xval, 'yval': yval, 'yval_str': yval_str, 'cval': cval}).sort_values(by = 'yval', ascending = False)
fig = plt.figure(dpi = 128, figsize = (12, 6))
ax1 = fig.add_subplot(111)
sns.stripplot(ax = ax1, data = df, x = 'xval', y = 'yval_str', marker = 'o', color = 'white', edgecolor = 'green', linewidth = 1)
plt.show()
If you want perfectly horizontally aligned points, you have to pass jitter = False to sns.stripplot:
sns.stripplot(ax = ax1, data = df, x = 'xval', y = 'yval_str', marker = 'o', color = 'white', edgecolor = 'green', linewidth = 1, jitter = False)

Tick labels appearing twice

I am trying to create a figure that is a dendrogram on top of a scatterplot, where the ends of the leaves on the dendrogram match up with the dots on the scatterplot, which in turn match up with the tick labels below. I have this working, but for some reason the tick labels appear twice. The labels in red and green are the ones I'm trying to keep.
This is my code:
import pandas as pd
from matplotlib import pyplot as plt
import scipy.cluster.hierarchy as sch
import numpy as np
import json
import random
def scatter_and_dendrogram(df, colors,wn='',label_x=False):
'''Args:
df (Pandas DataFrame): similarity matrix
colors (list of strs): list of colors
wn (str): window name
label_x=False(Bool): whether or not to label x axis
Returns: None
'''
norm = plt.Normalize(1,4)
dist_matrix = [] #linkage
for i in range(len(df)):
arr = []
for j in range(1,len(df.iloc[i])):
arr.append(df.iloc[i,j])
dist_matrix.append(list(arr))
X = np.asarray(dist_matrix)
Z = sch.linkage(X, 'ward')
sch.set_link_color_palette(['b'])
fig = plt.figure()
fig, axs = plt.subplots(2, 1, sharex='col', sharey='row',
gridspec_kw={'width_ratios': [1],
'height_ratios': [30, 1],
'hspace': 0, 'wspace': 0})
(ax1, ax2) = axs
dendrogram = sch.dendrogram(Z=Z, p=3,ax=ax1)
icoords = dendrogram['icoord']
dcoords = dendrogram['dcoord']
lst = [[],[],colors]
for i in range(len(icoords)):
ic = icoords[i]
dc = dcoords[i]
if dc.count(0) == 2:
lst[0].append(ic[0])
lst[0].append(ic[-1])
elif dc.count(0) == 1:
ind = dc.index(0)
lst[0].append(ic[ind])
lst[1] = [-0.1]*len(lst[0])
ax2.scatter(lst[0],lst[1],s=10,norm=norm, alpha=0.7)
fig.canvas.set_window_title(wn)
ax1.set_yticklabels([])
ax1.set_xticklabels([])
ax2.set_yticklabels([])
ax2.set_xticklabels([])
if label_x:
letters = list('ABCD')
labels = [letters[ind] for ind in dendrogram['leaves']]
c1 = '#ff0033' #red
c2 = '#006600'#green
xlbls = ax2.set_xticklabels(labels,fontsize=11,linespacing=3)
for lbl in xlbls:
t = lbl.get_text()
c = c2
if letters.index(t) < 2:
c = c1
print(c)
lbl.set_color(c)
ax1.set_title(wn)
ax1.set_ylabel('Aggregation Criterion',fontsize=15)
ax2.set_xlabel('Articles', fontsize=15)
plt.show()
l = ['A','B','C','D']
df = pd.DataFrame(index=l, columns=l)
for i in range(len(l)-1):
for j in range(i+1, len(l)):
r = random.randint(0, 10)
df.iloc[i,j] = r
df.iloc[j, i] = r
df.fillna(0,inplace=True)
print(df)
wn = 'Set C'
scatter_and_dendrogram(df, l, wn,True)
This is what it looks like:
According to matplotlib.pyplot.subplots about sharex and sharey
When subplots have a shared x-axis along a column, only the x tick
labels of the bottom subplot are created.
Similarly, when subplots have a shared y-axis along a row, only the y tick labels of the first column subplot are created.
To later turn other subplots' ticklabels on, use tick_params.
You need to add ax1.tick_params(axis='x', labelbottom=False) under xlbls = ax2.set_xticklabels.
Besides, if fig = plt.figure() is useless, remove it.

Trying to export 6 violin subplots (made using seaborn) (formatted into 3 rows and 2 columns) to a single page PDF

I need to export 6 violin subplots made using seaborn through python onto a single page PDF. They need to be formatted into 3 rows x 2 columns. Right now my code is generating a single page PDF with 6 empty plots and in the console this grid of empty plots appears as well as my 6 individual violin subplots (which I need to appear in the 3x2 grid format). I need to fix my code to make the violin plots export correctly as a PDF.
data = pd.read_csv(os.path.join(input_folder, input_file))
x = "Subregion"
hue = "Mutation"
col = "Subregion"
kind = "violin"
data = M1
title_name = "M1"
fig, ([ax1, ax2], [ax3, ax4], [ax5, ax6]) = plt.subplots(nrows=3, ncols=2,figsize = (6,6))
fig.subplots_adjust(hspace=0.4, wspace=0.4)
ax1 = sns.catplot(x = x, y = "Area_mm", hue = hue, col = None, kind = kind, data = data, legend = False)
ax1.set_ylabels("Area (mm^2)")
ax2 = sns.catplot(x = x, y = "DAPI_count", hue = hue, col = None, kind = kind, data = data, legend = False)
ax2.set_ylabels("DAPI Cell Count")
ax3 = sns.catplot(x = x, y = "SST_count", hue = hue, col = None, kind = kind, data = data, legend = False)
ax3.set_ylabels("SST Cell Count")
ax4 = sns.catplot(x = x, y = "DAPI_per_area", hue = hue, col = None, kind = kind, data = data, legend = False)
ax4.set_ylabels("DAPI Cell Density (DAPI/mm^2)")
ax5 = sns.catplot(x = x, y = "SST_per_area", hue = hue, col = None, kind = kind, data = data, legend = False)
ax5.set_ylabels("SST Cell Density (SST/mm^2)")
ax6 = sns.catplot(x = x, y = "SST_per_DAPI", hue = hue, col = None, kind = kind, data = data, legend = False)
ax6.set_ylabels("SST Cell Density (% SST/DAPI cells)")
fig.savefig(os.path.join(output_folder, title_name + '.pdf'))
The problem might be specifying the ax in a list.. not very sure how that is going to work. You can always flatten out the axis and iterate through your labels and y-axis values to plot, for example:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
x = "Subregion"
hue = "Mutation"
kind = "violin"
title_name = "M1"
M1 = pd.DataFrame(np.random.normal(0,1,(100,6)),
columns=["Area_mm","DAPI_count","SST_count","DAPI_per_area","SST_per_area","SST_per_DAPI"])
M1['x'] = np.random.choice(['p','q','r'],100)
M1["Mutation"] = np.random.choice(['A','B'],100)
VAR = M1.columns[:6]
YL = ["Area (mm^2)","DAPI Cell Count","SST Cell Count","DAPI Cell Density (DAPI/mm^2)","SST Cell Density (SST/mm^2)","SST Cell Density (% SST/DAPI cells)"]
fig, axs = plt.subplots(3, 2,figsize = (8,8))
fig.subplots_adjust(hspace=0.4, wspace=0.4)
axs = axs.reshape(-1)
for k in range(len(VAR)):
sns.violinplot(x = "x", y = VAR[k], hue = hue, col = None,
kind = kind, data = M1,ax=axs[k])
axs[k].set_ylabel(YL[k],fontsize=8)
axs[k].legend_.remove()
axs[-1].legend(loc='upper right', ncol=1,bbox_to_anchor=(1.5,1.5))
plt.show()

Categories

Resources