How to locate the median in a (seaborn) KDE plot? - python

I am trying to do a Kernel Density Estimation (KDE) plot with seaborn and locate the median. The code looks something like this:
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
sns.set_palette("hls", 1)
data = np.random.randn(30)
sns.kdeplot(data, shade=True)
# x_median, y_median = magic_function()
# plt.vlines(x_median, 0, y_median)
plt.show()
As you can see I need a magic_function() to fetch the median x and y values from the kdeplot. Then I would like to plot them with e.g. vlines. However, I can't figure out how to do that. The result should look something like this (obviously the black median bar is wrong here):
I guess my question is not strictly related to seaborn and also applies to other kinds of matplotlib plots. Any ideas are greatly appreciated.

You need to:
Extract the data of the kde line
Integrate it to calculate the cumulative distribution function (CDF)
Find the value that makes CDF equal 1/2, that is the median
import numpy as np
import scipy
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_palette("hls", 1)
data = np.random.randn(30)
p=sns.kdeplot(data, shade=True)
x,y = p.get_lines()[0].get_data()
#care with the order, it is first y
#initial fills a 0 so the result has same length than x
cdf = scipy.integrate.cumtrapz(y, x, initial=0)
nearest_05 = np.abs(cdf-0.5).argmin()
x_median = x[nearest_05]
y_median = y[nearest_05]
plt.vlines(x_median, 0, y_median)
plt.show()

Related

numpy and matplot - plotting on the same graph while one element changes

I'm trying to create a graph with k_b as the x-value and delta_P as the y-value. I want to plot k_b against delta_P but S=3 for one curve and S=0.1 for another curve. However, I want the two lines to be on the same graph. Does anyone have any advice on how to do that? Below is what I have for S=3 and it works.
def rocproduct(k_cat,E0,S,k_b,k_f):
return k_cat*E0*S/((k_b/k_f)+S)
import numpy as np
import matplotlib.pyplot as plt
k_cat=0.1;E0=1;k_f=0.3;S=3
k_b=np.array([0.01,0.1,0.2,0.5,1,1.5,2,5,10])
delta_P=rocproduct(k_cat,E0,S,k_b,k_f)
plt.ylabel('rate of change of product')
plt.xlabel('kb')
plt.plot(k_b,delta_P)
Just call rocproduct for S=0.1 and plot it again
import numpy as np
import matplotlib.pyplot as plt
# Parameters
k_cat=0.1
E0=1
k_f=0.3
S=3
# Function for data
def rocproduct(k_cat,E0,S,k_b,k_f):
return k_cat*E0*S/((k_b/k_f)+S)
# Data to plot
k_b=np.array([0.01,0.1,0.2,0.5,1,1.5,2,5,10])
delta_P_1=rocproduct(k_cat,E0,S,k_b,k_f)
S = 0.1
delta_P_2=rocproduct(k_cat,E0,S,k_b,k_f)
# Plotting
plt.ylabel('rate of change of product')
plt.xlabel('kb')
plt.plot(k_b,delta_P_1)
plt.plot(k_b, delta_P_2)
plt.show()

How to plot a smooth curve in python for a list of values?

I have created a list of values of Shannon entropy for a pair of multiple sequence aligned sequences. While plotting the values I get a simple plot. I want to plot a smooth curve over the lines. Can anyone suggest to me what will be the right way to process it? BAsically I want to plot a smooth curve that touches the tip of every bar and goes to zero where the "y axis value" is zero.
link for image: [1]: https://i.stack.imgur.com/SY3jH.png
#importing the relevant packages
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.interpolate import make_interp_spline
from Bio import AlignIO
import warnings
warnings.filterwarnings("ignore")
#function to calculate the Shannon Entropy of a MSA
# H = -sum[p(x).log2(px)]
def shannon_entropy(list_input):
unique_aa = set(list_input)
M = len(list_input)
entropy_list = []
# Number of residues in column
for aa in unique_aa:
n_i = list_input.count(aa)
P_i = n_i/float(M)
entropy_i = P_i*(math.log(P_i,2))
entropy_list.append(entropy_i)
sh_entropy = -(sum(entropy_list))
#print(sh_entropy)
return sh_entropy
#importing the MSA file
#importing the clustal file
align_clustal1 =AlignIO.read("/home/clustal.aln", "clustal")
def shannon_entropy_list_msa(alignment_file):
shannon_entropy_list = []
for col_no in range(len(list(alignment_file[0]))):
list_input = list(alignment_file[:, col_no])
shannon_entropy_list.append(shannon_entropy(list_input))
return shannon_entropy_list
clustal_omega1 = shannon_entropy_list_msa(align_clustal1)
# Plotting the data
plt.figure(figsize=(18,10))
plt.plot(clustal_omega1, 'r')
plt.xlabel('Residue', fontsize=16)
plt.ylabel("Shannon's entropy", fontsize=16)
plt.show()
Edit 1:
Here is what my graph looks like after implementing the "pchip" method. link for the pchip output: https://i.stack.imgur.com/hA3KW.png
pchip monotonic spline output
One approach would be to use PCHIP interpolation, which will give you the monotonic curve with the required behaviour for zero values on the y-axis.
We can't run your exact code example on our machines because you point to a local Clustal file in your 'home' directory.
Here's a simple working example, with link to output image:
import numpy as np
import matplotlib.pyplot as plt
from scipy.interpolate import pchip
mylist = [10,0,0,0,0,9,9,0,0,0,11,11,11,0,0]
mylist_np = np.array(mylist)
samples = np.array(range(len(mylist)))
xnew = np.linspace(samples.min(), samples.max(), 100)
plt.plot(xnew,pchip(samples, mylist_np )(xnew))
plt.show()

Displaying Colormap/legend with x,y,z plot and fourth variable

I'm using Pandas and am very new to programming. I'm plotting Energy Deposited (eDep) as a function of its x,y and z positions. So far, was successful in getting it to plot, but it won't let me plot the colormap beside my scatter plot! Any help is much appreciated
%matplotlib inline
import pandas as pd
import numpy as np
IncubatorBelow = "./Analysis.Test.csv"
df = pd.read_csv(IncubatorBelow, sep = ',', names['Name','TrackID','ParentID','xPos','yPos','zPos','eDep','DeltaE','Einit','EventID'],low_memory=False,error_bad_lines=False)
df["xPos"] = df["xPos"].str.replace("(","")
df["zPos"] = df["zPos"].str.replace(")","")
df.sort_values(by='Name', ascending=[False])
df.dropna(how='any',axis=0,subset=['Name','TrackID','ParentID','xPos','yPos','zPos','eDep','DeltaE','Einit','EventID'], inplace=True)
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
df['xPos'] = df['xPos'].astype(float)
df['yPos'] = df['yPos'].astype(float)
df['zPos'] = df['zPos'].astype(float)
#df10[df10['Name'].str.contains("e-")]
threedee = plt.figure().gca(projection='3d')
threedee.scatter(df["xPos"], df["yPos"], df["zPos"], c=df["eDep"], cmap=plt.cm.coolwarm)
threedee.set_xlabel("x(mm)")
threedee.set_ylabel("y(mm)")
threedee.set_zlabel("z(mm)")
plt.show()
Heres what the plot looks like!
Its from a particle physics simulation using GEANT4. The actual files are extremely large (3.7GB's that I've chunked into 40ish MB's) and this plot only represents a small fraction of the data.

Difference between specified and measured colours, matplotlib colormap

I'm having trouble replicating an old colormap I've used in matplotlib. It seems as if it was the default colormap because in the original code, no colormap was specified.
So looking at the old figure I made I've measured the colours from the colorbar using gpick. I've inputted these into a custom colormap as follows:
blue_red1 = LinearSegmentedColormap.from_list('mycmap', [
(0, '#6666de'),
(0.1428, '#668cff'),
(0.2856, '#66d9ff'),
(0.4284, '#92ffce'),
(0.5712, '#d0ff90'),
(0.714, '#ffe366'),
(0.8568, '#ff9b66'),
(1, '#db6666')])
CS = plt.contourf(H, temps, diff_list, cmap=blue_red1)
plt.savefig('out.png')
Yet when I measure the output colours with gpick again they have different hex values (and I can tell they're different).
What could be causing this?
The original I'm trying to replicate, and the output from the custom colour map are linked below:
You may get much closer to the desired result using the following.
The logic is that each color in the colorbar is the value corresponding to the mean of its interval.
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
X,Y=np.meshgrid(np.linspace(0,1),np.linspace(0,1) )
Z = X+Y
blue_red1 = LinearSegmentedColormap.from_list('mycmap', [
(0.0000, '#6666de'),
(0.0625, '#6666de'),
(0.1875, '#668cff'),
(0.3125, '#66d9ff'),
(0.4375, '#92ffce'),
(0.5625, '#d0ff90'),
(0.6875, '#ffe366'),
(0.8125, '#ff9b66'),
(0.9375, '#db6666'),
(1.0000, '#db6666')])
CS = plt.contourf(X,Y,Z, cmap=blue_red1)
plt.colorbar()
plt.show()
The other option is to use a ListedColormap. This gives the accurate colors.
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
X,Y=np.meshgrid(np.linspace(0,1),np.linspace(0,1) )
Z = X+Y
blue_red1 = ListedColormap(['#6666de','#668cff','#66d9ff','#92ffce','#d0ff90',
'#ffe366','#ff9b66','#db6666'],'mycmap')
CS = plt.contourf(X,Y,Z, cmap=blue_red1)
plt.colorbar()
plt.show()

matplotlib overlay a normal distribution with stddev axis onto another plot

I have a series of data that I'm reading in from a tutorial site.
I've managed to plot the distribution of the TV column in that data, however I also want to overlay a normal distribution curve with StdDev ticks on a second x-axis (so I can compare the two curves). I'm struggling to work out how to do it..
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats
import matplotlib.mlab as mlab
import math
# read data into a DataFrame
data = pd.read_csv('http://www-bcf.usc.edu/~gareth/ISL/Advertising.csv', index_col=0)
# draw distribution curve
h = sorted(data.TV)
hmean = np.mean(h)
hstd = np.std(h)
pdf = stats.norm.pdf(h, hmean, hstd)
plt.plot(h, pdf)
Here is a diagram close to what I'm after, where x is the StdDeviations. All this example needs is a second x axis to show the values of data.TV
Not sure what you really want, but you could probably use second axis like this
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats
import matplotlib.mlab as mlab
import math
# read data into a DataFrame
data = pd.read_csv('Advertising.csv', index_col=0)
fig, ax1 = plt.subplots()
# draw distribution curve
h = sorted(data.TV)
ax1.plot(h,'b-')
ax1.set_xlabel('TV')
ax1.set_ylabel('Count', color='b')
for tl in ax1.get_yticklabels():
tl.set_color('b')
hmean = np.mean(h)
hstd = np.std(h)
pdf = stats.norm.pdf(h, hmean, hstd)
ax2 = ax1.twinx()
ax2.plot(h, pdf, 'r.')
ax2.set_ylabel('pdf', color='r')
for tl in ax2.get_yticklabels():
tl.set_color('r')
plt.show()
Ok, assuming that you want to plot the distribution of your data, the fitted normal distribution with two x-axes, one way to achieve this is as follows.
Plot the normalized data together with the standard normal distribution. Then use matplotlib's twiny() to add a second x-axis to the plot. Use the same tick positions as the original x-axis on the second axis, but scale the labels so that you get the corresponding original TV values. The result looks like this:
Code
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats
import matplotlib.mlab as mlab
import math
# read data into a DataFrame
data = pd.read_csv('http://www-bcf.usc.edu/~gareth/ISL/Advertising.csv', index_col=0)
h = sorted(data.TV)
hmean = np.mean(h)
hstd = np.std(h)
h_n = (h - hmean) / hstd
pdf = stats.norm.pdf( h_n )
# plot data
f,ax1 = plt.subplots()
ax1.hist( h_n, 20, normed=1 )
ax1.plot( h_n , pdf, lw=3, c='r')
ax1.set_xlim( [h_n.min(), h_n.max()] )
ax1.set_xlabel( r'TV $[\sigma]$' )
ax1.set_ylabel( r'Relative Frequency')
ax2 = ax1.twiny()
ax2.grid( False )
ax2.set_xlim( ax1.get_xlim() )
ax2.set_ylim( ax1.get_ylim() )
ax2.set_xlabel( r'TV' )
ticklocs = ax2.xaxis.get_ticklocs()
ticklocs = [ round( t*hstd + hmean, 2) for t in ticklocs ]
ax2.xaxis.set_ticklabels( map( str, ticklocs ) )

Categories

Resources