Plot 95% confidence interval errorbar python pandas dataframes - python

I want to show 95% confidence interval with Python pandas, matpolib...
But I stucked, because for usual .std() I would do smth like this:
import pandas as pd
import numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import math
data = pd.read_table('output.txt',sep=r'\,', engine='python')
Ox = data.groupby(['Ox'])['Ox'].mean()
Oy = data.groupby(['Ox'])['Oy'].mean()
std = data.groupby(['Ox'])['Oy'].std()
plt.plot(Ox, Oy , label = 'STA = '+ str(x))
plt.errorbar(Ox, Oy, std, label = 'errorbar', linewidth=2)
plt.legend(loc='best', prop={'size':9.2})
plt.savefig('plot.pdf')
plt.close()
But I haven't found something in pandas methods which can help me. Does anybody know?

Using 2 * std to estimate the 95 % interval
In a normal distribution, the interval [μ - 2σ, μ + 2σ] covers 95.5 %, so
you can use 2 * std to estimate the 95 % interval:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
df = pd.DataFrame()
df['category'] = np.random.choice(np.arange(10), 1000, replace=True)
df['number'] = np.random.normal(df['category'], 1)
mean = df.groupby('category')['number'].mean()
std = df.groupby('category')['number'].std()
plt.errorbar(mean.index, mean, xerr=0.5, yerr=2*std, linestyle='')
plt.show()
Result:
Using percentiles
If your distribution is skewed, it is better to use asymmetrical errorbars and get your 95% interval from the percentiles.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import skewnorm
df = pd.DataFrame()
df['category'] = np.random.choice(np.arange(10), 1000, replace=True)
df['number'] = skewnorm.rvs(5, df['category'], 1)
mean = df.groupby('category')['number'].mean()
p025 = df.groupby('category')['number'].quantile(0.025)
p975 = df.groupby('category')['number'].quantile(0.975)
plt.errorbar(
mean.index,
mean,
xerr=0.5,
yerr=[mean - p025, p975 - mean],
linestyle='',
)
plt.show()
Result:

For a normal distribution ~95% of the values lie within a window of 4 standard deviations around the mean, or in other words, 95% of the values are within plus/minus 2 standard deviations from the mean. See, e.g. 68–95–99.7-rule.
plt.errorbar's yerr argument specifies the length of the single sided errorbar. Thus taking
plt.errorbar(x,y,yerr=2*std)
where std is the standard deviation shows the errorbars of the 95% confidence interval.

To obtain the 95% confidence interval you'll need to define a function.
Tweaking: Compute a confidence interval from sample data
def mean_confidence_interval(data, confidence=0.95):
a = 1.0 * np.array(data)
n = len(a)
m, se = np.mean(a), scipy.stats.sem(a)
h = se * scipy.stats.t.ppf((1 + confidence) / 2., n-1)
return m
def bound_confidence_interval(data, confidence=0.95):
a = 1.0 * np.array(data)
n = len(a)
m, se = np.mean(a), scipy.stats.sem(a)
h = se * scipy.stats.t.ppf((1 + confidence) / 2., n-1)
return h
Then define:
mean = df.groupby(by='yourquery').agg(mean_confidence_interval)
bound = df.groupby(by='yourquery').agg(bound_confidence_interval)
Finally plot with a library of your choice: e.g. plotly
import plotly.graph_objects as go
fig = go.Figure(data=go.Scatter(
x=mean[yourquery],
y=mean[yourquery2],
error_y=dict(
type='data', # value of error bar given in data coordinates
array=bound[yourquery2],
visible=True)
))
fig.show()

Related

Recover the time shift from nympy.correlate result in Python

This is not a duplicate question since other answers only explain how to plot the cross-correlation function and do not explain how you can get the time difference.
Given a sin signal and shifted version, we should be able to get the time delay between them.
I have created a sin signal and shifted it by t_d=0.05. The following is my code and its output:
import numpy as np
import matplotlib.pyplot as plt
fs = 1000
x = np.linspace(0, 1, fs)
f = 5
t_shift = 0.05
y = np.sin(2*np.pi*f*x)
y_shifted = np.sin(2*np.pi*f*(x-t_shift))
fig, ax = plt.subplots()
ax.plot(x, y, x, y_shifted)
plt.show()
By normalizing signals and applying numpy.correlate we get the following:
y_norm = (y-y.mean())/y.std()
y_shifted_norm = (y_shifted - y_shifted.mean())/y_shifted.std()
cc = np.correlate(y_norm, y_shifted_norm, 'full')
fig, ax = plt.subplots()
ax.plot(range(len(cc)), cc)
plt.show()
Question
From the indices of cross-correlation function, how can I get t_shift=0.05?
#Sepide. It seems to me as if you are trying to maximise the correlation between the signal y and a shifted version of y_shifted. This might be accomplished using np.correlate() but it seems nontrivial indeed to recover the time shifts in the signals. In the solution below I manually shift the time series and compute the correlation coefficient using np.corrcoef. As soon as this Pearson correlation coefficient equals 1, the two signals are aligned.
import numpy as np
import matplotlib.pyplot as plt
# Setting
fs = 1000
x = np.linspace(0, 1, fs)
f = 5
t_shift = 0.05
t_step = 1/fs
# Data
y = np.sin(2*np.pi*f*x)
y_shifted = np.sin(2*np.pi*f*(x-t_shift))
# Compute correlation
MaxTimeShift = 200
CorrelationList = np.empty((MaxTimeShift,1));
CorrelationList[:] = np.NaN
# Compute correlation for various shifts
for iter in range(MaxTimeShift):
CorrelationList[iter] = np.corrcoef( y[0:801].T, y_shifted[iter:(801+iter)].T)[0,1]
# Plot 1
plt.figure(1)
plt.plot(x, y, x, y_shifted)
plt.show()
# Plot 2
plt.figure(2)
ShiftList = t_step*np.arange(MaxTimeShift)
plt.plot(ShiftList, CorrelationList)
plt.title("Correlation coefficient")
plt.show()
print("The time shift between the signals is: ", ShiftList[np.argmax(CorrelationList)])

python - frequency of power spectrum

I want to plot a power spectrum from my data set (array of about 2000 values, the data is recorded every minute).
I've gotten so far as:
y= np.fft.fft(data)
abs = np.abs(y) #absolute value
p = np.square(abs) #power
but am confused about setting the frequency.
I've tried using freqs = np.fft.fftfreq(len(y)), but when I plot the result it looks like, which can't be right.
What am I doing wrong?
Here is an example to plot the power spectrum:
import matplotlib.pyplot as plt
import numpy as np
t = np.linspace(0,2000,200)
data = 2 * np.sin(2*np.pi *60*t) + 2 * np.sin(2*np.pi *42*t)
spectrum = np.fft.fft(data)
power_spectrum = np.square(np.abs(spectrum))
fig, ax = plt.subplots()
ax.plot(np.arange(len(power_spectrum)), power_spectrum)
plt.show()

Identifying Outliers with Quantile Regression and Python

I am trying to identify outliers in a dataset using the 5th and 95th percentiles of a regression line so I'm using quantile regression in Python with statsmodel, matplotlib and pandas. Based on this answer from blokeley, I can create a scatterplot of my data and show the best fit line and the lines for the 5th and 95th percentile based on quantile regression. But how do I identify those points that fall above and below those lines and then save them out to a pandas dataframe?
My data looks like this (there are 95 values in total):
Month Year LST NDVI
0 June 1984 310.550975 0.344335
1 June 1985 310.495331 0.320504
2 June 1986 306.820900 0.369494
3 June 1987 308.945602 0.369946
4 June 1988 308.694022 0.31863
2
and the script I have so far is this:
import pandas as pd
excel = my_excel
df = pd.read_excel(excel)
df.head()
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf
model = smf.quantreg('NDVI ~ LST',df)
quantiles = [0.05,0.95]
fits = [model.fit(q=q) for q in quantiles]
figure,axes = plt.subplots()
x = df['LST']
y = df['NDVI']
axes.scatter(x,df['NDVI'],c='green',alpha=0.3,label='data point')
fit = np.polyfit(x, y, deg=1)
axes.plot(x, fit[0] * x + fit[1], color='grey',label='best fit')
_x = np.linspace(x.min(),x.max())
for index, quantile in enumerate(quantiles):
_y = fits[index].params['LST'] * _x + fits[index].params['Intercept']
axes.plot(_x, _y, label=quantile)
title = 'LST/NDVI Jun-Aug'
plt.title(title)
axes.legend()
axes.set_xticks(np.arange(298,320,4))
axes.set_yticks(np.arange(0.25,0.5,.05))
axes.set_xlabel('LST')
axes.set_ylabel('NDVI');
And the chart I get out of that is this:
So I can definitely see data points above the 95th line and below the 5th line that I would classify as outliers, but I want to identify those in my original dataframe and maybe plot them on the cart or highlight them in some way to show them as "outliers".
I am searching on a method but coming up empty and could use some help.
You need to figure out if certain point are above the 95% quantile line or below the 5% quantile line. This you can do using the cross product, see this answer for a straightforward implementation.
In your example, you would need to combine the points above and below the quantile lines, possibly in a mask.
Here's is an example:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf
df = pd.DataFrame(np.random.normal(0, 1, (100, 2)))
df.columns = ['LST', 'NDVI']
model = smf.quantreg('NDVI ~ LST', df)
quantiles = [0.05, 0.95]
fits = [model.fit(q=q) for q in quantiles]
figure, axes = plt.subplots()
x = df['LST']
y = df['NDVI']
fit = np.polyfit(x, y, deg=1)
_x = np.linspace(x.min(), x.max(), num=len(y))
# fit lines
_y_005 = fits[0].params['LST'] * _x + fits[0].params['Intercept']
_y_095 = fits[1].params['LST'] * _x + fits[1].params['Intercept']
# start and end coordinates of fit lines
p = np.column_stack((x, y))
a = np.array([_x[0], _y_005[0]]) #first point of 0.05 quantile fit line
b = np.array([_x[-1], _y_005[-1]]) #last point of 0.05 quantile fit line
a_ = np.array([_x[0], _y_095[0]])
b_ = np.array([_x[-1], _y_095[-1]])
#mask based on if coordinates are above 0.95 or below 0.05 quantile fitlines using cross product
mask = lambda p, a, b, a_, b_: (np.cross(p-a, b-a) > 0) | (np.cross(p-a_, b_-a_) < 0)
mask = mask(p, a, b, a_, b_)
axes.scatter(x[mask], df['NDVI'][mask], facecolor='r', edgecolor='none', alpha=0.3, label='data point')
axes.scatter(x[~mask], df['NDVI'][~mask], facecolor='g', edgecolor='none', alpha=0.3, label='data point')
axes.plot(x, fit[0] * x + fit[1], label='best fit', c='lightgrey')
axes.plot(_x, _y_095, label=quantiles[1], c='orange')
axes.plot(_x, _y_005, label=quantiles[0], c='lightblue')
axes.legend()
axes.set_xlabel('LST')
axes.set_ylabel('NDVI')
plt.show()

How to use Python to draw a normal probability plot by using certain column data in dataFrame

I have a Data Frame that contains two columns named, "thousands of dollars per year", and "EMPLOY".
I create a new variable in this data frame named "cubic_Root" by computing the data in df['thousands of dollars per year']
df['cubic_Root'] = -1 / df['thousands of dollars per year'] ** (1. / 3)
The data in df['cubic_Root'] like that:
ID cubic_Root
1 -0.629961
2 -0.405480
3 -0.329317
4 -0.480750
5 -0.305711
6 -0.449644
7 -0.449644
8 -0.480750
Now! How can I draw a normal probability plot by using the data in df['cubic_Root'].
You want the "Probability" Plots.
So for a single plot, you'd have something like below.
import scipy.stats
import numpy as np
import matplotlib.pyplot as plt
# 100 values from a normal distribution with a std of 3 and a mean of 0.5
data = 3.0 * np.random.randn(100) + 0.5
counts, start, dx, _ = scipy.stats.cumfreq(data, numbins=20)
x = np.arange(counts.size) * dx + start
plt.plot(x, counts, 'ro')
plt.xlabel('Value')
plt.ylabel('Cumulative Frequency')
plt.show()
If you want to plot a distribution, and you know it, define it as a function, and plot it as so:
import numpy as np
from matplotlib import pyplot as plt
def my_dist(x):
return np.exp(-x ** 2)
x = np.arange(-100, 100)
p = my_dist(x)
plt.plot(x, p)
plt.show()
If you don't have the exact distribution as an analytical function, perhaps you can generate a large sample, take a histogram and somehow smooth the data:
import numpy as np
from scipy.interpolate import UnivariateSpline
from matplotlib import pyplot as plt
N = 1000
n = N/10
s = np.random.normal(size=N) # generate your data sample with N elements
p, x = np.histogram(s, bins=n) # bin it into n = N/10 bins
x = x[:-1] + (x[1] - x[0])/2 # convert bin edges to centers
f = UnivariateSpline(x, p, s=n)
plt.plot(x, f(x))
plt.show()
You can increase or decrease s (smoothing factor) within the UnivariateSpline function call to increase or decrease smoothing. For example, using the two you get:
Probability density Function (PDF) of inter-arrival time of events.
import numpy as np
import scipy.stats
# generate data samples
data = scipy.stats.expon.rvs(loc=0, scale=1, size=1000, random_state=123)
A kernel density estimation can then be obtained by simply calling
scipy.stats.gaussian_kde(data,bw_method=bw)
where bw is an (optional) parameter for the estimation procedure. For this data set, and considering three values for bw the fit is as shown below
# test values for the bw_method option ('None' is the default value)
bw_values = [None, 0.1, 0.01]
# generate a list of kde estimators for each bw
kde = [scipy.stats.gaussian_kde(data,bw_method=bw) for bw in bw_values]
# plot (normalized) histogram of the data
import matplotlib.pyplot as plt
plt.hist(data, 50, normed=1, facecolor='green', alpha=0.5);
# plot density estimates
t_range = np.linspace(-2,8,200)
for i, bw in enumerate(bw_values):
plt.plot(t_range,kde[i](t_range),lw=2, label='bw = '+str(bw))
plt.xlim(-1,6)
plt.legend(loc='best')
Reference:
Python: Matplotlib - probability plot for several data set
how to plot Probability density Function (PDF) of inter-arrival time of events?

Plotting profile hitstograms in python

I am trying to make a profile plot for two columns of a pandas.DataFrame. I would not expect this to be in pandas directly but it seems there is nothing in matplotlib either. I have searched around and cannot find it in any package other than rootpy. Before I take the time to write this myself I thought I would ask if there was a small package that contained profile histograms, perhaps where they are known by a different name.
If you don't know what I mean by "profile histogram" have a look at the ROOT implementation. http://root.cern.ch/root/html/TProfile.html
You can easily do it using scipy.stats.binned_statistic.
import scipy.stats
import numpy
import matplotlib.pyplot as plt
x = numpy.random.rand(10000)
y = x + scipy.stats.norm(0, 0.2).rvs(10000)
means_result = scipy.stats.binned_statistic(x, [y, y**2], bins=50, range=(0,1), statistic='mean')
means, means2 = means_result.statistic
standard_deviations = numpy.sqrt(means2 - means**2)
bin_edges = means_result.bin_edges
bin_centers = (bin_edges[:-1] + bin_edges[1:])/2.
plt.errorbar(x=bin_centers, y=means, yerr=standard_deviations, linestyle='none', marker='.')
Use seaborn. Data as from #MaxNoe
import numpy as np
import seaborn as sns
# just some random numbers to get started
x = np.random.uniform(-2, 2, 10000)
y = np.random.normal(x**2, np.abs(x) + 1)
sns.regplot(x=x, y=y, x_bins=10, fit_reg=None)
You can do much more (error bands are from bootstrap, you can change the estimator on the y-axis, add regression, ...)
While #Keith's answer seems to fit what you mean, it is quite a lot of code. I think this can be done much simpler, so one gets the key concepts and can adjust and build on top of it.
Let me stress one thing: what ROOT is calling a ProfileHistogram is not a special kind of plot. It is an errorbar plot. Which can simply be done in matplotlib.
It is a special kind of computation and that's not the task of a plotting library. This lies in the pandas realm, and pandas is great at stuff like this. It's symptomatical for ROOT as the giant monolithic pile it is to have an extra class for this.
So what you want to do is: discretize in some variable x and for each bin, calculate something in another variable y.
This can easily done using np.digitize together with the pandas groupy and aggregate methods.
Putting it all together:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
# just some random numbers to get startet
x = np.random.uniform(-2, 2, 10000)
y = np.random.normal(x**2, np.abs(x) + 1)
df = pd.DataFrame({'x': x, 'y': y})
# calculate in which bin row belongs base on `x`
# bins needs the bin edges, so this will give as 100 equally sized bins
bins = np.linspace(-2, 2, 101)
df['bin'] = np.digitize(x, bins=bins)
bin_centers = 0.5 * (bins[:-1] + bins[1:])
bin_width = bins[1] - bins[0]
# grouby bin, so we can calculate stuff
binned = df.groupby('bin')
# calculate mean and standard error of the mean for y in each bin
result = binned['y'].agg(['mean', 'sem'])
result['x'] = bin_centers
result['xerr'] = bin_width / 2
# plot it
result.plot(
x='x',
y='mean',
xerr='xerr',
yerr='sem',
linestyle='none',
capsize=0,
color='black',
)
plt.savefig('result.png', dpi=300)
Just like ROOT ;)
I made a module myself for this functionality.
import pandas as pd
from pandas import Series, DataFrame
import numpy as np
import matplotlib.pyplot as plt
def Profile(x,y,nbins,xmin,xmax,ax):
df = DataFrame({'x' : x , 'y' : y})
binedges = xmin + ((xmax-xmin)/nbins) * np.arange(nbins+1)
df['bin'] = np.digitize(df['x'],binedges)
bincenters = xmin + ((xmax-xmin)/nbins)*np.arange(nbins) + ((xmax-xmin)/(2*nbins))
ProfileFrame = DataFrame({'bincenters' : bincenters, 'N' : df['bin'].value_counts(sort=False)},index=range(1,nbins+1))
bins = ProfileFrame.index.values
for bin in bins:
ProfileFrame.ix[bin,'ymean'] = df.ix[df['bin']==bin,'y'].mean()
ProfileFrame.ix[bin,'yStandDev'] = df.ix[df['bin']==bin,'y'].std()
ProfileFrame.ix[bin,'yMeanError'] = ProfileFrame.ix[bin,'yStandDev'] / np.sqrt(ProfileFrame.ix[bin,'N'])
ax.errorbar(ProfileFrame['bincenters'], ProfileFrame['ymean'], yerr=ProfileFrame['yMeanError'], xerr=(xmax-xmin)/(2*nbins), fmt=None)
return ax
def Profile_Matrix(frame):
#Much of this is stolen from https://github.com/pydata/pandas/blob/master/pandas/tools/plotting.py
import pandas.core.common as com
import pandas.tools.plotting as plots
from pandas.compat import lrange
from matplotlib.artist import setp
range_padding=0.05
df = frame._get_numeric_data()
n = df.columns.size
fig, axes = plots._subplots(nrows=n, ncols=n, squeeze=False)
# no gaps between subplots
fig.subplots_adjust(wspace=0, hspace=0)
mask = com.notnull(df)
boundaries_list = []
for a in df.columns:
values = df[a].values[mask[a].values]
rmin_, rmax_ = np.min(values), np.max(values)
rdelta_ext = (rmax_ - rmin_) * range_padding / 2.
boundaries_list.append((rmin_ - rdelta_ext, rmax_+ rdelta_ext))
for i, a in zip(lrange(n), df.columns):
for j, b in zip(lrange(n), df.columns):
common = (mask[a] & mask[b]).values
nbins = 100
(xmin,xmax) = boundaries_list[i]
ax = axes[i, j]
Profile(df[a][common],df[b][common],nbins,xmin,xmax,ax)
ax.set_xlabel('')
ax.set_ylabel('')
plots._label_axis(ax, kind='x', label=b, position='bottom', rotate=True)
plots._label_axis(ax, kind='y', label=a, position='left')
if j!= 0:
ax.yaxis.set_visible(False)
if i != n-1:
ax.xaxis.set_visible(False)
for ax in axes.flat:
setp(ax.get_xticklabels(), fontsize=8)
setp(ax.get_yticklabels(), fontsize=8)
return axes
To my knowledge matplotlib doesn't still allow to directly produce profile histograms.
You can instead give a look at Hippodraw, a package developed at SLAC, that can be used as a Python extension module.
Here there is a Profile histogram example:
http://www.slac.stanford.edu/grp/ek/hippodraw/datareps_root.html#datareps_profilehist

Categories

Resources