Weighted empirical distribution function (ECDF) in python - python

I am trying to generate weighted empirical CDF in python. I know statsmodel.distributions.empirical_distribution provides an ECDF function, but it is unweighted. Is there a library that I can use or how can I go about extending this to write a function which calculates the weighted ECDF (EWCDF) like ewcdf {spatstat} in R.

Seaborn library has ecdfplot function which implements a weighted version of ECDF. I looked into the code of how seaborn calculates it.
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
sample = np.arange(100)
weights = np.random.randint(10, size=100)
estimator = sns.distributions.ECDF('proportion', complementary=True)
stat, vals = estimator(sample, weights=weights)
plt.plot(vals, stat)

Seaborn provides ecdfplot which allows you to plot a weighted CDF. See seaborn.ecdf. Based on deepAgrawal's answer, I adapted it a little bit so that what's plotted is CDF rather than 1-CDF.
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
sample = np.arange(15)
weights = np.random.randint(5, size=15)
df = pd.DataFrame(np.vstack((sample, weights)).T, columns = ['sample', 'weights'])
sns.ecdfplot(data = df, x = 'sample', weights = 'weights', stat = 'proportion', legend = True)

def ecdf(x):
Sorted = np.sort(x)
Length = len(x)
ecdf = np.zeros(Length)
for i in range(Length):
ecdf[i] = sum(Sorted <= x[i])/Length
return ecdf
x = np.array([1, 2, 5, 4, 3, 6, 7, 8, 9, 10])
ecdf(x)

Related

generating uniform distribution of integeres with python

I tried to generate an uniform distribution of random integeres on a given interval (it's unimportant whether it contains its upper limit or not) with python. I used the next snippet of code to do so and plot the result:
import numpy as np
import matplotlib.pyplot as plt
from random import randint
propsedPython = np.random.randint(0,32767,8388602)%2048
propsedPythonNoMod = np.random.randint(0,2048,8388602)
propsedPythonNoModIntegers = np.random.random_integers(0,2048,8388602)
propsedPythonNoModRandInt = np.empty(8388602)
for i in range(8388602):
propsedPythonNoModRandInt[i] = randint(0,2048)
plt.figure(figsize=[16,10])
plt.title(r'distribution $\rho_{prop}$ off all the python simulated proposed indices')
plt.xlabel(r'indices')
plt.ylabel(r'$\rho_{prop}$')
plt.yscale('log')
plt.hist(propsedPython,bins=1000,histtype='step',label=r'np.random.randint(0,32767,8388602)%2048')
plt.hist(propsedPythonNoMod,bins=1000,histtype='step',label=r'np.random.randint(0,2048,8388602')
plt.hist(propsedPythonNoModIntegers,bins=1000,histtype='step',label=r'np.random.random_integers(0,2048,8388602)')
plt.hist(propsedPythonNoModRandInt,bins=1000,histtype='step',label=r'for i in range(8388602):propsedPythonNoModRandInt[i] = randint(0,2048)')
plt.legend(loc=0)
The resulting plot is: Could somebody point me in the right direction why these spikes appear in al the different cases and or gives some advice which routine to use to got uniformly distributed random integers?
Thanks a lot!
Mmm...
I used new NumPy rng facility, and graph looks ok to me.
Code
import numpy as np
import matplotlib.pyplot as plt
rng = np.random.default_rng()
N = 1024*500
hist = np.zeros(2048, dtype=np.int32)
q = rng.integers(0, 2048, dtype=np.int32, size=N, endpoint=False)
for k in range(0, N):
hist[q[k]] += 1
x = np.arange(0, 2048, dtype=np.int32)
fig, ax = plt.subplots()
ax.stem(x, hist, markerfmt=' ')
plt.show()
and graph

Smoothing out a Line chart with Matplotlib

I have a pandas data frame that looks like this:
import pandas as pd
import matplotlib.pyplot
data = [{'A': 21, 'B': 23, 'C':19, 'D':26,'E':28,
'F':26,'G':23,'H':22,'I':24,'J':21}]
# Creates DataFrame.
df = pd.DataFrame(data)
plt.figure(figsize=(12,8))
df.iloc[-1].plot(marker='o',markersize=5)
plt.show()
When I try and plot this in Matplotlib, I end up with a very jagged line.
Is there a way I can smooth the line out to make it look more curved and fluid?
I have tried to use scipy's interpolate, but have not been successful.
Thanks
This should do the trick:
import pandas as pd
import matplotlib.pyplot as plt
from scipy import make_interp_spline
data = [{'A': 21, 'B': 23, 'C':19, 'D':26,'E':28,
'F':26,'G':23,'H':22,'I':24,'J':21}]
# Creates DataFrame.
df = pd.DataFrame(data)
y = np.array(df.iloc[-1].tolist())
x = np.arange(len(df.iloc[-1]))
xnew = np.linspace(x.min(), x.max(), 300)
spl = make_interp_spline(x, y, k=3)
ysmooth= spl(xnew)
plt.plot(xnew, ysmooth)
This is one option (not necessarily the ideal answer though):
You can try and use a polynomial approximation for the data, however you need numeric values for both you x and y axis, i've tried the below:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
#note i've changed the A,B,C... to 1,2,3...
data = [{1: 21, 2: 23, 3:19, 4:26,5:28,
6:26,7:23,8:22,9:24,10:21}]
#Creates DataFrame.
df = pd.DataFrame(data)
#define your lists
xlist = df.columns.tolist()
ylist = df.values.tolist()
ylist = ylist[0]
#plot data
plt.figure()
poly = np.polyfit(xlist,ylist,5)
poly_y = np.poly1d(poly)(xlist)
plt.plot(xlist,poly_y)
plt.plot(xlist,ylist)
plt.show()
Another option could be the Spline interpolation, the s parameters will allow you to adjust the smoothness of the curve, you can test several values for s:
from scipy.interpolate import splrep, splev
plt.figure()
bspl = splrep(xlist,ylist,s=25)
bspl_y = splev(xlist,bspl)
plt.plot(xlist,ylist)
plt.plot(xlist,bspl_y)
plt.show()

Plot RidgeCV coefficients as a function of the regularization

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import RidgeCV
tips = sns.load_dataset('tips')
X = tips.drop(columns=['tip','sex', 'smoker', 'day', 'time'])
y = tips['tip']
alphas = 10**np.linspace(10,-2,100)*0.5
ridge_clf = RidgeCV(alphas=alphas,scoring='r2').fit(X, y)
ridge_clf.score(X, y)
I wanted to plot the following graph for RidgeCV. I don't see any option to do that like GridSearhCV. I appreciate your suggestions!
There is no indication what the colors stand for. I assume they stand for features and we investigate the size of each feature weight as function of alpha. Here is my solution:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import RidgeCV
tips = sns.load_dataset('tips')
X = tips.drop(columns=['tip','sex', 'smoker', 'day', 'time'])
y = tips['tip']
alphas = 10**np.linspace(10,-2,100)*0.5
w = list()
for a in alphas:
ridge_clf = RidgeCV(alphas=[a],cv=10).fit(X, y)
w.append(ridge_clf.coef_)
w = np.array(w)
plt.semilogx(alphas,w)
plt.title('Ridge coefficients as function of the regularization')
plt.xlabel('alpha')
plt.ylabel('weights')
plt.legend(X.keys())
Output:
Since you only have two features in X there are only two lines.
Here is the code for generating the plot that you had posted.
Firstly, we need to understand that RidgeCV would not return the coef for each alpha value that we had fed in the alphas param.
The motivation behind having the RidgeCV is that it will try for different alpha values mentioned in alphas param, then based on cross validation scoring, it will return the best alpha along with the fitted model.
Hence, the only way to get the coef for each alpha value using cv is iterate through RidgeCV using each alpha value.
Example:
# Author: Fabian Pedregosa -- <fabian.pedregosa#inria.fr>
# License: BSD 3 clause
print(__doc__)
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model
# X is the 10x10 Hilbert matrix
X = 1. / (np.arange(1, 11) + np.arange(0, 10)[:, np.newaxis])
y = np.ones(10)
# #############################################################################
# Compute paths
n_alphas = 200
alphas = np.logspace(-10, -2, n_alphas)
coefs = []
for a in alphas:
ridge = linear_model.RidgeCV(alphas=[a], fit_intercept=False, cv=3)
ridge.fit(X, y)
coefs.append(ridge.coef_)
# #############################################################################
# Display results
ax = plt.gca()
ax.plot(alphas, coefs)
ax.set_xscale('log')
ax.set_xlim(ax.get_xlim()[::-1]) # reverse axis
plt.xlabel('alpha')
plt.ylabel('weights')
plt.title('RidgeCV coefficients as a function of the regularization')
plt.axis('tight')
plt.show()

Python scatter-plot: Conditions for marker styles?

I have a data set I wish to plot as scatter plot with matplotlib, and a vector the same size that categorizes and labels the data points (discretely, e.g. from 0 to 3). I want to use different markers for different labels (e.g. 'x' for 0, 'o' for 1 and so on). How can I solve this elegantly? I am quite sure I am just missing out on something, but didn't really find it, and my naive approaches failed so far...
What about iterating over all markers like this:
import numpy as np
import matplotlib.pyplot as plt
x = np.random.rand(100)
y = np.random.rand(100)
category = np.random.random_integers(0, 3, 100)
markers = ['s', 'o', 'h', '+']
for k, m in enumerate(markers):
i = (category == k)
plt.scatter(x[i], y[i], marker=m)
plt.show()
Matplotlib does not accepts different markers per plot.
However, a less verbose and more robust solution for large dataset is using the pandas and seaborn library:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
x = [48.959, 49.758, 49.887, 50.593, 50.683 ]
y = [122.310, 121.29, 120.525, 120.252, 119.509]
z = [136.993, 133.128, 143.710, 129.088, 139.860]
kmean = np.array([0, 1, 0, 2, 2])
df = pd.DataFrame({'x':x,'y':y,'z':z, 'km_z':kmean})
sns.scatterplot(data = df, x='x', y='y', hue='km_z', style='km_z')
which produces the following output
Additionally you can use the pandas.cut function to plot bins (Its something I regularly need to produce graphs where I can use a third continuous value as a parameter). The way to use it is :
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
x = [48.959, 49.758, 49.887, 50.593, 50.683 ]
y = [122.310, 121.29, 120.525, 120.252, 119.509]
z = [136.993, 133.128, 143.710, 129.088, 139.860]
df = pd.DataFrame({'x':x,'y':y,'z':z})
df['bins'] = pd.cut(df.z, bins=3)
sns.scatterplot(data = df, x='x', y='y', hue='bins', style='bins')
and it produces the following example:
I've used the latter method to produce graphs like the following:

Plot smooth line with PyPlot

I've got the following simple script that plots a graph:
import matplotlib.pyplot as plt
import numpy as np
T = np.array([6, 7, 8, 9, 10, 11, 12])
power = np.array([1.53E+03, 5.92E+02, 2.04E+02, 7.24E+01, 2.72E+01, 1.10E+01, 4.70E+00])
plt.plot(T,power)
plt.show()
As it is now, the line goes straight from point to point which looks ok, but could be better in my opinion. What I want is to smooth the line between the points. In Gnuplot I would have plotted with smooth cplines.
Is there an easy way to do this in PyPlot? I've found some tutorials, but they all seem rather complex.
You could use scipy.interpolate.spline to smooth out your data yourself:
from scipy.interpolate import spline
# 300 represents number of points to make between T.min and T.max
xnew = np.linspace(T.min(), T.max(), 300)
power_smooth = spline(T, power, xnew)
plt.plot(xnew,power_smooth)
plt.show()
spline is deprecated in scipy 0.19.0, use BSpline class instead.
Switching from spline to BSpline isn't a straightforward copy/paste and requires a little tweaking:
from scipy.interpolate import make_interp_spline, BSpline
# 300 represents number of points to make between T.min and T.max
xnew = np.linspace(T.min(), T.max(), 300)
spl = make_interp_spline(T, power, k=3) # type: BSpline
power_smooth = spl(xnew)
plt.plot(xnew, power_smooth)
plt.show()
Before:
After:
For this example spline works well, but if the function is not smooth inherently and you want to have smoothed version you can also try:
from scipy.ndimage.filters import gaussian_filter1d
ysmoothed = gaussian_filter1d(y, sigma=2)
plt.plot(x, ysmoothed)
plt.show()
if you increase sigma you can get a more smoothed function.
Proceed with caution with this one. It modifies the original values and may not be what you want.
See the scipy.interpolate documentation for some examples.
The following example demonstrates its use, for linear and cubic spline interpolation:
import matplotlib.pyplot as plt
import numpy as np
from scipy.interpolate import interp1d
# Define x, y, and xnew to resample at.
x = np.linspace(0, 10, num=11, endpoint=True)
y = np.cos(-x**2/9.0)
xnew = np.linspace(0, 10, num=41, endpoint=True)
# Define interpolators.
f_linear = interp1d(x, y)
f_cubic = interp1d(x, y, kind='cubic')
# Plot.
plt.plot(x, y, 'o', label='data')
plt.plot(xnew, f_linear(xnew), '-', label='linear')
plt.plot(xnew, f_cubic(xnew), '--', label='cubic')
plt.legend(loc='best')
plt.show()
Slightly modified for increased readability.
One of the easiest implementations I found was to use that Exponential Moving Average the Tensorboard uses:
def smooth(scalars: List[float], weight: float) -> List[float]: # Weight between 0 and 1
last = scalars[0] # First value in the plot (first timestep)
smoothed = list()
for point in scalars:
smoothed_val = last * weight + (1 - weight) * point # Calculate smoothed value
smoothed.append(smoothed_val) # Save it
last = smoothed_val # Anchor the last smoothed value
return smoothed
ax.plot(x_labels, smooth(train_data, .9), x_labels, train_data)
I presume you mean curve-fitting and not anti-aliasing from the context of your question. PyPlot doesn't have any built-in support for this, but you can easily implement some basic curve-fitting yourself, like the code seen here, or if you're using GuiQwt it has a curve fitting module. (You could probably also steal the code from SciPy to do this as well).
Here is a simple solution for dates:
from scipy.interpolate import make_interp_spline
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as dates
from datetime import datetime
data = {
datetime(2016, 9, 26, 0, 0): 26060, datetime(2016, 9, 27, 0, 0): 23243,
datetime(2016, 9, 28, 0, 0): 22534, datetime(2016, 9, 29, 0, 0): 22841,
datetime(2016, 9, 30, 0, 0): 22441, datetime(2016, 10, 1, 0, 0): 23248
}
#create data
date_np = np.array(list(data.keys()))
value_np = np.array(list(data.values()))
date_num = dates.date2num(date_np)
# smooth
date_num_smooth = np.linspace(date_num.min(), date_num.max(), 100)
spl = make_interp_spline(date_num, value_np, k=3)
value_np_smooth = spl(date_num_smooth)
# print
plt.plot(date_np, value_np)
plt.plot(dates.num2date(date_num_smooth), value_np_smooth)
plt.show()
It's worth your time looking at seaborn for plotting smoothed lines.
The seaborn lmplot function will plot data and regression model fits.
The following illustrates both polynomial and lowess fits:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
T = np.array([6, 7, 8, 9, 10, 11, 12])
power = np.array([1.53E+03, 5.92E+02, 2.04E+02, 7.24E+01, 2.72E+01, 1.10E+01, 4.70E+00])
df = pd.DataFrame(data = {'T': T, 'power': power})
sns.lmplot(x='T', y='power', data=df, ci=None, order=4, truncate=False)
sns.lmplot(x='T', y='power', data=df, ci=None, lowess=True, truncate=False)
The order = 4 polynomial fit is overfitting this toy dataset. I don't show it here but order = 2 and order = 3 gave worse results.
The lowess = True fit is underfitting this tiny dataset but may give better results on larger datasets.
Check the seaborn regression tutorial for more examples.
Another way to go, which slightly modifies the function depending on the parameters you use:
from statsmodels.nonparametric.smoothers_lowess import lowess
def smoothing(x, y):
lowess_frac = 0.15 # size of data (%) for estimation =~ smoothing window
lowess_it = 0
x_smooth = x
y_smooth = lowess(y, x, is_sorted=False, frac=lowess_frac, it=lowess_it, return_sorted=False)
return x_smooth, y_smooth
That was better suited than other answers for my specific application case.

Categories

Resources