I think numpy or scipy will do it, but didn't find. Thanks!
import numpy as np
import scipy.stats as stats
np.random.seed(0)
gaussian = stats.norm
Generating some random, normal data:
data = gaussian.rvs(loc = 5, scale = 22, size = 1000)
Computing descriptive statistics:
print(data.mean())
# 4.00435243522
print(data.std())
# 21.7147294907
Fitting the data to a normal distribution:
mean, std = gaussian.fit(data)
print(mean, std)
# (4.0043524352157016, 21.714729490718568)
Related
I'm trying to remove the trend present in the waveform which looks like the following:
For doing so, I use scipy.signal.detrend() as follows:
autocorr = scipy.signal.detrend(autocorr)
But I don't see any significant flattening in trend. I get the following:
My objective is to have the trend completely eliminated from the waveform. And I need to also generalize it so that it can detrend any kind of waveform - be it linear, piece-wise linear, polynomial, etc.
Can you please suggest a way to do the same?
Note: In order to replicate the above waveform, you can simply run the following code that I used to generate it:
#Loading Libraries
import warnings
warnings.filterwarnings("ignore")
import json
import sys, os
import numpy as np
import pandas as pd
import glob
import pickle
from statsmodels.tsa.stattools import adfuller, acf, pacf
from scipy.signal import find_peaks, square
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import matplotlib.pyplot as plt
#Generating a function with Dual Seasonality:
def white_noise(mu, sigma, num_pts):
""" Function to generate Gaussian Normal Noise
Args:
sigma: std value
num_pts: no of points
mu: mean value
Returns:
generated Gaussian Normal Noise
"""
noise = np.random.normal(mu, sigma, num_pts)
return noise
def signal_line_plot(input_signal: pd.Series, title: str = "", y_label: str = "Signal"):
""" Function to plot a time series signal
Args:
input_signal: time series signal that you want to plot
title: title on plot
y_label: label of the signal being plotted
Returns:
signal plot
"""
plt.plot(input_signal)
plt.title(title)
plt.ylabel(y_label)
plt.show()
# Square with two periodicities of daily and weekly. With #15min sampling frequency it means 4*24=96 samples and 4*24*7=672
t_week = np.linspace(1,480, 480)
t_weekend=np.linspace(1,192,192)
T=96 #Time Period
x_weekday = 10*square(2*np.pi*t_week/T, duty=0.7)+10 + white_noise(0, 1,480)
x_weekend = 2*square(2*np.pi*t_weekend/T, duty=0.7)+2 + white_noise(0,1,192)
x_daily_weekly = np.concatenate((x_weekday, x_weekend))
x_daily_weekly_long = np.concatenate((x_daily_weekly,x_daily_weekly,x_daily_weekly,x_daily_weekly,x_daily_weekly,x_daily_weekly,x_daily_weekly,x_daily_weekly,x_daily_weekly,x_daily_weekly))
signal_line_plot(x_daily_weekly_long)
signal_line_plot(x_daily_weekly_long[0:1000])
#Finding Autocorrelation & Lags for the signal [WHICH THE FINAL PARAMETERS WHICH ARE TO BE PLOTTED]:
#Determining Autocorrelation & Lag values
import scipy.signal as signal
autocorr = signal.correlate(x_daily_weekly_long, x_daily_weekly_long, mode="same")
#Normalize the autocorr values (such that the hightest peak value is at 1)
autocorr = (autocorr-min(autocorr))/(max(autocorr)-min(autocorr))
lags = signal.correlation_lags(len(x_daily_weekly_long), len(x_daily_weekly_long), mode = "same")
#Visualization
f = plt.figure()
f.set_figwidth(40)
f.set_figheight(10)
plt.plot(lags, autocorr)
#DETRENDING:
autocorr = scipy.signal.detrend(autocorr)
#Visualization
f = plt.figure()
f.set_figwidth(40)
f.set_figheight(10)
plt.plot(lags, autocorr)
Since it's an auto-correlation, it will always be even; so detrending with a breakpoint at lag=0 should get you part of the way there.
An alternative way to detrend is to use a high-pass filter; you could do this in two ways. What will be tricky is deciding what the cut-off frequency should be.
Here's a possible way to do this:
#Loading Libraries
import numpy as np
from scipy import signal
import matplotlib.pyplot as plt
#Generating a function with Dual Seasonality:
def white_noise(mu, sigma, num_pts):
""" Function to generate Gaussian Normal Noise
Args:
sigma: std value
num_pts: no of points
mu: mean value
Returns:
generated Gaussian Normal Noise
"""
noise = np.random.normal(mu, sigma, num_pts)
return noise
# High-pass filter via discrete Fourier transform
# Drop all components from 0th to dropcomponent-th
def dft_highpass(x, dropcomponent):
fx = np.fft.rfft(x)
fx[:dropcomponent] = 0
return np.fft.irfft(fx)
# Square with two periodicities of daily and weekly. With #15min sampling frequency it means 4*24=96 samples and 4*24*7=672
t_week = np.linspace(1,480, 480)
t_weekend=np.linspace(1,192,192)
T=96 #Time Period
x_weekday = 10*signal.square(2*np.pi*t_week/T, duty=0.7)+10 + white_noise(0, 1,480)
x_weekend = 2*signal.square(2*np.pi*t_weekend/T, duty=0.7)+2 + white_noise(0,1,192)
x_daily_weekly = np.concatenate((x_weekday, x_weekend))
x_daily_weekly_long = np.concatenate((x_daily_weekly,x_daily_weekly,x_daily_weekly,x_daily_weekly,x_daily_weekly,x_daily_weekly,x_daily_weekly,x_daily_weekly,x_daily_weekly,x_daily_weekly))
#Finding Autocorrelation & Lags for the signal [WHICH THE FINAL PARAMETERS WHICH ARE TO BE PLOTTED]:
#Determining Autocorrelation & Lag values
autocorr = signal.correlate(x_daily_weekly_long, x_daily_weekly_long, mode="same")
#Normalize the autocorr values (such that the hightest peak value is at 1)
autocorr = (autocorr-min(autocorr))/(max(autocorr)-min(autocorr))
lags = signal.correlation_lags(len(x_daily_weekly_long), len(x_daily_weekly_long), mode = "same")
# detrend w/ breakpoints
dautocorr = signal.detrend(autocorr, bp=len(lags)//2)
# detrend w/ high-pass filter
# use `filtfilt` to get zero-phase
b, a = signal.butter(1, 1e-3, 'high')
fautocorr = signal.filtfilt(b, a, autocorr)
# detrend with DFT HPF
rautocorr = dft_highpass(autocorr, len(autocorr) // 1000)
#Visualization
fig, ax = plt.subplots(3)
for i in range(3):
ax[i].plot(lags, autocorr, label='orig')
ax[0].plot(lags, dautocorr, label='detrend w/ bp')
ax[1].plot(lags, fautocorr, label='HPF')
ax[2].plot(lags, rautocorr, label='DFT')
for i in range(3):
ax[i].legend()
ax[i].set_ylabel('autocorr')
ax[-1].set_xlabel('lags')
giving
Is it possible to add binned_statistic to dask.stats? Or other solutions like embedding it in apply.ufunc()?
Here's an example using scipy.stats:
from scipy.stats import binned_statistic
import numpy as np
x = np.arange(500)
values = x*50
statistics, _, _ = binned_statistic(x, values, statistic='min', bins=500, range=(0, 500))
I'm having trouble obtaining the dispersion parameter of simulated data using statsmodels' GLM function.
import statsmodels.api as sm
import matplotlib.pyplot as plt
import scipy.stats as stats
import numpy as np
np.random.seed(1)
# Generate data
x=np.random.uniform(0, 100,50000)
x2 = sm.add_constant(x)
a = 0.5
b = 0.2
y_true = 1/(a+(b*x))
# Add error
scale = 2 # the scale parameter I'm trying to obtain
shape = y_true/scale # given that, for Gamma, mu = scale*shape
y = np.random.gamma(shape=shape, scale=scale)
# Run model
model = sm.GLM(y, x2, family=sm.families.Gamma()).fit()
model.summary()
Here's the summary from above:
Note that the coefficient estimates are correct (0.5 and 0.2), but the scale (21.995) is way off the scale I set (2).
Can someone point out what it is I'm misunderstanding/doing wrong? Thanks!
As Josef noted in the comments, statsmodels uses a different kind of parameterization.
I have a set whose samples are discrete values (in particular, the size of a queue over time). Now I'd like to find what distribution they belong to. To achieve this goal I'd act the same way I did for the other quantities, i.e. plotting a qqplot, launching
import statsmodels.api as sm
sm.qqplot(df, dist = 'geom', sparams = (.5,), line ='s', alpha = 0.3, marker ='.')
This works if dist is not a discrete random variables (e.g. 'exp' or 'norm') and indeed I used to get some results, but when the distribution is discrete (say, 'geom'), I get
AttributeError: 'geom_gen' object has no attribute 'fit'
I searched on the Internet how to make a qqplot (or something similar) to spot what distribution my samples belong to but I found nothing
def discreteQQ(x_sample):
p_test = np.array([])
for i in range(0, 1001):
p_test = np.append(p_test, i/1000)
i = i + 1
x_sample = np.sort(x_sample)
x_theor = stats.geom.rvs(.5, size=len(x_sample))
ecdf_sample = np.arange(1, len(x_sample) + 1)/(len(x_sample)+1)
x_theor = stats.geom.ppf(ecdf_sample, p=0.5)
for p in p_test:
plt.scatter(np.quantile(x_theor, p), np.quantile(x_sample, p), c = 'blue')
plt.xlabel('Theoretical quantiles')
plt.ylabel('Sample quantiles')
plt.show()
Generate a theoretical geometric distribution using scipy.stats.geom, convert the sample and theoretical data using statsmodels' ProbPlot and pass these to statsmodels' qqplot_2samples.
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
from statsmodels.graphics.gofplots import ProbPlot
from statsmodels.graphics.gofplots import qqplot_2samples
p_theor = 1/4 # The probability we check for
p_sample = 1/5 # The true probability of the sample distribution
# The experimental data
x_sample = stats.geom.rvs(p_sample, size=50)
# The model data
x_theor = stats.geom.rvs(p_theor, size=100)
qqplot_2samples(ProbPlot(x_sample), ProbPlot(x_theor), line='45')
plt.show()
So, let's say I have the following 2-dimensional target distribution that I would like to sample from (a mixture of bivariate normal distributions) -
import numba
import numpy as np
import scipy.stats as stats
import seaborn as sns
import pandas as pd
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
%matplotlib inline
def targ_dist(x):
target = (stats.multivariate_normal.pdf(x,[0,0],[[1,0],[0,1]])+stats.multivariate_normal.pdf(x,[-6,-6],[[1,0.9],[0.9,1]])+stats.multivariate_normal.pdf(x,[4,4],[[1,-0.9],[-0.9,1]]))/3
return target
and the following proposal distribution (a bivariate random walk) -
def T(x,y,sigma):
return stats.multivariate_normal.pdf(y,x,[[sigma**2,0],[0,sigma**2]])
The following is the Metropolis Hastings code for updating the "entire" state in every iteration -
#Initialising
n_iter = 30000
# tuning parameter i.e. variance of proposal distribution
sigma = 2
# initial state
X = stats.uniform.rvs(loc=-5, scale=10, size=2, random_state=None)
# count number of acceptances
accept = 0
# store the samples
MHsamples = np.zeros((n_iter,2))
# MH sampler
for t in range(n_iter):
# proposals
Y = X+stats.norm.rvs(0,sigma,2)
# accept or reject
u = stats.uniform.rvs(loc=0, scale=1, size=1)
# acceptance probability
r = (targ_dist(Y)*T(Y,X,sigma))/(targ_dist(X)*T(X,Y,sigma))
if u < r:
X = Y
accept += 1
MHsamples[t] = X
However, I would like to update "per component" (i.e. component-wise updating) in every iteration. Is there a simple way of doing this?
Thank you for your help!
From the tone of your question I assume you are looking performance improvements.
MonteCarlo algorithms are quite compute intensive. You will get better results, if you perform in algorithms on a lower level than in an interpreted language like python, e.g. writing a c-extension.
There are also implementations available for python (PyStan, PyMC3).