Struggling to graph a Beta Distribution using Python - python

Given some measures I am trying to create a beta distribution. Given a max, min, mean and also an alpha and beta how do I call the beta.ppf or beta.pfd to generate a proper data set?
Working Sample
https://www.kaggle.com/iancoetzer/betaworking
Broken Sample
https://www.kaggle.com/iancoetzer/betaproblem
import matplotlib.pyplot as plt
from scipy.stats import beta
#
# Set the shape paremeters
#
a = 2.8754
b = 3.0300
minv = 82.292
maxv = 129.871
mean = 105.46
#
# Generate the value between
#
x = np.linspace(beta.ppf(minv, a, b),beta.ppf(maxv, a, b), 100)
#
# Plot the beta distribution
#
plt.figure(figsize=(7,7))
plt.xlim(0.7, 1)
plt.plot(x, beta.pdf(x, a, b), 'r-')
plt.title('Beta Distribution', fontsize='15')
plt.xlabel('Values of Random Variable X (0, 1)', fontsize='15')
plt.ylabel('Probability', fontsize='15')
plt.show()```

We managed to code a simple solution to compute and plot the Beta Distribution as follow: see the red beta curve.
Now we are trying to plot a Weibull distribution ...
#import libraries
import pandas as pd, numpy as np, gc, time, os, uuid, math, datetime
from joblib import Parallel, delayed
from numpy.random import default_rng
from scipy.stats import beta
from scipy import special
from scipy.stats import exponweib
import matplotlib.pyplot as plt
#sample parameters
low, high, mean, a, b, trials = 82.292, 129.871, 105.46, 2.8754, 3.0300, 10000
scale = (high-low)/6
#normal
normal_arr = np.random.normal(loc=mean, scale=scale, size=trials)
#triangular
triangular_arr = np.random.triangular(left=low, mode=mean, right=high, size=trials)
#log normal
mu = math.log(math.pow(mean,2) / math.sqrt(math.pow(scale,2) + math.pow(mean,2)))
sigma = math.sqrt(math.log(math.pow(scale,2)/(math.pow(mean,2)) + 1))
lognorm_arr = np.random.lognormal(mean=mu, sigma=sigma, size=trials)
#beta
beta_x = np.linspace(beta.ppf(0.0, a, b),beta.ppf(1, a, b), trials)
#by = beta.pdf(bx, a, b)
beta_arr = beta.ppf(beta_x, a, b, loc=low, scale=high - low)
#define binning(arr) method:
def binning(arr):
df = pd.DataFrame(arr)
df["Trial"] = range(1, len(df) + 1)
df[0] = df[0].astype(float)
df.rename(columns = {0: "Result"}, inplace=True)
minval = df["Result"].min()
maxval = df["Result"].max()
binCount = 100
bins = np.linspace(minval, maxval, binCount + 1)
labels = np.arange(1, binCount + 1)
df["bins"] = pd.cut(df["Result"], bins = bins, labels = labels, include_lowest = True)
dfBin = df.groupby(["bins"])["Result"].mean()
dfCount = df.groupby(["bins"])["Result"].count()
dfBin.replace(np.nan, 0.0, inplace=True)
dfCount.replace(np.nan, 0, inplace=True)
dfCount = pd.DataFrame(dfCount)
dfBin = pd.DataFrame(dfBin)
dfBin["bin"] = range(1, len(dfBin) + 1)
dfBin["Result"] = dfBin["Result"].astype(float)
df = pd.merge(dfBin, dfCount, left_index=True, right_index=True)
#Rename the resulting columns
df.rename(columns = {'Result_x':'Mean'}, inplace = True)
df.rename(columns = {'Result_y':'Trials'}, inplace = True)
return df
dfNormal = binning(normal_arr)
dfLog = binning(lognorm_arr)
dfTriangular = binning(triangular_arr)
dfBeta = binning(beta_arr)
dfWeibull = binning(wei_arr)
dfNormal.drop(dfNormal[dfNormal["Mean"] == 0].index, inplace=True)
dfLog.drop(dfLog[dfLog["Mean"] == 0].index, inplace=True)
dfTriangular.drop(dfTriangular[dfTriangular["Mean"] == 0].index, inplace=True)
dfBeta.drop(dfBeta[dfBeta["Mean"] == 0].index, inplace=True)
dfWeibull.drop(dfWeibull[dfWeibull["Mean"] == 0].index, inplace=True)
plt.plot(dfNormal["Mean"], dfNormal["Trials"], label="Normal")
plt.plot(dfLog["Mean"], dfLog["Trials"], label="Lognormal")
plt.plot(dfTriangular["Mean"], dfTriangular["Trials"], label="Triangular")
plt.plot(dfBeta["Mean"], dfBeta["Trials"], label="Beta")
plt.plot(dfWeibull["Mean"], dfWeibull["Trials"], label="Weibull")
plt.legend(loc='upper right')
plt.xlabel("R amount")
plt.ylabel("# Trials")
#plt.xlim(low, high)
plt.show()

Related

Using Polyfit with Lists

I am trying to construct a function with np.polyfit() to extrapolate data according to my need. I have some temperature and pressure observations which I have plotted. I need to fit a best-fit line in the observations so that I can extrapolate (get the temperature in each pressure level for a different surface temperature, i.e., the temperature of the last pressure level, assuming that the shape of the fit remains constant) the observations to my need. This is what I have done so far:
import pandas as pd
import glob
import numpy as np
import matplotlib.dates as mdates
import matplotlib.pyplot as plt
from datetime import datetime
dfs = []
pressure = []
temp = []
#reading the data
for fname in glob.glob('/home/swadhin/project/radiosonde_data/in/pb/*.txt'):
df = pd.read_csv(fname, skiprows=1, delimiter='\s+',
names=['LVLpTYP', 'ETIME', 'PRESSURE','GPH','TEMP','RH','DPDP','WDIR','WSPD'])
p1 = df['PRESSURE'].to_numpy(dtype = np.float64)
t1 = df['TEMP'].to_numpy(dtype = np.float64)
pressure.append(p1)
temp.append(t1)
dfs.append(df)
p =[]
for i in pressure:
a = np.ma.masked_equal(i,-9999.) #masking the fill_values
p.append(a)
p = [i/100 for i in p] #converting the pressure to hPa
t =[]
for j in temp:
b = np.ma.masked_equal(j,-9999.)
c = np.ma.masked_equal(b,-8888.)
t.append(c)
t = [i/10 for i in t] #converting the temp to the appropriate unit
zipped = zip(p, t)
z_c = [np.polyfit(x,y,2) for x,y in zipped]
p_array = np.linspace(1000,0, num = 101)
for i in range(len(p)):
x = p[i]
y = t[i]
z = z_c[i]
xp = p_array[i]
p = np.poly1d(z)
plt.subplot(1,2,i+1)
plt.plot(y, x, '.', p(xp),xp, '-');
plt.gca().invert_yaxis()
But I am not getting any plots.
Earlier to plot the pressure and temperature from the observations I did this and got the following plot:
for i in range(len(p)):
plt.plot(t[i],p[i])
plt.gca().invert_yaxis()
plt.ylim(bottom = 1010)
plt.ylabel('Pressure(hPa)')
plt.xlabel('Temperature($^\circ$C)')
The pressure and temperature arrays have an inhomogeneous structure.
I am attaching the datafiles for reference:
Pressure data = https://drive.google.com/file/d/13e7u8iBZWvmHAj0yt9MXEtB1eniI9xOR/view?usp=sharing
Temp data = https://drive.google.com/file/d/13dysYQlutg0_a9aJnm2U3_lCecxSObFN/view?usp=sharing

efficient frontier/stock analyze

Consider the following task. Using a 10-year period I should calculate the portfolio weights in January and then use these weights in February to calculate the portfolio return and standard deviation. The program should then continue to calculate the weights In February and then use these weights in February to calculate the portfolio returns and standard deviation in marts. This should be done through all the 131 months in the data meaning I should only calculate the weights in the first month of the dataset.
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import yfinance as yf
p_ret = [] # Define an empty array for portfolio returns
p_vol = [] # Define an empty array for portfolio volatility
tickers = ['AAPL', 'AMZN', 'XOM']
start_date = datetime.date(2010, 1, 2)
end_date = datetime.date(2020, 12, 31)
daily_data = yf.download(tickers, start=start_date, end=end_date) # definere datasættet
daily_data = daily_data['Adj Close'].dropna()
Vector_of_ones = np.array([1,1,1])
frames = [v for _, v in daily_data.groupby(pd.Grouper(freq='M'))]
rf = 0.01 # risk free asset
weights = []
df = pd.DataFrame(columns=tickers)
for w in frames:
#corr_matrix = w.pct_change().apply(lambda x: np.log(1 + x)).corr()
mu = (w.resample('D').last().pct_change().sum())
individual_asset_return = np.subtract(np.transpose(mu), np.dot(Vector_of_ones,rf))
# individual_asset_return = daily_data.pct_change().mean() # finder gennemsnittet
df.loc[+1] = [individual_asset_return[tickers[0]], individual_asset_return[tickers[1]],
individual_asset_return[tickers[2]]]
df.index = df.index - 1
df = df.sort_index()
for d in range(len(df)):
cov_matrix = w.pct_change().apply(lambda x: np.log(1 + x)).cov()
liste = df.iloc[d].tolist()
a = np.dot(np.linalg.inv(cov_matrix), np.transpose(np.array(liste)))
omega_weights = a / (np.dot(np.transpose(Vector_of_ones), a)) # expression to find weights
weights.append(omega_weights)
for afkast in frames[1:]: #loop to find the portfolio returns and standard deviation
cov_matrix1 = afkast.pct_change().apply(lambda x: np.log(1 + x)).cov()
#corr_matrix1 = afkast.pct_change().apply(lambda x: np.log(1 + x)).corr()
df1 = df.iloc[1:, :]
for d1 in range(len(df)):
liste1 = df.iloc[d1].tolist()
portfolio_return = np.dot(np.transpose(omega_weights),
mu)
p_ret.append(portfolio_return)
volatility_portfolio = np.sqrt(np.dot(np.transpose(omega_weights), np.dot(cov_matrix1, omega_weights)))
p_vol.append(volatility_portfolio)
data = {'Returns': p_ret, 'Volatility': p_vol}
for counter, symbol in enumerate(afkast.columns.tolist()):
# print(counter, symbol)
data[symbol + ' weight'] = [w[counter] for w in weights]
portfolios = pd.DataFrame(data) # laver dataframe som sortere sådan at den med mindst volatility er øverst
portfolios['Date'] = pd.date_range(start=start_date, periods=len(portfolios), freq='M')
portfolios.plot(x='Date', y='Returns', kind='line')
# portfolios.plot(x = 'Date', y = 'Volatility', kind = 'line')
plt.show()
print(portfolios.head())
As you probably can see I’m not an advanced coder but I hope I could some help where my code is wrong if there is anything wrong.
I really appreciate any help you can provide.

Cluster groups by direction and magnitude - Python

I'm hoping to cluster vectors based on the direction and magnitude using python. I've found limited examples using R but none for python. Not to confuse with standard k-means for scatter points, I'm actually trying to cluster the whole vector.
The following takes two sets of xy points to generate a vector. I'm then hoping to cluster these vectors based on the length and direction.
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import KMeans
df = pd.DataFrame(np.random.randint(0,20,size=(100, 4)), columns=list('ABCD'))
plt.rcParams['image.cmap'] = 'Paired'
fig,ax = plt.subplots()
ax.set_xlim(-5, 25)
ax.set_ylim(-5, 25)
A = df['A']
B = df['B']
C = df['C']
D = df['D']
ax.quiver(A, B, (C-A), (D-B), angles = 'xy', scale_units = 'xy', scale = 1, alpha = 0.5)
X_1 = np.array(df[['A','B','C','D']])
model = KMeans(n_clusters = 20)
model.fit(X_1)
cluster_labels = model.predict(X_1)
df['n_cluster'] = cluster_labels
centroids_1 = pd.DataFrame(data = model.cluster_centers_, columns = ['start_x', 'start_y', 'end_x', 'end_y'])
cc = model.cluster_centers_
a = cc[:, 0]
b = cc[:, 1]
c = cc[:, 2]
d = cc[:, 3]
lc1 = ax.quiver(a, b, (c-a), (d-b), angles = 'xy', scale_units = 'xy', scale = 1, alpha = 0.8)
The following figure displays an example
What about this :
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import hdbscan
df = pd.DataFrame(np.random.randint(0,20,size=(100, 4)), columns=list('ABCD'))
plt.rcParams['image.cmap'] = 'Paired'
A = df['A'] #X start
B = df['B'] #Y start
C = df['C'] #X arrive
D = df['D'] #Y arrive
clusterer = hdbscan.HDBSCAN()
df['LENGTH'] = np.sqrt(np.square(df.C-df.A) + np.square(df.D-df.B))
df['DIRECTION'] = np.degrees(np.arctan2(df.D-df.B, df.C-df.A))
coords = df[['LENGTH', 'DIRECTION']].values
clusterer.fit_predict(coords)
cluster_labels = clusterer.labels_
num_clusters = len(set(cluster_labels))
clusters = pd.DataFrame(
[(coords[cluster_labels==n], len(coords[cluster_labels==n])) for n in range(num_clusters)],
columns=["points", "weight"]
)
colors = {0:"green", 1:"blue", 2:"red", 3:"yellow", 4:"pink"}
df['CLUSTER'] = np.nan
for x, (cluster, weight) in enumerate(clusters[clusters.weight>0].values.tolist()):
df_this_cluster = pd.DataFrame(cluster, columns=['LENGTH', 'DIRECTION'])
df_this_cluster['TEMP'] = x
df = df.merge(df_this_cluster, on=['LENGTH', 'DIRECTION'], how='left')
ix = df[df.TEMP.notnull()].index
df.loc[ix, "CLUSTER"] = df.loc[ix, "TEMP"]
df.drop("TEMP", axis=1, inplace=True)
df['COLOR'] = df['CLUSTER'].map(colors).fillna('black')
fig,ax = plt.subplots()
ax.set_xlim(-5, 25)
ax.set_ylim(-5, 25)
ax.quiver(df.A, df.B, (df.C-df.A), (df.D-df.B), angles='xy', scale_units='xy', scale=1, alpha=0.5, color=df.COLOR)
This will use clustering based on length and direction (direction being transformed to degrees, radians' small range doesn't match very well with the model on my first try).
I don't think this will be a very "cartesian" solution as the two values beeing analysed in the model are not in the same metrics... But the visual results are not so bad...
I did try another match based on the 4 coordinates, which is more rigorous. But it is (quite expectably) clustering the vectors by subareas of the space (when there are any) :
coords = df[['A', 'B', 'C', 'D']].values
clusterer.fit_predict(coords)
cluster_labels = clusterer.labels_
num_clusters = len(set(cluster_labels))
clusters = pd.DataFrame(
[(coords[cluster_labels==n], len(coords[cluster_labels==n])) for n in range(num_clusters)],
columns=["points", "weight"]
)
colors = {0:"green", 1:"blue", 2:"red", 3:"yellow", 4:"pink"}
df['CLUSTER'] = np.nan
for x, (cluster, weight) in enumerate(clusters[clusters.weight>0].values.tolist()):
df_this_cluster = pd.DataFrame(cluster, columns=['A', 'B', 'C', 'D'])
df_this_cluster['TEMP'] = x
df = df.merge(df_this_cluster, on=['A', 'B', 'C', 'D'], how='left')
ix = df[df.TEMP.notnull()].index
df.loc[ix, "CLUSTER"] = df.loc[ix, "TEMP"]
df.drop("TEMP", axis=1, inplace=True)
df['COLOR'] = df['CLUSTER'].map(colors).fillna('black')
EDIT
I gave it another try, based on the (very good) suggestion that angles are not a good variable given the fact that there are discontinuities around 0/2pi ; so I choose to use both sinuses and cosinuses instead. I also scaled the length (to have matching scales for the 3 variables) :
So the result would be :
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import robust_scale
import hdbscan
df = pd.DataFrame(np.random.randint(0,20,size=(100, 4)), columns=list('ABCD'))
plt.rcParams['image.cmap'] = 'Paired'
A = df['A'] #X start
B = df['B'] #Y start
C = df['C'] #X arrive
D = df['D'] #Y arrive
clusterer = hdbscan.HDBSCAN()
df['LENGTH'] = robust_scale(np.sqrt(np.square(df.C-df.A) + np.square(df.D-df.B)))
df['DIRECTION'] = np.arctan2(df.D-df.B, df.C-df.A)
df['COS'] = np.cos(df['DIRECTION'])
df['SIN'] = np.sin(df['DIRECTION'])
columns = ['LENGTH', 'COS', 'SIN']
clusterer = hdbscan.HDBSCAN()
values = df[columns].values
clusterer.fit_predict(values)
cluster_labels = clusterer.labels_
num_clusters = len(set(cluster_labels))
clusters = pd.DataFrame(
[(values[cluster_labels==n], len(values[cluster_labels==n])) for n in range(num_clusters)],
columns=["points", "weight"]
)
def get_cmap(n, name='hsv'):
'''
Returns a function that maps each index in 0, 1, ..., n-1 to a distinct
RGB color; the keyword argument name must be a standard mpl colormap name.
Credits to #Ali
https://stackoverflow.com/questions/14720331/how-to-generate-random-colors-in-matplotlib#answer-25628397
'''
return plt.cm.get_cmap(name, n)
cmap = get_cmap(num_clusters+1)
colors = {x:cmap(x) for x in range(num_clusters)}
df['CLUSTER'] = np.nan
for x, (cluster, weight) in enumerate(clusters[clusters.weight>0].values.tolist()):
df_this_cluster = pd.DataFrame(cluster, columns=columns)
df_this_cluster['TEMP'] = x
df = df.merge(df_this_cluster, on=columns, how='left')
df.reset_index(drop=True, inplace=True)
ix = df[df.TEMP.notnull()].index
df.loc[ix, "CLUSTER"] = df.loc[ix, "TEMP"]
df.drop("TEMP", axis=1, inplace=True)
df['CLUSTER'] = df['CLUSTER'].fillna(num_clusters-1)
df['COLOR'] = df['CLUSTER'].map(colors)
print("Number of clusters : ", num_clusters-1)
nrows = num_clusters//2 if num_clusters%2==0 else num_clusters//2 + 1
fig,axes = plt.subplots(nrows=nrows, ncols=2)
axes = [y for row in axes for y in row]
for k,ax in enumerate(axes):
ax.set_xlim(-5, 25)
ax.set_ylim(-5, 25)
ax.set_aspect('equal', adjustable='box')
if k+1 <num_clusters:
ax.set_title(f"CLUSTER #{k+1}", fontsize=10)
this_df = df[df.CLUSTER==k]
ax.quiver(
this_df.A, #X
this_df.B, #Y
(this_df.C-this_df.A), #X component of vector
(this_df.D-this_df.B), #Y component of vector
angles = 'xy',
scale_units = 'xy',
scale = 1,
color=this_df.COLOR
)
The results are way better (though it depends much of the input dataset) ; the last subplots refers to the vectors not being found to be inside a cluster:
Edit #2
If by "direction" you mean angle in the [0..pi[ interval (ie undirected vectors), you will want to include the following code before computing the cosinuses/sinuses :
ix = df[df.DIRECTION<0].index
df.loc[ix, "DIRECTION"] += np.pi
Maybe you can also cluster the angles (besides the vector norms) by the projections of a normalized vector onto the two unit vectors (1,0) and (0,1) with this function. Handling the projections directly (which are essentially the angles), we won't went into trouble caused by the periodicity of cosine function
def get_norm_and_angle(e1):
e1_norm = np.linalg.norm(e1,axis=1)
e1 = e1 / e1_norm[:,None]
e2 = np.array([1,0])
e3 = np.array([0,1])
return np.stack((e1_norm,e1#e2,e1#e3),axis=1)
Based on this function, here is one possible solution where there is no constraint on how many clusters we want to find. In the script below, five features are used for clustering
Vector norm
Vector projections on x and y axis
Vector starting points
with these five features
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.cluster import KMeans
def get_norm_and_angle(e1):
e1_norm = np.linalg.norm(e1,axis=1)
e1 = e1 / e1_norm[:,None]
e2 = np.array([1,0])
e3 = np.array([0,1])
return np.stack((e1_norm,e1#e2,e1#e3),axis=1)
data = np.cumsum(np.random.randint(0,10,size=(50, 4)),axis=0)
df = pd.DataFrame(data, columns=list('ABCD'))
A = df['A'];B = df['B']
C = df['C'];D = df['D']
starting_points = np.stack((A,B),axis=1)
vectors = np.stack((C,D),axis=1) - np.stack((A,B),axis=1)
different_view = get_norm_and_angle(vectors)
different_view = np.hstack((different_view,starting_points))
num_clusters = 8
model = KMeans(n_clusters=num_clusters)
model.fit(different_view)
cluster_labels = model.predict(different_view)
df['n_cluster'] = cluster_labels
cluster_centers = model.cluster_centers_
cluster_offsets = cluster_centers[:,0][:,None] * cluster_centers[:,1:3]
cluster_starts = np.vstack([np.mean(starting_points[cluster_labels==ind],axis=0) for ind in range(num_clusters)])
main_streams = np.hstack((cluster_starts,cluster_starts+cluster_offsets))
a,b,c,d = main_streams.T
fig,ax = plt.subplots(figsize=(8,8))
ax.set_xlim(-np.max(data)*.1,np.max(data)*1.1)
ax.set_ylim(-np.max(data)*.1,np.max(data)*1.1)
colors = sns.color_palette(n_colors=num_clusters)
lc1 = ax.quiver(a, b, (c-a), (d-b), angles = 'xy', scale_units = 'xy', color = colors, scale = 1, alpha = 0.8, zorder=100)
lc2 = ax.quiver(A, B, (C-A), (D-B), angles = 'xy', scale_units = 'xy', scale = .6, alpha = 0.2)
start_colors = [colors[ind] for ind in cluster_labels]
ax.scatter(starting_points[:,0],starting_points[:,1],c=start_colors)
plt.show()
A sample output is
as you can see in the figure, vectors with close starting points are clustered into the same group.

Interpolation using ExponentialSmoothing from stats models

I am using ExponentialSmoothing from statsmodels to run Holt-Winters method on time series.
I get forecasted values but can not extract calculated values and compare them with observed values.
from pandas import Series
from scipy import stats
import statsmodels.api as sm
from statsmodels.tsa.api import ExponentialSmoothing
modelHW = ExponentialSmoothing(np.asarray(passtrain_df['n_passengers']), seasonal_periods=12, trend='add', seasonal='mul',).fit()
y_hat_avg['Holt_Winter'] = modelHW.forecast(prediction_size)
So here, prediction_size = number of forecasted datapoints (4 in my case)
passtrain_df is a dataframe with observations (140 datapoints) based on which Holt_Winter model is built (regression).
I can easily display 4 forecasted values.
How do I extract 140 calculated values?
Tried to use:
print(ExponentialSmoothing.predict(np.asarray(passtrain_df), start=0, end=139))
But I probably have a syntax error somewhere
Thank you!
Edit:
Replaced synthetic dataset with sample data from OP
Fixed function that builds new forecast period
Fixed x-axis date format as per OPs request
Answer:
If you're looking for calculated values within your estimation period, you should use modelHW.fittedvalues and not modelHW.forecast(). The latter will give you just what it says; forecasts. And it's pretty awesome. Let me show you how to do both things:
Plot 1 - Model within estimation period
Plot 2 - Forecasts
Code:
#imports
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from statsmodels.tsa.api import ExponentialSmoothing
import matplotlib.dates as mdates
#%%
#
# Load data
pass_df = pd.read_csv('https://raw.githubusercontent.com/dacatay/time-series-analysis/master/data/passengers.csv', sep=';')
pass_df = pass_df.set_index('month')
type(pass_df.index)
df = pass_df.copy()
# Model
modelHW = ExponentialSmoothing(np.asarray(df['n_passengers']), seasonal_periods=12, trend='add', seasonal='mul',).fit()
modelHW.summary()
# Model, fitted values
model_values = modelHW.fittedvalues
model_period = df.index
df_model = pd.concat([df['n_passengers'], pd.Series(model_values, index = model_period)], axis = 1)
df_model.columns = ['n_passengers', 'HWmodel']
df_model = df_model.set_index(pd.DatetimeIndex(df_model.index))
# Model, plot
fig, ax = plt.subplots()
myFmt = mdates.DateFormatter('%Y-%m')
df_model.plot(ax = ax, x_compat=True)
ax.xaxis.set_major_formatter(myFmt)
# Forecasts
prediction_size = 10
forecast_values = modelHW.forecast(prediction_size)
# Forecasts, build new period
forecast_start = df.index[-1]
forecast_start = pd.to_datetime(forecast_start, format='%Y-%m-%d')
forecast_period = pd.period_range(forecast_start, periods=prediction_size+1, freq='M')
forecast_period = forecast_period[1:]
# Forecasts, create dataframe
df_forecast = pd.Series(forecast_values, index = forecast_period.values).to_frame()
df_forecast.columns = ['HWforecast']
# merge input and forecast dataframes
df_all = pd.merge(df,df_forecast, how='outer', left_index=True, right_index=True)
#df_all = df_all.set_index(pd.DatetimeIndex(df_all.index.values))
ix = df_all.index
ixp = pd.PeriodIndex(ix, freq = 'M')
df_all = df_all.set_index(ixp)
# Forecast, plot
fig, ax = plt.subplots()
myFmt = mdates.DateFormatter('%Y-%m')
df_all.plot(ax = ax, x_compat=True)
ax.xaxis.set_major_formatter(myFmt)
Previous attempts:
# imports
import pandas as pd
import numpy as np
from statsmodels.tsa.api import ExponentialSmoothing
# Data that matches your setup, but with a random
# seed to make it reproducible
np.random.seed(42)
# Time
date = pd.to_datetime("1st of Jan, 2019")
dates = date+pd.to_timedelta(np.arange(140), 'D')
# Data
n_passengers = np.random.normal(loc=0.0, scale=5.0, size=140).cumsum()
n_passengers = n_passengers.astype(int) + 100
df = pd.DataFrame({'n_passengers':n_passengers},index=dates)
1. How to plot observed vs. estimated values within the estimation period:
The following snippet will extract all fitted values and plot it against your observed values.
Snippet 2:
# Model
modelHW = ExponentialSmoothing(np.asarray(df['n_passengers']), seasonal_periods=12, trend='add', seasonal='mul',).fit()
modelHW.summary()
# Model, fitted values
model_values = modelHW.fittedvalues
model_period = df.index
df_model = pd.concat([df['n_passengers'], pd.Series(model_values, index = model_period)], axis = 1)
df_model.columns = ['n_passengers', 'HWmodel']
df_model.plot()
Plot 1:
2. How to produce and plot model forecasts of a certain length:
The following snippet will produce 10 forecasts from your model, and plot it as an extended period compared to your observer values.
Snippet 3:
# Forecast
prediction_size = 10
forecast_values = modelHW.forecast(prediction_size)
forecast_period = df.index[-1] + pd.to_timedelta(np.arange(prediction_size+1), 'D')
forecast_period = forecast_period[1:]
df_forecast = pd.concat([df['n_passengers'], pd.Series(forecast_values, index = forecast_period)], axis = 1)
df_forecast.columns = ['n_passengers', 'HWforecast']
df_forecast.plot()
Plot 2:
And here's the whole thing for an easy copy&paste:
# imports
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from statsmodels.tsa.api import ExponentialSmoothing
# Data that matches your setup, but with a random
# seed to make it reproducible
np.random.seed(42)
# Time
date = pd.to_datetime("1st of Jan, 2019")
dates = date+pd.to_timedelta(np.arange(140), 'D')
# Data
n_passengers = np.random.normal(loc=0.0, scale=5.0, size=140).cumsum()
n_passengers = n_passengers.astype(int) + 100
df = pd.DataFrame({'n_passengers':n_passengers},index=dates)
# Model
modelHW = ExponentialSmoothing(np.asarray(df['n_passengers']), seasonal_periods=12, trend='add', seasonal='mul',).fit()
modelHW.summary()
# Model, fitted values
model_values = modelHW.fittedvalues
model_period = df.index
df_model = pd.concat([df['n_passengers'], pd.Series(model_values, index = model_period)], axis = 1)
df_model.columns = ['n_passengers', 'HWmodel']
df_model.plot()
# Forecast
prediction_size = 10
forecast_values = modelHW.forecast(prediction_size)
forecast_period = df.index[-1] + pd.to_timedelta(np.arange(prediction_size+1), 'D')
forecast_period = forecast_period[1:]
df_forecast = pd.concat([df['n_passengers'], pd.Series(forecast_values, index = forecast_period)], axis = 1)
df_forecast.columns = ['n_passengers', 'HWforecast']
df_forecast.plot()
#vestland - here is the code and error:
y_train = passtrain_df.copy(deep=True)
model_HW = ExponentialSmoothing(np.asarray(y_train['n_passengers']), seasonal_periods=12, trend='add', seasonal='mul',).fit()
model_values = model_HW.fittedvalues
model_period = y_train.index
hw_model = pd.concat([y_train['n_passengers'], pd.Series(model_values, index = model_period)], axis = 1)
hw_model.columns = ['Observed Passengers', 'Holt-Winters']
plt.figure(figsize=(18,12))
hw_model.plot()
forecast_values = model_HW.forecast(prediction_size)
forecast_period = y_train.index[-1] + pd.to_timedelta(np.arange(prediction_size+1),'D')
forecast_period = forecast_period[1:]
hw_forecast = pd.concat([y_train['n_passengers'], pd.Series(forecast_values, index = forecast_period)], axis = 1)
hw_forecast.columns = ['Observed Passengers', 'HW-Forecast']
hw_forecast.plot()
Error:
NullFrequencyError Traceback (most recent call last)
<ipython-input-25-5f37a0dd0cfa> in <module>()
17
18 forecast_values = model_HW.forecast(prediction_size)
---> 19 forecast_period = y_train.index[-1] + pd.to_timedelta(np.arange(prediction_size+1),'D')
20 forecast_period = forecast_period[1:]
21
/anaconda3/lib/python3.6/site- packages/pandas/core/indexes/datetimelike.py in __radd__(self, other)
879 def __radd__(self, other):
880 # alias for __add__
--> 881 return self.__add__(other)
882 cls.__radd__ = __radd__
883
/anaconda3/lib/python3.6/site- packages/pandas/core/indexes/datetimelike.py in __add__(self, other)
842 # This check must come after the check for np.timedelta64
843 # as is_integer returns True for these
--> 844 result = self.shift(other)
845
846 # array-like others
/anaconda3/lib/python3.6/site-packages/pandas/core/indexes/datetimelike.py in shift(self, n, freq)
1049
1050 if self.freq is None:
-> 1051 raise NullFrequencyError("Cannot shift with no freq")
1052
1053 start = self[0] + n * self.freq
NullFrequencyError: Cannot shift with no freq

calculating slope for a series trendline in Pandas

Is there an idiomatic way of getting the slope for linear trend line fitting values in a DataFrame column? The data is indexed with DateTime index.
This should do it:
import numpy as np
import pandas as pd
df = pd.DataFrame(np.random.rand(100, 5), pd.date_range('2012-01-01', periods=100))
def trend(df):
df = df.copy().sort_index()
dates = df.index.to_julian_date().values[:, None]
x = np.concatenate([np.ones_like(dates), dates], axis=1)
y = df.values
return pd.DataFrame(np.linalg.pinv(x.T.dot(x)).dot(x.T).dot(y).T,
df.columns, ['Constant', 'Trend'])
trend(df)
Using the same df above for its index:
df_sample = pd.DataFrame((df.index.to_julian_date() * 10 + 2) + np.random.rand(100) * 1e3,
df.index)
coef = trend(df_sample)
df_sample['trend'] = (coef.iloc[0, 1] * df_sample.index.to_julian_date() + coef.iloc[0, 0])
df_sample.plot(style=['.', '-'])

Categories

Resources