Related
From the code given here, I have developed another code which uses Matplotlib in place of Seaborn (The data are plotted on several figures and subplots, and so are now more readable and I am closer to the point I want to reach: the user by putting the cursor over a point has access to all the information of the point, in particular the datetime.)
Here it is:
import pandas as pd
import numpy as np
import matplotlib.dates as mdates
import matplotlib.pyplot as plt
import random
from datetime import datetime
# size of the database
n = 1000
nA = 4
nB = 9
no = np.arange(n)
date = np.random.randint(1e9, size=n).astype('datetime64[s]')
A = [''.join(['A',str(random.randint(1, nA))]) for j in range(n)]
B = [''.join(['B',str(random.randint(1, nB))]) for j in range(n)]
Epsilon1 = np.random.random_sample((n,))
Epsilon2 = np.random.random_sample((n,))
Epsilon3 = np.random.random_sample((n,))
data = pd.DataFrame({'no':no,
'Date':date,
'A':A,
'B':B,
'Epsilon1':Epsilon1,
'Epsilon2':Epsilon2,
'Epsilon3':Epsilon3})
def format_coord(x, y):
string_x = datetime.utcfromtimestamp(x).strftime("%m/%d/%Y, %H:%M:%S")
return 'x={}, y={:.4f}'.format(string_x,y)
def plot_Epsilon_matplotlib():
for A in data['A'].sort_values().drop_duplicates().to_list():
n_col = 2
fig, axes = plt.subplots(np.ceil(nB/n_col).astype(int),n_col)
for j, B in enumerate(data['B'].sort_values().drop_duplicates().to_list()):
df = data.loc[(data['A']==A) & (data['B']==B)]
df = df.sort_values("Date", ascending=True)
axes.flatten()[j].plot(df["Date"],df['Epsilon1'],marker='x',c='b',label="Epsilon1")
axes.flatten()[j].plot(df["Date"],df['Epsilon2'],marker='x',c='r',label="Epsilon2")
axes.flatten()[j].plot(df["Date"],df['Epsilon3'],marker='x',c='g',label="Epsilon3")
axes.flatten()[j].format_coord = format_coord
if __name__ == '__main__':
plot_Epsilon_matplotlib()
The goal is that when the user puts the cursor over a point, he gets access to the full datetime of the data.
I have first tried to change the major formatter (as here):
axes.flatten()[j].xaxis.set_major_formatter(mdates.DateFormatter('%Y/%m/%d %H:%M:%S'))
but then the x ticks are not readable (especially if the user zooms on a subplot)
I then tried the define my own format_coord as here. My first try is given in the full code given above. The format of the datetime in Matplotlib figure status bar is good but the date remains in 1970 !
After reading this discussion, I realized this problem relates on Numpy datetime64 to Datetime conversion. I then coded this new version of format_coord (strongly inspired from this answer):
def format_coord_bis(x,y):
dt64 = np.datetime64(datetime.utcfromtimestamp(x))
unix_epoch = np.datetime64(0, 's')
one_second = np.timedelta64(1, 's')
seconds_since_epoch = (dt64 - unix_epoch) / one_second
string_x = datetime.utcfromtimestamp(seconds_since_epoch).strftime("%m/%d/%Y, %H:%M:%S")
return 'x={}, y={:.4f}'.format(string_x,y)
but the date given in the status bar remains the 01/01/1970...
I have found the solution from this answer.
The function format_coord() should be defined as follows:
def format_coord(x, y):
string_x = matplotlib.dates.num2date(x).strftime('%Y-%m-%d %H:%M:%S')
return 'x={}, y={:.4f}'.format(string_x,y)
I'm plotting the counts of a variable grouped by time as a heatmap. However, when including both hour and minute, the counts are quite low so the resulting heatmap doesn't really provide any real insight. Is it possible to group the counts in a bigger block of time? I'm hoping to test some different periods (5, 10 mins).
I'm also hoping to plot time on the x-axis. Similar to the output attached.
import seaborn as sns
import pandas as pd
from datetime import datetime
from datetime import timedelta
start = datetime(1900,1,1,10,0,0)
end = datetime(1900,1,1,13,0,0)
seconds = (end - start).total_seconds()
step = timedelta(minutes = 1)
array = []
for i in range(0, int(seconds), int(step.total_seconds())):
array.append(start + timedelta(seconds=i))
array = [i.strftime('%Y-%m-%d %H:%M%:%S') for i in array]
df2 = pd.DataFrame(array).rename(columns = {0:'Time'})
df2['Count'] = np.random.uniform(0.0, 0.5, size = len(df2))
df2['Count'] = df2['Count'].round(1)
df2['Time'] = pd.to_datetime(df2['Time'])
df2['Hour'] = df2['Time'].dt.hour
df2['Min'] = df2['Time'].dt.minute
g = df2.groupby(['Hour','Min','Count'])
count_df = g['Count'].nunique().unstack()
count_df.fillna(0, inplace = True)
sns.heatmap(count_df)
To deal with such cases, I think it would be easy to use data downsampling. It is also easy to change the thresholds. The axis labels in the output graph will need to be modified, but we recommend this method.
import seaborn as sns
import pandas as pd
import numpy as np
from datetime import datetime
from datetime import timedelta
start = datetime(1900,1,1,10,0,0)
end = datetime(1900,1,1,13,0,0)
seconds = (end - start).total_seconds()
step = timedelta(minutes = 1)
array = []
for i in range(0, int(seconds), int(step.total_seconds())):
array.append(start + timedelta(seconds=i))
array = [i.strftime('%Y-%m-%d %H:%M:%S') for i in array]
df2 = pd.DataFrame(array).rename(columns = {0:'Time'})
df2['Count'] = np.random.uniform(0.0, 0.5, size = len(df2))
df2['Count'] = df2['Count'].round(1)
df2['Time'] = pd.to_datetime(df2['Time'])
df2['Hour'] = df2['Time'].dt.hour
df2['Min'] = df2['Time'].dt.minute
df2.set_index('Time', inplace=True)
count_df = df2.resample('10min')['Count'].value_counts().unstack()
count_df.fillna(0, inplace = True)
sns.heatmap(count_df.T)
The way you could achieve this is by creating a column with numbers that have repeating elements for the number of minutes.
For example:
minutes = 3
x = [0,1,2]
np.repeat(x, repeats=minutes, axis=0)
>>>> [0,0,0,1,1,1,2,2,2]
and then group your data using this column.
So your code would look like:
...
minutes = 5
x = [i for i in range(int(df2.shape[0]/5))]
df2['group'] = np.repeat(x, repeats=minutes, axis=0)
g = df2.groupby(['Min', 'Count'])
count_df = g['Count'].nunique().unstack()
count_df.fillna(0, inplace = True)
How do I print the dataframe, where the population is within 5% of the mean? (2.5% below and 2.5% above)
Here is what I've tried:
mean = df['population'].mean()
minimum = mean - (0.025*mean)
maximum = mean + (0.025*mean)
df[df.population < maximum]
Use:
df.loc[(df['population'] > minimum) & (df['population'] < maximum)]
import pandas as pd
df = pd.read_csv("fileName.csv")
#suppose this dataFrame contains the population in the int format
mean = df['population'].mean()
minimum = mean - (0.025*mean)
maximum = mean + (0.025*mean)
ans = df.loc[(df['population']>minimum) & (df['population'] <maximum)]
ans
you can use this
I built this dataframe for testing.
import numpy as np
import pandas as pd
random_data = np.random.randint(1_000_000, 100_000_000, 200)
random_df = pd.DataFrame(random_data, columns=['population'])
random_df
Here's the answer to specifically what you were asking for.
pop = random_df.population
top_boundary = pop.mean() + pop.mean() * 0.025
low_boundary = pop.mean() - pop.mean() * 0.025
criteria_boundary_limits = random_df.population.between(low_boundary, top_boundary)
criteria_boundary_df = random_df.loc[criteria_boundary_limits]
criteria_boundary_df
But, maybe, another answer could be had by using quantiles. I used 40 quantiles because 1/40 = 0.025.
groups_list = list(range(1,41))
random_df['groups'] = pd.qcut(random_df['population'], 40, labels = groups_list)
criteria_groups_limits = random_df.groups.between(20,21)
criteria_groups_df = random_df.loc[criteria_groups_limits]
criteria_groups_df
I am trying to find and annotate the intersection point of two-line using Plotly. I know we can use (plt.plot(*intersection.xy,'ko')) to get the intersection point in Mathplotlib, but how can do it in plotly or if it can be done.
import numpy as np
import pandas as pd
import plotly.express as px
test = pd.merge(ps,uts,on = 'Time')
print(test)
time = test['Time']
omfr = test['Orifice Mass Flow Rate']
qab = test['Qab']
fig = px.line(test, x=time, y=[omfr,qab])
fig.update_layout( title='Pipe stress and UTS (MPa)',xaxis_title="Time (s)",yaxis_title="Pipe stress and UTS (MPa)",
hovermode='x')
Output of the code:
Intersection point:
Annotating and showing the intersection is easy. Finding it is the hard part, and my suggestion in that regard builds directly on the contributions in the post How do I compute the intersection point of two lines. I'll include a few lines on the details in my suggestion when I find the time. For now, the complete code snippet at the end of my answer will produce the following figure using this dataset:
Data
x y1 y2
0 1 1 11.00
1 2 8 14.59
2 3 27 21.21
3 4 64 31.11
Plot
Edit - Annotation
If you'd like to change the text annotation, just change
text="intersect"
... to something like:
text = 'lines intersect at x = ' + str(round(x[0], 2)) + ' and y = ' + str(round(y[0], 2))
Result:
Complete code
import pandas as pd
import plotly.graph_objects as go
import numpy as np
# import dash
# sample dataframe
df = pd.DataFrame()
df['x'] = np.arange(4) +1
df['y1'] = df['x']**3
df['y2'] = [10+val**2.2 for val in df['x']]
# intersection stuff
def _rect_inter_inner(x1,x2):
n1=x1.shape[0]-1
n2=x2.shape[0]-1
X1=np.c_[x1[:-1],x1[1:]]
X2=np.c_[x2[:-1],x2[1:]]
S1=np.tile(X1.min(axis=1),(n2,1)).T
S2=np.tile(X2.max(axis=1),(n1,1))
S3=np.tile(X1.max(axis=1),(n2,1)).T
S4=np.tile(X2.min(axis=1),(n1,1))
return S1,S2,S3,S4
def _rectangle_intersection_(x1,y1,x2,y2):
S1,S2,S3,S4=_rect_inter_inner(x1,x2)
S5,S6,S7,S8=_rect_inter_inner(y1,y2)
C1=np.less_equal(S1,S2)
C2=np.greater_equal(S3,S4)
C3=np.less_equal(S5,S6)
C4=np.greater_equal(S7,S8)
ii,jj=np.nonzero(C1 & C2 & C3 & C4)
return ii,jj
def intersection(x1,y1,x2,y2):
ii,jj=_rectangle_intersection_(x1,y1,x2,y2)
n=len(ii)
dxy1=np.diff(np.c_[x1,y1],axis=0)
dxy2=np.diff(np.c_[x2,y2],axis=0)
T=np.zeros((4,n))
AA=np.zeros((4,4,n))
AA[0:2,2,:]=-1
AA[2:4,3,:]=-1
AA[0::2,0,:]=dxy1[ii,:].T
AA[1::2,1,:]=dxy2[jj,:].T
BB=np.zeros((4,n))
BB[0,:]=-x1[ii].ravel()
BB[1,:]=-x2[jj].ravel()
BB[2,:]=-y1[ii].ravel()
BB[3,:]=-y2[jj].ravel()
for i in range(n):
try:
T[:,i]=np.linalg.solve(AA[:,:,i],BB[:,i])
except:
T[:,i]=np.NaN
in_range= (T[0,:] >=0) & (T[1,:] >=0) & (T[0,:] <=1) & (T[1,:] <=1)
xy0=T[2:,in_range]
xy0=xy0.T
return xy0[:,0],xy0[:,1]
# plotly figure
x,y=intersection(np.array(df['x'].values.astype('float')),np.array(df['y1'].values.astype('float')),
np.array(df['x'].values.astype('float')),np.array(df['y2'].values.astype('float')))
fig = go.Figure(data=go.Scatter(x=df['x'], y=df['y1'], mode = 'lines'))
fig.add_traces(go.Scatter(x=df['x'], y=df['y2'], mode = 'lines'))
fig.add_traces(go.Scatter(x=x, y=y,
mode = 'markers',
marker=dict(line=dict(color='black', width = 2),
symbol = 'diamond',
size = 14,
color = 'rgba(255, 255, 0, 0.6)'),
name = 'intersect'),
)
fig.add_annotation(x=x[0], y=y[0],
# text="intersect",
text = 'lines intersect at x = ' + str(round(x[0], 2)) + ' and y = ' + str(round(y[0], 2)),
font=dict(family="sans serif",
size=18,
color="black"),
ax=0,
ay=-100,
showarrow=True,
arrowhead=1)
fig.show()
I am using ExponentialSmoothing from statsmodels to run Holt-Winters method on time series.
I get forecasted values but can not extract calculated values and compare them with observed values.
from pandas import Series
from scipy import stats
import statsmodels.api as sm
from statsmodels.tsa.api import ExponentialSmoothing
modelHW = ExponentialSmoothing(np.asarray(passtrain_df['n_passengers']), seasonal_periods=12, trend='add', seasonal='mul',).fit()
y_hat_avg['Holt_Winter'] = modelHW.forecast(prediction_size)
So here, prediction_size = number of forecasted datapoints (4 in my case)
passtrain_df is a dataframe with observations (140 datapoints) based on which Holt_Winter model is built (regression).
I can easily display 4 forecasted values.
How do I extract 140 calculated values?
Tried to use:
print(ExponentialSmoothing.predict(np.asarray(passtrain_df), start=0, end=139))
But I probably have a syntax error somewhere
Thank you!
Edit:
Replaced synthetic dataset with sample data from OP
Fixed function that builds new forecast period
Fixed x-axis date format as per OPs request
Answer:
If you're looking for calculated values within your estimation period, you should use modelHW.fittedvalues and not modelHW.forecast(). The latter will give you just what it says; forecasts. And it's pretty awesome. Let me show you how to do both things:
Plot 1 - Model within estimation period
Plot 2 - Forecasts
Code:
#imports
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from statsmodels.tsa.api import ExponentialSmoothing
import matplotlib.dates as mdates
#%%
#
# Load data
pass_df = pd.read_csv('https://raw.githubusercontent.com/dacatay/time-series-analysis/master/data/passengers.csv', sep=';')
pass_df = pass_df.set_index('month')
type(pass_df.index)
df = pass_df.copy()
# Model
modelHW = ExponentialSmoothing(np.asarray(df['n_passengers']), seasonal_periods=12, trend='add', seasonal='mul',).fit()
modelHW.summary()
# Model, fitted values
model_values = modelHW.fittedvalues
model_period = df.index
df_model = pd.concat([df['n_passengers'], pd.Series(model_values, index = model_period)], axis = 1)
df_model.columns = ['n_passengers', 'HWmodel']
df_model = df_model.set_index(pd.DatetimeIndex(df_model.index))
# Model, plot
fig, ax = plt.subplots()
myFmt = mdates.DateFormatter('%Y-%m')
df_model.plot(ax = ax, x_compat=True)
ax.xaxis.set_major_formatter(myFmt)
# Forecasts
prediction_size = 10
forecast_values = modelHW.forecast(prediction_size)
# Forecasts, build new period
forecast_start = df.index[-1]
forecast_start = pd.to_datetime(forecast_start, format='%Y-%m-%d')
forecast_period = pd.period_range(forecast_start, periods=prediction_size+1, freq='M')
forecast_period = forecast_period[1:]
# Forecasts, create dataframe
df_forecast = pd.Series(forecast_values, index = forecast_period.values).to_frame()
df_forecast.columns = ['HWforecast']
# merge input and forecast dataframes
df_all = pd.merge(df,df_forecast, how='outer', left_index=True, right_index=True)
#df_all = df_all.set_index(pd.DatetimeIndex(df_all.index.values))
ix = df_all.index
ixp = pd.PeriodIndex(ix, freq = 'M')
df_all = df_all.set_index(ixp)
# Forecast, plot
fig, ax = plt.subplots()
myFmt = mdates.DateFormatter('%Y-%m')
df_all.plot(ax = ax, x_compat=True)
ax.xaxis.set_major_formatter(myFmt)
Previous attempts:
# imports
import pandas as pd
import numpy as np
from statsmodels.tsa.api import ExponentialSmoothing
# Data that matches your setup, but with a random
# seed to make it reproducible
np.random.seed(42)
# Time
date = pd.to_datetime("1st of Jan, 2019")
dates = date+pd.to_timedelta(np.arange(140), 'D')
# Data
n_passengers = np.random.normal(loc=0.0, scale=5.0, size=140).cumsum()
n_passengers = n_passengers.astype(int) + 100
df = pd.DataFrame({'n_passengers':n_passengers},index=dates)
1. How to plot observed vs. estimated values within the estimation period:
The following snippet will extract all fitted values and plot it against your observed values.
Snippet 2:
# Model
modelHW = ExponentialSmoothing(np.asarray(df['n_passengers']), seasonal_periods=12, trend='add', seasonal='mul',).fit()
modelHW.summary()
# Model, fitted values
model_values = modelHW.fittedvalues
model_period = df.index
df_model = pd.concat([df['n_passengers'], pd.Series(model_values, index = model_period)], axis = 1)
df_model.columns = ['n_passengers', 'HWmodel']
df_model.plot()
Plot 1:
2. How to produce and plot model forecasts of a certain length:
The following snippet will produce 10 forecasts from your model, and plot it as an extended period compared to your observer values.
Snippet 3:
# Forecast
prediction_size = 10
forecast_values = modelHW.forecast(prediction_size)
forecast_period = df.index[-1] + pd.to_timedelta(np.arange(prediction_size+1), 'D')
forecast_period = forecast_period[1:]
df_forecast = pd.concat([df['n_passengers'], pd.Series(forecast_values, index = forecast_period)], axis = 1)
df_forecast.columns = ['n_passengers', 'HWforecast']
df_forecast.plot()
Plot 2:
And here's the whole thing for an easy copy&paste:
# imports
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from statsmodels.tsa.api import ExponentialSmoothing
# Data that matches your setup, but with a random
# seed to make it reproducible
np.random.seed(42)
# Time
date = pd.to_datetime("1st of Jan, 2019")
dates = date+pd.to_timedelta(np.arange(140), 'D')
# Data
n_passengers = np.random.normal(loc=0.0, scale=5.0, size=140).cumsum()
n_passengers = n_passengers.astype(int) + 100
df = pd.DataFrame({'n_passengers':n_passengers},index=dates)
# Model
modelHW = ExponentialSmoothing(np.asarray(df['n_passengers']), seasonal_periods=12, trend='add', seasonal='mul',).fit()
modelHW.summary()
# Model, fitted values
model_values = modelHW.fittedvalues
model_period = df.index
df_model = pd.concat([df['n_passengers'], pd.Series(model_values, index = model_period)], axis = 1)
df_model.columns = ['n_passengers', 'HWmodel']
df_model.plot()
# Forecast
prediction_size = 10
forecast_values = modelHW.forecast(prediction_size)
forecast_period = df.index[-1] + pd.to_timedelta(np.arange(prediction_size+1), 'D')
forecast_period = forecast_period[1:]
df_forecast = pd.concat([df['n_passengers'], pd.Series(forecast_values, index = forecast_period)], axis = 1)
df_forecast.columns = ['n_passengers', 'HWforecast']
df_forecast.plot()
#vestland - here is the code and error:
y_train = passtrain_df.copy(deep=True)
model_HW = ExponentialSmoothing(np.asarray(y_train['n_passengers']), seasonal_periods=12, trend='add', seasonal='mul',).fit()
model_values = model_HW.fittedvalues
model_period = y_train.index
hw_model = pd.concat([y_train['n_passengers'], pd.Series(model_values, index = model_period)], axis = 1)
hw_model.columns = ['Observed Passengers', 'Holt-Winters']
plt.figure(figsize=(18,12))
hw_model.plot()
forecast_values = model_HW.forecast(prediction_size)
forecast_period = y_train.index[-1] + pd.to_timedelta(np.arange(prediction_size+1),'D')
forecast_period = forecast_period[1:]
hw_forecast = pd.concat([y_train['n_passengers'], pd.Series(forecast_values, index = forecast_period)], axis = 1)
hw_forecast.columns = ['Observed Passengers', 'HW-Forecast']
hw_forecast.plot()
Error:
NullFrequencyError Traceback (most recent call last)
<ipython-input-25-5f37a0dd0cfa> in <module>()
17
18 forecast_values = model_HW.forecast(prediction_size)
---> 19 forecast_period = y_train.index[-1] + pd.to_timedelta(np.arange(prediction_size+1),'D')
20 forecast_period = forecast_period[1:]
21
/anaconda3/lib/python3.6/site- packages/pandas/core/indexes/datetimelike.py in __radd__(self, other)
879 def __radd__(self, other):
880 # alias for __add__
--> 881 return self.__add__(other)
882 cls.__radd__ = __radd__
883
/anaconda3/lib/python3.6/site- packages/pandas/core/indexes/datetimelike.py in __add__(self, other)
842 # This check must come after the check for np.timedelta64
843 # as is_integer returns True for these
--> 844 result = self.shift(other)
845
846 # array-like others
/anaconda3/lib/python3.6/site-packages/pandas/core/indexes/datetimelike.py in shift(self, n, freq)
1049
1050 if self.freq is None:
-> 1051 raise NullFrequencyError("Cannot shift with no freq")
1052
1053 start = self[0] + n * self.freq
NullFrequencyError: Cannot shift with no freq