I am writing code to remove plateau outliers from time series data. I proceeded after receiving advice to use np.diff, but there was a problem that it could not be recognized if it was not the same value.
def find_plateaus(F, min_length=200, tolerance = 0.75, smoothing=15):
import numpy as np
from scipy.ndimage.filters import uniform_filter1d
# calculate smooth gradients
smoothF = uniform_filter1d(F, size = smoothing)
dF = uniform_filter1d(np.gradient(smoothF),size = smoothing)
d2F = uniform_filter1d(np.gradient(dF),size = smoothing)
def zero_runs(x):
iszero = np.concatenate(([0], np.equal(x, 0).view(np.int8), [0]))
absdiff = np.abs(np.diff(iszero))
ranges = np.where(absdiff == 1)[0].reshape(-1, 2)
return ranges
# Find ranges where second derivative is zero
# Values under eps are assumed to be zero.
eps = np.quantile(abs(d2F),tolerance)
smalld2F = (abs(d2F) <= eps)
# Find repititions in the mask "smalld2F" (i.e. ranges where d2F is constantly zero)
p = zero_runs(np.diff(smalld2F))
# np.diff(p) gives the length of each range found.
# only accept plateaus of min_length
plateaus = p[(np.diff(p) > min_length).flatten()]
return (plateaus)
plateaus = find_plateaus(test, min_length=5, tolerance = 0.02, smoothing=11)
plateaus = np.ravel(plateaus, order = 'A')
plateaus = plateaus.tolist()
print(plateaus)
test2['T&F'] = np.nan
for i in test2.index:
if i in plateaus:
test2.loc[i,['T&F']] = test2.loc[i,'data']
else :
test2.loc[i,['T&F']] = 0
fig, ax = plt.subplots(figsize=(15,6))
ax.plot(test2.index, test2['data'], color='black', label = 'time_series')
ax.scatter(test2.index,test2['T&F'], color='red', label = 'D910')
plt.legend()
plt.show();
Do you know any libraries or methods that can be used?
I want to recognize the parts marked in the picture below.
enter image description here
Still in progress, but found the answer.
First, make the np array multidimensional.
ex) time_step = 3
.....
Then, using np.std(), find the standard deviation,
After checking, you can set the standard deviation range to recognize the included range.
I want to test the low volatility factor for some market other than equities. Contradiccting finance 101, it has been Shown that low volatility stocks outperform high volatility stocks (see, for example, Baker, Malcolm, Brendan Bradley, and Jeffrey Wurgler (2011), “Benchmarks as Limits to Arbitrage: Understanding the Low-Volatility Anomaly”, Financial Analyst Journal, Vol. 67, No. 1, pp. 40–54.)
So what I want to do is construct the low vola factor by following the methodology of Jegadeesh and Titman (1993), namely raning stocks according to their previous j historical volatility and short top 30% (the most volatile) and Long the bottom 30% (the least volatile), and hold that Long-short Portfolio for k periods. Therefore, a 3-3 j-k Portfolio would mean, looking at the past 3 months of historical volatility (j), and hold that Portfolio for the following 3 months (k).
I have written some Code, and the j part Can be easily managed by simply increasing or decreasing the window of the rolling window vola calculation. The part I am struggling with is the k part, how this could be done. Unfortunately, I couldnt find many examples online.
In addition, I was wondering if my Code is correct or if I did any mistake, since it surprisingly did not work, regardless of the dataset I used. I am not sure whether this is the right place to ask, but if someone could take a look at it that would be great and might be helpful to others planning to implement a strategy like this as well.
Below is a simple working example with just 10 stocks. As I said, I want to implement it for some other assets, but this Code should work. You just have to use your own API key in line 16. Thanks a lot!
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import quandl
import pickle
import scipy.optimize as sco
from scipy.ndimage.interpolation import shift
import matplotlib.pyplot as plt
##################
# Low volatility #
##################
quandl.ApiConfig.api_key = 'Your key here'
stocks = ['MSFT','AAPL','AMZN','FB','BRK.B','JPM','GOOG','JNJ','V','PG','XOM']
data = quandl.get_table('WIKI/PRICES', ticker = stocks,
qopts = { 'columns': ['date', 'ticker', 'adj_close'] },
date = { 'gte': '2016-1-1', 'lte': '2019-11-3' }, paginate=True)
# with open("data.pkl", "wb") as pickle_file:
# pickle.dump(data, pickle_file)
# with open("data.pkl", "rb") as pickle_file:
# data = pickle.load(pickle_file)
data = data.pivot_table(index='date', columns='ticker', values='adj_close')
data = data.groupby(pd.Grouper(freq="M")).mean() # convert from daily to monthly prices
returns = (np.log(data) - np.log(data.shift(1))).dropna()
stds = returns.rolling(12).std()
stds = stds.values # convert to numpy array
list = []
for x in range(0, stds.shape[0]): # for each row in std matrix, create decile buckets (dec -> breakpoint to the next bucket)
for y in range(0,100,10):
dec = np.percentile(stds[x], y)
list.append(dec)
list = np.array(list) # convert list to numpy array
list = np.reshape(list, (stds.shape[0], -1)) # reshape the array such that it has the same format as returns (here: (26,10))
inds = []
for x in range(0, stds.shape[0]): # if the return is in the lower 30%, allocate a -1 to the asset. If it is in the upper 30%, allocate a 1. 0 otherwise.
ind = np.digitize(stds[x], list[x])
for x in range(0, ind.shape[0]):
if ind[x] <= 3:
ind[x] = 1
elif ind[x] >= 8:
ind[x] = -1
else:
ind[x] = 0
inds.append(ind)
inds = np.array(inds)
inds = inds.astype(np.float32)
for x in inds: # divide -1, 1 and 0 by the respective total number of counts of -1, 1 and 0, such that they sum up to -1 and 1 (beta neutral long-short)
ones = np.count_nonzero(x == 1) # count the number of 1
minus_ones = np.count_nonzero(x == -1) # count the number of -1
zeros = np.count_nonzero(x == 0) # count the number of 0
for y in range(0, inds.shape[1]):
if x[y] == 1:
x[y] = x[y] / ones
elif x[y] == -1:
x[y] = x[y] / minus_ones
else:
x[y] = x[y] / zeros
returns = returns.shift(periods=-1).values # shift returns one period back, and create numpy array
pf_returns = np.sum((inds*returns), axis=1) # multiply returns with weights, and sum up
pf_returns = pd.DataFrame(pf_returns)
print("---")
print(pf_returns.describe())
# Plot
pf_returns_indexed = 100 * (1 + pf_returns).cumprod()
pf_returns_indexed = pf_returns_indexed.plot(linewidth=1.2) # change line width
plt.show()
Im trying to run a rolling volatility (GARCH) using this python code:
import pandas as pd
import numpy as np
from matplotlib import style
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
class monte_carlo:
def __init__(self,S,mu,sigma,c):
self.S=S #The start value of the portfolio
self.mu=mu #The expected return calculated by CAPM
self.sigma=sigma #Volatility
self.c=c #Confidence level
def brownian_motion(self, num_sim, pred_days):#Main function to rund MCS
last_price = self.S #Collect the start value from init function
# There is a need of a dataframe to create the plots
simulation_df = pd.DataFrame()
#Loop to simualte the number of needed simulations
for x in range(num_sim): #Loop to ensure start value as value day 0
#Ensure the loop to start with start value
count = 0
#Create empty list
prices = []
#Append the start value to the list at position 0
prices.append(last_price)
for i in range(pred_days): #The actual MCS
if count == 251: #Restricting the simulation to one year
break
shock = ((self.mu-0.5*self.sigma**2)+self.sigma*np.random.normal())
# The brownian motion
price = prices[count] * np.exp(shock)
#Calculate price after shock
prices.append(price)
#Append price to the list
count += 1 #next count
simulation_df[x] = prices
#When loop is done, add the prices to the data frame
self.simulation_df = simulation_df
self.predicted_days = pred_days
def plot(self):
pred_days = self.predicted_days
simulation_df = self.simulation_df
last_price = self.S
#recall values
plot_line = plt.figure() #call lot function
style.use('bmh') #set style
title = "Monte Carlo Simulation: " + str(pred_days) + " Days"
plt.plot(simulation_df) #plot all the price paths
plot_line.suptitle(title,fontsize=18, fontweight='bold')
plt.xlabel('Day')
plt.ylabel('Price ($USD)')
plt.grid(True,color='grey')
plt.axhline(y=last_price, color='r', linestyle='-')
#ˆCreate right font
plt.show()
def VaR(self): #funtcion to callculte VaR
simulation_df = self.simulation_df
#Recall price an list of price paths
price_array = simulation_df.iloc[-1, :] #Rename
price_array = sorted(price_array, key=int) #Sort final prices
percentile = np.percentile(price_array,(1-self.c)*100)
#Use percentile function to find the percentile at
#The given confidence interval
Value_at_Risk = 1-(percentile/self.S)
#Calculate the acutall value at risk
print("Value at Risk: ", Value_at_Risk)
'''#fit = stats.norm.pdf(price_array, np.mean(price_array), np.std(price_array))
#plt.plot(price_array,fit,'-o')
plt.hist(price_array,normed=True)
plt.xlabel('Price')
plt.ylabel('Probability')
plt.title(r'Histogram of Speculated Stock Prices', fontsize=18, fontweight='bold')
plt.legend(loc="upper right")
plt.show()'''
if __name__== "__main__":
S = 100000000
c = 0.99
mu = 0.00024
sigma = 0.02
sim = monte_carlo(S,mu,sigma,c)
sim.brownian_motion(1000, 10)
#sim.plot()
sim.VaR()
#sim.key_stats()
#symbols = ['AAPL', 'KO', 'HD', 'PM']
#weights = [1000,1000,2000,3000]
#sim.get_portfolio(symbols, weights)
#sim.get_asset('AAPL')
Can someone help me with this implementation?
I am a Python beginner and wrote a function for a simple moving average strategy. I created a portfolio DataFrame inside the function and now I want to use this DataFrame outside of the function for plotting some graphs. My solution is: return portfolio - but this does not work. Can anybody help me?
This is my code:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# Import a data source - FSE-Data with Index 'Date'
all_close_prices = pd.read_csv('FSE_daily_close.csv')
all_close_prices = all_close_prices.set_index('Date')
# Fill NaN Values with the last available stock price - except for Zalando
all_close_prices = all_close_prices.fillna(method='ffill')
# Import ticker symbols
ticker_list = list(all_close_prices)
# Zalando 'FSE/ZO1_X' (position row 99) - doesn't begin in 2004
# Drop Zalando
all_close_prices.drop('FSE/ZO1_X', axis=1)
# Also from the ticker list
ticker_list.remove('FSE/ZO1_X')
# Create an empty signal dataframe with datetime index equivalent to the stocks
signals = pd.DataFrame(index=all_close_prices.index)
def ma_strategy(ticker, long_window, short_window):
# Calculate the moving avergaes
moving_avg_long = all_close_prices.rolling(window=long_window, min_periods=1).mean()
moving_avg_short = all_close_prices.rolling(window=short_window, min_periods=1).mean()
moving_avg_short = moving_avg_short
moving_avg_long = moving_avg_long
# Add the two MAs for the stocks in the ticker_list to the signals dataframe
for i in ticker_list:
signals['moving_avg_short_' + i] = moving_avg_short[i]
signals['moving_avg_long_' + i] = moving_avg_long[i]
# Set up the signals
for i in ticker_list:
signals['signal_' + i] = np.where(signals['moving_avg_short_' + i] > signals['moving_avg_long_' + i], 1, 0)
signals['positions_' + i] = signals['signal_' + i].diff(periods=1)
#Backtest
initial_capital = float(100000)
# Create a DataFrame `positions` with index of signals
positions = pd.DataFrame(index=all_close_prices)
# Create a new column in the positions DataFrame
# On the days that the signal is 1 (short moving average crosses the long moving average, you’ll buy a 100 shares.
# The days on which the signal is 0, the final result will be 0 as a result of the operation 100*signals['signal']
positions = 100 * signals[['signal_' + ticker]]
# Store the portfolio value owned with the stock
# DataFrame.multiply(other, axis='columns', fill_value=None) - Multiplication of dataframe and other, element-wise
# Store the difference in shares owned - same like position column in signals
pos_diff = positions.diff()
# Add `holdings` to portfolio
portfolio = pd.DataFrame(index=all_close_prices.index)
portfolio['holdings'] = (positions.multiply(all_close_prices[ticker], axis=0)).sum(axis=1)
# Add `cash` to portfolio
portfolio['cash'] = initial_capital - (pos_diff.multiply(all_close_prices[ticker], axis=0)).sum(
axis=1).cumsum()
# Add `total` to portfolio
portfolio['total'] = portfolio['cash'] + portfolio['holdings']
# Add `returns` to portfolio
portfolio['return'] = portfolio['total'].pct_change()
portfolio['return_cum'] = portfolio['total'].pct_change().cumsum()
return portfolio
ma_strategy('FSE/VOW3_X',20,5)
# Visualize the total value of the portfolio
portfolio_value = plt.figure(figsize=(12, 8))
ax1 = portfolio_value.add_subplot(1, 1, 1, ylabel='Portfolio value in $')
# Plot the equity curve in dollars
portfolio['total'].plot(ax=ax1, lw=2.)
You need to assign your function return value to a variable. The line which says
ma_strategy('FSE/VOW3_X',20,5)
probably needs to change to
portfolio = ma_strategy('FSE/VOW3_X',20,5)
I am trying to model a time series in python using python 2.7.11 and the excellent statsmodels.tsa package. My data consists of hourly measurements of traffic intensity over several weeks. Thus, the data has multiple seasonal components, days form a 24 hour period; weeks form a 168 hour period.
At this point, the modeling options in statsmodels.tsa are not set up to handle multiple seasonality, as they only allow for the specification of one seasonal factor. However, I came across the work of Rob Hyneman on multiple seasonality in R. He advocates modeling seasonal components of a time series using Fourier series, including a Fourier series in the model for the frequencies corresponding to each of seasonal periods.
I've used Welch's method to obtain the power spectral density of the signal in my observed time series, extracted the peaks in the signal which correspond to the frequencies at which I expect my seasonal effects, and used the frequency and amplitude to generate a sine wave pattern corresponding to the seasonal trends I expect in my data. As an aside, I think this allows me to bypass Hyneman's step of selecting the value of k based on the AIC, because I am using the signal inherent in the observed data.
To ensure that the sine waves match the occurrence of the seasonal pattern in the data, I match the peak of both sine wave patterns to the peaks in the observed data by visually selecting a peak within one of the 24-hour periods, and matching the hour of its occurrence to the highest value of the variable representing the sine wave. Prior to this, I have checked that the daily peaks occur at the same hour consistently.
So far, so good it seems - plots of the sine waves constructed with the obtained frequencies and amplitudes roughly correspond to the observed data. I then fit an ARIMA(2,0,0) model, including both of the decomposition-based variables as exogenous variables. At this point, I want to test the predictive utility of the model. However, this is where things get complicated.
When I am using ARIMA from the statsmodels package, the estimates I get from fitting the model form a pattern which replicates the sine waves, but with a range of values matching my observation. There is still a lot of variance in the observations which is not explained by the seasonal trends, leading me to believe that somewhere in the model fitting procedure something is not going the way it is supposed to.
Unfortunately, I am not sufficiently well-versed in the art of time series modeling to know if my unexpected results are due to the nature of exogenous variables I am including, statsmodels functionality that I should be using, but am omitting, or wrongful assumptions about the concept of seasonal trends.
Some concrete questions I have are:
is it possible to include multiple seasonal trends (i.e. Fourier- or decomposition-based) in an ARIMA model using statsmodels in python?
could reconstruction of the seasonal trend using sine waves cause difficulties when the sine waves are included as exogenous variables in the model as specified above and in the code below?
why does the model specified int he code below not yield predictions which match the observed data more closely?
Any help is much appreciated!
Best wishes, and thanks in advance,
Evert
p.s.: Sorry if my code sample and data file are overly long - as I am not sure what causes the unexpected results I thought I'd post the whole thing. Also, apologies for not following PEP8 at times - I'm still learning :)
Code sample:
import os
import re
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
from scipy.signal import welch
import operator
# Function which plots rolling mean of data set in order to estimate stationarity
# 'timeseries' = Data to be used for ARIMA modeling
#
def plotmean(timeseries, show=0, path=''):
rolmean = pd.rolling_mean(timeseries, window=12)
rolstd = pd.rolling_std(timeseries, window=12)
fig = plt.figure(figsize=(12, 8))
orig = plt.plot(timeseries, color='blue', label='Observed scores')
mean = plt.plot(rolmean, color='red', label='Rolling mean')
std = plt.plot(rolstd, color='black', label='Rolling SD')
plt.legend(loc='best')
plt.title('Rolling Mean & Standard Deviation')
if show != 0:
plt.show()
if path != '':
plt.savefig(path, format='png', bbox_inches='tight')
plt.clf()
#
# Function to decompose a function over time f(t) into a spectrum of signal amplitude and frequency
# 'dta' = The dataset used
# 'show' = Whether or not to show plot
# 'path' = Where to store plot, if desirable
#
# Output:
# frequency range and spectral density range
#
def runwelch(dta, show, path):
nps = (len(dta) / 2) + 8
nov = nps / 2
fft = nps
fs_temp = .0002778
# Set to 1/3600 because of hourly sampling
f, Pxx_den = welch(dta, fs=fs_temp, nperseg=nps, noverlap=nov, nfft=fft, scaling="spectrum")
plt.plot(f, Pxx_den)
plt.ylim([0.5e-7, 10])
plt.xlabel('frequency [Hz]')
plt.ylabel('PSD [V**2/Hz]')
if show != 0:
plt.show()
if path != '':
plt.savefig(path, format='png', bbox_inches='tight')
plt.clf()
return f, Pxx_den
#
# Function which gets amplitude and frequency of n most important periodical cycles, and provides plot
# to visually inspect if they correspond to expected seasonal components.
# 'freq' = output of Welch decomposition
# 'density' = output of Welch decomposition
# 'n' = desired number of peaks to extract
# 'show' = whether to show plots of corresponding sine functions
def getsines(n_obs, freq, density, n, show):
ftemp = freq
dtemp = density
fstore = []
dstore = []
astore = []
fs_temp = .0002778
# Set to 1/3600 because of hourly sampling
samplespace = n_obs * 3600
for a in range(0, n, 1):
max_index, max_value = max(enumerate(dtemp), key=operator.itemgetter(1))
dstore.append(max_value)
fstore.append(ftemp[max_index])
astore.append(np.sqrt(max_value))
dtemp[max_index] = 0
if show == 1:
for b in range(0, len(fstore), 1):
sound_sine = sine(fstore[b], samplespace, fs_temp, astore[b], 1)
plt.plot(sound_sine)
plt.show()
plt.clf()
return fstore, astore
def sine(freq, time_interval, rate, amp):
w = 2. * np.pi * freq
t = np.linspace(0, time_interval, time_interval * rate)
y = amp * np.sin(w * t)
return y
#
# Function which adapts the calculated sine waves for the returned sines for k = 1 through k = kmax
# 'dta' = Data set
def buildFterms(dta, fstore, astore):
n = len(fstore)
n_obs = len(dta)
fs_temp = .0002778
# Set to 1/3600 because of hourly sampling
samplespace = n_obs * 3600 + (24 * 3600)
# Add one excess day for later fitting of sine waves to peaks
store = []
for i in range(0, n, 1):
tmp = sine(fstore[i], samplespace, 0.0002778, astore[i])
store.append(tmp)
k_168_store = store[0]
k_24_store = store[1]
k_24 = np.transpose(k_24_store)
k_168 = np.transpose(k_168_store)
k_24 = pd.Series(k_24)
k_168 = pd.Series(k_168)
dta_ind, dta_val = max(enumerate(dta.iloc[120:143]), key=operator.itemgetter(1))
# Visually inspect mean plot, select interval which has clear and representative peak, use to determine index.
k_24_ind, k_24_val = max(enumerate(k_24.iloc[0:23]), key=operator.itemgetter(1))
# peak in sound level at index 1 is matched by peak in sine wave at index 7. Thus, sound level[0] corresponds to\
# sine waves[6]
# print dta_ind, dta_val, k_24_ind, k_24_val
k_24_sel = k_24[6:1014]
k_168_sel = k_168[6:1014]
exog = pd.concat([k_24_sel, k_168_sel], axis=1)
return exog
#
# Function which takes data, makes a plot of the ACF and PACF, and saves the plot, if needed
# 'x' = Time series data, time indexed, over which to plot the ACF and PACF.
# 'show' = Whether or not to show the resulting plot (0 = don't show [default], 1 = show)
# 'path' = A full file path specification indicating whether or not the file should be saved (default = 0, don't save)
# Use output plot to visually interpret necessary parameters p, d, q, and seasonal component for SARIMAX procedure
#
def plotpacf(x, show=0, path=''):
dflength = len(x)
nlags = dflength * .80
fig = plt.figure(figsize=(12, 8))
ax1 = fig.add_subplot(211)
fig = sm.graphics.tsa.plot_acf(x.squeeze(), lags=nlags, ax=ax1)
ax2 = fig.add_subplot(212)
fig = sm.graphics.tsa.plot_pacf(x, lags=nlags, ax=ax2)
if show != 0:
plt.show()
if path != '':
plt.savefig(path, format='png', bbox_inches='tight')
plt.clf()
#
# Function to calculate the Dickey-Fuller test of stationarity
# 'dta' = Time series data, time indexed, over which to test for stationarity using the Dickey-Fuller test.
#
def dftest(dta):
print 'Results of Dickey-Fuller Test:'
dftest = sm.tsa.stattools.adfuller(dta, autolag='AIC')
dfoutput = pd.Series(dftest[0:4], index=['Test Statistic', 'p-value', '#Lags Used', 'Number of Observations Used'])
for key, value in dftest[4].items():
dfoutput['Critical Value (%s)' % key] = value
if dfoutput[0] < dfoutput[4]:
dfoutput['Stationary'] = 'True'
else:
dfoutput['Stationary'] = 'False'
print dfoutput
#
# Function to difference the time series, in order to determine optimal value of d for ACF and PACF
# 'dta' = Data, time series indexed, to be differenced
# 'd' = Order of differencing to be applied
# 'show' = Whether or not to show the resulting plot (0 = don't show [default], 1 = show)
# 'path' = A full file path specification indicating whether or not the file should be saved (default = 0, don't save)
#
def diffit(dta, d, show, path=''):
templist = []
for i in range(0, (len(dta) - d), 1):
tempval = dta[i] - dta[i + d]
templist.append(tempval)
y = templist[d:len(templist)]
y = pd.Series(y)
plotpacf(y, show, path)
return y
#
# Function to fit the ARIMA model based on parameters obtained from the ACF / PACF plot
# 'dta' = Time series data, time indexed, over which to fit a SARIMAX model.
# 'exog' = Exogenous variables used in ARIMA model
# 'p' = Number of AutoRegressive lags, initially based on the cutoff point of the ACF
# 'd' = Order of differencing based on visual examination of ACF and PACF plots
# 'q' = Number of Moving Average lags, initially based on the utoff point of the PACF
# 'show' = Whether or not to show the resulting plot (0 = don't show [default], 1 = show)
# 'path' = A full file path specification indicating whether or not the file should be saved (default = 0, don't save)
#
def runARIMA(dta, exogvar, p, d, q, show=0, path=''):
mod = sm.tsa.ARIMA(dta, (p, d, q), exogvar)
results = mod.fit()
resids = results.resid.values
summarised = results.summary()
print summarised
plotpacf(resids, show, path)
return results
#
# Function to use fitted ARIMA for prediction of observed data, compare predicted to observed
# 'dta' = Data used in ARIMA prediction
# 'exog' = Exogenous variables fitted in the model
# 'arima' = Result from correctly fitted ARIMA model, likely on the residuals of a decomposed time series
# 'datrng' = Range of dates used for original time series definition, used for specifying predictions
# 'show' = Whether or not to show the resulting plot (0 = don't show [default], 1 = show)
# 'path' = A full file path specification indicating whether or not the file should be saved (default = 0, don't save)
#
def ARIMAcompare(dta, exogvar, arima, datrng, show=0, path=''):
dflength = len(datrng) - 1
observation = dta
prediction = arima.predict(start=3, end=dflength, exog=exogvar, dynamic=True)
df = pd.concat([prediction, observation], axis=1)
df.columns = ['predicted', 'observed']
plt.plot(prediction)
plt.plot(observation)
if show != 0:
plt.show()
if path != '':
plt.savefig(path, format='png', bbox_inches='tight')
plt.clf()
return df
#
# Function use fitted ARIMA model for predictions
# 'pred_hours' = number of hours we want to predict scores for
# 'firsttime' = last timestamp in observations
# 'df' = data frame containing data on which the ARIMA model was previously fitted
# 'results' = output of the modeling procedure
# 'freq' = Frequency of seasonal cycle that was used in decomposition
# 'decomp' = Output of the time series decomposition step
# 'mark' = Amount of hours included in the graph prior to prediction. Set at as close to 2 weeks as possible.
# 'show' = Whether or not to show the resulting plot (0 = don't show [default], 1 = show)
# 'path' = A full file path specification indicating whether or not the file should be saved (default = 0, don't save)
#
# Output: A dataframe with observed and predicted values. Note that predictions > 5 time units are considered unreliable
# by modeling standards.
#
def pred(pred_hours, k, df, arima, show=0, path=''):
n_obs = len(df.index)
lastdt = df.index[n_obs - 1]
lastdt = lastdt.to_datetime()
datrng = pd.date_range(lastdt, periods=(pred_hours + 1), freq='H')
future = pd.DataFrame(index=datrng, columns=df.columns)
df = pd.concat([df, future])
lendf = len(df.index)
df['predicted'] = arima.predict(start=n_obs, end=lendf, exog=k, dynamic=True)
print df
marked = 2 * pred_hours
df[['predicted', 'observed']].ix[-marked:].plot(figsize=(12, 8))
if show != 0:
plt.show()
if path != '':
plt.savefig(path, format='png', bbox_inches='tight')
plt.clf()
return df[['predicted', 'observed']].ix[-marked:]
dirnow = os.getcwd()
fpath = dirnow + '/sounds_full2.csv'
fhand = open(fpath)
dta = pd.read_csv(fhand, sep=',')
dta_sel = dta.iloc[1248:2256, 2]
#
#
#
# Extract start and end date of measurements from sound data, adding one hour because
# the last hour of the last day is not counted
#
sound_start = dta.iloc[1248, 0]
# The above .iloc value needs to be changed depending on the length of the sound data set being read in.
#
# Establish start date
sound_start = re.sub('-', '/', sound_start)
sound_start = re.sub('_', ' ', sound_start)
sound_start = sound_start + ':00'
sound_start = pd.to_datetime(sound_start, format='%d/%m/%Y %H:%M:%S')
#
# Establish end date
indexer = len(dta.index) - 1
sound_end = dta.iloc[indexer, 0]
sound_end = re.sub('-', '/', sound_end)
sound_end = re.sub('_', ' ', sound_end)
sound_end = sound_end + ':00'
sound_end = pd.to_datetime(sound_end, format='%d/%m/%Y %H:%M:%S')
sound_diff = sound_end - sound_start
#
# Derive number of periods and create data set
num_observed = (sound_diff.days * 24) + ((sound_diff.seconds + 3600) / 3600)
usedates3 = pd.date_range(sound_start, periods=num_observed, freq='H')
usedates3 = pd.Series(usedates3)
usedates3.index = dta_sel.index
timedfreq = pd.concat([usedates3, dta_sel], axis=1)
timedfreq.index = timedfreq.iloc[:, 0]
freqset = pd.Series(timedfreq.iloc[:, 1])
filepath = dirnow + '/Sound_RollingMean.png'
plotmean(freqset, 0, filepath)
# Plotted mean shows recurring (seasonal) trends at periods of 24 hours and 168 hours.
# This means a seasonal model is needed that accounts for both of these influences
# To do so, Fourier series representing the 24- and 168 hour seasonal trends can be added to the ARIMA-model
#
#
#
#
# Check for stationarity of data
#
dftest(freqset)
# Time series can be considered stationary
#
#
#
# Establish frequencies and amplitudes with which to fit ARIMA model
#
# Decompose signal into frequency and amplitude
#
filepath = dirnow + "/Welch.png"
f, Pxx_den = runwelch(freqset, 0, filepath)
#
# Obtain sine wave parameters, optionally view test plots to check periodicity
freqs, amplitudes = getsines(len(freqset), f, Pxx_den, 2, 0)
#
# Use parameters to build Fourier series for observed data with varying values for k
exog_sel = buildFterms(freqset, freqs, amplitudes)
exog_sel.index = freqset.index
#
# fit ARIMA model, plot ACF and PACF for fitted model, check for effects orders of differencing on residuals
#
filepath = dirnow + '/Sound_resid_ACFPACF.png'
Sound_ARIMA = runARIMA(freqset, exog_sel, 1, 0, 0, show=0, path=filepath)
sound_residuals = Sound_ARIMA.resid
#
# Plot various acf / pacf plots of differencing given model residuals
filepath = dirnow + '/Sound_resid_ACFPACF_d1.png'
tempdta_d1 = diffit(sound_residuals, 1, 0, filepath)
filepath = dirnow + '/Sound_resid_ACFPACF_d2.png'
tempdta_d2 = diffit(sound_residuals, 2, 0, filepath)
# Of the two differenced models, one order of differencing seems to yield the best results
# Visual inspection of plots and model output suggests model with p = 2, d = 0 or p = 1, d = 1 to be optimal.
#
#
#
# Find optimal form of model
filepath = dirnow + '/Sound_resid_ACFPACF_200.png'
Sound_ARIMA_200 = runARIMA(freqset, exog_sel, 2, 0, 0, show=0, path=filepath)
filepath = dirnow + '/Sound_resid_ACFPACF_110.png'
Sound_ARIMA_110 = runARIMA(freqset, exog_sel, 1, 1, 0, show=0, path=filepath)
# Based on model output and ACF / PACF plot comparison for 'Sound_resid_ACFPACF_110.png' and \
# 'Sound_resid_ACFPACF_200.png', the model parameters for p = 2, d = 0, q = 0 are closer to optimal.
#
# Use selected model to predict observed values
filepath = dirnow + '/Sound_PredictObserved.png'
sound_comparison = ARIMAcompare(freqset, exog_sel, Sound_ARIMA_200, usedates3, 0, filepath)
#
# Predict values and store for Sound dataset
filepath = dirnow + '/Sound_PredictFuture.png'
sound_storepred = pred(168, exog_sel.iloc[0:170, :], sound_comparison, Sound_ARIMA_200, 0, filepath)
Data file