Python Pandas code takes over a minute to process the data - python

I have two issues with the below code:
(a) It takes over a minutes to process a record
(b) I receive a Warning [A value is trying to be set on a copy of a slice from a DataFrame. See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy]
for stock in stocklist:
...
...
df['Up Move'] = np.nan
df['Down Move'] = np.nan
df['Average Up'] = np.nan
df['Average Down'] = np.nan
df['RS'] = np.nan
df['RSI'] = np.nan
for x in range(1, len(df)):
df['Up Move'][x] = 0
df['Down Move'][x] = 0
if df['Close'][x] > df['Close'][x-1]:
df['Up Move'][x] = df['Close'][x] - df['Close'][x-1]
if df['Close'][x] < df['Close'][x-1]:
df['Down Move'][x] = abs(df['Close'][x] - df['Close'][x-1])
#Calculate initial Average Up & Down, RS and RSI
df['Average Up'][14] = df['Up Move'][1:15].mean()
df['Average Down'][14] = df['Down Move'][1:15].mean()
df['RS'][14] = df['Average Up'][14] / df['Average Down'][14]
df['RSI'][14] = 100 - (100/(1+df['RS'][14]))
#Calculate rest of Average Up, Average Down, RS, RSI
for x in range(15, len(df)):
df['Average Up'][x] = (df['Average Up'][x-1]*13+df['Up Move'][x])/14
df['Average Down'][x] = (df['Average Down'][x-1]*13+df['Down Move'][x])/14
df['RS'][x] = df['Average Up'][x] / df['Average Down'][x]
df['RSI'][x] = 100 - (100/(1+df['RS'][x]))
df.drop(['Up Move', 'Down Move', 'Average Up', 'Average Down'], axis = 1, inplace = True)

Try to avoid using for loop to handle dataframe.
the code:
#Calculate rest of Average Up, Average Down, RS, RSI
for x in range(15, len(df)):
df['Average Up'][x] = (df['Average Up'][x-1]*13+df['Up Move'][x])/14
df['Average Down'][x] = (df['Average Down'][x-1]*13+df['Down Move'][x])/14
df['RS'][x] = df['Average Up'][x] / df['Average Down'][x]
df['RSI'][x] = 100 - (100/(1+df['RS'][x]))
may equal to:
df.loc[15:, 'Average Up'] = df.loc[15:, 'Average Up'].shift(-1)*13 + df.loc[15:, 'Up Move'][x]/14
df.loc[15:, 'Average Down'] = df.loc[15:, 'Average Down'].shift(-1)*13 + df.loc[15:, 'Up Down'][x]/14
df.loc[15:, 'RS'] = df.loc[15:, 'Average Up']/ df.loc[15:, 'Average Down']
df.loc[15:, 'RSI'] = 100 - (100/(1+df.loc[15:, 'RS'][x]))

Just found the code that gives me the same right result in a couple of seconds. The earlier code would take about one minute to three minutes.
# Compute RSI
def computeRSI (data, time_window):
diff = data.diff(1).dropna() # diff in one field(one day)
#this preservers dimensions off diff values
up_chg = 0 * diff
down_chg = 0 * diff
# up change is equal to the positive difference, otherwise equal to zero
up_chg[diff > 0] = diff[ diff>0 ]
# down change is equal to negative deifference, otherwise equal to zero
down_chg[diff < 0] = diff[ diff < 0 ]
# check pandas documentation for ewm
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.ewm.html
# values are related to exponential decay
# we set com=time_window-1 so we get decay alpha=1/time_window
up_chg_avg = up_chg.ewm(com=time_window-1 , min_periods=time_window).mean()
down_chg_avg = down_chg.ewm(com=time_window-1 , min_periods=time_window).mean()
rs = abs(up_chg_avg/down_chg_avg)
rsi = 100 - 100/(1+rs)
return rsi
# Calculate the RSI values using our function and add them to the dataframe.
df['RSI'] = computeRSI(df['Adj Close'], 14)
print(df.head())
print(df.tail())

Related

Efficient summation without looping

I need to calculate entries for two long series consisting of sums. Currently I'm looping, which is rather inefficient. I was wondering whether there exists a pandas method to exploit. It follows an MWE.
import numpy as np
import pandas as pd
T = range(1, 8761) # year in hours
T0 = range(6, 8761 + 24, 24) # time instant such that storage is empty
e2p = 2 # energy-to-power; maximal duration [h] such that storage can be emptied
# Input data:
# on: maximal storageinflow
# off: maximal storageoutflow
df = pd.DataFrame(
np.random.randint(0, 100, size=(8760, 2)),
index=T,
columns=['on', 'off']
)
# Output
# sum1: maximal storage level for storage with fixed instants T0
# sum2: maximal storage level for e2p storage
df['sum1'] = np.nan
df['sum2'] = np.nan
# Summation 1
# Everything that flows in within s in [t, t + 23] for t in T0
# must flow out until s = t + 24
for t in T0:
t0 = max(1, t - 23)
t = min(t, 8760)
for s in range(t0, t + 1):
sum_on = df.loc[t0: s, 'on'].sum()
sum_off = df.loc[s + 1: t, 'off'].sum()
df.loc[s, 'sum1'] = min(sum_on, sum_off)
# Summation 2
# Everything that flowed in until t in T, *can* flow out until t+e2p
for t in T:
t0 = max(1, t - e2p + 1)
tf = min(t + e2p, 8760)
sum_on = df.loc[t0: t, 'on'].sum()
sum_off = df.loc[t + 1: tf, 'off'].sum()
df.loc[t, 'sum2'] = min(sum_on, sum_off)
Solution Sum 1
for t in T0:
t0 = max(1, t - VZF + 1)
t = min(t, 8760)
sum_ein = df.loc[t0: t, 'on'].cumsum()
sum_aus = (df.loc[t0: t, 'off']
.iloc[::-1]
.cumsum()
.iloc[::-1]
.shift(periods=-1, fill_value=0))
df.loc[t0: t, 'sum1'] = pd.concat([sum_on, sum_off], axis=1).min(axis=1)
Solution Sum 2
sum_on = df['on'].rolling(window=e2p, min_periods=1).sum()
sum_off = (df['off']
.shift(periods=-1, fill_value=0)
.iloc[::-1]
.rolling(window=e2p, min_periods=1)
.sum()
.iloc[::-1])
df['sum2'] = pd.concat([sum_on, sum_off], axis=1).min(axis=1)

efficient frontier/stock analyze

Consider the following task. Using a 10-year period I should calculate the portfolio weights in January and then use these weights in February to calculate the portfolio return and standard deviation. The program should then continue to calculate the weights In February and then use these weights in February to calculate the portfolio returns and standard deviation in marts. This should be done through all the 131 months in the data meaning I should only calculate the weights in the first month of the dataset.
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import yfinance as yf
p_ret = [] # Define an empty array for portfolio returns
p_vol = [] # Define an empty array for portfolio volatility
tickers = ['AAPL', 'AMZN', 'XOM']
start_date = datetime.date(2010, 1, 2)
end_date = datetime.date(2020, 12, 31)
daily_data = yf.download(tickers, start=start_date, end=end_date) # definere datasættet
daily_data = daily_data['Adj Close'].dropna()
Vector_of_ones = np.array([1,1,1])
frames = [v for _, v in daily_data.groupby(pd.Grouper(freq='M'))]
rf = 0.01 # risk free asset
weights = []
df = pd.DataFrame(columns=tickers)
for w in frames:
#corr_matrix = w.pct_change().apply(lambda x: np.log(1 + x)).corr()
mu = (w.resample('D').last().pct_change().sum())
individual_asset_return = np.subtract(np.transpose(mu), np.dot(Vector_of_ones,rf))
# individual_asset_return = daily_data.pct_change().mean() # finder gennemsnittet
df.loc[+1] = [individual_asset_return[tickers[0]], individual_asset_return[tickers[1]],
individual_asset_return[tickers[2]]]
df.index = df.index - 1
df = df.sort_index()
for d in range(len(df)):
cov_matrix = w.pct_change().apply(lambda x: np.log(1 + x)).cov()
liste = df.iloc[d].tolist()
a = np.dot(np.linalg.inv(cov_matrix), np.transpose(np.array(liste)))
omega_weights = a / (np.dot(np.transpose(Vector_of_ones), a)) # expression to find weights
weights.append(omega_weights)
for afkast in frames[1:]: #loop to find the portfolio returns and standard deviation
cov_matrix1 = afkast.pct_change().apply(lambda x: np.log(1 + x)).cov()
#corr_matrix1 = afkast.pct_change().apply(lambda x: np.log(1 + x)).corr()
df1 = df.iloc[1:, :]
for d1 in range(len(df)):
liste1 = df.iloc[d1].tolist()
portfolio_return = np.dot(np.transpose(omega_weights),
mu)
p_ret.append(portfolio_return)
volatility_portfolio = np.sqrt(np.dot(np.transpose(omega_weights), np.dot(cov_matrix1, omega_weights)))
p_vol.append(volatility_portfolio)
data = {'Returns': p_ret, 'Volatility': p_vol}
for counter, symbol in enumerate(afkast.columns.tolist()):
# print(counter, symbol)
data[symbol + ' weight'] = [w[counter] for w in weights]
portfolios = pd.DataFrame(data) # laver dataframe som sortere sådan at den med mindst volatility er øverst
portfolios['Date'] = pd.date_range(start=start_date, periods=len(portfolios), freq='M')
portfolios.plot(x='Date', y='Returns', kind='line')
# portfolios.plot(x = 'Date', y = 'Volatility', kind = 'line')
plt.show()
print(portfolios.head())
As you probably can see I’m not an advanced coder but I hope I could some help where my code is wrong if there is anything wrong.
I really appreciate any help you can provide.

Index Error when performing simple calculation

Goal is to pull stock tickers from Wikipedia, use yfinance to grab the historical closing prices, and perform the Relative Strength Index (RSI) calculation for each of the tickers listed in the S&P500. Once this is accomplished, the tickers will be grouped into a 'buy', 'sell', or 'donothing' category based on their respective calculated RSI values. However, I am receiving an Index Error that refers to [ if RSI[245]>=30 and RSI[245-10]<30: ] claiming 245 is out of bounds for axis 0 with size 126, with size 126 being the number of days of closing prices collected.
import yfinance as yf
import pandas as pd
# Read and print the stock tickers that make up S&P500
tickers = pd.read_html(
'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')[0]
print(tickers.head())
# Get the data for this tickers from yahoo finance
df = yf.download(tickers.Symbol.to_list(),'2021-9-24','2022-3-24', auto_adjust=True)['Close']
print(data.head())
#-------------------------------------------------------
df2 = df
#print(df2)
total_columns = df2.shape[1]
#print(total_columns)
print(df2)
#-------------------------------------------------------
buy = []
sell = []
donothing = []
for i in range(total_columns):
#company_name = header
df_RSI = df2.iloc[:,i]
#print(df_RSI)
df_RSI['diff']=df_RSI.diff(1)
#print(df_RSI['diff'])
# Calculate Avg. Gains/Losses
df_RSI['gain'] = df_RSI['diff'].clip(lower=0).round(2)
df_RSI['loss'] = df_RSI['diff'].clip(upper=0).abs().round(2)
#print(df_RSI['gain'])
window_length = 14
df_RSI['avg_gain'] = df_RSI['gain'].rolling(window=window_length,min_periods=window_length).mean()
#print(df_RSI['avg_gain'][:30]) #yay working!
df_RSI['avg_loss'] = df_RSI['loss'].rolling(window=window_length, min_periods=window_length).mean()
#print(df_RSI['avg_loss'][:30]) #yay working!
#print(df_RSI.name) #prints out the tickers wooooo
# Get WMS averages
# Average Gains
for k, row in enumerate(df_RSI['avg_gain'].iloc[window_length+1:]):
df_RSI['avg_gain'].iloc[k + window_length + 1] =\
(df_RSI['avg_gain'].iloc[k + window_length] *
(window_length - 1) +
df_RSI['gain'].iloc[k + window_length + 1])\
/ window_length
# Average Losses
for j, row in enumerate(df_RSI['avg_loss'].iloc[window_length+1:]):
df_RSI['avg_loss'].iloc[j + window_length + 1] =\
(df_RSI['avg_loss'].iloc[j + window_length] *
(window_length - 1) +
df_RSI['loss'].iloc[j + window_length + 1])\
/ window_length
# View initial results
#print(df_RSI[:,window_length-1:window_length+5])
# Calculate RS Values
df_RSI['rs'] = df_RSI['avg_gain'] / df_RSI['avg_loss']
#print(df_RSI['rs'][:30])
#print(df_RSI['rs'][30:]) #yay working!
# Calculate RSI
df_RSI['rsi'] = 100 - (100 / (1.0 + df_RSI['rs']))
# View Result RSI
RSI = (df_RSI['rsi'])
#print(RSI[200:])
#print(RSI[199])
if RSI[245]>=30 and RSI[245-10]<30:
buy.append(df_RSI.name)
elif RSI[240]<=70 and RSI[245-10]>70:
sell.append(df_RSI.name)
else:
donothing.append(df_RSI.name)
print(buy)
print(sell)
print(donothing)
I have made a few changes to your code so that it works (I received an error because you didn't set df_RSI to a dataframe initially, and I changed your if statements at the end - explained below code).
import yfinance as yf
import pandas as pd
# Read and print the stock tickers that make up S&P500
tickers = pd.read_html(
'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')[0]
print(tickers.head())
# Get the data for this tickers from yahoo finance
df = yf.download(tickers.Symbol.to_list(),'2021-9-24','2022-3-24', auto_adjust=True)['Close']
print(df.head())
#-------------------------------------------------------
df2 = df
#print(df2)
total_columns = df2.shape[1]
#print(total_columns)
print(df2)
#-------------------------------------------------------
buy = []
sell = []
donothing = []
for i in range(total_columns):
#company_name = header
df_RSI = df2.iloc[:,i].to_frame()
#print(df_RSI)
df_RSI['diff']=df_RSI.diff(1)
#print(df_RSI['diff'])
# Calculate Avg. Gains/Losses
df_RSI['gain'] = df_RSI['diff'].clip(lower=0).round(2)
df_RSI['loss'] = df_RSI['diff'].clip(upper=0).abs().round(2)
#print(df_RSI['gain'])
window_length = 14
df_RSI['avg_gain'] = df_RSI['gain'].rolling(window=window_length,min_periods=window_length).mean()
#print(df_RSI['avg_gain'][:30]) #yay working!
df_RSI['avg_loss'] = df_RSI['loss'].rolling(window=window_length, min_periods=window_length).mean()
#print(df_RSI['avg_loss'][:30]) #yay working!
#print(df_RSI.name) #prints out the tickers wooooo
# Get WMS averages
# Average Gains
for k, row in enumerate(df_RSI['avg_gain'].iloc[window_length+1:]):
df_RSI['avg_gain'].iloc[k + window_length + 1] =\
(df_RSI['avg_gain'].iloc[k + window_length] *
(window_length - 1) +
df_RSI['gain'].iloc[k + window_length + 1])\
/ window_length
# Average Losses
for j, row in enumerate(df_RSI['avg_loss'].iloc[window_length+1:]):
df_RSI['avg_loss'].iloc[j + window_length + 1] =\
(df_RSI['avg_loss'].iloc[j + window_length] *
(window_length - 1) +
df_RSI['loss'].iloc[j + window_length + 1])\
/ window_length
# View initial results
#print(df_RSI[:,window_length-1:window_length+5])
# Calculate RS Values
df_RSI['rs'] = df_RSI['avg_gain'] / df_RSI['avg_loss']
#print(df_RSI['rs'][:30])
#print(df_RSI['rs'][30:]) #yay working!
# Calculate RSI
df_RSI['rsi'] = 100 - (100 / (1.0 + df_RSI['rs']))
# View Result RSI
RSI = (df_RSI['rsi'])
#print(RSI[200:])
#print(RSI[199])
if RSI[-1]>=30 and RSI[-1-10]<30:
buy.append(df2.columns[i])
elif RSI[-1]<=70 and RSI[-1-10]>70:
sell.append(df2.columns[i])
else:
donothing.append(df2.columns[i])
print(buy)
print(sell)
print(donothing)
The changes to your if statement:
df_RSI.name was changed to df2.columns[i], as I assume you wanted to append the ticker to buy, sell or donothing (and I changed df_RSI to a dataframe at the start of the for loop, so that you were appending columns in the rest of the loop, rather than additional rows to a series - and changing to_frame means that there is no longer a name for df_RSI).
The if statement itself was changed, to if RSI[-1]>=30.... This is because you are currently trying to find the index values: 245, 235 (in if), 240 and 235 (in elif), but the length of the series RSI is only 126 (so, because the index starts at 0, the very last row is RSI[125]). I changed this to be the last row RSI[-1], and the row 10 before that. Let me know if this is not what you were looking for, and I can change it (if you need more help with it).

Getting RSI in python

I've been trying to calculate the 14 RSI of stocks and I managed to get it to work, somewhat, it gives me inaccurate numbers
import pandas as pd
import datetime as dt
import pandas_datareader as web
ticker = 'TSLA'
start = dt.datetime(2018, 1, 1)
end = dt.datetime.now()
data = web.DataReader(ticker, 'yahoo', start, end)
delta = data['Adj Close'].diff(1)
delta.dropna(inplace=True)
positive = delta.copy()
negative = delta.copy()
positive[positive < 0] = 0
negative[negative > 0] = 0
days = 14
average_gain = positive.rolling(window=days).mean()
average_loss = abs(negative.rolling(window=days).mean())
relative_strenght = average_gain / average_loss
rsi = 100.0 - (100.0 / (1.0 + relative_strenght))
print(ticker + str(rsi))
It ends up giving me 77.991564 (14 days RSI) when I should be getting 70.13 (14 days RSI), does any know what I'm doing wrong?
also yes I've read Calculating RSI in Python but it doesn't help me with what I need
Here is one way to calculate by yourself RSI. The code could be optimized, but I prefer to make it easy to understand, and the let you optimize.
For the example, we assume that you've got a DataFrame called df, with a column called 'Close', for the close prices. By the way, notice that if you compare results of the RSI with a station, for example, you should be sure that you compare the same values. For example, if in the station, you've got the bid close, and that you calculate by your own on the mid or the ask, it will not be the same result.
Let's see the code :
def rsi(df,_window=14,_plot=0,_start=None,_end=None):
"""[RSI function]
Args:
df ([DataFrame]): [DataFrame with a column 'Close' for the close price]
_window ([int]): [The lookback window.](default : {14})
_plot ([int]): [1 if you want to see the plot](default : {0})
_start ([Date]):[if _plot=1, start of plot](default : {None})
_end ([Date]):[if _plot=1, end of plot](default : {None})
"""
##### Diff for the différences between last close and now
df['Diff'] = df['Close'].transform(lambda x: x.diff())
##### In 'Up', just keep the positive values
df['Up'] = df['Diff']
df.loc[(df['Up']<0), 'Up'] = 0
##### Diff for the différences between last close and now
df['Down'] = df['Diff']
##### In 'Down', just keep the negative values
df.loc[(df['Down']>0), 'Down'] = 0
df['Down'] = abs(df['Down'])
##### Moving average on Up & Down
df['avg_up'+str(_window)] = df['Up'].transform(lambda x: x.rolling(window=_window).mean())
df['avg_down'+str(_window)] = df['Down'].transform(lambda x: x.rolling(window=_window).mean())
##### RS is the ratio of the means of Up & Down
df['RS_'+str(_window)] = df['avg_up'+str(_window)] / df['avg_down'+str(_window)]
##### RSI Calculation
##### 100 - (100/(1 + RS))
df['RSI_'+str(_window)] = 100 - (100/(1+df['RS_'+str(_fast)]))
##### Drop useless columns
df = df.drop(['Diff','Up','Down','avg_up'+str(_window),'avg_down'+str(_window),'RS_'+str(_window)],axis=1)
##### If asked, plot it!
if _plot == 1:
sns.set()
fig = plt.figure(facecolor = 'white', figsize = (30,5))
ax0 = plt.subplot2grid((6,4), (1,0), rowspan=4, colspan=4)
ax0.plot(df[(df.index<=end)&(df.index>=start)&(df.Symbol==_ticker.replace('/',''))]['Close'])
ax0.set_facecolor('ghostwhite')
ax0.legend(['Close'],ncol=3, loc = 'upper left', fontsize = 15)
plt.title(_ticker+" Close from "+str(start)+' to '+str(end), fontsize = 20)
ax1 = plt.subplot2grid((6,4), (5,0), rowspan=1, colspan=4, sharex = ax0)
ax1.plot(df[(df.index<=end)&(df.index>=start)&(df.Symbol==_ticker.replace('/',''))]['RSI_'+str(_window)], color = 'blue')
ax1.legend(['RSI_'+str(_window)],ncol=3, loc = 'upper left', fontsize = 12)
ax1.set_facecolor('silver')
plt.subplots_adjust(left=.09, bottom=.09, right=1, top=.95, wspace=.20, hspace=0)
plt.show()
return(df)
To call the function, you just have to type
df = rsi(df)
if you keep it with default values, or to change _window and/or _plot for the arg.
Notice that if you input _plot=1, you'll need to feed starting and ending of the plot, with a string or a date time.

Fetching date of high and low prices for week based on daily high low prices

First of all I will share objective of running python code.
Getting Daily High and Low Prices for a stock from Yahoo.
Converting the daily high and lows to Weekly High/Lows, monthly High Lows, Yearly High Lows.
Getting exact dates of Weekly or Monthly High Lows from a daily dataframe
Finally after fetching Dates for Weekly(or Monthly)High & lows, I want to arrange the data of what occured first High or Low during the week. for eg. during week ending 12th December, 2020, I get High of the week is 100 and low of week is 97(after completing step 2) and also High date and low date from daily dataframe (from step 3), I want to arrange Prices in order of occurence. so if High happened on 9th December and Low happened on 12th December. The prices will be arranged as 100 in row 1 and then 97 in row 2 and this process repeats for entire data frame.
What I have been able to achieve.
I have completed step 1 and step 2. Struggling in step for 3 as of now.
Have accomplished Step 1 by
import pandas as pd
import yfinance as yf
Ticker = '^NSEI'
f = yf.download(Ticker,period="max")
f = f.drop(['Adj Close'], axis=1)
f = f.drop(['Open'], axis=1)
f = f.drop(['Close'], axis=1)
f = f.drop(['Volume'], axis=1)
f.reset_index(inplace=True)
f.insert(0,'Ticker',Ticker)
Step 2 by
fw = f.groupby(['Ticker', pd.Grouper(key='Date', freq='W')])\
.agg(High=pd.NamedAgg(column='High', aggfunc='max'),
Low=pd.NamedAgg(column='Low', aggfunc='min'))\
.reset_index()
fm = f.groupby(['Ticker', pd.Grouper(key='Date', freq='M')])\
.agg(High=pd.NamedAgg(column='High', aggfunc='max'),
Low=pd.NamedAgg(column='Low', aggfunc='min'))\
.reset_index()
fq = f.groupby(['Ticker', pd.Grouper(key='Date', freq='Q')])\
.agg(High=pd.NamedAgg(column='High', aggfunc='max'),
Low=pd.NamedAgg(column='Low', aggfunc='min'))\
.reset_index()
fy = f.groupby(['Ticker', pd.Grouper(key='Date', freq='Y')])\
.agg(High=pd.NamedAgg(column='High', aggfunc='max'),
Low=pd.NamedAgg(column='Low', aggfunc='min'))\
.reset_index()
Struggling with step 3. used pd.merge, pd.join, pd.concat but unable to combine Weekly dataframe with dataframe on Highs and lows. The no of weekly records increase by performing merge and drop duplcates also didn't work properly when specified keep last.
So if you all can help me in step 3 and 4 would be grateful. Thanks
Solved the query which i posted above. Hope this help others. Thanks
import pandas as pd
import yfinance as yf
import datetime as dt
import numpy as np
Ticker = '^NSEI'
df = yf.download(Ticker, period='max')
df= df.drop(['Open', 'Close', 'Adj Close', 'Volume'], axis = 1).reset_index()
# Daily 3238 columns for reference
#Adding columns for weekly, monthly,6 month,Yearly,
df['WkEnd'] = df.Date.dt.to_period('W').apply(lambda r: r.start_time) + dt.timedelta(days=6)
df['MEnd'] = (df.Date.dt.to_period('M').apply(lambda r: r.end_time)).dt.date
df['6Mend'] = np.where(df.Date.dt.month <= 6,(df.Date.dt.year).astype(str)+'-1H',(df['Date'].dt.year).astype(str)+'-2H')
df['YEnd'] = (df.Date.dt.to_period('Y').apply(lambda r: r.end_time)).dt.date
# key variable for melting
d = {'Date':['Hidate', 'Lodate'], 'Price':['High','Low']}
#creating weekly neoformat
dw = df.groupby(['WkEnd']).agg({'High' : 'max','Low' : 'min' }).reset_index()
dw['Hidate'] = dw[['WkEnd','High']].merge(df,how = 'left').Date
dw['Lodate'] = dw[['WkEnd','Low']].merge(df,how = 'left').Date
dw = pd.lreshape(dw,d)
dw = dw.sort_values(by = ['Date']).reset_index()
dw = dw.drop(['index'], axis = 1)
#creating Monthly neoformat
dm = df.groupby(['MEnd']).agg({'High' : 'max','Low' : 'min' }).reset_index()
dm['Hidate'] = dm[['MEnd','High']].merge(df,how = 'left').Date
dm['Lodate'] = dm[['MEnd','Low']].merge(df,how = 'left').Date
dm = pd.lreshape(dm,d)
dm = dm.sort_values(by = ['Date']).reset_index()
dm = dm.drop(['index'], axis = 1)
#creating 6mth neoformat
d6m = df.groupby(['6Mend']).agg({'High' : 'max','Low' : 'min' }).reset_index()
d6m['Hidate'] = d6m[['6Mend','High']].merge(df,how = 'left').Date
d6m['Lodate'] = d6m[['6Mend','Low']].merge(df,how = 'left').Date
d6m = pd.lreshape(d6m,d)
d6m = d6m.sort_values(by = ['Date']).reset_index()
d6m = d6m.drop(['index'], axis = 1)
#creating Yearly neoformat
dy = df.groupby(['YEnd']).agg({'High' : 'max','Low' : 'min' }).reset_index()
dy['Hidate'] = dy[['YEnd','High']].merge(df,how = 'left').Date
dy['Lodate'] = dy[['YEnd','Low']].merge(df,how = 'left').Date
dy = pd.lreshape(dy,d)
dy = dy.sort_values(by = ['Date']).reset_index()
dy = dy.drop(['index'], axis = 1)

Categories

Resources