More Efficient For-Loop Calculation? - python

Is there a more efficient way of writing the following? I current have this set up to calculate using a for-loop and at this pace, it will take a few days to compile.
I am forecasting demand over a period of 6 years on a weekly basis (52 weeks) broken down by product type (586 types) and zip code (892 unique ZIPs). The rand arrays are the parameter demand shares for each year drawn from a normal distribution and have dimensions [#weeks/#types/#zips x 6]. The demand growth array is the annual demand for each year.
I ultimately need to produce a data frame that has the following:
Year | Week of the Year | Product | Zip Code | Qty
This is what I currently have
demand_growth = [10,15,20,23,26,30]
rand_week_total = np.random.rand(52,6)
rand_product_total = np.random.rand(586,6)
rand_zipcode_total = np.random.rand(892,6)
forecast_year = []
forecast_week = []
forecast_product = []
forecast_ZIP = []
forecast_qty = []
for i in range(len(years)):
for j in range(len(week)):
for k in range(len(product)):
for l in range(len(zipcode)):
a = np.rint(demand_growth[i]*rand_week_total[j,i]*rand_product_total[k,i]*rand_zipcode_total[l,i])
if a !=0:
forecast_year.append(years[i])
forecast_week.append(week[j])
forecast_product.append(product[k])
forecast_ZIP.append(zipcode[l])
forecast_qty.append(a)
'''
Edited: included examples of the arrays being multiplied
Any recommendations would be greatly appreciated!

I think you can do more than than by studying how to use arrays and/or threading. For now, the best I got was 3x faster. I used lower boundaries to not spend the night on this.
import numpy as np
import timeit
def f1():
demand_growth = np.array([10,15,20,23,26,30])
rand_week_total = np.random.rand(52,6)
rand_product_total = np.random.rand(23,6)
rand_zipcode_total = np.random.rand(43,6)
forecast_year = []
forecast_week = []
forecast_product = []
forecast_ZIP = []
forecast_qty = []
years = np.array(range(2015, 2020))
weeks = np.array(range(0, 52))
product = np.array(range(0, 23))
zipcode = np.array(range(0, 43))
for i in range(len(years)):
for j in range(len(weeks)):
for k in range(len(product)):
for l in range(len(zipcode)):
a = np.rint(demand_growth[i]*rand_week_total[j,i]*rand_product_total[k,i]*rand_zipcode_total[l,i])
if a !=0:
forecast_year.append(years[i])
forecast_week.append(weeks[j])
forecast_product.append(product[k])
forecast_ZIP.append(zipcode[l])
forecast_qty.append(a)
def f2():
demand_growth = np.array([10,15,20,23,26,30])
rand_week_total = np.random.rand(52,6)
rand_product_total = np.random.rand(23,6)
rand_zipcode_total = np.random.rand(43,6)
forecast_year = []
forecast_week = []
forecast_product = []
forecast_ZIP = []
forecast_qty = []
years = np.array(range(2015, 2020))
weeks = np.array(range(0, 52))
product = np.array(range(0, 23))
zipcode = np.array(range(0, 43))
for i in range(len(years)):
for j in range(len(weeks)):
temp_ij = demand_growth[i]*rand_week_total[j,i]
for k in range(len(product)):
temp_ikj = temp_ij*rand_product_total[k,i]
for l in range(len(zipcode)):
a = np.rint(temp_ikj*rand_zipcode_total[l,i])
if a !=0:
forecast_year.append(years[i])
forecast_week.append(weeks[j])
forecast_product.append(product[k])
forecast_ZIP.append(zipcode[l])
forecast_qty.append(a)
def f3():
demand_growth = np.array([10,15,20,23,26,30])
rand_week_total = np.random.rand(52,6)
rand_product_total = np.random.rand(23,6)
rand_zipcode_total = np.random.rand(43,6)
forecast_year = []
forecast_week = []
forecast_product = []
forecast_ZIP = []
forecast_qty = []
years = np.array(range(2015, 2020))
weeks = np.array(range(0, 52))
product = np.array(range(0, 23))
zipcode = np.array(range(0, 43))
for j in range(len(weeks)):
temp_j = demand_growth*rand_week_total[j,:]
for k in range(len(product)):
temp_jk = temp_j * rand_product_total[k,:]
for l in range(len(zipcode)):
a = np.rint(temp_jk*rand_zipcode_total[l,:])
for i in range(len(years)):
if a[i] !=0:
forecast_year.append(years[i])
forecast_week.append(weeks[j])
forecast_product.append(product[k])
forecast_ZIP.append(zipcode[l])
forecast_qty.append(a[i])
print(timeit.Timer(f1).timeit(5))
print(timeit.Timer(f2).timeit(5))
print(timeit.Timer(f3).timeit(5))

Related

How can I get weekly data with this code?

Do you know with this code how can I get weekly datas ? because the output is only daily and I did not find the answer ...
start = datetime(2018,1,1)
end = datetime(2022,1,16)
stocks = ['DUK']
def ImportDataClose(name,start,end):
n = len(name)
ptf = pd.DataFrame()
for i in range(n):
tmp = pd.DataFrame(web.DataReader(name[i],'yahoo',start, end)["Close"])
ptf = pd.concat([ptf,tmp], axis = 1)
ptf.columns = name
return(ptf)
portfolio = ImportDataClose(stocks,start,end)
portfolio
regards !

Infinity loop issue using for loops

import pandas as pd
import time
import yfinance as yf
import money_18
import talib
def backtest(df,us_code, profit_target, stop_loss, macd_diff):
pos_opened = False
open_price = 0
close_price = 0
pnl = 0
pnl_list = []
original_capital = 100000
temp_capital = original_capital
num_of_lot = 0
equity_value = 0
equity_value_list = []
dd_dollar = 0
dd_dollar_list = []
dd_pct = 0
dd_pct_list = []
mdd_dollar = 0
mdd_pct = 0
total_profit = 0
num_of_trade = 0
for i in range(1, len(df)):
now_date = df.loc[i,'Date']
now_open = df.loc[i,'Open']
now_high = df.loc[i,'High']
now_low = df.loc[i,'Low']
now_close = df.loc[i,'Close']
now_rsi = df.loc[i,'RSI']
now_upper_band = df.loc[i,'Upper_Band']
now_middle_band = df.loc[i,'Middle_Band']
now_lower_band = df.loc[i,'Lower_Band']
now_macd = df.loc[i,'MACD']
now_macd_signal = df.loc[i,'MACD_Signal']
now_macd_hist = df.loc[i,'MACD_Hist']
##### equity curve #####
equity_value = round(temp_capital + (now_open - open_price) * num_of_lot )
equity_value_list.append(equity_value)
temp_max_equity = max(equity_value_list)
dd_dollar = temp_max_equity - equity_value
dd_dollar_list.append(dd_dollar)
mdd_dollar = max(dd_dollar_list)
dd_pct = (temp_max_equity - equity_value) / temp_max_equity
dd_pct_list.append(dd_pct)
mdd_pct = max(dd_pct_list)
##### open position #####
if (pos_opened == False) and (i < len(df) - 1) and now_macd_hist > macd_diff :
pos_opened = True
open_price = now_close
num_of_lot = temp_capital // (open_price)
##### profit taking and stop loss #####
if (pos_opened == True) and ((now_open - open_price > profit_target * open_price) or (now_open - open_price < stop_loss * open_price) or (i == len(df) -1)):
pos_opened = False
close_price = now_open
pnl = (close_price - open_price) * num_of_lot
pnl_list.append(pnl)
open_price = 0
num_of_lot = 0
temp_capital = temp_capital + pnl
if len(pnl_list) > 0:
total_profit = sum(pnl_list)
num_of_trade = len(pnl_list)
return us_code, profit_target, stop_loss, total_profit, num_of_trade, mdd_dollar, mdd_pct, macd_diff
if __name__ == '__main__':
us_code_list = ['TSLA', 'AAPL']
macd_diff_list = [0, 0.05]
profit_target_list = [0.03, 0.06]
stop_loss_list = [-0.01, -0.02, -0.03]
start_date = '2020-01-01'
end_date = '2020-12-31'
df_dict = {}
for us_code in us_code_list:
df= yf.Ticker(us_code).history(start=start_date, end=end_date)
df= df[df['Volume'] > 0]
df = df[['Open', 'High', 'Low', 'Close']]
df['RSI'] = talib.RSI(df['Close'], timeperiod=14)
df['Upper_Band'], df['Middle_Band'], df['Lower_Band'] = talib.BBANDS(df['Close'], 20, 2, 2)
df['MACD'], df['MACD_Signal'], df['MACD_Hist'] = talib.MACD(df['Close'], fastperiod=12, slowperiod=26,
signalperiod=9)
df = df[df['MACD_Hist'].notna()]
df = df.reset_index()
df_dict[us_code] = df
save_us_code = ''
save_macd_diff = 0
save_profit_target = 0
save_stop_loss = 0
total_profit = 0
num_of_trade = 0
mdd_dollar = 0
mdd_pct = 0
save_us_code_list = []
save_macd_diff_list = []
save_profit_target_list = []
save_stop_loss_list = []
total_profit_list = []
num_of_trade_list = []
mdd_dollar_list = []
mdd_pct_list = []
result_dict = {}
for us_code in us_code_list:
for macd_diff in macd_diff_list:
for profit_target in profit_target_list:
for stop_loss in stop_loss_list:
print(us_code, macd_diff, profit_target, stop_loss) ## the problem should be starting from here##
save_us_code, save_profit_target, save_stop_loss, total_profit, num_of_trade, mdd_dollar, mdd_pct, macd_diff = backtest(df, us_code, profit_target, stop_loss, macd_diff)
save_us_code_list.append(save_us_code)
save_profit_target_list.append(save_profit_target)
save_stop_loss_list.append(save_stop_loss)
total_profit_list.append(total_profit)
num_of_trade_list.append(num_of_trade)
mdd_dollar_list.append(mdd_dollar)
mdd_pct_list.append(mdd_pct)
macd_diff_list.append(macd_diff)
I am working on the algo trade, however, I created a for loop to put my parameter into my backtest function. However, the for loop keeps looping non-stop.
I think the error starting from "for macd_diff in macd_diff_list:" because i try to print the result below that row, the result is already indefinite.
Now that you've shown the full code, your problem is obvious. Your original example didn't show the issue because you didn't include all relevant code. Here's your example with the relevant code that's causing the issue:
for us_code in us_code_list:
for macd_diff in macd_diff_list:
for profit_target in profit_target_list:
for stop_loss in stop_loss_list:
... # irrelevant code not shown
macd_diff_list.append(macd_diff)
The issue is that you're looping through each item in macd_diff_list, but then for each loop iteration, you add an item to that list. So of course the loop will be infinite. You need to be looping through a different list, or adding items to a different list.

Generate synthetic time series data from existing sample data

Are there any good library/tools in python for generating synthetic time series data from existing sample data? For example I have sales data from January-June and would like to generate synthetic time series data samples from July-December )(keeping time series factors intact, like trend, seasonality, etc).
Leaving the question about quality of such data aside, here is a simple approach you can use Gaussian distribution to generate synthetic data based-off a sample. Below is the critical part.
import numpy as np
x # original sample np.array of features
feature_means = np.mean(x, axis=1)
feature_std = np.std(x, axis=1)
random_normal_feature_values = np.random.normal(feature_means, feature_std)
Here is a fully functioning code I used,
def generate_synthetic_data(sample_dataset, window_mean, window_std, fixed_window=None, variance_range =1 , sythesize_ratio = 2, forced_reverse = False):
synthetic_data = pd.DataFrame(columns=sample_dataset.columns)
synthetic_data.insert(len(sample_dataset.columns), "synthesis_seq", [], True)
for k in range(sythesize_ratio):
if len(synthetic_data) >= len(sample_dataset) * sythesize_ratio:
break;
#this loop generates a set that resembles the entire dataset
country_synthetic = pd.DataFrame(columns=synthetic_data.columns)
if fixed_window != None:
input_sequence_len = fixed_window
else:
input_sequence_len = int(np.random.normal(window_mean, window_std))
#population data change
country_data_i = sample_dataset
if len(country_data_i) < input_sequence_len :
continue
feature_length = configuration['feature_length'] #number of features to be randomized
country_data_array = country_data_i.to_numpy()
country_data_array = country_data_array.T[:feature_length]
country_data_array = country_data_array.reshape(feature_length,len(country_data_i))
x = country_data_array[:feature_length].T
reversed = np.random.normal(0,1)>0
if reversed:
x = x[::-1]
sets =0
x_list = []
dict_x = dict()
for i in range(input_sequence_len):
array_len = ((len(x) -i) - ((len(x)-i)%input_sequence_len))+i
if array_len <= 0:
continue
sets = int( array_len/ input_sequence_len)
if sets <= 0:
continue
x_temp = x[i:array_len].T.reshape(sets,feature_length,input_sequence_len)
uniq_keys = np.array([i+(input_sequence_len*k) for k in range(sets)])
x_temp = x_temp.reshape(feature_length,sets,input_sequence_len)
arrays_split = np.hsplit(x_temp,sets)
dict_x.update(dict(zip(uniq_keys, arrays_split)))
temp_x_list = [dict_x[i].T for i in sorted(dict_x.keys())]
temp_x_list = np.array(temp_x_list).squeeze()
feature_means = np.mean(temp_x_list, axis=1)
feature_std = np.std(temp_x_list, axis=1) /variance_range
random_normal_feature_values = np.random.normal(feature_means, feature_std).T
random_normal_feature_values = np.round(random_normal_feature_values,0)
random_normal_feature_values[random_normal_feature_values < 0] = 0
if reversed:
random_normal_feature_values = random_normal_feature_values.T[::-1]
random_normal_feature_values = random_normal_feature_values.T
for i in range(len(random_normal_feature_values)):
country_synthetic[country_synthetic.columns[i]] = random_normal_feature_values[i]
country_synthetic['synthesis_seq'] = k
synthetic_data = synthetic_data.append(country_synthetic, ignore_index=True)
return synthetic_data
for i in range(1):
directory_name = '/synthetic_'+str(i)
mypath = source_path+ '/cleaned'+directory_name
if os.path.exists(mypath) == False:
os.mkdir(mypath)
data = generate_synthetic_data(original_data, window_mean = 0, window_std= 0, fixed_window=2 ,variance_range = 10**i, sythesize_ratio = 1)
synthetic_data.append(data)
#data.to_csv(mypath+'/synthetic_'+str(i)+'_dt31_05_.csv', index=False )
print('synth step : ', i, ' len : ', len(synthetic_data))
Good luck!

Script in python/pandas works but doesn't work when placed in side a function

I have this script I'm running to try to create a dataframe to summarize some statistics:
month = [may,june,july,august,sept]
month_str = [5,6,7,8,9]
avg_age = []
avg_use = []
avg_kwh = []
avg_coll = []
avg_cred = []
for i in month:
avg_age.append(i[i['Age']!=0]['Age'].mean())
avg_use.append(i[i['AverageBilledUsage']!=0]['AverageBilledUsage'].mean())
avg_kwh.append(i[i['AverageKWH']!=0]['AverageKWH'].mean())
avg_coll.append(i[i['Total Collected']!=0]['Total Collected'].mean())
avg_cred.append(i[(i['credit_score']!=0) & (i['credit_score']!=99999)]['credit_score'].mean())
pd.DataFrame(data = [avg_age,avg_use,avg_kwh,avg_coll,avg_cred],columns = month_str,index = ['Age','Usage','kwh','collected','creditscore'])
It returns exactly what I want to see. But when I place it inside a function I get the following error:
AssertionError: 5 columns passed, passed data had 1 columns
Here is the code inside the function:
def get_nums():
months = [may,june,july,august,sept]
month_str = [5,6,7,8,9]
avg_age = []
avg_use = []
avg_kwh = []
avg_coll = []
avg_cred = []
for i in months:
avg_age.append(i[i['Age']!=0]['Age'].mean())
avg_use.append(i[i['AverageBilledUsage']!=0]['AverageBilledUsage'].mean())
avg_kwh.append(i[i['AverageKWH']!=0]['AverageKWH'].mean())
avg_coll.append(i[i['Total Collected']!=0]['Total Collected'].mean())
avg_cred.append(i[(i['credit_score']!=0) & (i['credit_score']!=99999)]['credit_score'].mean())
this_df = pd.DataFrame(data = [avg_age,avg_use,avg_kwh,avg_coll,avg_cred],columns = month_str,index = ['Age','Usage','kwh','collected','creditscore'])
return this_df
You have a problem with the last line of the for loop in the function. this_df is being defined in every iteration of the loop.
The corrected code is below.
def get_nums():
months = [may,june,july,august,sept]
month_str = [5,6,7,8,9]
avg_age = []
avg_use = []
avg_kwh = []
avg_coll = []
avg_cred = []
for i in months:
avg_age.append(i[i['Age']!=0]['Age'].mean())
avg_use.append(i[i['AverageBilledUsage']!=0]['AverageBilledUsage'].mean())
avg_kwh.append(i[i['AverageKWH']!=0]['AverageKWH'].mean())
avg_coll.append(i[i['Total Collected']!=0]['Total Collected'].mean())
avg_cred.append(i[(i['credit_score']!=0) & (i['credit_score']!=99999)]['credit_score'].mean())
this_df = pd.DataFrame(data = [avg_age,avg_use,avg_kwh,avg_coll,avg_cred],columns = month_str,index = ['Age','Usage','kwh','collected','creditscore'])
return this_df
Base on my understanding , you do not need the for loop here
month = [may,june,july,august,sept]
month_str = [5,6,7,8,9]
df=pd.concat(month,keys=month_str)
df=df.mask(df==0|df==99999)
df.groupby(level=0).mean().T

List of named lists

I need to create a list of named lists in a python script.
What I want to do is create a mklist method that will take strings in a list and create lists named for each of the strings. So, from here:
a = "area"
for i in range(1, 37):
x = str(a) + str("%02d" % (i,))
' '.join(x.split())
I want to get the following:
area01 = []
area02 = []
area03 = []
area04 = []
area05 = []
area06 = []
area07 = []
area08 = []
area09 = []
area10 = []
area11 = []
area12 = []
area13 = []
area14 = []
area15 = []
area16 = []
area17 = []
area18 = []
area19 = []
area20 = []
area21 = []
area22 = []
area23 = []
area24 = []
area25 = []
area26 = []
area27 = []
area28 = []
area29 = []
area30 = []
area31 = []
area32 = []
area33 = []
area34 = []
area35 = []
area36 = []
Any advice? I can't seem to get it. Thanks!
E
This calls for either a list of lists:
area = [[] for i in range(37)]
Or a dict of lists:
area = {i: [] for i in range(1, 37)} # Python 2.7+
area = dict((i, []) for i in range(1, 37)) # Python 2.6 or earlier
Then you can access each item with:
area[1]
area[2]
...
area[36]
See this question.
a = "area"
for i in range(1, 37):
x = str(a) + str("%02d" % (i,))
locals()[x] = []
Or use globals() if you want the lists to be global.
That would give you empty list variables area01 to area36.
You should note though that just because it can be done, doesn't mean it should. A better/more readable solution would be:
area = [[] for i in range(37)]
(see John's solution)
Something like:
a = "area"
x = [ "%s%02d = []" % (a, i) for i in range(1,37) ]
print('\n'.join(x))
If you want your lists defined in the current Python session you sostitute the last line with
for i in range(0, 36):
exec(x[i])
And check with
print area01
print area35
Note however that this is not advisable

Categories

Resources