How to speed up appending to a multi-index DataFrame? - python

I'm using apply on a multi-index groupby with Numba and found that the slowest part is adding the results back to the original frame. How would I achieve this with speeds that are practical? The indexes in the original df are unique.
Import and generate some data:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import numba
times = np.arange(datetime(2017, 1, 1), datetime(2020, 2, 1), timedelta(minutes=60)).astype(np.datetime64)
tlen = len(times)
A, Z = np.array(['A', 'Z']).view('int32')
symbol_names = np.random.randint(low=A, high=Z, size=50 * 7, dtype='int32').view(f'U{7}')
times = np.concatenate([times] * 50)
names = np.array([y for x in [[s] * tlen for s in symbol_names] for y in x])
open_column = np.random.randint(low=40, high=60, size=len(times), dtype='uint32')
high_column = np.random.randint(low=50, high=70, size=len(times), dtype='uint32')
low_column = np.random.randint(low=30, high=50, size=len(times), dtype='uint32')
close_column = np.random.randint(low=40, high=60, size=len(times), dtype='uint32')
df = pd.DataFrame({'open': open_column, 'high': high_column, 'low': low_column, 'close': close_column}, index=[names, times])
df.index = df.index.set_names(['Symbol', 'Date'])
df['entry'] = np.select( [df.open > df.open.shift(), False], (df.close, -1), np.nan)
df['exit'] = df.close.where(df.high > df.open * 1.33, np.nan)
Define the numba function:
#numba.jit(nopython=True)
def nb_func(arr, limit=0, stop=0, tbe=0):
is_active = 0
bars_held = 0
limit_target = np.inf
stop_target = -np.inf
result = np.empty(arr.shape[0], dtype='float32')
for n in range(arr.shape[0]):
ret = 0
if is_active == 1:
bars_held += 1
if arr[n][2] < stop_target:
ret = stop_target
is_active = 0
elif arr[n][1] > limit_target:
ret = limit_target
is_active = 0
elif bars_held >= tbe:
ret = arr[n][3]
is_active = 0
elif arr[n][5] > 0:
ret = arr[n][3]
is_active = 0
if is_active == 0:
if arr[n][4] > 0:
is_active = 1
bars_held = 0
if stop != 0:
stop_target = arr[n][3] * stop
if limit != 0:
limit_target = arr[n][3] * limit
result[n] = ret
return result
Define the applied function with additional arguments and create the groupBy object:
def process(group):
group['result'] = nb_func(group.values, limit=1.10, stop=0.50, tbe=5)
return group['result']
group = df[['open', 'high', 'low', 'close', 'entry', 'exit']].groupby('Symbol')
Using regular concat the result is extremely slow:
def concat(group):
ret = group.apply(*[process]).droplevel(0)
df2 = pd.concat([df, ret], axis=1)
concat(group)
>> concat function took 20.292 s
Adding the result back as a column is faster but still slow:
def simple_readd(group):
ret = group.apply(*[process]).droplevel(0)
df['result'] = ret
simple_readd(group)
>> simple_readd function took 7.141 s
Calculating the results without adding them back to the original frame is fast and I'd like to get the total time to around 1 s:
def series_return(group):
ret = group.apply(*[process]).droplevel(0)
series_return(group)
>> series_return function took 0.905 s
UPDATE:
This is the most improvement in time I've managed, from 7 seconds to about 1.8. Still interested in doing better as the indexes and columns are completely identical, perhaps there is a numPy solution?
def series_merge(group):
ret = group.apply(*[process]).droplevel(0)
merged = df.reset_index().merge(ret, on=['Symbol', 'Date'], how='left').reset_index(drop=True)
merged = merged.set_index(['Symbol', 'Date'])
return merged
series_merge(group)
>> series_merge function took 1.892 s

Related

For pandas query is very slow, How to enhancing performance?

How to quickly find the data that meets the conditions and terminate the subsequent search in advance.Search from right to front until the maximum findlimit of the search.
Expected accuracy <= 0.001s,
Current accuracy >= 0.03s
def test_Find(df:pd.DataFrame,findLimit:int=365):
success = False
current = pd.DataFrame()
start = len(df) -1
loopCount = 0
# Compare from back to front
for i in range(start, -1, -1):
loopCount += 1
if loopCount > findLimit:
return pd.DataFrame()
current = df.iloc[i]
success = Find(i, df)
if success:
return current
return pd.DataFrame()
#Whether the last value is greater than the current value
def LastPassCurrent(curi:int, df:pd.DataFrame):
current = df.iloc[curi]
last = df.iloc[-1]
result = last.c > current["c"]
return result
# Whether the current residual value is Less than or equal to the current value
def RemainNoPassCurrent(curi:int, df:pd.DataFrame)->bool:
cur = df.iloc[curi]
remain = df.iloc[curi+1:-1]
maxC = remain["c"].max()
if np.isnan(maxC):
maxC = 0
remainNoPassCurrent = maxC <= cur["c"]
return remainNoPassCurrent
# Qualified search
def Find(curi, df):
current = df.iloc[curi]
result = (current.a == 8) and \
RemainNoPassCurrent(curi, df) and \
LastPassCurrent(curi, df)
return result
#test data
dfs = []
for i in range(0, 4000):
dfs.append(pd.DataFrame(np.arange(365*3).reshape(365,3), columns=list('abc')))
#Test time collection
df = None
for i in range(0, 4000):
df = dfs[i]
start_time = time.time()
data = test_Find(df, 365)
end_time = time.time()
result = end_time - start_time
print(f'loop {i} Empty:{data.empty} time is %.3fs' % result)
#Current testing inefficient time
# loop 0 time is 0.367s ....

Best approach to iterate and append a custom function a on a new dataframe

I have the following custom function that generates a row with EMA data for a specific asset based on the current time.
Here's the complete code for the function:
def find_ema(futures_symbol):
futures_symbol = futures_symbol
def fetch_ohlc(symbol,timeframe, timesymbol):
symbol = symbol
timeframe = timeframe
timesymbol = timesymbol
#fetch data-binance api
candlestick_url = 'https://fapi.binance.com/fapi/v1/continuousKlines?pair='+symbol+'&contractType=PERPETUAL&interval='+str(timeframe)+timesymbol+'&limit=1500'
candlestick_chart = requests.get(candlestick_url).json()
candlestick_df = pd.DataFrame(candlestick_chart)
candlestick_df = candlestick_df.iloc[:,1:7]
candlestick_df.columns = ['open', 'high', 'low','close','volume', 'date']
candlestick_df['date'] = pd.to_datetime(candlestick_df['date'], unit='ms').round('1s')
candlestick_df.insert(0, 'date', candlestick_df.pop('date') )
# reset to midnight
candlestick_df.date = pd.to_datetime(candlestick_df.date)
min_date = candlestick_df.date.min()
NextDay_Date = (min_date + datetime.timedelta(days=1)).replace(hour=0, minute=0, second=0, microsecond=0)
candlestick_df = candlestick_df[candlestick_df.date >= NextDay_Date].copy()
candlestick_df = candlestick_df.set_index('date')
candlestick_df['symbol'] = symbol
ohlc_data = candlestick_df
cols= ['open', 'high', 'low','close','volume']
ohlc_data[cols] = ohlc_data[cols].apply(pd.to_numeric, errors = 'coerce')
ohlc_data[cols] = ohlc_data[cols].round(decimals=2)
return ohlc_data
#separate df for limited candle stick data
ohlc_smaller = fetch_ohlc(futures_symbol,5,'m')
ohlc_larger = fetch_ohlc(futures_symbol,1,'h')
ema_df = ohlc_smaller
#calculating ema with 200 row data
ema_df['15m'] = ohlc_smaller.resample('15T').apply({'close':'last'}).ewm(span = 200, min_periods = 200).mean()
ema_df['30m'] = ohlc_smaller.resample('30T').apply({'close':'last'}).ewm(span = 200, min_periods = 200).mean()
ema_df['1h'] = ohlc_larger.resample('60T').apply({'close':'last'}).ewm(span = 200, min_periods = 200).mean()
ema_df['2h'] = ohlc_larger.resample('120T').apply({'close':'last'}).ewm(span = 200, min_periods = 200).mean()
ema_df['4h'] = ohlc_larger.resample('240T').apply({'close':'last'}).ewm(span = 200, min_periods = 200).mean()
#forward fill larger tf data to smaller tf
ema_df = ema_df.fillna(method='ffill').tail(1)
ema_df.insert(0, 'symbol', ema_df.pop('symbol'))
ema_df = ema_df.drop(['high','low','close','volume'], axis=1)
return ema_df
When I apply this function to a single symbol it returns the dataframe perfectly. for example:
working example on single symbol
However, I now have a list of symbols on which I want to apply this function and create a new dataframe.
Here's how I am generating my list of symbols:
symbols_url = 'https://fapi.binance.com/fapi/v1/ticker/price'
symbols_data = requests.get(symbols_url).json()
symbols_df = pd.DataFrame(symbols_data)
symbols_df = symbols_df[symbols_df['symbol'].str.contains('USDT')]
futures_tickers_binance = list(symbols_df['symbol'])
#some ends with numbers(eg:Quarterly Contracts), hence filter:
futures_tickers_binance = list(filter(lambda x: x.endswith(('USDT')), futures_tickers_binance))
Here's what I thought would work:
for symbol in futures_tickers_binance:
for j in range(len(futures_tickers_binance)):
df = df.append(find_ema(futures_tickers_binance[j]))
df = df.drop_duplicates()
However, this returns a valueError:
ValueError: If using all scalar values, you must pass an index
Is there a way to apply this function and generate a new dataframe with the values for the complete list in a faster way?
Thank you in advance for your patience to read this!
The final result would look something like this, however my loop is not working the way it is supposed to be working:
Expected (almost) perfect result
Here's my complete code if needed:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import datetime
import requests
symbols_url = 'https://fapi.binance.com/fapi/v1/ticker/price'
symbols_data = requests.get(symbols_url).json()
symbols_df = pd.DataFrame(symbols_data)
symbols_df = symbols_df[symbols_df['symbol'].str.contains('USDT')]
futures_tickers_binance = list(symbols_df['symbol'])
#some ends with numbers(eg:Quarterly Contracts), hence filter:
futures_tickers_binance = list(filter(lambda x: x.endswith(('USDT')), futures_tickers_binance))
def find_ema(futures_symbol):
futures_symbol = futures_symbol
def fetch_ohlc(symbol,timeframe, timesymbol):
symbol = symbol
timeframe = timeframe
timesymbol = timesymbol
#fetch data-binance api
candlestick_url = 'https://fapi.binance.com/fapi/v1/continuousKlines?pair='+symbol+'&contractType=PERPETUAL&interval='+str(timeframe)+timesymbol+'&limit=1500'
candlestick_chart = requests.get(candlestick_url).json()
candlestick_df = pd.DataFrame(candlestick_chart)
candlestick_df = candlestick_df.iloc[:,1:7]
candlestick_df.columns = ['open', 'high', 'low','close','volume', 'date']
candlestick_df['date'] = pd.to_datetime(candlestick_df['date'], unit='ms').round('1s')
candlestick_df.insert(0, 'date', candlestick_df.pop('date') )
# reset to midnight
candlestick_df.date = pd.to_datetime(candlestick_df.date)
min_date = candlestick_df.date.min()
NextDay_Date = (min_date + datetime.timedelta(days=1)).replace(hour=0, minute=0, second=0, microsecond=0)
candlestick_df = candlestick_df[candlestick_df.date >= NextDay_Date].copy()
candlestick_df = candlestick_df.set_index('date')
candlestick_df['symbol'] = symbol
ohlc_data = candlestick_df
cols= ['open', 'high', 'low','close','volume']
ohlc_data[cols] = ohlc_data[cols].apply(pd.to_numeric, errors = 'coerce')
ohlc_data[cols] = ohlc_data[cols].round(decimals=2)
return ohlc_data
#separate df for limited candle stick data
ohlc_smaller = fetch_ohlc(futures_symbol,5,'m')
ohlc_larger = fetch_ohlc(futures_symbol,1,'h')
ema_df = ohlc_smaller
#calculating ema with 200 row data
ema_df['15m'] = ohlc_smaller.resample('15T').apply({'close':'last'}).ewm(span = 200, min_periods = 200).mean()
ema_df['30m'] = ohlc_smaller.resample('30T').apply({'close':'last'}).ewm(span = 200, min_periods = 200).mean()
ema_df['1h'] = ohlc_larger.resample('60T').apply({'close':'last'}).ewm(span = 200, min_periods = 200).mean()
ema_df['2h'] = ohlc_larger.resample('120T').apply({'close':'last'}).ewm(span = 200, min_periods = 200).mean()
ema_df['4h'] = ohlc_larger.resample('240T').apply({'close':'last'}).ewm(span = 200, min_periods = 200).mean()
#forward fill larger tf data to smaller tf
ema_df = ema_df.fillna(method='ffill').tail(1)
ema_df.insert(0, 'symbol', ema_df.pop('symbol'))
ema_df = ema_df.drop(['high','low','close','volume'], axis=1)
return ema_df
for symbol in futures_tickers_binance:
for j in range(len(futures_tickers_binance)):
df = df.append(find_ema(futures_tickers_binance[j]))
df = df.drop_duplicates()

Infinity loop issue using for loops

import pandas as pd
import time
import yfinance as yf
import money_18
import talib
def backtest(df,us_code, profit_target, stop_loss, macd_diff):
pos_opened = False
open_price = 0
close_price = 0
pnl = 0
pnl_list = []
original_capital = 100000
temp_capital = original_capital
num_of_lot = 0
equity_value = 0
equity_value_list = []
dd_dollar = 0
dd_dollar_list = []
dd_pct = 0
dd_pct_list = []
mdd_dollar = 0
mdd_pct = 0
total_profit = 0
num_of_trade = 0
for i in range(1, len(df)):
now_date = df.loc[i,'Date']
now_open = df.loc[i,'Open']
now_high = df.loc[i,'High']
now_low = df.loc[i,'Low']
now_close = df.loc[i,'Close']
now_rsi = df.loc[i,'RSI']
now_upper_band = df.loc[i,'Upper_Band']
now_middle_band = df.loc[i,'Middle_Band']
now_lower_band = df.loc[i,'Lower_Band']
now_macd = df.loc[i,'MACD']
now_macd_signal = df.loc[i,'MACD_Signal']
now_macd_hist = df.loc[i,'MACD_Hist']
##### equity curve #####
equity_value = round(temp_capital + (now_open - open_price) * num_of_lot )
equity_value_list.append(equity_value)
temp_max_equity = max(equity_value_list)
dd_dollar = temp_max_equity - equity_value
dd_dollar_list.append(dd_dollar)
mdd_dollar = max(dd_dollar_list)
dd_pct = (temp_max_equity - equity_value) / temp_max_equity
dd_pct_list.append(dd_pct)
mdd_pct = max(dd_pct_list)
##### open position #####
if (pos_opened == False) and (i < len(df) - 1) and now_macd_hist > macd_diff :
pos_opened = True
open_price = now_close
num_of_lot = temp_capital // (open_price)
##### profit taking and stop loss #####
if (pos_opened == True) and ((now_open - open_price > profit_target * open_price) or (now_open - open_price < stop_loss * open_price) or (i == len(df) -1)):
pos_opened = False
close_price = now_open
pnl = (close_price - open_price) * num_of_lot
pnl_list.append(pnl)
open_price = 0
num_of_lot = 0
temp_capital = temp_capital + pnl
if len(pnl_list) > 0:
total_profit = sum(pnl_list)
num_of_trade = len(pnl_list)
return us_code, profit_target, stop_loss, total_profit, num_of_trade, mdd_dollar, mdd_pct, macd_diff
if __name__ == '__main__':
us_code_list = ['TSLA', 'AAPL']
macd_diff_list = [0, 0.05]
profit_target_list = [0.03, 0.06]
stop_loss_list = [-0.01, -0.02, -0.03]
start_date = '2020-01-01'
end_date = '2020-12-31'
df_dict = {}
for us_code in us_code_list:
df= yf.Ticker(us_code).history(start=start_date, end=end_date)
df= df[df['Volume'] > 0]
df = df[['Open', 'High', 'Low', 'Close']]
df['RSI'] = talib.RSI(df['Close'], timeperiod=14)
df['Upper_Band'], df['Middle_Band'], df['Lower_Band'] = talib.BBANDS(df['Close'], 20, 2, 2)
df['MACD'], df['MACD_Signal'], df['MACD_Hist'] = talib.MACD(df['Close'], fastperiod=12, slowperiod=26,
signalperiod=9)
df = df[df['MACD_Hist'].notna()]
df = df.reset_index()
df_dict[us_code] = df
save_us_code = ''
save_macd_diff = 0
save_profit_target = 0
save_stop_loss = 0
total_profit = 0
num_of_trade = 0
mdd_dollar = 0
mdd_pct = 0
save_us_code_list = []
save_macd_diff_list = []
save_profit_target_list = []
save_stop_loss_list = []
total_profit_list = []
num_of_trade_list = []
mdd_dollar_list = []
mdd_pct_list = []
result_dict = {}
for us_code in us_code_list:
for macd_diff in macd_diff_list:
for profit_target in profit_target_list:
for stop_loss in stop_loss_list:
print(us_code, macd_diff, profit_target, stop_loss) ## the problem should be starting from here##
save_us_code, save_profit_target, save_stop_loss, total_profit, num_of_trade, mdd_dollar, mdd_pct, macd_diff = backtest(df, us_code, profit_target, stop_loss, macd_diff)
save_us_code_list.append(save_us_code)
save_profit_target_list.append(save_profit_target)
save_stop_loss_list.append(save_stop_loss)
total_profit_list.append(total_profit)
num_of_trade_list.append(num_of_trade)
mdd_dollar_list.append(mdd_dollar)
mdd_pct_list.append(mdd_pct)
macd_diff_list.append(macd_diff)
I am working on the algo trade, however, I created a for loop to put my parameter into my backtest function. However, the for loop keeps looping non-stop.
I think the error starting from "for macd_diff in macd_diff_list:" because i try to print the result below that row, the result is already indefinite.
Now that you've shown the full code, your problem is obvious. Your original example didn't show the issue because you didn't include all relevant code. Here's your example with the relevant code that's causing the issue:
for us_code in us_code_list:
for macd_diff in macd_diff_list:
for profit_target in profit_target_list:
for stop_loss in stop_loss_list:
... # irrelevant code not shown
macd_diff_list.append(macd_diff)
The issue is that you're looping through each item in macd_diff_list, but then for each loop iteration, you add an item to that list. So of course the loop will be infinite. You need to be looping through a different list, or adding items to a different list.

How to find Average directional movement for stocks using Pandas?

I have a dataframe of OHLCV data. I would like to know if anyone knows any tutorial or any way of finding ADX(Average directional movement ) using pandas?
import pandas as pd
import yfinance as yf
import matplotlib.pyplot as plt
import datetime as dt
import numpy as nm
start=dt.datetime.today()-dt.timedelta(59)
end=dt.datetime.today()
df=pd.DataFrame(yf.download("MSFT", start=start, end=end))
The average directional index, or ADX, is the primary technical indicator among the five indicators that make up a technical trading system developed by J. Welles Wilder, Jr. and is calculated using the other indicators that make up the trading system. The ADX is primarily used as an indicator of momentum, or trend strength, but the total ADX system is also used as a directional indicator.
Directional movement is calculated by comparing the difference between two consecutive lows with the difference between their respective highs.
For the excel calculation of ADX this is a really good video:
https://www.youtube.com/watch?v=LKDJQLrXedg&t=387s
I was playing with this a little bit and found something that can help you with the issue:
def ADX(data: pd.DataFrame, period: int):
"""
Computes the ADX indicator.
"""
df = data.copy()
alpha = 1/period
# TR
df['H-L'] = df['High'] - df['Low']
df['H-C'] = np.abs(df['High'] - df['Close'].shift(1))
df['L-C'] = np.abs(df['Low'] - df['Close'].shift(1))
df['TR'] = df[['H-L', 'H-C', 'L-C']].max(axis=1)
del df['H-L'], df['H-C'], df['L-C']
# ATR
df['ATR'] = df['TR'].ewm(alpha=alpha, adjust=False).mean()
# +-DX
df['H-pH'] = df['High'] - df['High'].shift(1)
df['pL-L'] = df['Low'].shift(1) - df['Low']
df['+DX'] = np.where(
(df['H-pH'] > df['pL-L']) & (df['H-pH']>0),
df['H-pH'],
0.0
)
df['-DX'] = np.where(
(df['H-pH'] < df['pL-L']) & (df['pL-L']>0),
df['pL-L'],
0.0
)
del df['H-pH'], df['pL-L']
# +- DMI
df['S+DM'] = df['+DX'].ewm(alpha=alpha, adjust=False).mean()
df['S-DM'] = df['-DX'].ewm(alpha=alpha, adjust=False).mean()
df['+DMI'] = (df['S+DM']/df['ATR'])*100
df['-DMI'] = (df['S-DM']/df['ATR'])*100
del df['S+DM'], df['S-DM']
# ADX
df['DX'] = (np.abs(df['+DMI'] - df['-DMI'])/(df['+DMI'] + df['-DMI']))*100
df['ADX'] = df['DX'].ewm(alpha=alpha, adjust=False).mean()
del df['DX'], df['ATR'], df['TR'], df['-DX'], df['+DX'], df['+DMI'], df['-DMI']
return df
At the beginning the values aren't correct (as always with the EWM approach) but after several computations it converges to the correct value.
Math was taken from here.
def ADX(df):
def getCDM(df):
dmpos = df["High"][-1] - df["High"][-2]
dmneg = df["Low"][-2] - df["Low"][-1]
if dmpos > dmneg:
return dmpos
else:
return dmneg
def getDMnTR(df):
DMpos = []
DMneg = []
TRarr = []
n = round(len(df)/14)
idx = n
while n <= (len(df)):
dmpos = df["High"][n-1] - df["High"][n-2]
dmneg = df["Low"][n-2] - df["Low"][n-1]
DMpos.append(dmpos)
DMneg.append(dmneg)
a1 = df["High"][n-1] - df["High"][n-2]
a2 = df["High"][n-1] - df["Close"][n-2]
a3 = df["Low"][n-1] - df["Close"][n-2]
TRarr.append(max(a1,a2,a3))
n = idx + n
return DMpos, DMneg, TRarr
def getDI(df):
DMpos, DMneg, TR = getDMnTR(df)
CDM = getCDM(df)
POSsmooth = (sum(DMpos) - sum(DMpos)/len(DMpos) + CDM)
NEGsmooth = (sum(DMneg) - sum(DMneg)/len(DMneg) + CDM)
DIpos = (POSsmooth / (sum(TR)/len(TR))) *100
DIneg = (NEGsmooth / (sum(TR)/len(TR))) *100
return DIpos, DIneg
def getADX(df):
DIpos, DIneg = getDI(df)
dx = (abs(DIpos- DIneg) / abs(DIpos + DIneg)) * 100
ADX = dx/14
return ADX
return(getADX(df))
print(ADX(df))
This gives you the exact numbers as Tradingview and Thinkorswim.
import numpy as np
def ema(arr, periods=14, weight=1, init=None):
leading_na = np.where(~np.isnan(arr))[0][0]
arr = arr[leading_na:]
alpha = weight / (periods + (weight-1))
alpha_rev = 1 - alpha
n = arr.shape[0]
pows = alpha_rev**(np.arange(n+1))
out1 = np.array([])
if 0 in pows:
out1 = ema(arr[:int(len(arr)/2)], periods)
arr = arr[int(len(arr)/2) - 1:]
init = out1[-1]
n = arr.shape[0]
pows = alpha_rev**(np.arange(n+1))
scale_arr = 1/pows[:-1]
if init:
offset = init * pows[1:]
else:
offset = arr[0]*pows[1:]
pw0 = alpha*alpha_rev**(n-1)
mult = arr*pw0*scale_arr
cumsums = mult.cumsum()
out = offset + cumsums*scale_arr[::-1]
out = out[1:] if len(out1) > 0 else out
out = np.concatenate([out1, out])
out[:periods] = np.nan
out = np.concatenate(([np.nan]*leading_na, out))
return out
def atr(highs, lows, closes, periods=14, ema_weight=1):
hi = np.array(highs)
lo = np.array(lows)
c = np.array(closes)
tr = np.vstack([np.abs(hi[1:]-c[:-1]),
np.abs(lo[1:]-c[:-1]),
(hi-lo)[1:]]).max(axis=0)
atr = ema(tr, periods=periods, weight=ema_weight)
atr = np.concatenate([[np.nan], atr])
return atr
def adx(highs, lows, closes, periods=14):
highs = np.array(highs)
lows = np.array(lows)
closes = np.array(closes)
up = highs[1:] - highs[:-1]
down = lows[:-1] - lows[1:]
up_idx = up > down
down_idx = down > up
updm = np.zeros(len(up))
updm[up_idx] = up[up_idx]
updm[updm < 0] = 0
downdm = np.zeros(len(down))
downdm[down_idx] = down[down_idx]
downdm[downdm < 0] = 0
_atr = atr(highs, lows, closes, periods)[1:]
updi = 100 * ema(updm, periods) / _atr
downdi = 100 * ema(downdm, periods) / _atr
zeros = (updi + downdi == 0)
downdi[zeros] = .0000001
adx = 100 * np.abs(updi - downdi) / (updi + downdi)
adx = ema(np.concatenate([[np.nan], adx]), periods)
return adx

Generate synthetic time series data from existing sample data

Are there any good library/tools in python for generating synthetic time series data from existing sample data? For example I have sales data from January-June and would like to generate synthetic time series data samples from July-December )(keeping time series factors intact, like trend, seasonality, etc).
Leaving the question about quality of such data aside, here is a simple approach you can use Gaussian distribution to generate synthetic data based-off a sample. Below is the critical part.
import numpy as np
x # original sample np.array of features
feature_means = np.mean(x, axis=1)
feature_std = np.std(x, axis=1)
random_normal_feature_values = np.random.normal(feature_means, feature_std)
Here is a fully functioning code I used,
def generate_synthetic_data(sample_dataset, window_mean, window_std, fixed_window=None, variance_range =1 , sythesize_ratio = 2, forced_reverse = False):
synthetic_data = pd.DataFrame(columns=sample_dataset.columns)
synthetic_data.insert(len(sample_dataset.columns), "synthesis_seq", [], True)
for k in range(sythesize_ratio):
if len(synthetic_data) >= len(sample_dataset) * sythesize_ratio:
break;
#this loop generates a set that resembles the entire dataset
country_synthetic = pd.DataFrame(columns=synthetic_data.columns)
if fixed_window != None:
input_sequence_len = fixed_window
else:
input_sequence_len = int(np.random.normal(window_mean, window_std))
#population data change
country_data_i = sample_dataset
if len(country_data_i) < input_sequence_len :
continue
feature_length = configuration['feature_length'] #number of features to be randomized
country_data_array = country_data_i.to_numpy()
country_data_array = country_data_array.T[:feature_length]
country_data_array = country_data_array.reshape(feature_length,len(country_data_i))
x = country_data_array[:feature_length].T
reversed = np.random.normal(0,1)>0
if reversed:
x = x[::-1]
sets =0
x_list = []
dict_x = dict()
for i in range(input_sequence_len):
array_len = ((len(x) -i) - ((len(x)-i)%input_sequence_len))+i
if array_len <= 0:
continue
sets = int( array_len/ input_sequence_len)
if sets <= 0:
continue
x_temp = x[i:array_len].T.reshape(sets,feature_length,input_sequence_len)
uniq_keys = np.array([i+(input_sequence_len*k) for k in range(sets)])
x_temp = x_temp.reshape(feature_length,sets,input_sequence_len)
arrays_split = np.hsplit(x_temp,sets)
dict_x.update(dict(zip(uniq_keys, arrays_split)))
temp_x_list = [dict_x[i].T for i in sorted(dict_x.keys())]
temp_x_list = np.array(temp_x_list).squeeze()
feature_means = np.mean(temp_x_list, axis=1)
feature_std = np.std(temp_x_list, axis=1) /variance_range
random_normal_feature_values = np.random.normal(feature_means, feature_std).T
random_normal_feature_values = np.round(random_normal_feature_values,0)
random_normal_feature_values[random_normal_feature_values < 0] = 0
if reversed:
random_normal_feature_values = random_normal_feature_values.T[::-1]
random_normal_feature_values = random_normal_feature_values.T
for i in range(len(random_normal_feature_values)):
country_synthetic[country_synthetic.columns[i]] = random_normal_feature_values[i]
country_synthetic['synthesis_seq'] = k
synthetic_data = synthetic_data.append(country_synthetic, ignore_index=True)
return synthetic_data
for i in range(1):
directory_name = '/synthetic_'+str(i)
mypath = source_path+ '/cleaned'+directory_name
if os.path.exists(mypath) == False:
os.mkdir(mypath)
data = generate_synthetic_data(original_data, window_mean = 0, window_std= 0, fixed_window=2 ,variance_range = 10**i, sythesize_ratio = 1)
synthetic_data.append(data)
#data.to_csv(mypath+'/synthetic_'+str(i)+'_dt31_05_.csv', index=False )
print('synth step : ', i, ' len : ', len(synthetic_data))
Good luck!

Categories

Resources