How can I optimize this Python Pandas code?

How can I optimize this Python Pandas code? - python

The following function takes market data (OHLCV candles) and tries to find periods where the price oscillates between two bounds (consolidation zones). I wrote it by translating, from Pine Script to Python, an open-source indicator found on TradingView.
The function works, in the sense that it finds correctly consolidation zones. But problem is performance, mainly due to the for cycle at the end: with 30K candles it takes just ~5 seconds to execute the code before the for, and then it takes over 2 minutes to run the cycle.
import numpy as np
import pandas as pd
from pandas import (DataFrame, Series)
def _find_zz(row: Series):
if pd.notnull(row['hb']) and pd.notnull(row['lb']):
if row['dir'] == 1:
return row['hb']
else:
return row['lb']
else:
return row['hb'] if pd.notnull(row['hb']) else row['lb'] if pd.notnull(row['lb']) else np.NaN
def consolidation_zones(dataframe: DataFrame, timeperiod: int = 100,
minlength: int = 20) -> DataFrame:
rolling = dataframe.rolling(timeperiod, min_periods=1)
idxmax = rolling['high'].apply(lambda x: x.idxmax()).astype(int)
idxmin = rolling['low'].apply(lambda x: x.idxmin()).astype(int)
highest = pd.concat({'value': dataframe['high'], 'offset': dataframe.index - idxmax}, axis=1)
lowest = pd.concat({'value': dataframe['low'], 'offset': dataframe.index - idxmin}, axis=1)
hb = highest.apply(lambda x: x['value'] if x['offset'] == 0 else np.NaN, axis=1)
lb = lowest.apply(lambda x: x['value'] if x['offset'] == 0 else np.NaN, axis=1)
direction = pd.concat({'hb': hb, 'lb': lb}, axis=1).apply(lambda x: 1 if pd.notnull(x['hb']) and pd.isnull(x['lb']) else -1 if pd.isnull(x['hb']) and pd.notnull(x['lb']) else np.NaN, axis=1).fillna(method='ffill').fillna(0).astype(int)
zz = pd.concat({'hb': hb, 'lb': lb, 'dir': direction}, axis=1).apply(_find_zz, axis=1)
group = direction.ne(direction.shift()).cumsum()
zzdir = pd.concat({'zz': zz, 'dir': direction, 'group': group}, axis=1)
zzdir['min'] = zzdir.groupby('group')['zz'].cummin().fillna(method='ffill')
zzdir['max'] = zzdir.groupby('group')['zz'].cummax().fillna(method='ffill')
zzdir['pp'] = np.NaN
pp = Series(np.where(zzdir['dir'] == 1, zzdir['max'], np.where(zzdir['dir'] == -1, zzdir['min'], zzdir['pp'])))
H = dataframe.rolling(minlength, min_periods=1)['high'].max()
L = dataframe.rolling(minlength, min_periods=1)['low'].min()
prevpp = np.NaN
conscnt = 0
condhigh = np.NaN
condlow = np.NaN
zones = DataFrame(index=dataframe.index, columns=['upper_bound', 'lower_bound'])
indexes = [] # will keep indexes of candles that are part of the consolidation
#----------------
for index, value in pp.items():
# pp is a value computed before: when it changes, it *may* be the end of a consolidation zone
if value != prevpp:
if conscnt > 0 and value <= condhigh and value >= condlow:
# if condlow <= pp <= condhigh, we are still in consolidation
conscnt = conscnt + 1
indexes.append(index)
else: # end of consolidation
conscnt = 0
indexes = []
else:
conscnt = conscnt + 1
indexes.append(index)
if conscnt >= minlength:
if conscnt == minlength:
# initially, condhigh/low is equal to the highest/lowest value in last minlength candles
condhigh = H.get(index)
condlow = L.get(index)
else:
# update condhigh/low with new high/low
condhigh = max(condhigh, dataframe.loc[index, 'high'])
condlow = min(condlow, dataframe.loc[index, 'low'])
zones.loc[zones.index.isin(indexes), 'upper_bound'] = condhigh
zones.loc[zones.index.isin(indexes), 'lower_bound'] = condlow
prevpp = value
#----------------
return zones
I don't know how to write the last part of the code (delimited by comments) without iterating over all the rows.
This is the original Pine Script: Consolidation Zones - Live - TradingView

Related

How do I iterate through a pandas dataframe and access a lagging or leading row?

I have the following DataFrame:
df = pd.DataFrame(
{
'date': ['2020-12-05', '2020-12-06', '2020-12-07'],
'day': ['Saturday', 'Sunday', 'Monday'],
'score': [2, 3, 0]
}
)
df
In the DataFrame above, I want to update the score on Monday if the scores on the weekend were non-zero values. For the DataFrame above, Monday's score would be 2.5. But it should work for other, longer DataFrames as well.
I know I can use the following:
df.score.loc[(df.day == 'Monday') & (df.score != 0) & (df.score.shift(1) != 0) & (df.score.shift(2) != 0)] = (df.score + df.score.shift(1)+df.score.shift(2))/3
df.score.loc[(df.day == 'Monday') & (df.score != 0) & (df.score.shift(1) != 0) & (df.score.shift(2) == 0)] = (df.score + df.score.shift(1))/2
df.score.loc[(df.day == 'Monday') & (df.score != 0) & (df.score.shift(1) == 0) & (df.score.shift(2) != 0)] = (df.score + df.score.shift(2))/2
df.score.loc[(df.day == 'Monday') & (df.score == 0) & (df.score.shift(1) != 0) & (df.score.shift(2) != 0)] = (df.score.shift(1) + df.score.shift(2))/2
df.score.loc[(df.day == 'Monday') & (df.score == 0) & (df.score.shift(1) != 0) & (df.score.shift(2) == 0)] = df.score.shift(1)
df.score.loc[(df.day == 'Monday') & (df.score == 0) & (df.score.shift(1) == 0) & (df.score.shift(2) != 0)] = df.score.shift(2)
but this is too lengthy. I think I need to iterate through the DataFrame, something like this:
for index, row in df.iterrows():
if row.day == 'Monday':
non_zeros = []
if row.score != 0:
non_zeros.append(row.score)
if row.score.shift(1) != 0:
non_zeros.append(row.score.shift(1))
if row.score.shift(2) != 0:
non_zeros.append(row.score.shift(2))
mon_score = sum(non_zeros)/len(non_zeros)
df.at[index, 'score'] = mon_score
The code above doesn't work because I get an error:
AttributeError: 'float' object has no attribute 'shift'
So, it seems that shift() isn't correct.
How would I access the previous row and how would I access the score in the previous row? Is there a better way than manually listing the combinations of conditions, like I've done above?

How would I access the previous row
Keep the previous row in a variable - you actually want to see the previous two rows.
rows = df.iterrows()
index,minus2 = next(rows)
index,minus1 = next(rows)
for index, current in rows:
if current.day == 'Monday':
print(f'Saturday:{minus2.date}, Sunday:{minus2.date}, Monday:{current.date}')
print(f'Sat score:{minus2.score}, Sun score:{minus2.score}, Mon score:{current.score}')
print('*********')
minus2,minus1 = minus1,current
Here is another way to do it.
Setup
import pandas as pd
from numpy.random import default_rng
rng = default_rng()
dates = pd.date_range("2020-12-04", periods=60, freq="D")
days = dates.day_name()
score = rng.choice([0, 0, 0, 0, 0, 0, 0.1, 0.2, 0.3], size=60)
df = pd.DataFrame({"date": dates, "day": days, "score": score})
print(df.head(10))
Groupby weekend/weekday; if there is a score on the weekend; calculate the new Monday score; use the group's Monday index to assign a new value to the DataFrame.
weekends = df.day.str.contains(r"Saturday|Sunday|Monday")
days_of_interest = df[weekends]
gb = df.groupby((days_of_interest.day == "Saturday").cumsum())
for k, g in gb:
if (g.iloc[:2].score != 0).any():
monday = (g.iloc[-1]).name
new_score = g.score.mean()
# print(new_score))
df.loc[monday, "score"] = new_score
# print(g)
print(df)
It assumes the data does not break/stop/start on a Saturday,Sunday,Monday boundary - necessary assumption because it uses .iloc slicing/indexing and groups starting on 'Saturday'. Extra logic or alternate selection methods would be needed to accommodate the edge cases if the assumption is not correct.
I wasn't too sure on how the monday score was to be updated, looked like you were averaging over non-zero scores (sum(non_zeros)/len(non_zeros)) - including monday(?). Maybe it should have been:
# add average of weekend scores to monday score
weekend_score = g.iloc[:2].score.mean()
df.loc[monday,'score'] += weekend_score
# or just the sum of all the scores?
df.loc[monday,'score'] = g.score.sum()

If I get it correctly, this might be the solution you have been looking for. I have assumed that your dummy data might be same(no Nulls or any other randomized rows) Try following if you want to use shift().
import pandas as pd
import math
df = pd.DataFrame(
{
'date': ['2020-11-30','2020-12-05', '2020-12-06', '2020-12-07'],
'day': ['Monday','Saturday', 'Sunday', 'Monday'],
'score': [4, 2, 3, 0]
}
)
def update(row,df1,df2):
score_Mon = row['score']
score_Sun = df1.iloc[row.name]['score']
score_Sat = df2.iloc[row.name]['score']
# Base condition if previous Saturday,Sunday values available, else put
# default value only
if row['day'] == 'Monday' and not(math.isnan(score_Sun)) and
not(math.isnan(score_Sat)):
# Non zero condition
if score_Mon != 0 and score_Sun > 0 and score_Sat > 0:
row['score'] = (score_Mon + score_Sun + score_Sat)/3
# Zero condition
else:
row['score'] = (score_Sun + score_Sat)/2
return row
df = df.apply(update,axis=1,args = [df.shift(1),df.shift(2)])
df

df = pd.DataFrame(
{
'date': ['2020-12-05', '2020-12-06', '2020-12-07', '2020-12-05', '2020-12-06', '2020-12-07'],
'day': ['Saturday', 'Sunday', 'Monday','Saturday', 'Sunday', 'Monday'],
'score': [-0.2, 0, 0.0, -0.3, 0, 0.0]
}
)
Based on the question, if we have to access previous rows based on row in question, we can use below
mondays = df['day']=='Monday'
sundays = df.score.shift(1)[mondays]
saturdays = df.score.shift(2)[mondays]
# row_mask_for_upd = mondays & sundays.astype(bool) | saturdays.astype(bool) # if either of sundays or saturdays have to be non zero
row_mask_for_upd = mondays & sundays.astype(bool) & saturdays.astype(bool) # if both sundays and saturdays have to be non zero
if True in set(row_mask_for_upd):
df.loc[row_mask_for_upd, "score"] = (sundays + saturdays)/2
Input:
Output:
Other Inputs and Outputs:
Input:
Output:
Input:
Output:

Trying to find information gain but don't know how to approach conditional entropy

Dataset: https://raw.githubusercontent.com/Kuntal-G/Machine-Learning/master/R-machine-learning/data/banknote-authentication.csv
How can I calculate the conditional entropy and find the best information gain from a dataset like this?
The code for calculating entropy:
def entropy(column):
""" Calculates the entropy"""
values, counts = np.unique(column, return_counts=True)
entropy_val = 0
for i in range(len(counts)):
entropy_val += (
(-counts[i] / sum(counts)) * math.log2(counts[i] / (sum(counts)))
)
return entropy_val
where 'column' is a feature in the dataframe, for example df[0].
I'm a little stuck as to where to go from here... Can anyone point me in the right direction, where my end goal is finding best information gain.
entropy_vals = {}
entropy_vals = entropy(X[0]), entropy(X[1]), entropy(X[2]), entropy(X[3]), entropy(y)
print(entropy_vals)
df = pd.read_csv('data_banknote_authentication.txt', header=None)
print(df)
y = df.iloc[:, -1]
X = df.iloc[:, :4]
def count_labels(rows):
"""Counts number of each unique value in selected column."""
counts = {}
for row in rows:
label = row
if label not in counts:
counts[label] = 1
else:
counts[label] += 1
return counts
def entropy(column):
""" Calculates the entropy"""
values, counts = np.unique(column, return_counts=True)
entropy_val = 0
for i in range(len(counts)):
entropy_val += (
(-counts[i] / sum(counts)) * math.log2(counts[i] / (sum(counts)))
)
return entropy_val
entropy_vals = {}
entropy_vals = entropy(X[0]), entropy(X[1]), entropy(X[2]), entropy(X[3]), entropy(y)
print(entropy_vals)
def check_unique(data):
label_col = data[data.columns[-1]]
print(label_col)
unique_features = np.unique(label_col)
if len(unique_features) == 1:
return True
else:
return False
def categorize_data(data):
label_col = data[data.columns[-1]]
values, counts = np.unique(label_col, return_counts=True)
print(values, counts)
index = counts.argmax()
category = values[index]
return category
def split(data):
x_less = data[data <= np.mean(data)]
x_greater = data[data > np.mean(data)]
return x_less, x_greater

if I want to make new column based on the old columns range?

I want to make a new column with old column's date range
df['block']= np.where((df['transacted_date']> '2016-06-01') & (df['transacted_date']< '2016-09-01') ,0,'None')
df['block']= np.where((df['transacted_date']> '2016-09-01') & (df['transacted_date']< '2016-12-01') ,1,'None')
is there way to do this in if elif statement?

try using np.select
m1 = (df['transacted_date'] > '2016-06-01') & (df['transacted_date'] < '2016-09-01')
m2 = (df['transacted_date'] > '2016-09-01') &( df['transacted_date'] < '2016-12-01')
df['block'] = np.select(condlist=[m1,m2],
choicelist=[0,1],
default=None)

Use numpy.select with Series.between:
m1 = df['transacted_date'].between('2016-06-01', '2016-09-01', inclusive = False)
m2 = df['transacted_date'].between('2016-09-01', '2016-12-01', inclusive = False)
df['block'] = np.select([m1,m2], [0,1], default=None)
If need if-else solution:
def f(x):
if (x > pd.Timestamp('2016-06-01')) and (x < pd.Timestamp('2016-09-01')):
return 0
elif (x > pd.Timestamp('2016-09-01')) and (x < pd.Timestamp('2016-12-01')):
return 1
else:
return None
df['block']=df['transacted_date'].apply(f)
If need more general solution use cut with numpy.where, because cut cannot create None or NaN labels:
b = pd.to_datetime([pd.Timestamp.min,'2016-06-01','2016-09-01','2016-12-01',pd.Timestamp.max])
s = pd.cut(df['transacted_date'], bins=b, labels=[-2, 0, 1, -1])
df['block1'] = np.where(s.astype(int) >= 0, s, np.nan)

How to optimize an O(N*M) to be O(n**2)?

I am trying to solve USACO's Milking Cows problem. The problem statement is here: https://train.usaco.org/usacoprob2?S=milk2&a=n3lMlotUxJ1
Given a series of intervals in the form of a 2d array, I have to find the longest interval and the longest interval in which no milking was occurring.
Ex. Given the array [[500,1200],[200,900],[100,1200]], the longest interval would be 1100 as there is continuous milking and the longest interval without milking would be 0 as there are no rest periods.
I have tried looking at whether utilizing a dictionary would decrease run times but I haven't had much success.
f = open('milk2.in', 'r')
w = open('milk2.out', 'w')
#getting the input
farmers = int(f.readline().strip())
schedule = []
for i in range(farmers):
schedule.append(f.readline().strip().split())
#schedule = data
minvalue = 0
maxvalue = 0
#getting the minimums and maximums of the data
for time in range(farmers):
schedule[time][0] = int(schedule[time][0])
schedule[time][1] = int(schedule[time][1])
if (minvalue == 0):
minvalue = schedule[time][0]
if (maxvalue == 0):
maxvalue = schedule[time][1]
minvalue = min(schedule[time][0], minvalue)
maxvalue = max(schedule[time][1], maxvalue)
filled_thistime = 0
filled_max = 0
empty_max = 0
empty_thistime = 0
#goes through all the possible items in between the minimum and the maximum
for point in range(minvalue, maxvalue):
isfilled = False
#goes through all the data for each point value in order to find the best values
for check in range(farmers):
if point >= schedule[check][0] and point < schedule[check][1]:
filled_thistime += 1
empty_thistime = 0
isfilled = True
break
if isfilled == False:
filled_thistime = 0
empty_thistime += 1
if (filled_max < filled_thistime) :
filled_max = filled_thistime
if (empty_max < empty_thistime) :
empty_max = empty_thistime
print(filled_max)
print(empty_max)
if (filled_max < filled_thistime):
filled_max = filled_thistime
w.write(str(filled_max) + " " + str(empty_max) + "\n")
f.close()
w.close()
The program works fine, but I need to decrease the time it takes to run.

A less pretty but more efficient approach would be to solve this like a free list, though it is a bit more tricky since the ranges can overlap. This method only requires looping through the input list a single time.
def insert(start, end):
for existing in times:
existing_start, existing_end = existing
# New time is a subset of existing time
if start >= existing_start and end <= existing_end:
return
# New time ends during existing time
elif end >= existing_start and end <= existing_end:
times.remove(existing)
return insert(start, existing_end)
# New time starts during existing time
elif start >= existing_start and start <= existing_end:
# existing[1] = max(existing_end, end)
times.remove(existing)
return insert(existing_start, end)
# New time is superset of existing time
elif start <= existing_start and end >= existing_end:
times.remove(existing)
return insert(start, end)
times.append([start, end])
data = [
[500,1200],
[200,900],
[100,1200]
]
times = [data[0]]
for start, end in data[1:]:
insert(start, end)
longest_milk = 0
longest_gap = 0
for i, time in enumerate(times):
duration = time[1] - time[0]
if duration > longest_milk:
longest_milk = duration
if i != len(times) - 1 and times[i+1][0] - times[i][1] > longest_gap:
longes_gap = times[i+1][0] - times[i][1]
print(longest_milk, longest_gap)

As stated in the comments, if the input is sorted, the complexity could be O(n), if that's not the case we need to sort it first and the complexity is O(nlog n):
lst = [ [300,1000],
[700,1200],
[1500,2100] ]
from itertools import groupby
longest_milking = 0
longest_idle = 0
l = sorted(lst, key=lambda k: k[0])
for v, g in groupby(zip(l[::1], l[1::1]), lambda k: k[1][0] <= k[0][1]):
l = [*g][0]
if v:
mn, mx = min(i[0] for i in l), max(i[1] for i in l)
if mx-mn > longest_milking:
longest_milking = mx-mn
else:
mx = max((i2[0] - i1[1] for i1, i2 in zip(l[::1], l[1::1])))
if mx > longest_idle:
longest_idle = mx
# corner case, N=1 (only one interval)
if len(lst) == 1:
longest_milking = lst[0][1] - lst[0][0]
print(longest_milking)
print(longest_idle)
Prints:
900
300
For input:
lst = [ [500,1200],
[200,900],
[100,1200] ]
Prints:
1100
0

writing function in pandas/python

I have just started to learn python and don't have much of dev background. Here is the code I have written while learning.
I now want to make a function which exactly does what my "for" loop is doing but it needs to calculate different exp(exp,exp1 etc) based on different num(num, num1 etc)
how can I do this?
import pandas as pd
index = [0,1]
s = pd.Series(['a','b'],index= index)
t = pd.Series([1,2],index= index)
t1 = pd.Series([3,4],index= index)
df = pd.DataFrame(s,columns = ["str"])
df["num"] =t
df['num1']=t1
print (df)
exp=[]
for index, row in df.iterrows():
if(row['str'] == 'a'):
row['mul'] = -1 * row['num']
exp.append(row['mul'])
else:
row['mul'] = 1 * row['num']
exp.append(row['mul'])
df['exp'] = exp
print (df)
This is what i was trying to do which gives wrong results
import pandas as pd
index = [0,1]
s = pd.Series(['a','b'],index= index)
t = pd.Series([1,2],index= index)
t1 = pd.Series([3,4],index= index)
df = pd.DataFrame(s,columns = ["str"])
df["num"] =t
df['num1']=t1
def f(x):
exp=[]
for index, row in df.iterrows():
if(row['str'] == 'a'):
row['mul'] = -1 * x
exp.append(row['mul'])
else:
row['mul'] = 1 * x
exp.append(row['mul'])
return exp
df['exp'] = df['num'].apply(f)
df['exp1'] = df['num1'].apply(f)
df
Per suggestion below, I would do:
df['exp']=np.where(df.str=='a',df['num']*-1,df['num']*1)
df['exp1']=np.where(df.str=='a',df['num1']*-1,df['num1']*1)

I think you are looking for np.where
df['exp']=np.where(df.str=='a',df['num']*-1,df['num']*1)
df
Out[281]:
str num num1 exp
0 a 1 3 -1
1 b 2 4 2

Normal dataframe operation:
df["exp"] = df.apply(lambda x: x["num"] * (1 if x["str"]=="a" else -1), axis=1)
Mathematical dataframe operation:
df["exp"] = ((df["str"] == 'a')-0.5) * 2 * df["num"]

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

How can I optimize this Python Pandas code? - python

Related

How do I iterate through a pandas dataframe and access a lagging or leading row?

Trying to find information gain but don't know how to approach conditional entropy

if I want to make new column based on the old columns range?

How to optimize an O(N*M) to be O(n**2)?

writing function in pandas/python

Categories

Resources