Create boolean flag in pandas from signal's crossings - python

I would like to create a flag with a function and applying it to one column in a pandas dataframe.
The intention of the function is to set the value 1 when the signal crosses upwards over -1 and resets the value to 0 when the signal crosses 1 downwards.
Here is my code example:
I just cant get the function to work
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
x = np.arange(0, 10, 0.01)
x2 = np.arange(0, 20, 0.02)
sin1 = np.sin(x)
sin2 = np.sin(x2)
x2 /= 2
sin3 = sin1 + sin2
df = pd.DataFrame(sin3)
#name signal column
df.columns = ['signal']
df.signal.plot()
def my_flag(x):
#cross over -1
ok1 = (x.iloc[-1] > -1)*1
ok2 = (x.iloc[-2] < -1)*1
activate = (ok1*ok2) > 0.5
if activate:
flag_activate = 1
# OFF
#cross under 1
ok3 = (x.iloc[-1] <1)*1
ok4 = (x.iloc[-2] > 1)*1
inactivate = (ok3*ok4) > 0.5
if inactivate:
flag_activate = 0
# # add to df
return flag_activate
df['the_flag'] = df['signal'].apply(my_flag)
#I have set the flag to 0 for plotting purposes for demo,
# should be replaced when my_flag function works
df['the_flag'] = 0
fig, (ax1,ax2) = plt.subplots(2)
ax1.plot(df['signal'])
ax1.set_title('signal')
y1 = -1
y2 = 1
ax1.axhline(y1,color='r')
I have made a "cartoon picture" of what I would like the flag to llook like for a sine signal:

We can first detect the -1 and +1 crossings whilst considering they should cross-up and cross-down, respectively. This can be done via shifting the signal to left and right by 1 and comparing against -/+ 1 with the crossing behaviour in mind:
neg_1_crossings = np.where((sin3[:-1] < -1) & (sin3[1:] > -1))[0]
pos_1_crossings = np.where((sin3[:-1] > +1) & (sin3[1:] < +1))[0]
For -1 cross-up's: First mask imposes previous values be less than -1, second one imposes next values be greater then -1. Similar for the +1, except operators flipped.
Now we have:
>>> neg_1_crossings
array([592], dtype=int64)
>>> pos_1_crossings
array([157, 785], dtype=int64)
I'd run for loops here to get the flag:
flag = np.zeros_like(sin3)
for neg_cross in neg_1_crossings:
# a `neg_cross` raises the flag
flag[neg_cross:] = 1
for pos_cross in pos_1_crossings:
if pos_cross > neg_cross:
# once we hit a `pos_cross` later on, restrict the flag's ON
# periods to be between the `neg_cross` and this `pos_cross`
flag[pos_cross:] = 0
# we are done with this `neg_cross`
break
which gives
Overall:
def get_flag(col):
"""
`col` is a pd.Series
"""
# signal in numpy domain; also its shifted versions
signal = col.to_numpy()
sig_shifted_left = signal[1:]
sig_shifted_right = signal[:-1]
# detect crossings
neg_1_crossings = np.where((sig_shifted_right < -1) & (sig_shifted_left > -1))[0]
pos_1_crossings = np.where((sig_shifted_right > +1) & (sig_shifted_left < +1))[0]
# form the `flag` signal
flag = np.zeros_like(signal)
for neg_cross in neg_1_crossings:
# a `neg_cross` raises the flag
flag[neg_cross:] = 1
for pos_cross in pos_1_crossings:
if pos_cross > neg_cross:
# once we hit a `pos_cross` later on, restrict the flag's ON
# periods to be between the `neg_cross` and this `pos_cross`
flag[pos_cross:] = 0
# we are done with this `neg_cross`
break
return flag

You can use shift and query to find where the signal crosses your interval boundaries
df["shifted"] = df.signal.shift(-1)
start = df.query("shifted <= -1 and signal >= -1")
stop = df.query("shifted <= 1 and signal >= 1")
then you can use these crossings to set your flag column, probably there's some more compact way to do this in pandas
df["flag"] = False
# pair each left boundary with the closest right one, if any
for l in start.index.values:
try:
r = stop.index.values[stop.index.values > l][0]
df.loc[l:r, "flag"] = True
except:
continue
Let's see if this works:
df.signal.plot()
start.signal.plot(marker="o", lw=0)
stop.signal.plot(marker="o", lw=0)
df.flag.astype(int).plot()

Related

How to plot 3 or more values in plot.bar()

I tried to make plot.bar() using 2 values having them in a list, but I'm unable to plot 3 values.
I tried to add plot.bar(x,y,z), but it didn't work.
ce_data = ce_data.drop(
['pchangeinOpenInterest', 'totalTradedVolume', 'impliedVolatility', # this removes unecesssary items
'pChange', 'totalBuyQuantity', 'totalSellQuantity', 'bidQty',
'bidprice', 'askQty', 'askPrice', 'askQty', 'identifier', 'lastPrice', 'change', 'expiryDate',
'underlying'], axis=1)[
['openInterest', 'changeinOpenInterest', 'strikePrice', 'underlyingValue']]
style.use('ggplot')
ce_data.to_csv('kumar.csv')
df = pd.read_csv('kumar.csv', parse_dates=True, index_col=0)
pivot = df.iloc[2, 3] # this selects the strike price
pivot_round = round(pivot, -2) # round of the price
x = df['strikePrice'].tolist()
y = df['changeinOpenInterest'].tolist()
z = df['openInterest'].tolist()
for i in range(len(x)):
if int(x[i]) >= pivot_round - 400:
xleftpos = i
break
for i in range(len(x)):
if int(x[i]) >= pivot_round + 400:
xrightpos = i
break
x = x[xleftpos:xrightpos]
y = y[xleftpos:xrightpos]
z = z[xleftpos:xrightpos]
plot.bar([value for value in range(len(x))],y)
plot.set_xticks([idx + 0.5 for idx in range(len(x))])
plot.set_xticklabels(x, rotation=35, ha='right', size=10)
I am expecting strike price in x axis and y and z (change in oi and oi) in as bars.
IIUC, here's how I'd do it. This should have a single x-axis w/ 'strikePrice' and two bars of 'changeinOpenInterest' and 'openInterest'.
disp_df = df.pivot('strikePrice', 'changeinOpenInterest', 'openInterest')
disp_df.plot(kind='bar')
You can add the bells and whistles you want to the plot, but this avoids a lot of the manipulation you did above.

Random walk series between start-end values and within minimum/maximum limits

How can i generate a random walk data between a start-end values
while not passing over the maximum value and not going under the minimum value?
Here is my attempt to do this but for some reason sometimes the series goes over the max or under the min values. It seems that the Start and the End value are respected but not the minimum and the maximum value. How can this be fixed? Also i would like to give the standard deviation for the fluctuations but don't know how. I use a randomPerc for fluctuation but this is wrong as i would like to specify the std instead.
import numpy as np
import matplotlib.pyplot as plt
def generateRandomData(length,randomPerc, min,max,start, end):
data_np = (np.random.random(length) - randomPerc).cumsum()
data_np *= (max - min) / (data_np.max() - data_np.min())
data_np += np.linspace(start - data_np[0], end - data_np[-1], len(data_np))
return data_np
randomData=generateRandomData(length = 1000, randomPerc = 0.5, min = 50, max = 100, start = 66, end = 80)
## print values
print("Max Value",randomData.max())
print("Min Value",randomData.min())
print("Start Value",randomData[0])
print("End Value",randomData[-1])
print("Standard deviation",np.std(randomData))
## plot values
plt.figure()
plt.plot(range(randomData.shape[0]), randomData)
plt.show()
plt.close()
Here is a simple loop which checks for series that go under the minimum or over the maximum value. This is exactly what i am trying to avoid. The series should be distributed between the given limits for min and max values.
## generate 1000 series and check if there are any values over the maximum limit or under the minimum limit
for i in range(1000):
randomData = generateRandomData(length = 1000, randomPerc = 0.5, min = 50, max = 100, start = 66, end = 80)
if(randomData.min() < 50):
print(i, "Value Lower than Min limit")
if(randomData.max() > 100):
print(i, "Value Higher than Max limit")
As you impose conditions on your walk, it can not be considered purely random. Anyway, one way is to generate the walk iteratively, and check the boundaries on each iteration. But if you wanted a vectorized solution, here it is:
def bounded_random_walk(length, lower_bound, upper_bound, start, end, std):
assert (lower_bound <= start and lower_bound <= end)
assert (start <= upper_bound and end <= upper_bound)
bounds = upper_bound - lower_bound
rand = (std * (np.random.random(length) - 0.5)).cumsum()
rand_trend = np.linspace(rand[0], rand[-1], length)
rand_deltas = (rand - rand_trend)
rand_deltas /= np.max([1, (rand_deltas.max()-rand_deltas.min())/bounds])
trend_line = np.linspace(start, end, length)
upper_bound_delta = upper_bound - trend_line
lower_bound_delta = lower_bound - trend_line
upper_slips_mask = (rand_deltas-upper_bound_delta) >= 0
upper_deltas = rand_deltas - upper_bound_delta
rand_deltas[upper_slips_mask] = (upper_bound_delta - upper_deltas)[upper_slips_mask]
lower_slips_mask = (lower_bound_delta-rand_deltas) >= 0
lower_deltas = lower_bound_delta - rand_deltas
rand_deltas[lower_slips_mask] = (lower_bound_delta + lower_deltas)[lower_slips_mask]
return trend_line + rand_deltas
randomData = bounded_random_walk(1000, lower_bound=50, upper_bound =100, start=50, end=100, std=10)
You can see it as a solution of geometric problem. The trend_line is connecting your start and end points, and have margins defined by lower_bound and upper_bound. rand is your random walk, rand_trend it's trend line and rand_deltas is it's deviation from the rand trend line. We collocate the trend lines, and want to make sure that deltas don't exceed margins. When rand_deltas exceeds the allowed margin, we "fold" the excess back to the bounds.
At the end you add the resulting random deltas to the start=>end trend line, thus receiving the desired bounded random walk.
The std parameter corresponds to the amount of variance of the random walk.
update : fixed assertions
In this version "std" is not promised to be the "interval".
I noticed you used built in functions as arguments (min and max) which is not reccomended (I changed these to max_1 and min_1). Other than this your code should work as expected:
def generateRandomData(length,randomPerc, min_1,max_1,start, end):
data_np = (np.random.random(length) - randomPerc).cumsum()
data_np *= (max_1 - min_1) / (data_np.max() - data_np.min())
data_np += np.linspace(start - data_np[0], end - data_np[-1],len(data_np))
return data_np
randomData=generateRandomData(1000, 0.5, 50, 100, 66, 80)
If you are willing to modify your code this will work:
import random
for_fill=[]
# generate 1000 samples within the specified range and save them in for_fill
for x in range(1000):
generate_rnd_df=random.uniform(50,100)
for_fill.append(generate_rnd_df)
#set starting and end point manually
for_fill[0]=60
for_fill[999]=80
Here is one way, very crudely expressed in code.
>>> import random
>>> steps = 1000
>>> start = 66
>>> end = 80
>>> step_size = (50,100)
Generate 1,000 steps assured to be within the required range.
>>> crude_walk_steps = [random.uniform(*step_size) for _ in range(steps)]
>>> import numpy as np
Turn these steps into a walk but notice that they fail to meet the requirements.
>>> crude_walk = np.cumsum(crude_walk_steps)
>>> min(crude_walk)
57.099056617839288
>>> max(crude_walk)
75048.948693623403
Calculate a simple linear transformation to scale the steps.
>>> from sympy import *
>>> var('a b')
(a, b)
>>> solve([57.099056617839288*a+b-66,75048.948693623403*a+b-80])
{b: 65.9893403510312, a: 0.000186686954219243}
Scales the steps.
>>> walk = [0.000186686954219243*_+65.9893403510312 for _ in crude_walk]
Verify that the walk now starts and stops where intended.
>>> min(walk)
65.999999999999986
>>> max(walk)
79.999999999999986
You can also generate a stream of random walks and filter out those that do not meet your constraints. Just be aware that by filtering they are not really 'random' anymore.
The code below creates an infinite stream of 'valid' random walks. Be careful with
very tight constraints, the 'next' call might take a while ;).
import itertools
import numpy as np
def make_random_walk(first, last, min_val, max_val, size):
# Generate a sequence of random steps of lenght `size-2`
# that will be taken bewteen the start and stop values.
steps = np.random.normal(size=size-2)
# The walk is the cumsum of those steps
walk = steps.cumsum()
# Performing the walk from the start value gives you your series.
series = walk + first
# Compare the target min and max values with the observed ones.
target_min_max = np.array([min_val, max_val])
observed_min_max = np.array([series.min(), series.max()])
# Calculate the absolute 'overshoot' for min and max values
f = np.array([-1, 1])
overshoot = (observed_min_max*f - target_min_max*f)
# Calculate the scale factor to constrain the walk within the
# target min/max values.
# Don't upscale.
correction_base = [walk.min(), walk.max()][np.argmax(overshoot)]
scale = min(1, (correction_base - overshoot.max()) / correction_base)
# Generate the scaled series
new_steps = steps * scale
new_walk = new_steps.cumsum()
new_series = new_walk + first
# Check the size of the final step necessary to reach the target endpoint.
last_step_size = abs(last - new_series[-1]) # step needed to reach desired end
# Is it larger than the largest previously observed step?
if last_step_size > np.abs(new_steps).max():
# If so, consider this series invalid.
return None
else:
# Else, we found a valid series that meets the constraints.
return np.concatenate((np.array([first]), new_series, np.array([last])))
start = 66
stop = 80
max_val = 100
min_val = 50
size = 1000
# Create an infinite stream of candidate series
candidate_walks = (
(i, make_random_walk(first=start, last=stop, min_val=min_val, max_val=max_val, size=size))
for i in itertools.count()
)
# Filter out the invalid ones.
valid_walks = ((i, w) for i, w in candidate_walks if w is not None)
idx, walk = next(valid_walks) # Get the next valid series
print(
"Walk #{}: min/max({:.2f}/{:.2f})"
.format(idx, walk.min(), walk.max())
)

Pandas DataFrame Logic - Python

Trying to backtest trading logic for fun but I can seem to comprehend how to utilize numpy to make decisions. For example, I want to set df['position'] = 1 or -1 based on whether the data is below or above the upper and lower lines. If Data <= the lower line I want to set position = 1 and keep it at 1 until Data it is >= the upper line. Once data is >= the upper line I want to set position = -1 and keep at -1 then repeat.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
data = np.random.standard_normal((5, 100)).flatten()
data = data.cumsum()
df = pd.DataFrame({'Data': data})
df['std'] = df['Data'].rolling(50).std()
df['SMA'] = df['Data'].rolling(50).mean()
df['upper'] = df['SMA'] + (2 * df['std'])
df['lower'] = df['SMA'] - (2 * df['std'])
df[['Data', 'SMA', 'upper', 'lower']].plot(figsize=(10, 6))
df['position'] = 0
plt.show()
Here I try to do just that but fail because I don't know how to do this properly.
df['islower'] = np.where(df['Data'] < df['lower'], 1, 0)
df['isupper'] = np.where(df['Data'] > df['upper'], 1, 0)
df['position'] = np.where(df['isupper']==1, -1, 0) | np.where(df['islower']==1, 1, 0)
I think what you want to do is:
df['islower'] = df['islower'].where(df['Data'] < df['lower'], 1, 0)
df['isupper'] = df['isupper'].where(df['Data'] < df['upper'], 1, 0)

Creating Univariate Moran Scatterplot in PySal

I'm trying to create a Moran's scatterplot using PySAL -- the one with HH/HL/LH/LL quadrants -- and think I've got there but would like to check my understanding/interpretation/code. The code below uses the built-in North Carolina SIDS data set and row-standardisation.
import numpy as np
import pysal as ps
import matplotlib.pyplot as plt
import matplotlib.cm as cos
# shpdir is wherever the PySAL example data are installed
col = 'SIDR74'
w = ps.open(os.path.join(shpdir,"sids2.gal")).read()
f = ps.open(os.path.join(shpdir,"sids2.dbf"))
y = np.array(f.by_col(col))
w.transform = 'r'
### Are these next three steps right? ###
# Calculate the spatial lag
yl = ps.lag_spatial(w, y)
# Z-Score standardisation
yt = (y - y.mean())/y.std()
ylt = (yl - yl.mean())/yl.std()
# Elements of a Moran's I Scatterplot
# X-axis = z-standardised attribute values
# Y-axis = z-standardised lagged attribute values
# Quadrants = HH=1, LH=2, LL=3, HL=4
#
# So from that it follows that:
# HH == ylt > 0 and yt > 0 = 1
# LH == ylt > 0 and yt < 0 = 2
# LL == ylt < 0 and yt < 0 = 3
# HL == ylt < 0 and yt > 0 = 4
# Initialise an array with a default
# value to hold the quadrant information
quad = np.zeros(yt.shape)
quad[np.bitwise_and(ylt > 0, yt > 0)]=1 # HH
quad[np.bitwise_and(ylt > 0, yt < 0)]=2 # LH
quad[np.bitwise_and(ylt < 0, yt < 0)]=3 # LL
quad[np.bitwise_and(ylt < 0, yt > 0)]=4 # HL
plt.scatter(yt, ylt, c=quad, cmap=cms.summer)
plt.suptitle("Moran Scatterplot?")
plt.show()
That produces something that seems reasonable, but I think I've thought myself into knots on the basis that I've not actually calculated Moran's I yet (via ps.Moran_Local(...)) and this is called a Moran scatterplot...

How can I vectorize this for loop in numpy?

The code is below:
import numpy as np
X = np.array(range(15)).reshape(5,3) # X's element value is meaningless
flag = np.random.randn(5,4)
y = np.array([0, 1, 2, 3, 0]) # Y's element value in range(flag.shape[1]) and Y.shape[0] equals X.shape[0]
dW = np.zeros((3, 4)) # dW.shape equals (X.shape[1], flag.shape[1])
for i in xrange(5):
for j in xrange(4):
if flag[i,j] > 0:
dW[:,j] += X[i,:].T
dW[:,y[i]] -= X[i,:].T
To compute dW more efficiently, how to vectorize this for loop?
Here's how I'd do it:
# has shape (x.shape[1],) + flag.shape
masked = np.where(flag > 0, X.T[...,np.newaxis], 0)
# sum over the i index
dW = masked.sum(axis=1)
# sum over the j index
np.subtract.at(dW, np.s_[:,y], masked.sum(axis=2))
# dW[:,y] -= masked.sum(axis=2) does not work here
See the documentation of ufunc.at for an explanation of that last comment
Here's a vectorized approach based upon np.add.reduceat -
# --------------------- Setup output array ----------------------------------
dWOut = np.zeros((X.shape[1], flag.shape[1]))
# ------ STAGE #1 : Vectorize calculations for "dW[:,j] += X[i,:].T" --------
# Get indices where flag's transposed version has > 0
idx1 = np.argwhere(flag.T > 0)
# Row-extended version of X using idx1's col2 that corresponds to i-iterator
X_ext1 = X[idx1[:,1]]
# Get the indices at which we need to columns change
shift_idx1 = np.append(0,np.where(np.diff(idx1[:,0])>0)[0]+1)
# Use the changing indices as boundaries for add.reduceat to add
# groups of rows from extended version of X
dWOut[:,np.unique(idx1[:,0])] += np.add.reduceat(X_ext1,shift_idx1,axis=0).T
# ------ STAGE #2 : Vectorize calculations for "dW[:,y[i]] -= X[i,:].T" -------
# Repeat same philsophy for this second stage, except we need to index into y.
# So, that would involve sorting and also the iterator involved is just "i".
idx2 = idx1[idx1[:,1].argsort()]
cols_idx1 = y[idx2[:,1]]
X_ext2 = X[idx2[:,1]]
sort_idx = (y[idx2[:,1]]).argsort()
X_ext2 = X_ext2[sort_idx]
shift_idx2 = np.append(0,np.where(np.diff(cols_idx1[sort_idx])>0)[0]+1)
dWOut[:,np.unique(cols_idx1)] -= np.add.reduceat(X_ext2,shift_idx2,axis=0).T
You can do this:
ff = (flag > 0) * 1
ff = ff.reshape((5, 4, 1, 1))
XX = ff * X
[ii, jj] = np.meshgrid(np.arange(5), np.arange(4))
dW[:, jj] += XX[ii, jj, ii, :].transpose((2, 0, 1))
dW[:, y[ii]] -= XX[ii, jj, ii, :].transpose((2, 0, 1))
You can further merge and fold these expressions to get a one-liner but it won't add any more performance.
Update #1: Yep, sorry this is not giving correct results, I had a typo in my check

Categories

Resources