find start end index of bouts of consecutive equal values - python

given a dataframe df
df = pandas.DataFrame(data=[1,0,0,1,1,1,0,1,0,1,1,1],columns = ['A'])
df
Out[20]:
A
0 1
1 0
2 0
3 1
4 1
5 1
6 0
7 1
8 0
9 1
10 1
11 1
I would like to find the start and end index of interval of ones larger than 3.
In this case what I expect is
(3,5 and 9,11)

Use the shifting cumsum trick to mark consecutive groups, then use groupby to get indices and filter by your conditions.
v = (df['A'] != df['A'].shift()).cumsum()
u = df.groupby(v)['A'].agg(['all', 'count'])
m = u['all'] & u['count'].ge(3)
df.groupby(v).apply(lambda x: (x.index[0], x.index[-1]))[m]
A
3 (3, 5)
7 (9, 11)
dtype: object

I don't explicitly know Pandas, but I do know Python, and took this as a small challenge:
def find_sub_in_list(my_list, sublist, greedy=True):
matches = []
results = []
for item in range(len(my_list)):
aux_list = my_list[item:]
if len(sublist) > len(aux_list) or len(aux_list) == 0:
break
start_match = None
end_pos = None
if sublist[0] == my_list[item]:
start_match = item
for sub_item in range(len(sublist)):
if sublist[sub_item] != my_list[item+sub_item]:
end_pos = False
if end_pos == None and start_match != None:
end_pos = start_match+len(sublist)
matches.append([start_match, end_pos])
if greedy:
results = []
for match in range(len(matches)-1):
if matches[match][1] > matches[match+1][0]:
results.append([matches[match][0], matches[match+1][1]])
else:
results.append(matches[match])
else:
results = matches
return results
my_list = [1,1,1,0,1,1,0,1,1,1,1]
interval = 3
sublist = [1]*interval
matches = find_sub_in_list(my_list, sublist)
print(matches)

Related

How can I count the number of elements within a range in each row

I want to count the number of elements in a row that fall within a range and then print a new column with the results. After looking around I came up with the following solution, however the results are not consistent. Is the solution too simplistic for what I want to accomplish?
I have the following DataFrame.
import pandas as pd
import numpy as np
df = pd.DataFrame(np.random.randint(0,10,size=(10, 3)), columns=list('ABC'))
def val_1(row):
val_1 = 0
if row.loc['A'] and row.loc['B'] and row.loc["C"] in range(1,3):
val_1 = 3
elif row.loc['A'] and row.loc['B'] in range(1,3):
val_1 = 2
elif row.loc['A'] in range(1,3):
val_1 = 1
return val_1
def val_2(row):
val_2 = 0
if row.loc['A'] and row.loc['B'] and row.loc["C"] in range(3,6) :
val_2 = 3
elif row.loc['A'] and row.loc['B'] in range(3,6) :
val_2 = 2
elif row.loc['A'] in range(3,6) :
val_2 = 1
return val_2
def val_3(row):
val_3 = 0
if row.loc['A'] and row.loc['B'] and row.loc["C"] in range(6,10) :
val_3 = 3
elif row.loc['A'] and row.loc['B'] in range(6,10) :
val_3 = 2
elif row.loc['A']in range(6,10) :
val_3 = 1
return val_3
def results():
df['Val_1'] = df.apply(val_1, axis=1)
df['Val_2'] = df.apply(val_2, axis=1)
df['Val_3'] = df.apply(val_3, axis=1)
print(df)
results()
A B C Val_1 Val_2 Val_3
0 9 0 0 0 0 1
1 6 1 0 2 0 1
2 8 5 5 0 3 1
3 9 7 0 0 0 2
4 4 6 2 3 1 2
5 1 5 5 1 3 0
6 8 1 7 2 0 3
7 4 8 5 0 3 2
8 0 6 0 0 0 0
9 3 0 3 0 1 0
Thanks for your help.
This is shorter version of your code.
The problem is in your code:
It is checking the range only for last element
row.loc['A'] and row.loc['B'] and row.loc["C"] in range(start,end)
Only checking range logic for C column (Not for A & B)
def get_val(row,start,end):
val = 0
if row.loc['A'] and row.loc['B'] and row.loc["C"] in range(start,end) :
val = 3
elif row.loc['A'] and row.loc['B'] in range(start,end) :
val = 2
elif row.loc['A'] in range(start,end) :
val = 1
return val
df = pd.DataFrame(np.random.randint(0,10,size=(10, 3)), columns=list('ABC'))
df['Val_1'] = df.apply(lambda x:get_val(x,1,3), axis=1)
df['Val_2'] = df.apply(lambda x:get_val(x,3,6), axis=1)
df['Val_3'] = df.apply(lambda x:get_val(x,6,10), axis=1)
My Solution
def query_build(start, end):
query1 = f'A>={start} and A<={end} and B>={start} and B<={end} and C>={start} and C<={end}'
query2 = f'A>={start} and A<={end} and B>={start} and B<={end}'
query3 = f'A>={start} and A<={end}'
return {query1: 3,
query2: 2,
query3: 1}
df = pd.DataFrame(np.random.randint(0,10,size=(10, 3)), columns=list('ABC'))
df['val1'] = 0
df['val2'] = 0
df['val3'] = 0
val_range = {'val1':(1,2),'val2':(3,5),'val3':(6,9)}
for name, r_range in val_range.items():
query_set = query_build(*r_range)
for query, val in query_set.items():
length = len(df.query(query))
if length:
df[name][df.query(query).index] = val
print(df)
Another approach which is using numpy.select (recommended in the Pandas User Guide here):
import numpy as np
for n, (start, stop) in enumerate([(1, 3), (3, 6), (6, 10)], start=1):
m = df.isin(range(start, stop))
condlist = [m.all(axis=1), m[["A", "B"]].all(axis=1), m["A"]]
df[f"Val_{n}"] = np.select(condlist, [3, 2, 1])
For each iteration build a mask m on df that checks if the df values are in the resp. range(start, stop).
Based on m build a condition list condlist: The 1. entry checks if all values are in the range, the 2. checks if the values in columns A and B are in the range, and the 3. checks if the value in column A is in the range - all checks are done row-wise.
Based on condlist set the corresponding values from [3, 2, 1] (called choicelist in the numpy.select docs) in the new column, and 0 (standard default) if no condition is met.
The selection follows the preferences in you code - see the numpy.select documentation:
When multiple conditions are satisfied, the first one encountered in condlist is used.
Your question actually sounds a bit different to what you are doing:
I want to count the number of elements in a row that fall within a range and then print a new column with the results.
If that is your real goal, then you could try something simpler:
for n, (start, stop) in enumerate([(1, 3), (3, 6), (6, 10)], start=1):
df[f"Val_{n}"] = df.isin(range(start, stop)).sum(axis=1)
Why your approach fails (in addition to #Mazhar's explanation): This
if row.loc['A'] and row.loc['B'] and row.loc["C"] in range(1,3):
isn't how logical operators like and work: You have to fully specify each part of the condition, like:
if (row.loc['A'] in range(1,3)) and (row.loc['B'] in range(1,3)) and (row.loc["C"] in range(1,3)):
The way you have used it actually resolves to
if bool(row.loc['A']) and bool(row.loc['B']) and row.loc["C"] in range(1,3):
which here means (since the values are numbers)
if (row.loc['A'] != 0) and (row.loc['B'] != 0) and (row.loc["C"] in range(1,3)):

how to merge rows summing over each column without iteration

I have the following df:
prevent _p _n _id
0 1 0 0 83135
0 0 1 0 83135
0 0 1 0 82238
I would like to merge all rows having the same column _idby summing over each column for
the desired output in a dataframe, final (please note that if thee sum is greater than 1, the value should just be 1):
prevent _p _n _id
0 1 1 0 83135
0 0 1 0 82238
I can easily do this using the following code iterating over the dataframe:
final = pd.DataFrame()
for id_ in _ids:
out = df[df._id == id_]
prevent = 0
_p = 0
_n = 0
d = {}
if len(out) > 0:
for row in out.itertuples():
if prevent == 0:
prevent += row.prevent
if _p == 0:
_p += row._p
if _n == 0:
_n += row._n
d['_p'] = _p
d['_n'] = _n
d['prevent'] = prevent
t=pd.DataFrame([d])
t['_id'] = id_
final=pd.concat([final, t])
I have several hundred thousand rows, so this will be very inefficient. Is there a way to vectorize this?
Treat 0 and 1 as boolean with any, then convert them back to integers:
df.groupby("_id").any().astype("int").reset_index()
Check groupby
out = df.groupby('_id',as_index=False).sum()

In Python Pandas , searching where there are 4 consecutive rows where values going up

I am trying to figure out how I can mark the rows where the price are part of 4 increase prices .
the "is_consecutive" is actually the mark .
I managed to do the diff between the rows :
df['diff1'] = df['Close'].diff()
But I didn't managed to find out which row is a part of 4 increase prices .
I had a thought to use df.rolling() .
The exmple df,
On rows 0-3 , we need to get an output of 'True' on the ["is_consecutive"] column , because the ['diff1'] on this consecutive rows is increase for 4 rows .
On rows 8-11 , we need to get an output of 'False' on the ["is_consecutive"] column , because the ['diff1'] on this consecutive rows is zero .
Date Price diff1 is_consecutive
0 1/22/20 0 0 True
1 1/23/20 130 130 True
2 1/24/20 144 14 True
3 1/25/20 150 6 True
4 1/27/20 60 -90 False
5 1/28/20 95 35 False
6 1/29/20 100 5 False
7 1/30/20 50 -50 False
8 2/01/20 100 0 False
9 1/02/20 100 0 False
10 1/03/20 100 0 False
11 1/04/20 100 0 False
12 1/05/20 50 -50 False
general example :
if
price = [30,55,60,65,25]
the different form the consecutive number on the list will be :
diff1 = [0,25,5,5,-40]
So when the diff1 is plus its actually means the consecutive prices are increase .
I need to mark(in the df) the rows that have 4 consecutive that go up.
Thank You for help (-:
Try: .rolling with window of size 4 and min periods 1:
df["is_consecutive"] = (
df["Price"]
.rolling(4, min_periods=1)
.apply(lambda x: (x.diff().fillna(0) >= 0).all())
.astype(bool)
)
print(df)
Prints:
Date Price is_consecutive
0 1/22/20 0 True
1 1/23/20 130 True
2 1/24/20 144 True
3 1/25/20 150 True
4 1/26/20 60 False
5 1/26/20 95 False
6 1/26/20 100 False
7 1/26/20 50 False
Assuming the dataframe is sorted. One way is based on the cumsum of the differences to identify the first time an upward Price move succeeding a 3 days upwards trend (i.e. 4 days of upward trend).
quant1 = (df['Price'].diff().apply(np.sign) == 1).cumsum()
quant2 = (df['Price'].diff().apply(np.sign) == 1).cumsum().where(~(df['Price'].diff().apply(np.sign) == 1)).ffill().fillna(0).astype(int)
df['is_consecutive'] = (quant1-quant2) >= 3
note that the above takes into account only strictly increasing Prices (not equal).
Then we override also the is_consecutive tag for the previous 3 Prices to be also TRUE using the win_view self defined function:
def win_view(x, size):
if isinstance(x, list):
x = np.array(x)
if isinstance(x, pd.core.series.Series):
x = x.values
if isinstance(x, np.ndarray):
pass
else:
raise Exception('wrong type')
return np.lib.stride_tricks.as_strided(
x,
shape=(x.size - size + 1, size),
strides=(x.strides[0], x.strides[0])
)
arr = win_view(df['is_consecutive'], 4)
arr[arr[:,3]] = True
Note that we inplace replace the values to be True.
EDIT 1
Inspired by the self defined win_view function, I realized that the solution it can be obtained simply by win_view (without the need of using cumsums) as below:
df['is_consecutive'] = False
arr = win_view(df['Price'].diff(), 4)
arr_ind = win_view(list(df['Price'].index), 4)
mask = arr_ind[np.all(arr[:, 1:] > 0, axis=1)].flatten()
df.loc[mask, 'is_consecutive'] = True
We maintain 2 arrays, 1 for the returns and 1 for the indices. We collect the indices where we have 3 consecutive positive return np.all(arr[:, 1:] > 0, axis=1 (i.e. 4 upmoving prices) and we replace those in our original df.
The function will return columns named "consecutive_up" which represents all rows that are part of the 5 increase series and "consecutive_down" which represents all rows that are part of the 4 decrees series.
def c_func(temp_df):
temp_df['increase'] = temp_df['Price'] > temp_df['Price'].shift()
temp_df['decrease'] = temp_df['Price'] < temp_df['Price'].shift()
temp_df['consecutive_up'] = False
temp_df['consecutive_down'] = False
for ind, row in temp_df.iterrows():
if row['increase'] == True:
count += 1
else:
count = 0
if count == 5:
temp_df.iloc[ind - 5:ind + 1, 4] = True
elif count > 5:
temp_df.iloc[ind, 4] = True
for ind, row in temp_df.iterrows():
if row['decrease'] == True:
count += 1
else:
count = 0
if count == 4:
temp_df.iloc[ind - 4:ind + 1, 5] = True
elif count > 4:
temp_df.iloc[ind, 5] = True
return temp_df

Multiple conditional statements on list comprehension

So this is my code and I want to know if I can use list comprehension to execute the same operation (count the clusters within rows and output a list of length df.shape[0]). There are at least two rows for the same cluster number, but it can be more and they cycles. I tried but couldn't figure it out.
Any suggestions?
My code:
import pandas as pd
cluster_global = 0
cluster_relativo = 0
cluster_index = []
for index, row in df.iterrows():
if row['cluster'] == cluster_relativo:
cluster_index.append(cluster_global)
elif row['cluster'] == (cluster_relativo + 1):
cluster_global += 1
cluster_relativo += 1
cluster_index.append(cluster_global)
elif row['cluster'] == 0:
cluster_global += 1
cluster_relativo = 0
cluster_index.append(cluster_global)
The DataFrame looks like
index
cluster
0
0
1
0
2
1
3
1
4
1
5
2
6
2
7
0
8
0
...
...
n
m<40
Do you want this?
from itertools import groupby
result = [0 if index == 0 and key == 0
else index
for index, (key, group) in enumerate(groupby(my_values))
for _ in group
]
print(result)
Replace my_values in the list comprehension via - df['cluster'].values. to test

Why is Poisonous plants stack solution giving TLE?

Trying to solve hackerrank problem.
There are plants in a garden. Each of these plants has been added with some amount of pesticide. After each day, if any plant has more pesticide than the plant at its left, being weaker than the left one, it dies. You are given the initial values of the pesticide in each plant. Print the number of days after which no plant dies, i.e. the time after which there are no plants with more pesticide content than the plant to their left.
I have used stacks to solve this problem. Example below:
a = 6 5 8 4 7 10 9
10 > 9 a = 6 5 8 4 7 10 b = 9
7 > 10 a = 6 5 8 4 7 b = 9
4 > 7 a = 6 5 8 4 b = 9
8 > 4 a = 6 5 8 b = 9 4
5 > 8 a = 6 5 b = 9 4
6 > 5 a = 6 5 b = 9 4
after this just make a new list with a = a + b.reverse(). Start the process again and exit when list is sorted in reverse order.
This still is giving me time exceeded. Any idea?
n = int(input())
first_list = input().split()
first_list = [int(i) for i in first_list]
count = 0
second_list = []
data_1, data_2 = 0, 0
while True:
b = []
if sorted(first_list, reverse=True) == first_list:
break
data_1 = first_list.pop()
for i in range(len(first_list)-1):
data_2 = first_list.pop()
if data_1 > data_2:
pass
elif data_1 < data_2:
second_list.append(data_1)
elif data_1 == data_2:
second_list.append(data_1)
second_list.append(data_2)
data_1 = data_2
if len(first_list)>=1 and data_1 < first_list[0]:
first_list.append(data_1)
second_list.reverse()
first_list = first_list + second_list
count += 1
print(count)
Edited code:
n = int(input())
input = [int(i) for i in input().split()]
count, t_length = 0, 0
cmp = input[0]
while True:
length = len(input)
for i in range(1, length):
if input[i] == -1:
t_length += 1
continue
if input[i] > cmp:
cmp = input[i]
input[i] = -1
else:
t_length += 1
cmp = input[i]
if t_length+1 == length:
break
count += 1
cmp, t_length = input[0], 0
print(count)
I agree with Woot4Moo that something doesn't look right, and I suggest you focus more on stack use (instead of doubly linked lists). See this link to the Stock Span problem which helps detail an O(N) solution for tracking differences in a list of prices. This can be extended with a condition for pesticide.
For example, it'd be filling in the gaps of:
import sys
ps = [int(s) for s in list(sys.stdin)[1].strip().split()]
stack = [ps[0]]
max_days = 0
for i in range(1, len(ps)):
days[i] = 1 if ps[i] > ps[i-1] else 0
# TODO - Logic to update days[i]
while len(stack) > 0 and ps[i] <= stack[-1][1]:
(ps_other, days_other) = stack.pop()
stack.append((ps[i], days[i])
max_days = max(max_days, days[i])
print(max_days)
I quickly implemented in O(N^2), and found 80% of tests pass (the rest timing out), so more cleverly using the stack according to the link above should do the job.
import collections
import sys
ps = [int(s) for s in list(sys.stdin)[1].strip().split()]
ps_prev = []
days = 0
while collections.Counter(ps_prev) != collections.Counter(ps):
ps_prev = ps[:]
i = len(ps) - 1
while i > 0:
if ps[i] > ps[i-1]:
ps.pop(i)
i -= 1
days += 1
print(days - 1)
Edit: note that the sketchy sys.stdin use is to cater for the HackerRank test input.
Sorting is N log N and your data structure seems wrong to me. Why would you not use a doubly linked list since the requirement is that it is the left most neighbor. You would simply dereference the pointers that died.

Categories

Resources