Sampling pandas DF to match a second DF within error - python

Suppose I have two DFs, say df1,df2 as follows:
import pandas as pd
import numpy as np
df1 = pd.DataFrame([[0,1,100],[1,1.1,120],[2,0.8,102]],columns=['id','a','b'])
df2 = pd.DataFrame([[0,0.5,110],[1,1.05,94],[2,0.96,145],[3,0.86,112],[4,1.3,97]],
columns=['id','a','b'])
print(df1)
id a b
0 0 1.0 100
1 1 1.1 120
2 2 0.8 102
print(df2)
id a b
0 0 0.50 110
1 1 1.05 94
2 2 0.96 145
3 3 0.86 112
4 4 1.30 97
Now, suppose I choose some interval size da,db. I want, for each row in df1, to pick a random row from df2, such that abs(a1-a2)<da,abs(b1-b2)<db. What I am currently doing is very brute force:
da = 0.2
db = 25
df2_list=[]
nbad = 0
for rid,row in df1.iterrows():
ca = row['a']
cb = row['b']
c_df2 = df2[np.abs(df2['a']-ca)<da]\
[np.abs(df2['b']-cb)<db]
if len(c_df2) == 0:
nbad+=1
continue
c_df2 = c_df2.sample()
df2_list.append(c_df2['id'].values[0])
matched_df = df2[df2['id'].isin(df2_list)]
print(matched_df)
id a b
1 1 1.05 94
3 3 0.86 112
4 4 1.30 97
However, for my real purpose, where my DF is really big, this is very slow.
Is there a faster way to achieve this result?

Here's a solution:
da = 0.2
db = 25
res = pd.merge(df1.assign(dummy = 1), df2.assign(dummy = 1), on = "dummy").drop("dummy", axis = 1)
res = res[(np.abs(res.a_x - res.a_y) < da) & (np.abs(res.b_x - res.b_y) < db)]
res = res.groupby("id_x").apply(lambda x: x.sample(1))[["id_y", "a_y", "b_y"]]
res.index = res.index.droplevel(1)
print(res)
The output is:
id_y a_y b_y
id_x
0 1 1.05 94
1 4 1.30 97
2 3 0.86 112

Related

How to combine rows in groupby with several conditions?

I want to combine rows in pandas df with the following logic:
dataframe is grouped by users
rows are ordered by start_at_min
rows are combiend when:
Case A:
if start_at_min<=200:
row1[stop_at_min] - row2[start_at_min] < 5
(eg: 101 -100 = 1 -> combine; 200-100=100: -> dont combine)
Case Bif 200> start_at_min<400:
change threhsold to 3
Case C if start_at_min>400:
Never combine
Example df
user start_at_min stop_at_min
0 1 100 150
1 1 152 201 #row0 with row1 combine
2 1 205 260 #row1 with row 2 NO -> start_at_min above 200 -> threshol =3
3 2 65 100 #no
4 2 200 265 #no
5 2 300 451 #no
6 2 452 460 #no -> start_at_min above 400-> never combine
Expected output:
user start_at_min stop_at_min
0 1 100 201 #row1 with row2 combine
2 1 205 260 #row2 with row 3 NO -> start_at_min above 200 -> threshol =3
3 2 65 100 #no
4 2 200 265 #no
5 2 300 451 #no
6 2 452 460 #no -> start_at_min above 400-> never combine
I have written the funciton combine_rows, that takes in 2 Series and applies this logic
def combine_rows (s1:pd.Series, s2:pd.Series):
# take 2 rows and combine them if start_at_min row2 - stop_at_min row1 < 5
if s2['start_at_min'] - s1['stop_at_min'] <5:
return pd.Series({
'user': s1['user'],
'start_at_min': s1['start_at_min'],
'stop_at_min' : s2['stop_at_min']
})
else:
return pd.concat([s1,s2],axis=1).T
Howver I am unable to apply this function to the dataframe.
This was my attempt:
df.groupby('user').sort_values(by=['start_at_min']).apply(combine_rows) # this not working
Here is the full code:
import pandas as pd
import numpy as np
df = pd.DataFrame({
"user" : [1, 1, 2,2],
'start_at_min': [60, 101, 65, 200],
'stop_at_min' : [100, 135, 100, 265]
})
def combine_rows (s1:pd.Series, s2:pd.Series):
# take 2 rows and combine them if start_at_min row2 - stop_at_min row1 < 5
if s2['start_at_min'] - s1['stop_at_min'] <5:
return pd.Series({
'user': s1['user'],
'start_at_min': s1['start_at_min'],
'stop_at_min' : s2['stop_at_min']
})
else:
return pd.concat([s1,s2],axis=1).T
df.groupby('user').sort_values(by=['start_at_min']).apply(combine_rows) # this not working
version 1: one condition
Perform a custom groupby.agg:
threshold = 5
# if the successive stop/start per group are above threshold
# start a new group
group = (df['start_at_min']
.sub(df.groupby('user')['stop_at_min'].shift())
.ge(threshold).cumsum()
)
# groupby.agg
out = (df.groupby(['user', group], as_index=False)
.agg({'start_at_min': 'min',
'stop_at_min': 'max'
})
)
Output:
user start_at_min stop_at_min
0 1 60 135
1 2 65 100
2 2 200 265
Intermediate:
(df['start_at_min']
.sub(df.groupby('user')['stop_at_min'].shift())
)
0 NaN
1 1.0 # below threshold, this will be merged
2 NaN
3 100.0 # above threshold, keep separate
dtype: float64
version 2: multiple conditions
# define variable threshold
threshold = np.where(df['start_at_min'].le(200), 5, 3)
# array([3, 3, 5, 3, 3, 5, 5])
# compute the new starts of group like in version 1
# but using the now variable threshold
m1 = (df['start_at_min']
.sub(df.groupby('user')['stop_at_min'].shift())
.ge(threshold)
)
# add a second restart condition (>400)
m2 = df['start_at_min'].gt(400)
# if either mask is True, start a new group
group = (m1|m2).cumsum()
# groupby.agg
out = (df.groupby(['user', group], as_index=False)
.agg({'start_at_min': 'min',
'stop_at_min': 'max'
})
)
Output:
user start_at_min stop_at_min
0 1 100 201
1 1 205 260
2 2 65 100
3 2 200 265
4 2 300 451
5 2 452 460

How do I create a while loop for this df that has moving average in every stage? [duplicate]

This question already has an answer here:
For loop that adds and deducts from pandas columns
(1 answer)
Closed 1 year ago.
So I want to spread the shipments per ID in the group one by one by looking at avg sales to determine who to give it to.
Here's my dataframe:
ID STOREID BAL SALES SHIP
1 STR1 50 5 18
1 STR2 6 7 18
1 STR3 74 4 18
2 STR1 35 3 500
2 STR2 5 4 500
2 STR3 54 7 500
While SHIP (grouped by ID) is greater than 0, calculate AVG (BAL/SALES) and the lowest AVG per group give +1 to its column BAL and +1 to its column final. And then repeat the process until SHIP is 0. The AVG would be different every stage which is why I wanted it to be a while loop.
Sample output of first round is below. So do this until SHIP is 0 and SUM of Final per ID is = to SHIP:
ID STOREID BAL SALES SHIP AVG Final
1 STR1 50 5 18 10 0
1 STR2 6 4 18 1.5 1
1 STR3 8 4 18 2 0
2 STR1 35 3 500 11.67 0
2 STR2 5 4 500 1.25 1
2 STR3 54 7 500 7.71 0
I've tried a couple of ways in SQL, I thought it would be better to do it in python but I haven't been doing a great job with my loop. Here's what I tried so far:
df['AVG'] = 0
df['FINAL'] = 0
for i in df.groupby(["ID"])['SHIP']:
if i > 0:
df['AVG'] = df['BAL'] / df['SALES']
df['SHIP'] = df.groupby(["ID"])['SHIP']-1
total = df.groupby(["ID"])["FINAL"].transform("cumsum")
df['FINAL'] = + 1
df['A'] = + 1
else:
df['FINAL'] = 0
This was challenging because more than one row in the group can have the same average calculation. then it throws off the allocation.
This works on the example dataframe, if I understood you correctly.
d = {'ID': [1, 1, 1, 2,2,2], 'STOREID': ['str1', 'str2', 'str3','str1', 'str2', 'str3'],'BAL':[50, 6, 74, 35,5,54], 'SALES': [5, 7, 4, 3,4,7], 'SHIP': [18, 18, 18, 500,500,500]}
df = pd.DataFrame(data=d)
df['AVG'] = 0
df['FINAL'] = 0
def calc_something(x):
# print(x.iloc[0]['SHIP'])
for i in range(x.iloc[0]['SHIP'])[0:500]:
x['AVG'] = x['BAL'] / x['SALES']
x['SHIP'] = x['SHIP']-1
x = x.sort_values('AVG').reset_index(drop=True)
# print(x.iloc[0, 2])
x.iloc[0, 2] = x['BAL'][0] + 1
x.iloc[0, 6] = x['FINAL'][0] + 1
return x
df_final = df.groupby('ID').apply(calc_something).reset_index(drop=True).sort_values(['ID', 'STOREID'])
df_final
ID STOREID BAL SALES SHIP AVG FINAL
1 1 STR1 50 5 0 10.000 0
0 1 STR2 24 7 0 3.286 18
2 1 STR3 74 4 0 18.500 0
4 2 STR1 127 3 0 42.333 92
5 2 STR2 170 4 0 42.500 165
3 2 STR3 297 7 0 42.286 243

How to assign running values to each columns with for loops in Pandas?

I have two dataframes, both have same shapes.
dfA
2008LG 2007LG 2006LG 2005LG
0 44 65 30 20
1 10 16 56 70
2 65 30 20 122
3 0.0 0.00 679 158
4 0.0 0.00 30 20
dfB
2008Net 2007Net 2006Net 2005Net
0 0 0 0 452
1 0 0 0 365
2 0 0 0 778
3 0 0 0 78
4 0 0 0 60
The calculation logic is: for each row in dfB , start from the very end 2005Net column and use 2005LG - 2005net and get a value which gets assigned to the first right columns of 2005Net.
For example: for the first iteration 2005LG - 2005Net = 20-452 = -432 and assign -432 to 2006Net. and the second iteration will start from 2006LG - 2006Net= 30 - -432 = 462 and assign to 2007Net.
below is my code, but it is not cutting it, what exactly is wrong here ?
import pandas as pd
import numpy as np
from tqdm import tqdm
for index in tqdm(range(dfA.shape[0])):
for col_index in reversed(range(4)):
the_value = 0
the_value = dfA[dfA.columns[col_index]].iloc[index] - dfB[dfB.columns[col_index]].iloc[index]
dfB[dfB.columns[col_index-1]].iloc[index] = the_value
Try something like this.
for index in reverse(range(4)):
dfB[index - 1] = dfA.iloc[:, index] - dfB.iloc[:,index]
This assume that each column you want to subtract have the same lenght.

Summing by string names Pandas

I'm working with a data frame like this, but bigger and with more zone. I am trying to sum the value of the rows by their names. The total sum of the R or C zones goes in total column while the total sum of either M zones goes in total1 .
Input:
total, total1 are the desired output.
ID Zone1 CHC1 Value1 Zone2 CHC2 Value2 Zone3 CHC3 Value3 total total1
1 R5B 100 10 C2 0 20 R10A 2 5 35 0
1 C2 95 20 M2-6 5 6 R5B 7 3 23 6
3 C2 40 4 C4 60 6 0 6 0 10 0
3 C1 100 8 0 0 0 0 100 0 8 0
5 M1-5 10 6 M2-6 86 15 0 0 0 0 21
You can use filter for DataFrames for Zones and Values:
z = df.filter(like='Zone')
v = df.filter(like='Value')
Then create boolean DataFrames by contains with apply if want check substrings:
m1 = z.apply(lambda x: x.str.contains('R|C'))
m2 = z.apply(lambda x: x.str.contains('M'))
#for check strings
#m1 = z == 'R2'
#m2 = z.isin(['C1', 'C4'])
Last filter by where v and sum per rows:
df['t'] = v.where(m1.values).sum(axis=1).astype(int)
df['t1'] = v.where(m2.values).sum(axis=1).astype(int)
print (df)
ID Zone1 CHC1 Value1 Zone2 CHC2 Value2 Zone3 CHC3 Value3 t t1
0 1 R5B 100 10 C2 0 20 R10A 2 5 35 0
1 1 C2 95 20 M2-6 5 6 R5B 7 3 23 6
2 3 C2 40 4 C4 60 6 0 6 0 10 0
3 3 C1 100 8 0 0 0 0 100 0 8 0
4 5 M1-5 10 6 M2-6 86 15 0 0 0 0 21
Solution1 (simpler code but slower and less flexible)
total = []
total1 = []
for i in range(df.shape[0]):
temp = df.iloc[i].tolist()
if "R2" in temp:
total.append(temp[temp.index("R2")+1])
else:
total.append(0)
if ("C1" in temp) & ("C4" in temp):
total1.append(temp[temp.index("C1")+1] + temp[temp.index("C4")+1])
else:
total1.append(0)
df["Total"] = total
df["Total1"] = total1
Solution2 (faster than solution1 and easier to customize but possibly memory intensive)
# columns to use
cols = df.columns.tolist()
zones = [x for x in cols if x.startswith('Zone')]
vals = [x for x in cols if x.startswith('Value')]
# you can customize here
bucket1 = ['R2']
bucket2 = ['C1', 'C4']
thresh = 2 # "OR": 1, "AND": 2
original = df.copy()
# bucket1 check
for zone in zones:
df.loc[~df[zone].isin(bucket1), cols[cols.index(zone)+1]] = 0
original['Total'] = df[vals].sum(axis=1)
df = original.copy()
# bucket2 check
for zone in zones:
df.loc[~df[zone].isin(bucket2), cols[cols.index(zone)+1]] = 0
df['Check_Bucket'] = df[zones].stack().reset_index().groupby('level_0')[0].apply(list)
df['Check_Bucket'] = df['Check_Bucket'].apply(lambda x: len([y for y in x if y in bucket2]))
df['Total1'] = df[vals].sum(axis=1)
df.loc[df.Check_Bucket < thresh, 'Total1'] = 0
df.drop('Check_Bucket', axis=1, inplace=True)
When I expanded original dataframe to 100k rows, solution 1 took 11.4 s ± 82.1 ms per loop, while solution 2 took 3.53 s ± 29.8 ms per loop. The difference is because solution 2 does not for-looping over row direction.

Pandas DataFrame use previous row value for complicated 'if' conditions to determine current value

I want to know if there is any faster way to do the following loop? Maybe use apply or rolling apply function to realize this
Basically, I need to access previous row's value to determine current cell value.
df.ix[0] = (np.abs(df.ix[0]) >= So) * np.sign(df.ix[0])
for i in range(1, len(df)):
for col in list(df.columns.values):
if ((df[col].ix[i] > 1.25) & (df[col].ix[i-1] == 0)) | :
df[col].ix[i] = 1
elif ((df[col].ix[i] < -1.25) & (df[col].ix[i-1] == 0)):
df[col].ix[i] = -1
elif ((df[col].ix[i] <= -0.75) & (df[col].ix[i-1] < 0)) | ((df[col].ix[i] >= 0.5) & (df[col].ix[i-1] > 0)):
df[col].ix[i] = df[col].ix[i-1]
else:
df[col].ix[i] = 0
As you can see, in the function, I am updating the dataframe, I need to access the most updated previous row, so using shift will not work.
For example:
Input:
A B C
1.3 -1.5 0.7
1.1 -1.4 0.6
1.0 -1.3 0.5
0.4 1.4 0.4
Output:
A B C
1 -1 0
1 -1 0
1 -1 0
0 1 0
you can use .shift() function for accessing previous or next values:
previous value for col column:
df['col'].shift()
next value for col column:
df['col'].shift(-1)
Example:
In [38]: df
Out[38]:
a b c
0 1 0 5
1 9 9 2
2 2 2 8
3 6 3 0
4 6 1 7
In [39]: df['prev_a'] = df['a'].shift()
In [40]: df
Out[40]:
a b c prev_a
0 1 0 5 NaN
1 9 9 2 1.0
2 2 2 8 9.0
3 6 3 0 2.0
4 6 1 7 6.0
In [43]: df['next_a'] = df['a'].shift(-1)
In [44]: df
Out[44]:
a b c prev_a next_a
0 1 0 5 NaN 9.0
1 9 9 2 1.0 2.0
2 2 2 8 9.0 6.0
3 6 3 0 2.0 6.0
4 6 1 7 6.0 NaN
I am surprised there isn't a native pandas solution to this as well, because shift and rolling do not get it done. I have devised a way to do this using the standard pandas syntax but I am not sure if it performs any better than your loop... My purposes just required this for consistency (not speed).
import pandas as pd
df = pd.DataFrame({'a':[0,1,2], 'b':[0,10,20]})
new_col = 'c'
def apply_func_decorator(func):
prev_row = {}
def wrapper(curr_row, **kwargs):
val = func(curr_row, prev_row)
prev_row.update(curr_row)
prev_row[new_col] = val
return val
return wrapper
#apply_func_decorator
def running_total(curr_row, prev_row):
return curr_row['a'] + curr_row['b'] + prev_row.get('c', 0)
df[new_col] = df.apply(running_total, axis=1)
print(df)
# Output will be:
# a b c
# 0 0 0 0
# 1 1 10 11
# 2 2 20 33
Disclaimer: I used pandas 0.16 but with only slight modification this will work for the latest versions too.
Others had similar questions and I posted this solution on those as well:
Reference previous row when iterating through dataframe
Reference values in the previous row with map or apply
#maxU has it right with shift, I think you can even compare dataframes directly, something like this:
df_prev = df.shift(-1)
df_out = pd.DataFrame(index=df.index,columns=df.columns)
df_out[(df>1.25) & (df_prev == 0)] = 1
df_out[(df<-1.25) & (df_prev == 0)] = 1
df_out[(df<-.75) & (df_prev <0)] = df_prev
df_out[(df>.5) & (df_prev >0)] = df_prev
The syntax may be off, but if you provide some test data I think this could work.
Saves you having to loop at all.
EDIT - Update based on comment below
I would try my absolute best not to loop through the DF itself. You're better off going column by column, sending to a list and doing the updating, then just importing back again. Something like this:
df.ix[0] = (np.abs(df.ix[0]) >= 1.25) * np.sign(df.ix[0])
for col in df.columns.tolist():
currData = df[col].tolist()
for currRow in range(1,len(currData)):
if currData[currRow]> 1.25 and currData[currRow-1]== 0:
currData[currRow] = 1
elif currData[currRow] < -1.25 and currData[currRow-1]== 0:
currData[currRow] = -1
elif currData[currRow] <=-.75 and currData[currRow-1]< 0:
currData[currRow] = currData[currRow-1]
elif currData[currRow]>= .5 and currData[currRow-1]> 0:
currData[currRow] = currData[currRow-1]
else:
currData[currRow] = 0
df[col] = currData

Categories

Resources