Pandas create a new column containing calculated results, applied onto mutliple columns - python

I need some help to modify my function and how to apply it in order to iterate an ifelse condition through multiple features.
Suppose we have the following table t1
import pandas as pd
names = {'name': ['Jon','Bill','Maria','Emma']
,'feature1': [2,3,4,5]
,'feature2': [1,2,3,4]
,'feature3': [1,2,3,4]}
t1 = pd.DataFrame(names,columns=['name','feature1','feature2','feature3'])
I want to create 3 new columns based on an ifelse condition. Here is how I am doing it for the first feature:
# Define the conditions
def ifelsefunction(row):
if row['feature1'] >=3:
return 1
elif row['feature1'] ==2:
return 2
else:
return 0
# Apply the condition
t1['ft1'] = t1.apply(ifelsefunction, axis=1)
I would like to write the function into something iterable like this
def ifelsefunction(row, feature):
if row[feature] >=3:
return 1
elif row[feature] ==2:
return 2
else:
return 0
t1['ft1_score'] = t1.apply(ifelsefunction(row, 'feature1'), axis=1)
t1['ft2_score'] = t1.apply(ifelsefunction(row, 'feature2'), axis=1)
t1['ft3_score'] = t1.apply(ifelsefunction(row, 'feature3'), axis=1)
---- EDIT ----
Thanks for the answers, I may have over-simplified the actual problem.
How do I do the same for this conditions?
def ifelsefunction(var1, var2):
mask1 = (var1 >=3) and (var1<var2)
mask2 = var1 == 2
return np.select([mask1,mask2], [var1*0.7, var1*var2], default=0)

I think here is best avoid loops, use numpy.select for test and assign mask only for selected columns from list, for pass function with input DataFrame is used DataFrame.pipe:
# Define the conditions
def ifelsefunction(df):
m1 = df >= 3
m2 = df == 2
return np.select([m1, m2], [1, 2], default=0)
cols = ['feature1','feature2','feature3']
t1[cols] = t1[cols].pipe(ifelsefunction)
#alternative
#t1[cols] = ifelsefunction(t1[cols])
print (t1)
name feature1 feature2 feature3
0 Jon 2 0 0
1 Bill 1 2 2
2 Maria 1 1 1
3 Emma 1 1 1
For new columns use:
# Define the conditions
def ifelsefunction(df):
m1 = df >= 3
m2 = df == 2
return np.select([m1, m2], [1, 2], default=0)
cols = ['feature1','feature2','feature3']
new = [f'{x}_score' for x in cols]
t1[new] = t1[cols].pipe(ifelsefunction)
#alternative
#t1[new] = ifelsefunction(t1[cols])
print (t1)
name feature1 feature2 feature3 feature1_score feature2_score \
0 Jon 2 1 1 2 0
1 Bill 3 2 2 1 2
2 Maria 4 3 3 1 1
3 Emma 5 4 4 1 1
feature3_score
0 0
1 2
2 1
3 1
EDIT:
You can change function like:
def ifelsefunction(df, var1, var2):
mask1 = (df[var1] >=3) & (df[var1]<df[var2])
mask2 = df[var1] == 2
return np.select([mask1,mask2], [df[var1]*0.7, df[var1]*df[var2]], default=0)
t1['new'] = ifelsefunction(t1, 'feature3','feature1')
print (t1)
name feature1 feature2 feature3 new
0 Jon 2 1 1 0.0
1 Bill 3 2 2 6.0
2 Maria 4 3 3 2.1
3 Emma 5 4 4 2.8

Try using apply
t1["ft1_score"] = t1.feature1.apply(lambda x: 1 if x >= 3 else (2 if x == 2 else 0))
t1["ft2_score"] = t1.feature2.apply(lambda x: 1 if x >= 3 else (2 if x == 2 else 0))
t1["ft3_score"] = t1.feature3.apply(lambda x: 1 if x >= 3 else (2 if x == 2 else 0))

Related

How can I fill empty DataFrame based on conditions?

I have following dataframe called condition:
[0] [1] [2] [3]
1 0 0 1 0
2 0 1 0 0
3 0 0 0 1
4 0 0 0 1
For easier reproduction:
import numpy as np
import pandas as pd
n=4
t=3
condition = pd.DataFrame([[0,0,1,0], [0,1,0,0], [0,0,0, 1], [0,0,0, 1]], columns=['0','1', '2', '3'])
condition.index=np.arange(1,n+1)
Further I have several dataframes that should be filled in a foor loop
df = pd.DataFrame([],index = range(1,n+1),columns= range(t+1) ) #NaN DataFrame
df_2 = pd.DataFrame([],index = range(1,n+1),columns= range(t+1) )
df_3 = pd.DataFrame(3,index = range(1,n+1),columns= range(t+1) )
for i,t in range(t,-1,-1):
if condition[t]==1:
df.loc[:,t] = df_3.loc[:,t]**2
df_2.loc[:,t]=0
elif (condition == 0 and no 1 in any column after t)
df.loc[:,t] = 2.5
....
else:
df.loc[:,t] = 5
df_2.loc[:,t]= df.loc[:,t+1]
I am aware that this for loop is not correct, but what I wanted to do, is to check elementwise condition (recursevly) and if it is 1 (in condition) to fill dataframe df with squared valued of df_3. If it is 0 in condition, I should differentiate two cases.
In the first case, there are no 1 after 0 (row 1 and 2 in condition) then df = 2.5
Second case, there was 1 after and fill df with 5 (row 3 and 4)
So the dataframe df should look something like this
[0] [1] [2] [3]
1 5 5 9 2.5
2 5 9 2.5 2.5
3 5 5 5 9
4 5 5 5 9
The code should include for loop.
Thanks!
I am not sure if this is what you want, but based on your desired output you can do this with only masking operations (which is more efficient than looping over the rows anyway). Your code could look like this:
is_one = condition.astype(bool)
is_after_one = (condition.cumsum(axis=1) - condition).astype(bool)
df = pd.DataFrame(5, index=condition.index, columns=condition.columns)
df_2 = pd.DataFrame(2.5, index=condition.index, columns=condition.columns)
df_3 = pd.DataFrame(3, index=condition.index, columns=condition.columns)
df.where(~is_one, other=df_3 * df_3, inplace=True)
df.where(~is_after_one, other=df_2, inplace=True)
which yields:
0 1 2 3
1 5 5 9.0 2.5
2 5 9 2.5 2.5
3 5 5 5.0 9.0
4 5 5 5.0 9.0
EDIT after comment:
If you really want to loop explicitly over the rows and columns, you could do it like this with the same result:
n_rows = condition.index.size
n_cols = condition.columns.size
for row_index in range(n_rows):
for col_index in range(n_cols):
cond = condition.iloc[row_index, col_index]
if col_index < n_cols - 1:
rest_row = condition.iloc[row_index, col_index + 1:].to_list()
else:
rest_row = []
if cond == 1:
df.iloc[row_index, col_index] = df_3.iloc[row_index, col_index] ** 2
elif cond == 0 and 1 not in rest_row:
# fill whole row at once
df.iloc[row_index, col_index:] = 2.5
# stop iterating over the rest
break
else:
df.iloc[row_index, col_index] = 5
df_2.loc[:, col_index] = df.iloc[:, col_index + 1]
The result is the same, but this is much more inefficient and ugly, so I would not recommend it like this

how to pass row values from a column based on bool value from several other columns

I have the following df :
df = data.frame("T" = c(1,2,3,5,1,3,2,5), "A" = c("0","0","1","1","0","1","0","1"), "B" = c("0","0","0","1","0","0","0","1"))
df
T A B
1 1 0 0
2 2 0 0
3 3 1 0
4 5 1 1
5 1 0 0
6 3 1 0
7 2 0 0
8 5 1 1
Column A & B were the results of as follow:
df['A'] = [int(x) for x in total_df["T"] >= 3]
df['B'] = [int(x) for x in total_df["T"] >= 5]
I have a data spilt
train_size = 0.6
training = df.head(int(train_size * df.shape[0]))
test = df.tail(int((1 - train_size) * df.shape[0]))
Here is the question:
How can I pass row values from "T" to a list called 'tr_return' from 'training' where both columns "A" & "B" are == 1?
I tried this:
tr_returns = training[training['A' and 'B'] == 1]['T'] or
tr_returns = training[training['A'] == 1 and training['B'] == 1]['T']
But neither one works :( Any help will be appreciated!

Create a multiple if statement and a function to replace values in columns

I am working with a large dataset, but in order to simplify, I will use a simpler example (whose some rows have been deleted to clean the dataset) in order to illustrate the problem.
Imagine I have this dataset:
Code_0 Code_1 Code_3 Code_4 Code_5 Code_6 Code_7
3 1 1 3 0 0 1 1
9 0 0 0 0 0 0 1
10 4 2 3 1 1 0 0
15 0 3 0 5 1 1 1
So, what I want to do in every row is, If Code_5 is equal to one and code_0 is bigger than 0, then, to code_0 will be added one. On the other hand, if code_6 == 1,and code_1 is bigger than 0, then to code_1 will be added one. Finally, if code_7 == 1 and code_3 is bigger than 0, than to code_3 will be added 1. So I want to have this:
Code_0 Code_1 Code_3 Code_4 Code_5 Code_6 Code_7
3 1 2 4 0 0 1 1
9 0 0 0 0 0 0 1
10 5 2 3 1 1 0 0
15 0 4 0 5 1 1 1
I did this code but it is not working. Anyway I think that there is maybe a better option.
def add_one(x):
if x == df_data['Code_5']:
if x == 1 and df_data['Code_0'] > 0:
df_data['Code_0'] = df_data['Code_0'] + 1
if x == df_data['Code_6']:
if x == 1 and df_data['Code_1'] > 0:
df_data['Code_1'] = df_data['Code_1'] + 1
if x == df_data['Code_7']:
if x == 1 and df_data['Code_2'] > 0:
df_data['Code_2'] = df_data['Code_2'] + 1
df_data['Code_0'] = df_data['Code_5'].apply(add_one)
df_data['Code_1'] = df_data['Code_6'].apply(add_one)
df_data['Code_2'] = df_data['Code_7'].apply(add_one)
Anyone can help me, please?
YOu can simplify by passing the complete row:
In [163]: def add_one(row):
...: if row['Code_5'] == 1 and row['Code_0'] > 0:
...: row['Code_0'] = row['Code_0'] + 1
...: if row['Code_6'] == 1 and row['Code_1'] > 0:
...: row['Code_1'] = row['Code_1'] + 1
...: if row['Code_7'] == 1 and row['Code_2'] > 0:
...: row['Code_2'] = row['Code_2'] + 1
...: return row
...:
In [164]: add_one
Out[164]: <function __main__.add_one(row)>
In [165]: df =df.apply(lambda x: add_one(x), axis=1)
axis =1 is specify the columns

Create new pandas column based on start of text string from other column

I have a pandas dataframe with a text column.
I'd like to create a new column in which values are conditional on the start of the text string from the text column.
So if the 30 first characters of the text column:
== 'xxx...xxx' then return value 1
== 'yyy...yyy' then return value 2
== 'zzz...zzz' then return value 3
if none of the above return 0
There is possible use multiple numpy.where but if more conditions use apply:
For select strings from strats use indexing with str.
df = pd.DataFrame({'A':['xxxss','yyyee','zzzswee','sss'],
'B':[4,5,6,8]})
print (df)
A B
0 xxxss 4
1 yyyee 5
2 zzzswee 6
3 sss 8
#check first 3 values
a = df.A.str[:3]
df['new'] = np.where(a == 'xxx', 1,
np.where(a == 'yyy', 2,
np.where(a == 'zzz', 3, 0)))
print (df)
A B new
0 xxxss 4 1
1 yyyee 5 2
2 zzzswee 6 3
3 sss 8 0
def f(x):
#print (x)
if x == 'xxx':
return 1
elif x == 'yyy':
return 2
elif x == 'zzz':
return 3
else:
return 0
df['new'] = df.A.str[:3].apply(f)
print (df)
A B new
0 xxxss 4 1
1 yyyee 5 2
2 zzzswee 6 3
3 sss 8 0
EDIT:
If length is different, only need:
df['new'] = np.where(df.A.str[:3] == 'xxx', 1,
np.where(df.A.str[:2] == 'yy', 2,
np.where(df.A.str[:1] == 'z', 3, 0)))
print (df)
A B new
0 xxxss 4 1
1 yyyee 5 2
2 zzzswee 6 3
3 sss 8 0
EDIT1:
Thanks for idea to Quickbeam2k1 use str.startswith for check starts of each string:
df['new'] = np.where(df.A.str.startswith('xxx'), 1,
np.where(df.A.str.startswith('yy'), 2,
np.where(df.A.str.startswith('z'), 3, 0)))
print (df)
A B new
0 xxxss 4 1
1 yyyee 5 2
2 zzzswee 6 3
3 sss 8 0
A different and slower solution:
However, the advantage is that the mapping from patterns is a function parameter (with implicit default 0 value)
def map_starts_with(pat_map):
def map_string(t):
pats = [pat for pat in pat_map.keys() if t.startswith(pat)]
return pat_map.get(pats[0]) if len(pats) > 0 else 0
# get only value of "first" pattern if at least one pattern is found
return map_string
df = pd.DataFrame({'col':[ 'xx', 'aaaaaa', 'c']})
col
0 xx
1 aaaaaa
2 c
mapping = { 'aaa':4 ,'c':3}
df.col.apply(lambda x: map_starts_with(mapping)(x))
0 0
1 4
2 3
Note the we also used currying here. I'm wondering if this approach can be implemented using additional pandas or numpy functionality.
Note that the "first" pattern match may depend on the traversal order of the dict keys. This is irrelephant if there is no overlap in the keys. (Jezrael's solution, or its direct generalization thereof, will also choose one element for the match, but in a more predictable manner)

Pandas DataFrame use previous row value for complicated 'if' conditions to determine current value

I want to know if there is any faster way to do the following loop? Maybe use apply or rolling apply function to realize this
Basically, I need to access previous row's value to determine current cell value.
df.ix[0] = (np.abs(df.ix[0]) >= So) * np.sign(df.ix[0])
for i in range(1, len(df)):
for col in list(df.columns.values):
if ((df[col].ix[i] > 1.25) & (df[col].ix[i-1] == 0)) | :
df[col].ix[i] = 1
elif ((df[col].ix[i] < -1.25) & (df[col].ix[i-1] == 0)):
df[col].ix[i] = -1
elif ((df[col].ix[i] <= -0.75) & (df[col].ix[i-1] < 0)) | ((df[col].ix[i] >= 0.5) & (df[col].ix[i-1] > 0)):
df[col].ix[i] = df[col].ix[i-1]
else:
df[col].ix[i] = 0
As you can see, in the function, I am updating the dataframe, I need to access the most updated previous row, so using shift will not work.
For example:
Input:
A B C
1.3 -1.5 0.7
1.1 -1.4 0.6
1.0 -1.3 0.5
0.4 1.4 0.4
Output:
A B C
1 -1 0
1 -1 0
1 -1 0
0 1 0
you can use .shift() function for accessing previous or next values:
previous value for col column:
df['col'].shift()
next value for col column:
df['col'].shift(-1)
Example:
In [38]: df
Out[38]:
a b c
0 1 0 5
1 9 9 2
2 2 2 8
3 6 3 0
4 6 1 7
In [39]: df['prev_a'] = df['a'].shift()
In [40]: df
Out[40]:
a b c prev_a
0 1 0 5 NaN
1 9 9 2 1.0
2 2 2 8 9.0
3 6 3 0 2.0
4 6 1 7 6.0
In [43]: df['next_a'] = df['a'].shift(-1)
In [44]: df
Out[44]:
a b c prev_a next_a
0 1 0 5 NaN 9.0
1 9 9 2 1.0 2.0
2 2 2 8 9.0 6.0
3 6 3 0 2.0 6.0
4 6 1 7 6.0 NaN
I am surprised there isn't a native pandas solution to this as well, because shift and rolling do not get it done. I have devised a way to do this using the standard pandas syntax but I am not sure if it performs any better than your loop... My purposes just required this for consistency (not speed).
import pandas as pd
df = pd.DataFrame({'a':[0,1,2], 'b':[0,10,20]})
new_col = 'c'
def apply_func_decorator(func):
prev_row = {}
def wrapper(curr_row, **kwargs):
val = func(curr_row, prev_row)
prev_row.update(curr_row)
prev_row[new_col] = val
return val
return wrapper
#apply_func_decorator
def running_total(curr_row, prev_row):
return curr_row['a'] + curr_row['b'] + prev_row.get('c', 0)
df[new_col] = df.apply(running_total, axis=1)
print(df)
# Output will be:
# a b c
# 0 0 0 0
# 1 1 10 11
# 2 2 20 33
Disclaimer: I used pandas 0.16 but with only slight modification this will work for the latest versions too.
Others had similar questions and I posted this solution on those as well:
Reference previous row when iterating through dataframe
Reference values in the previous row with map or apply
#maxU has it right with shift, I think you can even compare dataframes directly, something like this:
df_prev = df.shift(-1)
df_out = pd.DataFrame(index=df.index,columns=df.columns)
df_out[(df>1.25) & (df_prev == 0)] = 1
df_out[(df<-1.25) & (df_prev == 0)] = 1
df_out[(df<-.75) & (df_prev <0)] = df_prev
df_out[(df>.5) & (df_prev >0)] = df_prev
The syntax may be off, but if you provide some test data I think this could work.
Saves you having to loop at all.
EDIT - Update based on comment below
I would try my absolute best not to loop through the DF itself. You're better off going column by column, sending to a list and doing the updating, then just importing back again. Something like this:
df.ix[0] = (np.abs(df.ix[0]) >= 1.25) * np.sign(df.ix[0])
for col in df.columns.tolist():
currData = df[col].tolist()
for currRow in range(1,len(currData)):
if currData[currRow]> 1.25 and currData[currRow-1]== 0:
currData[currRow] = 1
elif currData[currRow] < -1.25 and currData[currRow-1]== 0:
currData[currRow] = -1
elif currData[currRow] <=-.75 and currData[currRow-1]< 0:
currData[currRow] = currData[currRow-1]
elif currData[currRow]>= .5 and currData[currRow-1]> 0:
currData[currRow] = currData[currRow-1]
else:
currData[currRow] = 0
df[col] = currData

Categories

Resources