Trying to apply a function on a Pandas DataFrame in Python - python

I'm trying to apply this function to fill the Age column based on Pclass and Sex columns. But I'm unable to do so. How can I make it work?
def fill_age():
Age = train['Age']
Pclass = train['Pclass']
Sex = train['Sex']
if pd.isnull(Age):
if Pclass == 1:
return 34.61
elif (Pclass == 1) and (Sex == 'male'):
return 41.2813
elif (Pclass == 2) and (Sex == 'female'):
return 28.72
elif (Pclass == 2) and (Sex == 'male'):
return 30.74
elif (Pclass == 3) and (Sex == 'female'):
return 21.75
elif (Pclass == 3) and (Sex == 'male'):
return 26.51
else:
pass
else:
return Age
train['Age'] = train['Age'].apply(fill_age(),axis=1)
I'm getting the following error:
ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

You should consider using parenthesis to separate the arguments (which you already did) and change the boolean operator and for bitwise opeator & to avoid this type of errors. Also, keep in mind that if you want to use apply then you should use a parameter x for the function which will part of a lambda in the apply function:
def fill_age(x):
Age = x['Age']
Pclass = x['Pclass']
Sex = x['Sex']
if pd.isnull(Age):
if Pclass == 1:
return 34.61
elif (Pclass == 1) & (Sex == 'male'):
return 41.2813
elif (Pclass == 2) & (Sex == 'female'):
return 28.72
elif (Pclass == 2) & (Sex == 'male'):
return 30.74
elif (Pclass == 3) & (Sex == 'female'):
return 21.75
elif (Pclass == 3) & (Sex == 'male'):
return 26.51
else:
pass
else:
return Age
Now, using apply with the lambda:
train['Age'] = train['Age'].apply(lambda x: fill_age(x),axis=1)
In a sample dataframe:
df = pd.DataFrame({'Age':[1,np.nan,3,np.nan,5,6],
'Pclass':[1,2,3,3,2,1],
'Sex':['male','female','male','female','male','female']})
Using the answer provided above:
df['Age'] = df.apply(lambda x: fill_age(x),axis=1)
Output:
Age Pclass Sex
0 1.00 1 male
1 28.72 2 female
2 3.00 3 male
3 21.75 3 female
4 5.00 2 male
5 6.00 1 female

Related

Applying if-else statement to create new column

I have applied if-else statement in function change_test, but it has resulted in None values in each test column, here is my code:
def change_test(df):
if ((df['product_id'] == 7.99) & (df['refunded'] == 1)):
df['test'] = 0
elif (df['product_id'] == 49.99):
df['test'] == 49.99
else :
df['test'] = df['product_id'] * (df['days_used_app'] / 7)
df['test'] = df.apply(change_test, axis = 1)
And here is my dataframe before applying this function
You should return value in apply function
def change_test(row):
if ((row['product_id'] == 7.99) & (row['refunded'] == 1)):
return 0
elif (row['product_id'] == 49.99):
return 49.99
else :
return row['product_id'] * (row['days_used_app'] / 7)
df['test'] = df.apply(change_test, axis = 1)

How do I compare values in columns in a pandas dataframe using a function?

I have a pandas dataframe and would like to create a new column based on the below condition:
def confidence_level(row):
if (row['ctry_one'] == row['ctry_two']) and (row['Market'] == 'yes'):
return 'H'
if (row['ctry_one'] == row['ctry_two']) and (row['Market'] == 'no'):
return 'M'
if (row['ctry_one'] != row['ctry_two']) and (row['Market'] == 'yes'):
return 'M'
if (row['ctry_one'] != row['ctry_two']) and (row['Market'] == 'no'):
return 'L'
df['status'] = confidence_level(df)
This is the error I receive:
ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().
func_test['Confidence'].value_counts()
Has anyone experienced this before? I tried applying .all() at the end of each argument like below, but this just returns 'None' for everything:
def confidence_level(row):
if (row['ctry_one'] == row['ctry_two']).all() and (row['Market'] == 'yes').all():
return 'H'
if (row['ctry_one'] == row['ctry_two']).all() and (row['Market'] == 'no').all():
return 'M'
if (row['ctry_one'] != row['ctry_two']).all() and (row['Market'] == 'yes').all():
return 'M'
if (row['ctry_one'] != row['ctry_two']).all() and (row['Market'] == 'no').all():
return 'L'
You need to call your function for each row, rather than for the whole dataframe, like this:
df['status'] = df.apply(confidence_level, axis=1)
That said, using np.select like Mayank's solution or using .loc like this will be run faster:
def confidence_level(df):
new_df = df.copy()
new_df.loc[(df['ctry_one'] == df['ctry_two']) & (df['Market'] == 'yes'), 'status'] = 'H'
new_df.loc[(df['ctry_one'] == df['ctry_two']) & (df['Market'] == 'no'), 'status'] = 'M'
new_df.loc[(df['ctry_one'] != df['ctry_two']) & (df['Market'] == 'yes'), 'status'] = 'M'
new_df.loc[(df['ctry_one'] != df['ctry_two']) & (df['Market'] == 'no'), 'status'] = 'L'
return df
df = confidence_level(df)
Use numpy.select instead, which is more performant and readable:
import numpy as np
conditions = [(df['ctry_one'] == df['ctry_two']) & (df['Market'] == 'yes'), (df['ctry_one'] == df['ctry_two']) & (df['Market'] == 'no'), (df['ctry_one'] != df['ctry_two']) & (df['Market'] == 'yes'), (df['ctry_one'] != df['ctry_two']) & (df['Market'] == 'no')]
choices = ['H', 'M', 'M', 'L']
df['status'] = np.select(conditions, choices)

TypeError: 'float' object is not subscriptable (While tryng to apply a function on Pandas DataFrame)

I am trying to apply this function on a pandas dataframe. But I am getting this error. I'd like to know what does it mean, and how to rectify it?
def fill_age(x):
Age = x['Age']
Pclass = x['Pclass']
Sex = x['Sex']
if pd.isnull(Age):
if Pclass == 1:
return 34.61 + np.random.normal(loc =0, scale = 13.61)
elif (Pclass == 1) & (Sex == 'male'):
return 41.2813 + np.random.normal(loc = 0, scale = 15.14)
elif (Pclass == 2) & (Sex == 'female'):
return 28.72 + np.random.normal(loc = 0, scale = 12.87)
elif (Pclass == 2) & (Sex == 'male'):
return 30.74 + np.random.normal(loc = 0, scale= 14.79)
elif (Pclass == 3) & (Sex == 'female'):
return 21.75 + np.random.normal(loc = 0, scale = 12.73)
elif (Pclass == 3) & (Sex == 'male'):
return 26.51 + np.random.normal(loc = 0, scale= 12.16)
else:
pass
else:
return Age
train['Age'] = train['Age'].apply(fill_age)
Note: train is a pandas dataframe
You don't show where:
train['Age'] = train['Age'].apply(fill_age)
is coming from, but I suspect ```train`` is actually a float not a dict.

Apply result to dataset after df.iterrows

df = pd.read_csv('./test22.csv')
df.head(5)
df = df.replace(np.nan, None)
for index,col in df.iterrows():
# Extract only if date1 happened earlier than date2
load = 'No'
if col['date1'] == None or col['date2'] == None:
load = 'yes'
elif int(str(col['date1'])[:4]) >= int(str(col['date2'])[:4]) and \
(len(str(col['date1'])) == 4 or len(str(col['date2'])) == 4):
load = 'yes'
elif int(str(col['date1'])[:6]) >= int(str(col['date2'])[:6]) and \
(len(str(col['date1'])) == 6 or len(str(col['date2'])) == 6):
load = 'yes'
elif int(str(col['date1'])[:8]) >= int(str(col['date2'])[:8]):
load = 'yes'
df.head(5)
After preprocessing using iterrows in dataset, If you look at the above code (attached code), it will not be reflected in the actual dataset. I want to reflect the result in actual dataset.
How can I apply it to the actual dataset?
Replace your for loop with a function that returns a boolean, then you can use df.apply to apply it to all rows, and then filter your dataframe by that value:
def should_load(x):
if x['date1'] == None or x['date2'] == None:
return True
elif int(str(x['date1'])[:4]) >= int(str(x['date2'])[:4]) and \
(len(str(x['date1'])) == 4 or len(str(x['date2'])) == 4):
return True
elif int(str(x['date1'])[:6]) >= int(str(x['date2'])[:6]) and \
(len(str(x['date1'])) == 6 or len(str(x['date2'])) == 6):
return True
elif int(str(x['date1'])[:8]) >= int(str(x['date2'])[:8]):
return True
return False
df[df.apply(should_load, axis=1)].head(5)

How to simplify an IF statement

So I have an IF statement in python which essentially looks to change null values in a dataset to an average based off two other columns.
def impute_age(cols):
Age = cols[0]
Pclass = cols[1]
Sex = cols[2]
if pd.isnull(Age):
if Pclass == 1 and Sex == 0:
return train.loc[(train["Pclass"] == 1)
& (train["Sex_male"] == 0)]["Age"].mean()
if Pclass == 2 and Sex == 0:
return train.loc[(train["Pclass"] == 2)
& (train["Sex_male"] == 0)]["Age"].mean()
if Pclass == 3 and Sex == 0:
return train.loc[(train["Pclass"] == 3)
& (train["Sex_male"] == 0)]["Age"].mean()
if Pclass == 1 and Sex == 1:
return train.loc[(train["Pclass"] == 1)
& (train["Sex_male"] == 1)]["Age"].mean()
if Pclass == 2 and Sex == 1:
return train.loc[(train["Pclass"] == 2)
& (train["Sex_male"] == 1)]["Age"].mean()
if Pclass == 3 and Sex == 1:
return train.loc[(train["Pclass"] == 3)
& (train["Sex_male"] == 1)]["Age"].mean()
else:
return Age
So here i'm trying to fill in nans using the average age of male/females in certain passenger classes. I feel like there would be a much better way of writing this, especially if I was to come across a much bigger dataset.
For reference the train df is the main df with all of the data. For some reason I couldn't get this code to work with a subset of train passed through using the cols argument.
The question here is essentially: how can I write this in a much simpler way & is there a way I could write this IF statement if my dataset was MUCH larger?
It appears to me that all you need to do is parameterize your inner if:
if pd.isnull(Age):
return train.loc[(train["Pclass"] == Pclass)
& (train["Sex_male"] == Sex)]["Age"].mean()
PCLASS_VALUES = [
[],
]
SEX_VALUES = [
[],
]
return train.loc[(train["Pclass"] == PCLASS_VALUES[Pclass][Sex]) & (train["Sex_male"] == SEX_VALUES[Pclass][Sex])]["Age"].mean()

Categories

Resources