So I have an IF statement in python which essentially looks to change null values in a dataset to an average based off two other columns.
def impute_age(cols):
Age = cols[0]
Pclass = cols[1]
Sex = cols[2]
if pd.isnull(Age):
if Pclass == 1 and Sex == 0:
return train.loc[(train["Pclass"] == 1)
& (train["Sex_male"] == 0)]["Age"].mean()
if Pclass == 2 and Sex == 0:
return train.loc[(train["Pclass"] == 2)
& (train["Sex_male"] == 0)]["Age"].mean()
if Pclass == 3 and Sex == 0:
return train.loc[(train["Pclass"] == 3)
& (train["Sex_male"] == 0)]["Age"].mean()
if Pclass == 1 and Sex == 1:
return train.loc[(train["Pclass"] == 1)
& (train["Sex_male"] == 1)]["Age"].mean()
if Pclass == 2 and Sex == 1:
return train.loc[(train["Pclass"] == 2)
& (train["Sex_male"] == 1)]["Age"].mean()
if Pclass == 3 and Sex == 1:
return train.loc[(train["Pclass"] == 3)
& (train["Sex_male"] == 1)]["Age"].mean()
else:
return Age
So here i'm trying to fill in nans using the average age of male/females in certain passenger classes. I feel like there would be a much better way of writing this, especially if I was to come across a much bigger dataset.
For reference the train df is the main df with all of the data. For some reason I couldn't get this code to work with a subset of train passed through using the cols argument.
The question here is essentially: how can I write this in a much simpler way & is there a way I could write this IF statement if my dataset was MUCH larger?
It appears to me that all you need to do is parameterize your inner if:
if pd.isnull(Age):
return train.loc[(train["Pclass"] == Pclass)
& (train["Sex_male"] == Sex)]["Age"].mean()
PCLASS_VALUES = [
[],
]
SEX_VALUES = [
[],
]
return train.loc[(train["Pclass"] == PCLASS_VALUES[Pclass][Sex]) & (train["Sex_male"] == SEX_VALUES[Pclass][Sex])]["Age"].mean()
Related
I have a DataFrame df with columns action and pointerID. In the code snippet below I'm iterating through every row which is pretty inefficient because the DataFrame is pretty large. Is there a more efficient way to do this?
annotated = []
curr_pointers = []
count = 1
for index, row in df.iterrows():
action = row["action"]
id = row["pointerID"]
if action == "ACTION_MOVE":
annotated.append(curr_pointers[id])
elif (action == "ACTION_POINTER_DOWN") or (action == "ACTION_DOWN"):
if row["actionIndex"] != id:
continue
if id >= len(curr_pointers):
curr_pointers.append(count)
else:
curr_pointers[id] = count
annotated.append(count)
count = count + 1
elif (action == "ACTION_POINTER_UP") or (action == "ACTION_UP") or (action == "ACTION_CANCEL"):
if row["actionIndex"] != id:
continue
annotated.append(curr_pointers[id])
else:
print("{} unknown".format(action))
I am trying to apply this function on a pandas dataframe. But I am getting this error. I'd like to know what does it mean, and how to rectify it?
def fill_age(x):
Age = x['Age']
Pclass = x['Pclass']
Sex = x['Sex']
if pd.isnull(Age):
if Pclass == 1:
return 34.61 + np.random.normal(loc =0, scale = 13.61)
elif (Pclass == 1) & (Sex == 'male'):
return 41.2813 + np.random.normal(loc = 0, scale = 15.14)
elif (Pclass == 2) & (Sex == 'female'):
return 28.72 + np.random.normal(loc = 0, scale = 12.87)
elif (Pclass == 2) & (Sex == 'male'):
return 30.74 + np.random.normal(loc = 0, scale= 14.79)
elif (Pclass == 3) & (Sex == 'female'):
return 21.75 + np.random.normal(loc = 0, scale = 12.73)
elif (Pclass == 3) & (Sex == 'male'):
return 26.51 + np.random.normal(loc = 0, scale= 12.16)
else:
pass
else:
return Age
train['Age'] = train['Age'].apply(fill_age)
Note: train is a pandas dataframe
You don't show where:
train['Age'] = train['Age'].apply(fill_age)
is coming from, but I suspect ```train`` is actually a float not a dict.
I'm trying to apply this function to fill the Age column based on Pclass and Sex columns. But I'm unable to do so. How can I make it work?
def fill_age():
Age = train['Age']
Pclass = train['Pclass']
Sex = train['Sex']
if pd.isnull(Age):
if Pclass == 1:
return 34.61
elif (Pclass == 1) and (Sex == 'male'):
return 41.2813
elif (Pclass == 2) and (Sex == 'female'):
return 28.72
elif (Pclass == 2) and (Sex == 'male'):
return 30.74
elif (Pclass == 3) and (Sex == 'female'):
return 21.75
elif (Pclass == 3) and (Sex == 'male'):
return 26.51
else:
pass
else:
return Age
train['Age'] = train['Age'].apply(fill_age(),axis=1)
I'm getting the following error:
ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().
You should consider using parenthesis to separate the arguments (which you already did) and change the boolean operator and for bitwise opeator & to avoid this type of errors. Also, keep in mind that if you want to use apply then you should use a parameter x for the function which will part of a lambda in the apply function:
def fill_age(x):
Age = x['Age']
Pclass = x['Pclass']
Sex = x['Sex']
if pd.isnull(Age):
if Pclass == 1:
return 34.61
elif (Pclass == 1) & (Sex == 'male'):
return 41.2813
elif (Pclass == 2) & (Sex == 'female'):
return 28.72
elif (Pclass == 2) & (Sex == 'male'):
return 30.74
elif (Pclass == 3) & (Sex == 'female'):
return 21.75
elif (Pclass == 3) & (Sex == 'male'):
return 26.51
else:
pass
else:
return Age
Now, using apply with the lambda:
train['Age'] = train['Age'].apply(lambda x: fill_age(x),axis=1)
In a sample dataframe:
df = pd.DataFrame({'Age':[1,np.nan,3,np.nan,5,6],
'Pclass':[1,2,3,3,2,1],
'Sex':['male','female','male','female','male','female']})
Using the answer provided above:
df['Age'] = df.apply(lambda x: fill_age(x),axis=1)
Output:
Age Pclass Sex
0 1.00 1 male
1 28.72 2 female
2 3.00 3 male
3 21.75 3 female
4 5.00 2 male
5 6.00 1 female
df = pd.read_csv('./test22.csv')
df.head(5)
df = df.replace(np.nan, None)
for index,col in df.iterrows():
# Extract only if date1 happened earlier than date2
load = 'No'
if col['date1'] == None or col['date2'] == None:
load = 'yes'
elif int(str(col['date1'])[:4]) >= int(str(col['date2'])[:4]) and \
(len(str(col['date1'])) == 4 or len(str(col['date2'])) == 4):
load = 'yes'
elif int(str(col['date1'])[:6]) >= int(str(col['date2'])[:6]) and \
(len(str(col['date1'])) == 6 or len(str(col['date2'])) == 6):
load = 'yes'
elif int(str(col['date1'])[:8]) >= int(str(col['date2'])[:8]):
load = 'yes'
df.head(5)
After preprocessing using iterrows in dataset, If you look at the above code (attached code), it will not be reflected in the actual dataset. I want to reflect the result in actual dataset.
How can I apply it to the actual dataset?
Replace your for loop with a function that returns a boolean, then you can use df.apply to apply it to all rows, and then filter your dataframe by that value:
def should_load(x):
if x['date1'] == None or x['date2'] == None:
return True
elif int(str(x['date1'])[:4]) >= int(str(x['date2'])[:4]) and \
(len(str(x['date1'])) == 4 or len(str(x['date2'])) == 4):
return True
elif int(str(x['date1'])[:6]) >= int(str(x['date2'])[:6]) and \
(len(str(x['date1'])) == 6 or len(str(x['date2'])) == 6):
return True
elif int(str(x['date1'])[:8]) >= int(str(x['date2'])[:8]):
return True
return False
df[df.apply(should_load, axis=1)].head(5)
I would like to calculate the mean of an array in python, using different grouping variables. For instance, I want to calculate the mean of all values in column1, for which column2 == 2 and column3 == a + 3.
I've tried a for-loop & if-loop, but it seems extremely complicated and for the dimensions of my data way too confusing. Is there another way to group the data for certain conditions and calculate the mean for each combination of conditions individually?
I am looking for a function like group_by(), summarise() or aggregate() in R, just for python.
This is the loop I tried so far:
for j in range(0,len(e_data)): #iterate for each row in e_data
if e_data[j,6] == 0.0082:
if e_data[j,1] == ped1:
e_data[j,7] = mean(e_data[:,4][np.where((e_data[:,0] == (ped1+0.0082)) & (e_data[:,5] == i))])
elif e_data[j,1] == ped2:
e_data[j,7] = mean(e_data[:,4][np.where((e_data[:,0] == (ped2+0.0082)) & (e_data[:,5] == i))])
elif e_data[j,1] == ped3:
e_data[j,7] = mean(e_data[:,4][np.where((e_data[:,0] == (ped3+0.0082)) & (e_data[:,5] == i))]))
if e_data[j,6] == 0.001:
if e_data[j,1] == ped1:
e_data[j,7] = mean(e_data[:,4][np.where((e_data[:,0] == (ped1+0.001)) & (e_data[:,5] == i))])
elif e_data[j,1] == ped2:
e_data[j,7] = mean(e_data[:,4][np.where((e_data[:,0] == (ped2+0.001)) & (e_data[:,5] == i))])
elif e_data[j,1] == ped3:
e_data[j,7] = mean(e_data[:,4][np.where((e_data[:,0] == (ped3+0.001)) & (e_data[:,5] == i))])
if e_data[j,6] == 0.0235:
if e_data[j,1] == ped1:
e_data[j,7] = mean(e_data[:,4][np.where((e_data[:,0] == (ped1+0.0235)) & (e_data[:,5] == i))])
elif e_data[j,1] == ped2:
e_data[j,7] = mean(e_data[:,4][np.where((e_data[:,0] == (ped2+0.0235)) & (e_data[:,5] == i))])
I would recommend you to checkout Pandas, which does exactly what you need.
In the python world, Pandas is the definitive solution for data analysis tasks like what you have described.
Once you import your data (ideally in .csv format) into pandas:
import pandas as pd
df = pd.read_csv('filepath_to_your_data')
## I want to calculate the mean of all values in column1, for which column2 == 2 and column3 == a + 3
df[(df.column2==2) & (df.column3 == a+3)].loc[:, 'column1'].mean()
Pandas also offers groupby, describe and agg