I have applied if-else statement in function change_test, but it has resulted in None values in each test column, here is my code:
def change_test(df):
if ((df['product_id'] == 7.99) & (df['refunded'] == 1)):
df['test'] = 0
elif (df['product_id'] == 49.99):
df['test'] == 49.99
else :
df['test'] = df['product_id'] * (df['days_used_app'] / 7)
df['test'] = df.apply(change_test, axis = 1)
And here is my dataframe before applying this function
You should return value in apply function
def change_test(row):
if ((row['product_id'] == 7.99) & (row['refunded'] == 1)):
return 0
elif (row['product_id'] == 49.99):
return 49.99
else :
return row['product_id'] * (row['days_used_app'] / 7)
df['test'] = df.apply(change_test, axis = 1)
I have a DataFrame df with columns action and pointerID. In the code snippet below I'm iterating through every row which is pretty inefficient because the DataFrame is pretty large. Is there a more efficient way to do this?
annotated = []
curr_pointers = []
count = 1
for index, row in df.iterrows():
action = row["action"]
id = row["pointerID"]
if action == "ACTION_MOVE":
annotated.append(curr_pointers[id])
elif (action == "ACTION_POINTER_DOWN") or (action == "ACTION_DOWN"):
if row["actionIndex"] != id:
continue
if id >= len(curr_pointers):
curr_pointers.append(count)
else:
curr_pointers[id] = count
annotated.append(count)
count = count + 1
elif (action == "ACTION_POINTER_UP") or (action == "ACTION_UP") or (action == "ACTION_CANCEL"):
if row["actionIndex"] != id:
continue
annotated.append(curr_pointers[id])
else:
print("{} unknown".format(action))
I would like to calculate the mean of an array in python, using different grouping variables. For instance, I want to calculate the mean of all values in column1, for which column2 == 2 and column3 == a + 3.
I've tried a for-loop & if-loop, but it seems extremely complicated and for the dimensions of my data way too confusing. Is there another way to group the data for certain conditions and calculate the mean for each combination of conditions individually?
I am looking for a function like group_by(), summarise() or aggregate() in R, just for python.
This is the loop I tried so far:
for j in range(0,len(e_data)): #iterate for each row in e_data
if e_data[j,6] == 0.0082:
if e_data[j,1] == ped1:
e_data[j,7] = mean(e_data[:,4][np.where((e_data[:,0] == (ped1+0.0082)) & (e_data[:,5] == i))])
elif e_data[j,1] == ped2:
e_data[j,7] = mean(e_data[:,4][np.where((e_data[:,0] == (ped2+0.0082)) & (e_data[:,5] == i))])
elif e_data[j,1] == ped3:
e_data[j,7] = mean(e_data[:,4][np.where((e_data[:,0] == (ped3+0.0082)) & (e_data[:,5] == i))]))
if e_data[j,6] == 0.001:
if e_data[j,1] == ped1:
e_data[j,7] = mean(e_data[:,4][np.where((e_data[:,0] == (ped1+0.001)) & (e_data[:,5] == i))])
elif e_data[j,1] == ped2:
e_data[j,7] = mean(e_data[:,4][np.where((e_data[:,0] == (ped2+0.001)) & (e_data[:,5] == i))])
elif e_data[j,1] == ped3:
e_data[j,7] = mean(e_data[:,4][np.where((e_data[:,0] == (ped3+0.001)) & (e_data[:,5] == i))])
if e_data[j,6] == 0.0235:
if e_data[j,1] == ped1:
e_data[j,7] = mean(e_data[:,4][np.where((e_data[:,0] == (ped1+0.0235)) & (e_data[:,5] == i))])
elif e_data[j,1] == ped2:
e_data[j,7] = mean(e_data[:,4][np.where((e_data[:,0] == (ped2+0.0235)) & (e_data[:,5] == i))])
I would recommend you to checkout Pandas, which does exactly what you need.
In the python world, Pandas is the definitive solution for data analysis tasks like what you have described.
Once you import your data (ideally in .csv format) into pandas:
import pandas as pd
df = pd.read_csv('filepath_to_your_data')
## I want to calculate the mean of all values in column1, for which column2 == 2 and column3 == a + 3
df[(df.column2==2) & (df.column3 == a+3)].loc[:, 'column1'].mean()
Pandas also offers groupby, describe and agg
So I have an IF statement in python which essentially looks to change null values in a dataset to an average based off two other columns.
def impute_age(cols):
Age = cols[0]
Pclass = cols[1]
Sex = cols[2]
if pd.isnull(Age):
if Pclass == 1 and Sex == 0:
return train.loc[(train["Pclass"] == 1)
& (train["Sex_male"] == 0)]["Age"].mean()
if Pclass == 2 and Sex == 0:
return train.loc[(train["Pclass"] == 2)
& (train["Sex_male"] == 0)]["Age"].mean()
if Pclass == 3 and Sex == 0:
return train.loc[(train["Pclass"] == 3)
& (train["Sex_male"] == 0)]["Age"].mean()
if Pclass == 1 and Sex == 1:
return train.loc[(train["Pclass"] == 1)
& (train["Sex_male"] == 1)]["Age"].mean()
if Pclass == 2 and Sex == 1:
return train.loc[(train["Pclass"] == 2)
& (train["Sex_male"] == 1)]["Age"].mean()
if Pclass == 3 and Sex == 1:
return train.loc[(train["Pclass"] == 3)
& (train["Sex_male"] == 1)]["Age"].mean()
else:
return Age
So here i'm trying to fill in nans using the average age of male/females in certain passenger classes. I feel like there would be a much better way of writing this, especially if I was to come across a much bigger dataset.
For reference the train df is the main df with all of the data. For some reason I couldn't get this code to work with a subset of train passed through using the cols argument.
The question here is essentially: how can I write this in a much simpler way & is there a way I could write this IF statement if my dataset was MUCH larger?
It appears to me that all you need to do is parameterize your inner if:
if pd.isnull(Age):
return train.loc[(train["Pclass"] == Pclass)
& (train["Sex_male"] == Sex)]["Age"].mean()
PCLASS_VALUES = [
[],
]
SEX_VALUES = [
[],
]
return train.loc[(train["Pclass"] == PCLASS_VALUES[Pclass][Sex]) & (train["Sex_male"] == SEX_VALUES[Pclass][Sex])]["Age"].mean()
Let's start with a Pandas DataFrame df with numerical columns pS, pS0 and pE:
import pandas as pd
df = pd.DataFrame([[0.1,0.2,0.7],[0.3,0.6,0.1],[0.9,0.1,0.0]],
columns=['pS','pE','pS0'])
We want to build a column indicating which of the 3 previous is dominating. I achieved it this way:
def class_morph(x):
y = [x['pE'],x['pS'],x['pS0']]
y.sort(reverse=True)
if (y[0] == y[1]):
return 'U'
elif (x['pE'] == y[0]):
return 'E'
elif (x['pS'] == y[0]):
return 'S'
elif (x['pS0'] == y[0]):
return 'S0'
df['Morph'] = df.apply(class_morph, axis=1)
Which gives the correct result:
But my initial try was the following:
def class_morph(x):
if (x['pE'] > np.max(x['pS'],x['pS0'])):
return 'E'
elif (x['pS'] > np.max(x['pE'],x['pS0'])):
return 'S'
elif (x['pS0'] > np.max(x['pS'],x['pE'])):
return 'S0'
else:
return 'U'
Which returned something wrong:
Could somebody explain to me what is my mistake in my first try?