Conditional column in DataFrame: where's the mistake? - python

Let's start with a Pandas DataFrame df with numerical columns pS, pS0 and pE:
import pandas as pd
df = pd.DataFrame([[0.1,0.2,0.7],[0.3,0.6,0.1],[0.9,0.1,0.0]],
columns=['pS','pE','pS0'])
We want to build a column indicating which of the 3 previous is dominating. I achieved it this way:
def class_morph(x):
y = [x['pE'],x['pS'],x['pS0']]
y.sort(reverse=True)
if (y[0] == y[1]):
return 'U'
elif (x['pE'] == y[0]):
return 'E'
elif (x['pS'] == y[0]):
return 'S'
elif (x['pS0'] == y[0]):
return 'S0'
df['Morph'] = df.apply(class_morph, axis=1)
Which gives the correct result:
But my initial try was the following:
def class_morph(x):
if (x['pE'] > np.max(x['pS'],x['pS0'])):
return 'E'
elif (x['pS'] > np.max(x['pE'],x['pS0'])):
return 'S'
elif (x['pS0'] > np.max(x['pS'],x['pE'])):
return 'S0'
else:
return 'U'
Which returned something wrong:
Could somebody explain to me what is my mistake in my first try?

Related

Applying if-else statement to create new column

I have applied if-else statement in function change_test, but it has resulted in None values in each test column, here is my code:
def change_test(df):
if ((df['product_id'] == 7.99) & (df['refunded'] == 1)):
df['test'] = 0
elif (df['product_id'] == 49.99):
df['test'] == 49.99
else :
df['test'] = df['product_id'] * (df['days_used_app'] / 7)
df['test'] = df.apply(change_test, axis = 1)
And here is my dataframe before applying this function
You should return value in apply function
def change_test(row):
if ((row['product_id'] == 7.99) & (row['refunded'] == 1)):
return 0
elif (row['product_id'] == 49.99):
return 49.99
else :
return row['product_id'] * (row['days_used_app'] / 7)
df['test'] = df.apply(change_test, axis = 1)

How to speed up successive pd.apply with successive pd.DataFrame.loc calls?

def __link_price(row: pd.Series) -> Union[None, float]:
if (row['fund'] == 'A') and (row['share_class'] == 'X'):
return df_hist.loc[row['date'], 'AA']
elif (row['fund'] == 'A') and (row['share_class'] == 'Y'):
return df_hist.loc[row['date'], 'AB']
elif (row['fund'] == 'B') and (row['share_class'] == 'X'):
return df_hist.loc[row['date'], 'BA']
elif (row['fund'] == 'B') and (row['share_class'] == 'Y'):
return df_hist.loc[row['date'], 'BB']
elif (row['fund'] == 'C') and (row['share_class'] == 'X'):
return df_hist.loc[row['date'], 'CA']
elif (row['fund'] == 'C') and (row['share_class'] == 'Y'):
return df_hist.loc[row['date'], 'CB']
else:
return 0
df.loc[:, 'price'] = df.apply(__link_price_at_purchase, axis=1).values
df has 10,000+ lines, so this code is taking a long time. In addition for each row, I'm doing a df_hist.loc call to get the value.
I'm trying to speed up this section of code and then option I've found so far is using:
df.loc[:, 'price'] = df.apply(__link_price_at_purchase, axis=1, raw=True).values
But this forces me to use index based selection for row instead of value selection:
if (row[0] == 'A') and (row[1] == 'X')
which reduces the readability of the code.
I'm looking for an approach that both speeds up the code and still allows for readability of the code.
In python, there is a certain cost for each attribute or item lookup and function call. And you don't have a compiler that optimizes things for you.
Here are some general recommendations:
Try creating a column that includes fund and share_class without using Python functions and then merge it with df_hist
# convert history from 'wide' format into 'long' format
hist = df_hist.set_index("date").stack()
prices = (
# create key column for join
df.assign(key=df["fund"] + df["share_class"].replace({"X": "A", "Y": "B"}))
.set_index(["date", "key"])
.join(hist) # join by index
)
If it's not trivial to create a key column, minimize attribute lookups inside the apply function:
def __link_price(row):
date, fund, share_class = row[["date", "fund", "share_class"]]
if fund == 'A' and share_class == 'X':
return df_hist.loc[date, 'AA']
...
Optimize if conditions. For example, you need to check 6 conditions in case where (row['fund'] == 'C') and (row['share_class'] == 'Y'). You can reduce this number to ... 1.
fund_and_share_class_to_key = {
("A", "X"): "AA",
("A", "Y"): "AB",
...
}
key = fund_and_share_class_to_key.get((fund, share_class))
return df_hist.loc[date, key] if key is not None else 0
Pandas itself is pretty slow for non-vectorized and non-arithmetic operations. In your case it's better to use standard python dicts for faster lookups.
# small benchmark
df = pd.DataFrame({"value": [4,5,6]})
d = df.to_dict(orient="index")
%timeit df.loc[1, "value"] # 8.7ms
%timeit d[1]["value"] # 50ns; ~170 times faster
# convert dataframe into the dict with format:
# {<date>: {"AA": <value>}}
history = df_hist.set_index("date").to_dict(orient="index")
def __link_price(row):
...
price = history.get(date, {}).get(key, 0)
return price
It should be faster to pass history as an apply argument rather than search it in the non-local scope. It also makes the code cleaner.
def __link_price(row, history):
...
df.apply(__link_price, args=(history, ))
To summarize, a faster function would be something like this:
history = df_hist.set_index("date").to_dict(orient="index")
# we don't need to create a mapping on every __link_price call
fund_and_share_class_to_key = {
("A", "X"): "AA",
("A", "Y"): "AB",
...
}
def __link_price(row, history, fund_and_share_class_to_key):
date, fund, share_class = row[["date", "fund", "share_class"]]
key = fund_and_share_class_to_key.get((fund, share_class))
return history.get(date, {}).get(key, 0)
df.apply(__link_price, args=(history, fund_and_share_class_to_key))

Optimize the code for dataframes in python

Below is the code for checks on two columns. I know this isn't the proper way of doing it on two columns of a dataframe but I was hoping to get help for doing it in a better way
for i in range(len(df)):
if df['Current_Value'][i].lower() == 'false'or df['Current_Value'][i].lower() == '0' and df['_Value'][i].lower() == 'false' or df['_Value'][i].lower() == '0':
df['CHECK'][i] = True
elif df['Current_Value'][i].lower() == 'true' or df['Current_Value'][i].lower() == '1' and df['_Value'][i].lower() == 'true' or df['_Value'][i].lower() == '1':
df['CHECK'][i] = True
elif df['Current_Value'][i].lower() in df['_Value'][i].lower():
df['CHECK'][i] = True
else:
df['CHECK'][i] = False
You should use lambda expressions for such a check. Although you haven't provided us with a sample dataset, what you could do is something like this:
First define the lambda function
def fill_check_column(current_value, value):
# Precompute this, so that it is calculated only once
current_value = current_value.lower()
value = value.lower()
if current_value in ['false', '0'] and value in ['false', '0']:
return True
elif current_value in ['true', '1'] and value in ['true', '1']:
return True
elif current_value in value:
return True
else:
return False
Then use it on the data frame:
df['Check'] = df.apply(lambda row: fill_check_column(current_value=row['Current_Value'],
value=row['_Value'],
axis=1)
You could also improve the fill_check_column to make the checks only once.

Apply result to dataset after df.iterrows

df = pd.read_csv('./test22.csv')
df.head(5)
df = df.replace(np.nan, None)
for index,col in df.iterrows():
# Extract only if date1 happened earlier than date2
load = 'No'
if col['date1'] == None or col['date2'] == None:
load = 'yes'
elif int(str(col['date1'])[:4]) >= int(str(col['date2'])[:4]) and \
(len(str(col['date1'])) == 4 or len(str(col['date2'])) == 4):
load = 'yes'
elif int(str(col['date1'])[:6]) >= int(str(col['date2'])[:6]) and \
(len(str(col['date1'])) == 6 or len(str(col['date2'])) == 6):
load = 'yes'
elif int(str(col['date1'])[:8]) >= int(str(col['date2'])[:8]):
load = 'yes'
df.head(5)
After preprocessing using iterrows in dataset, If you look at the above code (attached code), it will not be reflected in the actual dataset. I want to reflect the result in actual dataset.
How can I apply it to the actual dataset?
Replace your for loop with a function that returns a boolean, then you can use df.apply to apply it to all rows, and then filter your dataframe by that value:
def should_load(x):
if x['date1'] == None or x['date2'] == None:
return True
elif int(str(x['date1'])[:4]) >= int(str(x['date2'])[:4]) and \
(len(str(x['date1'])) == 4 or len(str(x['date2'])) == 4):
return True
elif int(str(x['date1'])[:6]) >= int(str(x['date2'])[:6]) and \
(len(str(x['date1'])) == 6 or len(str(x['date2'])) == 6):
return True
elif int(str(x['date1'])[:8]) >= int(str(x['date2'])[:8]):
return True
return False
df[df.apply(should_load, axis=1)].head(5)

The truth value of a DataFrame is ambiguous

I am trying to get the values that are related to brand and manufacturer which are same (e.g brand==J.R. Watkins and manufacturer==J.R.Watkins)in last elif block.But it giving error as:
ValueError: The truth value of a DataFrame is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().
My code is:
import csv
import pandas as pd
import sys
class sample:
def create_df(self, f):
self.z=pd.read_csv(f)
def get_resultant_df(self, list_cols):
self.data_frame = self.z[list_cols[:]]
def process_df(self, df, conditions):
resultant_df = self.data_frame
if conditions[2] == 'equals':
new_df=resultant_df[resultant_df[conditions[1]] == conditions[3]]
return new_df
elif conditions[2] == 'contains':
new_df = resultant_df[resultant_df[conditions[1]].str.contains(conditions[3])]
return new_df
elif conditions[2] == 'not equals':
new_df = resultant_df[resultant_df[conditions[1]] != conditions[3]]
return new_df
elif conditions[2] == 'startswith':
new_df = resultant_df[resultant_df[conditions[1]].str.startswith(conditions[3])]
return new_df
elif conditions[2] == 'in':
new_df = resultant_df[resultant_df[conditions[1]].isin(resultant_df[conditions[3]])]
return new_df
elif conditions[2] == 'not in':
new_df = resultant_df[~resultant_df[conditions[1]].isin(resultant_df[conditions[3]])]
return new_df
elif conditions[2]=='group':
new_df=list(resultant_df.groupby(conditions[0])[conditions[1]])
return new_df
elif conditions[2]=='specific':
new_df=resultant_df.loc[resultant_df[conditions[0]]==conditions[8]]
return new_df
elif conditions[2]=='same':
if(resultant_df.loc[(resultant_df[conditions[0]]==conditions[8]) & (resultant_df[conditions[1]]==conditions[8])]).all():
new_df=resultant_df
return new_df
if __name__ == '__main__':
sample = sample()
sample.create_df("/home/purpletalk/GrammarandProductReviews.csv")
df = sample.get_resultant_df(['brand', 'reviews.id','manufacturer','reviews.title','reviews.username'])
new_df = sample.process_df(df, ['brand','manufacturer','same','manufacturer', 'size', 'equal',8,700,'J.R. Watkins'])
print new_df['brand']
I am trying to get the values that are related to brand and
manufacturer which are same (e.g brand==J.R. Watkins and
manufacturer==J.R.Watkins)
Your logic is overcomplicated. Just apply a filter:
df = df[(df['brand'] == 'J.R. Watkins') & (df['manufacturer'] == 'J.R.Watkins')]
You don't need pd.DataFrame.all(), which appears to be what you are attempting. Nor do you need an inner if statement: if there's no match, you will have an empty dataframe.

Categories

Resources