Apply for loop in multiple dataframe for multiple columns? - python

Dataframe is like below: Where I want to change dataframes value to 'dead' if age is more than 100.
import pandas as pd
raw_data = {'age1': [23,45,210],'age2': [10,20,150],'name': ['a','b','c']}
df = pd.DataFrame(raw_data, columns = ['age1','age2','name'])
raw_data = {'age1': [80,90,110],'age2': [70,120,90],'name': ['a','b','c']}
df2 = pd.DataFrame(raw_data, columns = ['age1','age2','name'])
Desired outcome
df=
age1 age2 name
0 23 10 a
1 45 20 b
2 dead dead c
df2=
age1 age2 name
0 80 70 a
1 90 dead b
2 dead 90 c
I was trying something like this:
col_list=['age1','age2']
df_list=[df,df2]
def dead(df):
for df in df_list:
if df.columns in col_list:
if df.columns >=100:
return 'dead'
else:
return df.columns
df.apply(dead)
Error shown:
The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
I am looking for a loop that works on all dataframe.
Please correct my function also for future learning :)

With your shown samples, please try following. Using filter, np.where functions of pandas, numpy respectively.
c = df.filter(regex='age\d+').columns
df[c] = np.where(df[c].ge(100),'dead',df[c])
df
Alternative approach with where:
c=df.filter(like='age').columns
df[c] = df[c].where(~df['c'].ge(100),'dead')
Explanation:
Getting columns which has same name like age in c variable.
Then using np.where to check if respective(all age columns) are greeter/equal to 100, if yes then set it to dead or keep it as it is.

I did the following:
col_list=['age1','age2']
df_list=[df,df2]
for d in df_list:
for c in col_list:
d.loc[d[c]>100, c] = 'dead'

#inspired by #jib and #ravinder
col_list=['age1','age2']
df_list=[df,df2]
for d in df_list:
for c in col_list:
d[c]=np.where(d[c]>100,'dead',d[c])
df #or df2
output:
age1 age2 name
0 23 10 a
1 45 20 b
2 dead dead c

One possible solution is to use Pandas' mask, which is similar to if-else, but vectorized.
def dead(df):
col_list = ['age1', 'age2']
df = df.copy()
temporary = df.filter(col_list)
temporary = temporary.mask(temporary >= 100, "dead")
df.loc[:, col_list] = temporary
return df
Apply function to the dataframe:
df.pipe(dead)
age1 age2 name
0 23 10 a
1 45 20 b
2 dead dead c

You can do:
def check_more_than_100(x):
v = None
try:
v = int(x)
except:
pass
if v is not None:
return (v > 100)
return (False)
df['age1'] = df['age1'].apply(lambda x : 'dead' if check_more_than_100(x) else x)
df['age2'] = df['age2'].apply(lambda x : 'dead' if check_more_than_100(x) else x)
df2['age1'] = df2['age1'].apply(lambda x : 'dead' if check_more_than_100(x) else x)
df2['age2'] = df2['age2'].apply(lambda x : 'dead' if check_more_than_100(x) else x)
This should take care of non-int values if any.

I used this answer to a similar question. Basically you can use the .where() function from numpy to set based on the conditional.
import pandas as pd
import numpy as np
raw_data = {'age1': [23,45,210],'age2': [10,20,150],'name': ['a','b','c']}
df = pd.DataFrame(raw_data, columns = ['age1','age2','name'])
raw_data = {'age1': [80,90,110],'age2': [70,120,90],'name': ['a','b','c']}
df2 = pd.DataFrame(raw_data, columns = ['age1','age2','name'])
col_list=['age1','age2']
df_list=[df,df2]
def dead(df_list, col_list):
for df in df_list:
for col in col_list:
df[col] = np.where(df[col] >= 100, "dead", df[col])
return df_list
df
dead([df], col_list)

Extracting numeric columns and then using numpy where -
df_cols = df._get_numeric_data().columns.values
df2_cols = df2._get_numeric_data().columns.values
df[df_cols] = np.where(df[df_cols].to_numpy() > 100, 'dead', df[df_cols])
df2[df2_cols] = np.where(df2[df2_cols].to_numpy() > 100, 'dead', df2[df2_cols])

Related

How do i create a function that scans multiple dataframe columns for a value. if any of those values are found the new column returns a given figure

I'm trying to build a sheet whereby I have a new column ('column x').
This column would be populated by scanning over three already existing columns (a, b , c).
if a given value is found in any of those columns for the indexed figure the new column will read 'Fail' else it will read 'pass.
When i try this on scanning a single column my code works
example:
df["Column x"] = df["Column a"].apply(lambda val: "Fail" if val == 'T' else "Pass")
When i try in more than one it fails no matter how i adjust.
df['Column x'] = df['Column a'].any(lambda val: 'Fail' if val == 0 else 'Pass') or df['Column b'].apply(lambda val: 'Fail' if val == 'False' else 'Pass')
any advise is incredibly helpful
Here is an all pandas code:
import pandas as pd
df = pd.DataFrame({
'col1':['A','B','C'],
'col2':['E','F','G'],
'col3':['H','I','J']}
)
df["C"] = pd.Series(
df['col1'].apply(lambda a: 0 if a == 'C' else 1) &
df['col2'].apply(lambda a: 0 if a == 'C' else 1) &
df['col3'].apply(lambda a: 0 if a == 'C' else 1)
).apply(lambda x: "Pass" if x else 'Fail')
print(df)
output
col1 col2 col3 C
0 A E H Pass
1 B F I Pass
2 C G J Fail
Handling empty data frame
if df.empty:
print('DataFrame is empty!')
Check Below code with np.where, checking for J if present in any column than False else True
import pandas as pd
import numpy as np
df = pd.DataFrame({'col1':['A','B','C'],'col2':['E','F','G'],'col3':['H','I','J']})
df['column_x'] = np.where(((df['col1']=='J')|(df['col2']=='J')|(df['col3']=='J')),'Fail','Pass')
df
Output:

Why does my original dataframe change as well?

For the dataset that I am using, it is available on Kaggle at this link
I am doing this to it:
import pandas as pd
df = pd.read_csv('./survey_results_public.csv')
df = df.dropna(subset=['Salary'], axis = 0).drop(['Respondent','ExpectedSalary','Salary'], axis = 1)
print(df['HoursPerWeek'].mean())
print(sum(df['HoursPerWeek'].isnull()))
# Method 1
df1 = df
df1 = df1.select_dtypes(include=['float']).fillna(df1.mean())
print(df['HoursPerWeek'].mean())
print(sum(df['HoursPerWeek'].isnull()))
print(df1['HoursPerWeek'].mean())
print(sum(df1['HoursPerWeek'].isnull()))
# Method 2
df2 = df
num_vars = df2.select_dtypes(include = ['float']).columns
for col in num_vars:
df2[col].fillna(df2[col].mean(),inplace = True)
print(df['HoursPerWeek'].mean())
print(sum(df['HoursPerWeek'].isnull()))
print(df2['HoursPerWeek'].mean())
print(sum(df2['HoursPerWeek'].isnull()))
My question is: Why does "Method 2" change df as well, as observed in the last 4 print statements where the mean and number of empty values is the dame between df and df2?
When I do something similar with normal variables in python this does not happen
a=2
b=a
c=a
print(a,b,c)
b += 2
print(a,b,c)
c += 3
print(a,b,c)
In this example, a is unchanged.
what you want to do is copy the dataframes:
...
# Method 1
df1 = df.copy()
df1 = df1.select_dtypes(include=['float']).fillna(df1.mean())
....
# Method 2
df2 = df.copy()
num_vars = df2.select_dtypes(include = ['float']).columns
...
Hope this helps :D
A good example are lists:
a = [1,2,3]
b = a
a.append(4)
print("b is",b)
# output is 'b is [1,2,3,4]

Pandas check which substring is in column of strings

Im trying to create function which will create a new column in a pandas dataframe, where it figures out which substring is in a column of strings and takes the substring and uses that for the new column.
The problem being that the text to find does not appear at the same location in variable x
df = pd.DataFrame({'x': ["var_m500_0_somevartext","var_m500_0_vartextagain",
"varwithsomeothertext_0_500", "varwithsomext_m150_0_text"], 'x1': [4, 5, 6,8]})
finds = ["m500_0","0_500","m150_0"]
which of finds is in a given df["x"] row
I've made a function that works, but is terribly slow for large datasets
def pd_create_substring_var(df,new_var_name = "new_var",substring_list=["1"],var_ori="x"):
import re
df[new_var_name] = "na"
cols = list(df.columns)
for ix in range(len(df)):
for find in substring_list:
for m in re.finditer(find, df.iloc[ix][var_ori]):
df.iat[ix, cols.index(new_var_name)] = df.iloc[ix][var_ori][m.start():m.end()]
return df
df = pd_create_substring_var(df,"t",finds,var_ori="x")
df
x x1 t
0 var_m500_0_somevartext 4 m500_0
1 var_m500_0_vartextagain 5 m500_0
2 varwithsomeothertext_0_500 6 0_500
3 varwithsomext_m150_0_text 8 m150_0
Does this accomplish what you need ?
finds = ["m500_0", "0_500", "m150_0"]
df["t"] = df["x"].str.extract(f"({'|'.join(finds)})")
Use pandas.str.findall:
df['x'].str.findall("|".join(finds))
0 [m500_0]
1 [m500_0]
2 [0_500]
3 [m150_0]
Probably not the best way:
df['t'] = df['x'].apply(lambda x: ''.join([i for i in finds if i in x]))
And now:
print(df)
Is:
x x1 t
0 var_m500_0_somevartext 4 m500_0
1 var_m500_0_vartextagain 5 m500_0
2 varwithsomeothertext_0_500 6 0_500
3 varwithsomext_m150_0_text 8 m150_0
And now, just adding to #pythonjokeun's answer, you can do:
df["t"] = df["x"].str.extract("(%s)" % '|'.join(finds))
Or:
df["t"] = df["x"].str.extract("({})".format('|'.join(finds)))
Or:
df["t"] = df["x"].str.extract("(" + '|'.join(finds) + ")")
I don't know how large your dataset is, but you can use map function like below:
def subset_df_test():
df = pandas.DataFrame({'x': ["var_m500_0_somevartext", "var_m500_0_vartextagain",
"varwithsomeothertext_0_500", "varwithsomext_m150_0_text"], 'x1': [4, 5, 6, 8]})
finds = ["m500_0", "0_500", "m150_0"]
df['t'] = df['x'].map(lambda x: compare(x, finds))
print df
def compare(x, finds):
for f in finds:
if f in x:
return f
Try this
df["t"] = df["x"].apply(lambda x: [i for i in finds if i in x][0])

create new column from conditional statement without mask pandas

I am looking for a better way to do the following:
A
TRDNumber
ALB2008081610 430
ALB200808167 0
ALB200808168 190
Creating a new column based on the value in another column using a conditional statement
A B
TRDNumber
ALB2008081610 430 z
ALB200808167 0 x
ALB200808168 190 y
The following code works but I know that there must be a better way to do this.
mask = df['A'] == 0
df20 = df[mask]
df20['B'] = 'x'
df20
mask2 = ((df.A != 0) & (df.B <= 200) )
df21 = df[mask2]
df21['B'] = 'y'
df21
pieces = [df20,df21]
pd.concat(pieces)
I think you want to do the following:
#%%
df = pd.DataFrame()
df['A'] = pd.Series([430,0,190], index=['ALB2008081610', 'ALB200808167', 'ALB200808168'])
print(df)
#%%
df['B'] = None
print(df)
#%%
df.loc[(df.A==0), 'B'] = 'x'
print(df)
#%%
df.loc[(df.A!=0) & (df.A<=200), 'B'] = 'y'
print(df)
An explanation about indexing can be found here: http://pandas.pydata.org/pandas-docs/stable/indexing.html
Tip for next time: provide the code for creating the dataframe. Then we can directly play around with the same dataframe you are using.
You can create function and apply it to your dataset:
>>> def foo(x):
... if x['A'] == 0:
... return 'x'
... elif x['A'] < 200:
... return 'y'
... else:
... return 'z'
...
>>> df['B'] = df.apply(foo, axis=1)
>>> df
A B
TRDNumber
ALB2008081610 430 z
ALB200808167 0 x
ALB200808168 190 y

Pandas: Check if row exists with certain values

I have a two dimensional (or more) pandas DataFrame like this:
>>> import pandas as pd
>>> df = pd.DataFrame([[0,1],[2,3],[4,5]], columns=['A', 'B'])
>>> df
A B
0 0 1
1 2 3
2 4 5
Now suppose I have a numpy array like np.array([2,3]) and want to check if there is any row in df that matches with the contents of my array. Here the answer should obviously true but eg. np.array([1,2]) should return false as there is no row with both 1 in column A and 2 in column B.
Sure this is easy but don't see it right now.
Turns out it is really easy, the following does the job here:
>>> ((df['A'] == 2) & (df['B'] == 3)).any()
True
>>> ((df['A'] == 1) & (df['B'] == 2)).any()
False
Maybe somebody comes up with a better solution which allows directly passing in the array and the list of columns to match.
Note that the parenthesis around df['A'] == 2 are not optional since the & operator binds just as strong as the == operator.
an easier way is:
a = np.array([2,3])
(df == a).all(1).any()
If you also want to return the index where the matches occurred:
index_list = df[(df['A'] == 2)&(df['B'] == 3)].index.tolist()
To find rows where a single column equals a certain value:
df[df['column name'] == value]
To find rows where multiple columns equal different values, Note the inner ():
df[(df["Col1"] == Value1 & df["Col2"] == Value2 & ....)]
a simple solution with dictionary
def check_existance(dict_of_values, df):
v = df.iloc[:, 0] == df.iloc[:, 0]
for key, value in dict_of_values.items():
v &= (df[key] == value)
return v.any()
import pandas as pd
df = pd.DataFrame([[0,1],[2,3],[4,5]], columns=['A', 'B'])
this_row_exists = {'A':2, 'B':3}
check_existance(this_row_exists, df)
# True
this_row_does_not_exist = {'A':2, 'B':5}
check_existance(this_row_does_not_exist, df)
# False
An answer that works with larger dataframes so you don't need to manually check for each columns:
import pandas as pd
import numpy as np
#define variables
df = pd.DataFrame([[0,1],[2,3],[4,5]], columns=['A', 'B'])
a = np.array([2,3])
def check_if_np_array_is_in_df(df, a):
# transform a into a dataframe
da = pd.DataFrame(np.expand_dims(a,axis=0), columns=['A','B'])
# drop duplicates from df
ddf=df.drop_duplicates()
result = pd.concat([ddf,da]).shape[0] - pd.concat([ddf,da]).drop_duplicates().shape[0]
return result
print(check_if_np_array_is_in_df(df, a))
print(check_if_np_array_is_in_df(df, [1,3]))
If you want to return the row where the matches occurred:
resulting_row = df[(df['A'] == 2)&(df['B'] == 3)].values

Categories

Resources