I have a two dimensional (or more) pandas DataFrame like this:
>>> import pandas as pd
>>> df = pd.DataFrame([[0,1],[2,3],[4,5]], columns=['A', 'B'])
>>> df
A B
0 0 1
1 2 3
2 4 5
Now suppose I have a numpy array like np.array([2,3]) and want to check if there is any row in df that matches with the contents of my array. Here the answer should obviously true but eg. np.array([1,2]) should return false as there is no row with both 1 in column A and 2 in column B.
Sure this is easy but don't see it right now.
Turns out it is really easy, the following does the job here:
>>> ((df['A'] == 2) & (df['B'] == 3)).any()
True
>>> ((df['A'] == 1) & (df['B'] == 2)).any()
False
Maybe somebody comes up with a better solution which allows directly passing in the array and the list of columns to match.
Note that the parenthesis around df['A'] == 2 are not optional since the & operator binds just as strong as the == operator.
an easier way is:
a = np.array([2,3])
(df == a).all(1).any()
If you also want to return the index where the matches occurred:
index_list = df[(df['A'] == 2)&(df['B'] == 3)].index.tolist()
To find rows where a single column equals a certain value:
df[df['column name'] == value]
To find rows where multiple columns equal different values, Note the inner ():
df[(df["Col1"] == Value1 & df["Col2"] == Value2 & ....)]
a simple solution with dictionary
def check_existance(dict_of_values, df):
v = df.iloc[:, 0] == df.iloc[:, 0]
for key, value in dict_of_values.items():
v &= (df[key] == value)
return v.any()
import pandas as pd
df = pd.DataFrame([[0,1],[2,3],[4,5]], columns=['A', 'B'])
this_row_exists = {'A':2, 'B':3}
check_existance(this_row_exists, df)
# True
this_row_does_not_exist = {'A':2, 'B':5}
check_existance(this_row_does_not_exist, df)
# False
An answer that works with larger dataframes so you don't need to manually check for each columns:
import pandas as pd
import numpy as np
#define variables
df = pd.DataFrame([[0,1],[2,3],[4,5]], columns=['A', 'B'])
a = np.array([2,3])
def check_if_np_array_is_in_df(df, a):
# transform a into a dataframe
da = pd.DataFrame(np.expand_dims(a,axis=0), columns=['A','B'])
# drop duplicates from df
ddf=df.drop_duplicates()
result = pd.concat([ddf,da]).shape[0] - pd.concat([ddf,da]).drop_duplicates().shape[0]
return result
print(check_if_np_array_is_in_df(df, a))
print(check_if_np_array_is_in_df(df, [1,3]))
If you want to return the row where the matches occurred:
resulting_row = df[(df['A'] == 2)&(df['B'] == 3)].values
Related
I'm trying to build a sheet whereby I have a new column ('column x').
This column would be populated by scanning over three already existing columns (a, b , c).
if a given value is found in any of those columns for the indexed figure the new column will read 'Fail' else it will read 'pass.
When i try this on scanning a single column my code works
example:
df["Column x"] = df["Column a"].apply(lambda val: "Fail" if val == 'T' else "Pass")
When i try in more than one it fails no matter how i adjust.
df['Column x'] = df['Column a'].any(lambda val: 'Fail' if val == 0 else 'Pass') or df['Column b'].apply(lambda val: 'Fail' if val == 'False' else 'Pass')
any advise is incredibly helpful
Here is an all pandas code:
import pandas as pd
df = pd.DataFrame({
'col1':['A','B','C'],
'col2':['E','F','G'],
'col3':['H','I','J']}
)
df["C"] = pd.Series(
df['col1'].apply(lambda a: 0 if a == 'C' else 1) &
df['col2'].apply(lambda a: 0 if a == 'C' else 1) &
df['col3'].apply(lambda a: 0 if a == 'C' else 1)
).apply(lambda x: "Pass" if x else 'Fail')
print(df)
output
col1 col2 col3 C
0 A E H Pass
1 B F I Pass
2 C G J Fail
Handling empty data frame
if df.empty:
print('DataFrame is empty!')
Check Below code with np.where, checking for J if present in any column than False else True
import pandas as pd
import numpy as np
df = pd.DataFrame({'col1':['A','B','C'],'col2':['E','F','G'],'col3':['H','I','J']})
df['column_x'] = np.where(((df['col1']=='J')|(df['col2']=='J')|(df['col3']=='J')),'Fail','Pass')
df
Output:
Say I have the following DataFrame:
df = pd.DataFrame(np.arange(10).reshape(5,2),columns=list('AB'))
A B
0 0 1
1 2 3
2 4 5
3 6 7
4 8 9
And I wish to output each column header followed by the column concatenated as a string like so:
'''A
02468
B
13579'''
I can do like so with a for loop:
for col in df.columns:
print(col, df[col].astype(str).str.cat(), sep='\n')
but I have a large number of columns - is there a more efficient way to do this?
Try converting the columns to str with astype, joining them together, then take advantage to to_csv's ability to create formatted data setting the separator to newlines, and exclude the header:
import numpy as np
import pandas as pd
df = pd.DataFrame(np.arange(10).reshape(5, 2), columns=list('AB'))
s = df.astype(str).apply(''.join).to_csv(sep='\n', header=False)
print(s)
s:
A
02468
B
13579
I was interested in the timings so I made a perfplot:
import numpy as np
import pandas as pd
import perfplot
def make_data(n):
if n // 2 == 0:
return pd.DataFrame(columns=list('AB'))
df = pd.DataFrame(np.arange(n).reshape(n // 2, 2), columns=list('AB'))
return df
def for_option(df):
s = ''
for k, v in df.astype(str).to_dict('list').items():
s += f"{k}\n{''.join(v)}\n"
return s
def apply_option_to_csv(df):
s = df.astype(str).apply(''.join).to_csv(sep='\n', header=False)
return s
def apply_option_for(df):
s = ''
for k, v in zip(df.columns, df.astype(str).apply(''.join)):
s += f"{k}\n{v}\n"
return s
if __name__ == '__main__':
out = perfplot.bench(
setup=make_data,
kernels=[
for_option,
apply_option_to_csv,
apply_option_for
],
labels=['for option', 'apply option (to csv)', 'apply option (for)'],
n_range=[2 ** k for k in range(25)],
equality_check=None
)
out.save('res.png', transparent=False)
It appears to_csv has some overhead which makes it overall less efficient than other options. In terms of apply(''.join) vs to_dict('list').items() and joining each value they appear to behave similarly at larger values, but Scott Boston's solution is significantly faster for smaller frames.
Try this:
for k,v in df.astype(str).to_dict('list').items():
print(k)
print(''.join(v))
It may be faster than using df.apply you'll have to test with your dataframe.
Dataframe is like below: Where I want to change dataframes value to 'dead' if age is more than 100.
import pandas as pd
raw_data = {'age1': [23,45,210],'age2': [10,20,150],'name': ['a','b','c']}
df = pd.DataFrame(raw_data, columns = ['age1','age2','name'])
raw_data = {'age1': [80,90,110],'age2': [70,120,90],'name': ['a','b','c']}
df2 = pd.DataFrame(raw_data, columns = ['age1','age2','name'])
Desired outcome
df=
age1 age2 name
0 23 10 a
1 45 20 b
2 dead dead c
df2=
age1 age2 name
0 80 70 a
1 90 dead b
2 dead 90 c
I was trying something like this:
col_list=['age1','age2']
df_list=[df,df2]
def dead(df):
for df in df_list:
if df.columns in col_list:
if df.columns >=100:
return 'dead'
else:
return df.columns
df.apply(dead)
Error shown:
The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
I am looking for a loop that works on all dataframe.
Please correct my function also for future learning :)
With your shown samples, please try following. Using filter, np.where functions of pandas, numpy respectively.
c = df.filter(regex='age\d+').columns
df[c] = np.where(df[c].ge(100),'dead',df[c])
df
Alternative approach with where:
c=df.filter(like='age').columns
df[c] = df[c].where(~df['c'].ge(100),'dead')
Explanation:
Getting columns which has same name like age in c variable.
Then using np.where to check if respective(all age columns) are greeter/equal to 100, if yes then set it to dead or keep it as it is.
I did the following:
col_list=['age1','age2']
df_list=[df,df2]
for d in df_list:
for c in col_list:
d.loc[d[c]>100, c] = 'dead'
#inspired by #jib and #ravinder
col_list=['age1','age2']
df_list=[df,df2]
for d in df_list:
for c in col_list:
d[c]=np.where(d[c]>100,'dead',d[c])
df #or df2
output:
age1 age2 name
0 23 10 a
1 45 20 b
2 dead dead c
One possible solution is to use Pandas' mask, which is similar to if-else, but vectorized.
def dead(df):
col_list = ['age1', 'age2']
df = df.copy()
temporary = df.filter(col_list)
temporary = temporary.mask(temporary >= 100, "dead")
df.loc[:, col_list] = temporary
return df
Apply function to the dataframe:
df.pipe(dead)
age1 age2 name
0 23 10 a
1 45 20 b
2 dead dead c
You can do:
def check_more_than_100(x):
v = None
try:
v = int(x)
except:
pass
if v is not None:
return (v > 100)
return (False)
df['age1'] = df['age1'].apply(lambda x : 'dead' if check_more_than_100(x) else x)
df['age2'] = df['age2'].apply(lambda x : 'dead' if check_more_than_100(x) else x)
df2['age1'] = df2['age1'].apply(lambda x : 'dead' if check_more_than_100(x) else x)
df2['age2'] = df2['age2'].apply(lambda x : 'dead' if check_more_than_100(x) else x)
This should take care of non-int values if any.
I used this answer to a similar question. Basically you can use the .where() function from numpy to set based on the conditional.
import pandas as pd
import numpy as np
raw_data = {'age1': [23,45,210],'age2': [10,20,150],'name': ['a','b','c']}
df = pd.DataFrame(raw_data, columns = ['age1','age2','name'])
raw_data = {'age1': [80,90,110],'age2': [70,120,90],'name': ['a','b','c']}
df2 = pd.DataFrame(raw_data, columns = ['age1','age2','name'])
col_list=['age1','age2']
df_list=[df,df2]
def dead(df_list, col_list):
for df in df_list:
for col in col_list:
df[col] = np.where(df[col] >= 100, "dead", df[col])
return df_list
df
dead([df], col_list)
Extracting numeric columns and then using numpy where -
df_cols = df._get_numeric_data().columns.values
df2_cols = df2._get_numeric_data().columns.values
df[df_cols] = np.where(df[df_cols].to_numpy() > 100, 'dead', df[df_cols])
df2[df2_cols] = np.where(df2[df2_cols].to_numpy() > 100, 'dead', df2[df2_cols])
I have a pandas dataframe which looks like this:
A B
x 5.9027.5276
y 656.344872.0
z 78.954.23
What I want to reach is to replace the string entries in column B by floats of the first four numbers of the entries of column B as decimal numbers at the second position.
Therefore, I wrote the following code:
for entry in df['B']:
entry = re.search(r'((\d\.?){1,4})', entry).group().replace(".","")
df['B'] = entry[:1] + '.' + entry[1:]
df['B'] = df['B'].astype(float)
It almost does what I want but it replaces all the entries in B with the float value of the first row. Instead, I would like to replace the entries with the according float value of each row.
How could I do this?
Thanks a lot!
You can use the relevant pandas string functions:
df['B'] = df['B'].str.extract('((\d\.?){1,4})')[0].str.replace(r'\.', '')
df['B'] = df['B'].str[:1] + '.' + df['B'].str[1:]
df['B'] = df['B'].astype(float)
print(df)
A B
0 x 5.902
1 y 6.563
2 z 7.895
You might encase your operation in function and then use .apply i.e.:
import re
import pandas as pd
df = pd.DataFrame({'A':['x','y','z'],'B':['5.9027.5276','656.344872.0','78.954.23']})
def func(entry):
entry = re.search(r'((\d\.?){1,4})', entry).group().replace(".","")
return entry[:1] + '.' + entry[1:]
df['B'] = df['B'].apply(func)
df['B'] = df['B'].astype(float)
print(df)
output:
A B
0 x 5.902
1 y 6.563
2 z 7.895
Suppose I have a structured dataframe as follows:
df = pd.DataFrame({"A":['a','a','a','b','b'],
"B":[1]*5})
The A column has previously been sorted. I wish to find the first row index of where df[df.A!='a']. The end goal is to use this index to break the data frame into groups based on A.
Now I realise that there is a groupby functionality. However, the dataframe is quite large and this is a simplified toy example. Since A has been sorted already, it would be faster if I can just find the 1st index of where df.A!='a'. Therefore it is important that whatever method that you use the scanning stops once the first element is found.
idxmax and argmax will return the position of the maximal value or the first position if the maximal value occurs more than once.
use idxmax on df.A.ne('a')
df.A.ne('a').idxmax()
3
or the numpy equivalent
(df.A.values != 'a').argmax()
3
However, if A has already been sorted, then we can use searchsorted
df.A.searchsorted('a', side='right')
array([3])
Or the numpy equivalent
df.A.values.searchsorted('a', side='right')
3
I found there is first_valid_index function for Pandas DataFrames that will do the job, one could use it as follows:
df[df.A!='a'].first_valid_index()
3
However, this function seems to be very slow. Even taking the first index of the filtered dataframe is faster:
df.loc[df.A!='a','A'].index[0]
Below I compare the total time(sec) of repeating calculations 100 times for these two options and all the codes above:
total_time_sec ratio wrt fastest algo
searchsorted numpy: 0.0007 1.00
argmax numpy: 0.0009 1.29
for loop: 0.0045 6.43
searchsorted pandas: 0.0075 10.71
idxmax pandas: 0.0267 38.14
index[0]: 0.0295 42.14
first_valid_index pandas: 0.1181 168.71
Notice numpy's searchsorted is the winner and first_valid_index shows worst performance. Generally, numpy algorithms are faster, and the for loop does not do so bad, but it's just because the dataframe has very few entries.
For a dataframe with 10,000 entries where the desired entries are closer to the end the results are different, with searchsorted delivering the best performance:
total_time_sec ratio wrt fastest algo
searchsorted numpy: 0.0007 1.00
searchsorted pandas: 0.0076 10.86
argmax numpy: 0.0117 16.71
index[0]: 0.0815 116.43
idxmax pandas: 0.0904 129.14
first_valid_index pandas: 0.1691 241.57
for loop: 9.6504 13786.29
The code to produce these results is below:
import timeit
# code snippet to be executed only once
mysetup = '''import pandas as pd
import numpy as np
df = pd.DataFrame({"A":['a','a','a','b','b'],"B":[1]*5})
'''
# code snippets whose execution time is to be measured
mycode_set = ['''
df[df.A!='a'].first_valid_index()
''']
message = ["first_valid_index pandas:"]
mycode_set.append( '''df.loc[df.A!='a','A'].index[0]''')
message.append("index[0]: ")
mycode_set.append( '''df.A.ne('a').idxmax()''')
message.append("idxmax pandas: ")
mycode_set.append( '''(df.A.values != 'a').argmax()''')
message.append("argmax numpy: ")
mycode_set.append( '''df.A.searchsorted('a', side='right')''')
message.append("searchsorted pandas: ")
mycode_set.append( '''df.A.values.searchsorted('a', side='right')''' )
message.append("searchsorted numpy: ")
mycode_set.append( '''for index in range(len(df['A'])):
if df['A'][index] != 'a':
ans = index
break
''')
message.append("for loop: ")
total_time_in_sec = []
for i in range(len(mycode_set)):
mycode = mycode_set[i]
total_time_in_sec.append(np.round(timeit.timeit(setup = mysetup,\
stmt = mycode, number = 100),4))
output = pd.DataFrame(total_time_in_sec, index = message, \
columns = ['total_time_sec' ])
output["ratio wrt fastest algo"] = \
np.round(output.total_time_sec/output["total_time_sec"].min(),2)
output = output.sort_values(by = "total_time_sec")
display(output)
For the larger dataframe:
mysetup = '''import pandas as pd
import numpy as np
n = 10000
lt = ['a' for _ in range(n)]
b = ['b' for _ in range(5)]
lt[-5:] = b
df = pd.DataFrame({"A":lt,"B":[1]*n})
'''
Using pandas groupby() to group by column or list of columns. Then first() to get the first value in each group.
import pandas as pd
df = pd.DataFrame({"A":['a','a','a','b','b'],
"B":[1]*5})
#Group df by column and get the first value in each group
grouped_df = df.groupby("A").first()
#Reset indices to match format
first_values = grouped_df.reset_index()
print(first_values)
>>> A B
0 a 1
1 b 1
For multiple conditions:
Let's say we have:
s = pd.Series(['a', 'a', 'c', 'c', 'b', 'd'])
And we want to find the first item different than a and c, we do:
n = np.logical_and(s.values != 'a', s.values != 'c').argmax()
Times:
import numpy as np
import pandas as pd
from datetime import datetime
ITERS = 1000
def pandas_multi_condition(s):
ts = datetime.now()
for i in range(ITERS):
n = s[(s != 'a') & (s != 'c')].index[0]
print(n)
print(datetime.now() - ts)
def numpy_bitwise_and(s):
ts = datetime.now()
for i in range(ITERS):
n = np.logical_and(s.values != 'a', s.values != 'c').argmax()
print(n)
print(datetime.now() - ts)
s = pd.Series(['a', 'a', 'c', 'c', 'b', 'd'])
print('pandas_multi_condition():')
pandas_multi_condition(s)
print()
print('numpy_bitwise_and():')
numpy_bitwise_and(s)
Output:
pandas_multi_condition():
4
0:00:01.144767
numpy_bitwise_and():
4
0:00:00.019013
If you just want to find the first instance without going through the entire dataframe, you can go the for-loop way.
df = pd.DataFrame({"A":['a','a','a','b','b'],"B":[1]*5})
for index in range(len(df['A'])):
if df['A'][index] != 'a':
print(index)
break
The index is the row number of the 1st index of where df.A!='a'
You can iterate by dataframe rows (it is slow) and create your own logic to get values that you wanted:
def getMaxIndex(df, col)
max = -999999
rtn_index = 0
for index, row in df.iterrows():
if row[col] > max:
max = row[col]
rtn_index = index
return rtn_index
Generalized Form:
index = df.loc[df.column_name == 'value_you_looking_for'].index[0]
Example:
index_of_interest = df.loc[df.A == 'a'].index[0]