Compare if two python tables are tibble equivalent - python

I want to write a function to compare if 2 tables are tibble equivalent(identical variables and observations) For example, the 2 tables below are equivalent. For the third one isn't.
a b c
x 1 hat
y 2 cat
z 3 bat
w 4 rat
b c a
2 cat y
3 bat z
1 hat x
4 rat w
a b c
2 y cat
3 z bat
1 x hat
4 w rat
I decided to solve this by comparing the max values. How do I properly call out first, second, etc column and compare the max values for each one?
def equal(A, B):
A_names = sorted(A.columns)
X = A[var_names].copy()
B_names=sorted(B.columns)
Y=B[var_names].copy()
if A[0].max()==B[0].max() and A[1].max()==B[1].max():
return True
else:
return False
This has a Error KeyError: 0

This task can be solved by using equals method of DataFrame object and some DataFrames preprocessing:
def compare_dataframes(df1, df2):
df1_cols = df1.columns.tolist()
df2_cols = df2.columns.tolist()
# column names and shapes should be equal for both dataframes
if set(df1_cols).symmetric_difference(set(df2_cols)) or (df1.shape != df2.shape):
return False
df1_sorted = df1.sort_values(by=cols).reset_index(drop=True)
df2_sorted = df2.sort_values(by=cols).reset_index(drop=True)
df2_sorted = df2_sorted[df1_sorted.columns]
return df1_sorted.equals(df2_sorted)

A_var_names = sorted(A.columns)
AA = A[A_var_names].copy() #COLUMN ORDER
AA.sort_values(by=A_var_names,inplace=True) #VALUE ORDER
B_var_names = sorted(B.columns)
BB = B[B_var_names].copy()
BB.sort_values(by=B_var_names,inplace=True)
return AA.equals(BB)

Related

Fuzzy match in a column in a dataframe in python

I have a column that has strings. I want to do a fuzzy match and mark those which have an 80% match in a column next to it. I can do it with the following code on a smaller dataset but my original dataset is too big for this to work efficiently. Is there a better way to do this? This is what I have done.
import pandas as pd
l = [[1,'a','b','c','help pls'],[2,'a','c','c','yooo'],[3,'a','c','c','you will not pass'],[4,'a','b','b','You shall not pass'],[5,'a','c','c','You shall not pass!']]
df = pd.DataFrame(l,columns = ['Serial No','one','two','three','four'])
df['yes/no 2'] = ""
for i in range(0, df.shape[0]):
for j in range(0, df.shape[0]):
if (i != j):
if (fuzz.token_sort_ratio(df.iloc[i,df.shape[1]-2],df.iloc[j,df.shape[1]-2]) > 80):
df.iloc[i,df.shape[1]-1] = "yes"
import pandas as pd
from fuzzywuzzy import fuzz
l = [[1,'a','b','c','help pls'],[2,'a','c','c','yooo'],[3,'a','c','c','you will not pass'],[4,'a','b','b','You shall not pass'],[5,'a','c','c','You shall not pass!']]
df = pd.DataFrame(l,columns = ['Serial No','one','two','three','four'])
def match(row):
thresh = 80
return fuzz.token_sort_ratio(row["two"],row["three"])>thresh
df["Yes/No"] = df.apply(match,axis=1)
print(df)
Serial No one two three four Yes/No
0 1 a b c help pls False
1 2 a c c yooo True
2 3 a c c you will not pass True
3 4 a b b You shall not pass True
4 5 a c c You shall not pass! True
import pandas as pd
from fuzzywuzzy import fuzz,process
l = [[1,'a','b','c','help pls'],[2,'a','c','c','yooo'],[3,'a','c','c','you will not pass'],[4,'a','b','b','You shall not pass'],[5,'a','c','c','You shall not pass!']]
df = pd.DataFrame(l,columns = ['Serial No','one','two','three','four']).reset_index()
def match(df,col):
thresh = 80
return df[col].apply(lambda x:"Yes" if len(process.extractBests(x[1],[xx[1] for i,xx in enumerate(df[col]) if i!=x[0]],
scorer=fuzz.token_sort_ratio,score_cutoff=thresh+1,limit=1))>0 else "No")
df["five"] = df.apply(lambda x:(x["index"],x["four"]),axis=1)
df["Yes/No"] = df.pipe(match,"five")
print(df)
index Serial No one two three four five Yes/No
0 0 1 a b c help pls (0, help pls) No
1 1 2 a c c yooo (1, yooo) No
2 2 3 a c c you will not pass (2, you will not pass) Yes
3 3 4 a b b You shall not pass (3, You shall not pass) Yes
4 4 5 a c c You shall not pass! (4, You shall not pass!) Yes

Pandas dataframes equality test

How do I write a function that checks two input dataframes are of equal as long as rows in both dataframes are equal? So it disregards index positions and column orders. I can't use df.equals() since it will enforce data types to be equal, which is not what I need.
from io import StringIO
canonical_in_csv = """,c,a,b
2,hat,x,1
0,rat,y,4
3,cat,x,2
1,bat,x,2"""
with StringIO(canonical_in_csv) as fp:
df1 = pd.read_csv(fp, index_col=0)
canonical_soln_csv = """,a,b,c
0,x,1,hat
1,x,2,bat
2,x,2,cat
3,y,4,rat"""
with StringIO(canonical_soln_csv) as fp:
df2 = pd.read_csv(fp, index_col=0)
df1:
c a b
2 hat x 1
0 rat y 4
3 cat x 2
1 bat x 2
df2:
a b c
0 x 1 hat
1 x 2 bat
2 x 2 cat
3 y 4 rat
My attempt:
temp1 = (df == df2).all()
temp2 = temp1.all()
temp2
ValueError: Can only compare identically-labeled DataFrame objects
You can use sort_index by index and columns values first, then merge with eq (==) or equals:
df11 = df1.sort_index().sort_index(axis=1)
df22 = df2.sort_index().sort_index(axis=1)
print (df11.merge(df22))
a b c
0 y 4 rat
1 x 2 bat
2 x 1 hat
3 x 2 cat
print (df11.merge(df22).eq(df11))
a b c
0 True True True
1 True True True
2 True True True
3 True True True
a = df11.merge(df22).eq(df11).values.all()
#alternative
#a = df11.merge(df22).equals(df11)
print (a)
True
Your function should be rewritten:
def checkequality(A, B):
df11 = A.sort_index(axis=1)
df11 = df11.sort_values(df11.columns.tolist()).reset_index(drop=True)
df22 = B.sort_index(axis=1)
df22 = df22.sort_values(df22.columns.tolist()).reset_index(drop=True)
return (df11 == df22).values.all()
a = checkequality(df1, df2)
print (a)
True
You request on row index dis-regard is pretty difficult to undertake as this datatype is not optimized for such operation whereas regarding columns issue, fortunately this will help you
df1.values == df2[df1.columns].values
where df1.columns syncs the columns order and values convert to numpy for comparison. I still recommend not doing row re-ordering and match as that can be very taxing for bigger dataset.
Based on index match this can be what you are looking for
df1.values==df2.reindex(df1.index.values.tolist())[df1.columns].values
Update
As pointed by #Dark a cleaner and in-place comparison can be done like this
df1.loc[df2.index,df2.columns] == df2
I figured it out,
def checkequality(A, B):
var_names = sorted(A.columns)
var_names
Y = A[var_names].copy()
Y.sort_values(by = var_names,inplace=True)
Y.set_index([list(range(0,len(Y)))],inplace=True)
var_names2 = sorted(B.columns)
var_names2
Y2 = B[var_names2].copy()
Y2.sort_values(by = var_names2,inplace=True)
Y2.set_index([list(range(0,len(Y2)))],inplace=True)
if (Y==Y2).all().all() == True:
return True
else:
return False

Split DataFrame string column into N splits

i have a df
a name
1 a/b/c
2 w/x/y/z
3 q/w/e/r/t
i want to split the name column on '/' to get this output
id name main sub leaf
1 a/b/c a b c
2 w/x/y/z w x z
3 q/w/e/r/t q w t
i.e first two slashes add as main and sub respectively,
and leaf should be filled with word after last slash
i tried using this, but result was incorrect
df['name'].str.split('/', expand=True).rename(columns={0:'main',1:'sub',2:'leaf'})
is there any way to assign columns
Use split with assign:
s = df['name'].str.split('/')
df = df.assign(main=s.str[0], sub=s.str[1], leaf=s.str[-1])
print (df)
a name leaf main sub
0 1 a/b/c c a b
1 2 w/x/y/z z w x
2 3 q/w/e/r/t t q w
For change order of columns:
s = df['name'].str.split('/')
df = df.assign(main=s.str[0], sub=s.str[1], leaf=s.str[-1])
df = df[df.columns[:-3].tolist() + ['main','sub','leaf']]
print (df)
a name main sub leaf
0 1 a/b/c a b c
1 2 w/x/y/z w x z
2 3 q/w/e/r/t q w t
Or:
s = df['name'].str.split('/')
df = (df.join(pd.DataFrame({'main':s.str[0], 'sub':s.str[1], 'leaf':s.str[-1]},
columns=['main','sub','leaf'])))
print (df)
a name main sub leaf
0 1 a/b/c a b c
1 2 w/x/y/z w x z
2 3 q/w/e/r/t q w t
Option 1
Using str.split, but don't expand the result. You should end up with a column of lists. Next, use df.assign, assign columns to return a new DataFrame object.
v = df['name'].str.split('/')
df.assign(
main=v.str[ 0],
sub=v.str[ 1],
leaf=v.str[-1]
)
name leaf main sub
a
1 a/b/c c a b
2 w/x/y/z z w x
3 q/w/e/r/t t q w
Details
This is what v looks like:
a
1 [a, b, c]
2 [w, x, y, z]
3 [q, w, e, r, t]
Name: name, dtype: object
This is actually a lot easier to handle, because you have greater control over elements with the .str accessor. If you expand the result, you have to snap your ragged data to a tabular format to fit into a new DataFrame object, thereby introducing Nones. At that point, indexing (finding the ith or ith-last element) becomes a chore.
Option 2
Using direct assignment (to maintain order) -
df['main'] = v.str[ 0]
df['sub' ] = v.str[ 1]
df['leaf'] = v.str[-1]
df
name main sub leaf
a
1 a/b/c a b c
2 w/x/y/z w x z
3 q/w/e/r/t q w t
Note that this modifies the original dataframe, instead of returning a new one, so it is cheaper. However, it is more intractable if you have a large number of columns.
You might instead consider this alternative which should generalise to many more columns:
for c, i in [('main', 0), ('sub', 1), ('leaf', -1)]:
df[c] = v[i]
df
name main sub leaf
a
1 a/b/c a b c
2 w/x/y/z w x z
3 q/w/e/r/t q w t
Iterate over a list of tuples. The first element in a tuple is the column name, and the second is the corresponding index to pick the result from v. You still have to assign each one separately, whether you like it or not. Using a loop would probably be a clean way of doing it.

Python pandas apply too slow Fuzzy Match

def fuzzy_clean(i, dfr, merge_list, key):
for col in range(0,len(merge_list)):
if col == 0:
scaled_down = dfr[dfr[merge_list[col]]==i[merge_list[col]]]
else:
scaled_down = scaled_down[scaled_down[merge_list[col]]==i[merge_list[col]]]
if len(scaled_down)>0:
if i[key] in scaled_down[key].values.tolist():
return i[key]
else:
return pd.to_datetime(scaled_down[key][min(abs([scaled_down[key]-i[key]])).index].values[0])
else:
return i[key]
df[key]=df.apply(lambda i: fuzzy_clean(i,dfr,merge_list,key), axis=1)
I'm trying to eventually merge together two dataframes, dfr and df. The issue I have is that I need to merge on about 9 columns, one of which being a timestamp that doesn't quite match up between the two dataframes where sometimes it is slightly lagging, sometimes leading. I wrote a function that works when using the following; however, in practice it is just too slow running through hundreds of thousands of rows.
merge_list is a list of columns that each dataframe share that match up 100%
key is a string of a column, 'timestamp', that each share, which is what doesn't match up too well
Any suggestions in speeding this up would be greatly appreciated!
The data looks like the following:
df:
timestamp A B C
0 100 x y z
1 101 y i u
2 102 r a e
3 103 q w e
dfr:
timestamp A B C
0 100.01 x y z
1 100.99 y i u
2 101.05 y i u
3 102 r a e
4 103.01 q w e
5 103.20 q w e
I want df to look like the following:
timestamp A B C
0 100.01 x y z
1 100.99 y i u
2 102 r a e
3 103.01 q w e
Adding the final merge for reference:
def fuzzy_merge(df_left, df_right, on, key, how='outer'):
df_right[key]=df_right.apply(lambda i: fuzzy_clean(i,df_left,on,key), axis=1)
return pd.merge(df_left, df_right, on=on+[key], how=how, indicator=True).sort_values(key)
I've found a solution that I believe works. Pandas has a merge_asof that follows, still verifying possible double counting but seemed to do a decent job.
pd.merge_asof(left_df, right_df, on='timestamp', by=merge_list, direction='nearest')

Filter a pandas dataframe using values from a dict

I need to filter a data frame with a dict, constructed with the key being the column name and the value being the value that I want to filter:
filter_v = {'A':1, 'B':0, 'C':'This is right'}
# this would be the normal approach
df[(df['A'] == 1) & (df['B'] ==0)& (df['C'] == 'This is right')]
But I want to do something on the lines
for column, value in filter_v.items():
df[df[column] == value]
but this will filter the data frame several times, one value at a time, and not apply all filters at the same time. Is there a way to do it programmatically?
EDIT: an example:
df1 = pd.DataFrame({'A':[1,0,1,1, np.nan], 'B':[1,1,1,0,1], 'C':['right','right','wrong','right', 'right'],'D':[1,2,2,3,4]})
filter_v = {'A':1, 'B':0, 'C':'right'}
df1.loc[df1[filter_v.keys()].isin(filter_v.values()).all(axis=1), :]
gives
A B C D
0 1 1 right 1
1 0 1 right 2
3 1 0 right 3
but the expected result was
A B C D
3 1 0 right 3
only the last one should be selected.
IIUC, you should be able to do something like this:
>>> df1.loc[(df1[list(filter_v)] == pd.Series(filter_v)).all(axis=1)]
A B C D
3 1 0 right 3
This works by making a Series to compare against:
>>> pd.Series(filter_v)
A 1
B 0
C right
dtype: object
Selecting the corresponding part of df1:
>>> df1[list(filter_v)]
A C B
0 1 right 1
1 0 right 1
2 1 wrong 1
3 1 right 0
4 NaN right 1
Finding where they match:
>>> df1[list(filter_v)] == pd.Series(filter_v)
A B C
0 True False True
1 False False True
2 True False False
3 True True True
4 False False True
Finding where they all match:
>>> (df1[list(filter_v)] == pd.Series(filter_v)).all(axis=1)
0 False
1 False
2 False
3 True
4 False
dtype: bool
And finally using this to index into df1:
>>> df1.loc[(df1[list(filter_v)] == pd.Series(filter_v)).all(axis=1)]
A B C D
3 1 0 right 3
Abstraction of the above for case of passing array of filter values rather than single value (analogous to pandas.core.series.Series.isin()). Using the same example:
df1 = pd.DataFrame({'A':[1,0,1,1, np.nan], 'B':[1,1,1,0,1], 'C':['right','right','wrong','right', 'right'],'D':[1,2,2,3,4]})
filter_v = {'A':[1], 'B':[1,0], 'C':['right']}
##Start with array of all True
ind = [True] * len(df1)
##Loop through filters, updating index
for col, vals in filter_v.items():
ind = ind & (df1[col].isin(vals))
##Return filtered dataframe
df1[ind]
##Returns
A B C D
0 1.0 1 right 1
3 1.0 0 right 3
Here is a way to do it:
df.loc[df[filter_v.keys()].isin(filter_v.values()).all(axis=1), :]
UPDATE:
With values being the same across columns you could then do something like this:
# Create your filtering function:
def filter_dict(df, dic):
return df[df[dic.keys()].apply(
lambda x: x.equals(pd.Series(dic.values(), index=x.index, name=x.name)), asix=1)]
# Use it on your DataFrame:
filter_dict(df1, filter_v)
Which yields:
A B C D
3 1 0 right 3
If it something that you do frequently you could go as far as to patch DataFrame for an easy access to this filter:
pd.DataFrame.filter_dict_ = filter_dict
And then use this filter like this:
df1.filter_dict_(filter_v)
Which would yield the same result.
BUT, it is not the right way to do it, clearly.
I would use DSM's approach.
For python2, that's OK in #primer's answer. But, you should be careful in Python3 because of dict_keys. For instance,
>> df.loc[df[filter_v.keys()].isin(filter_v.values()).all(axis=1), :]
>> TypeError: unhashable type: 'dict_keys'
The correct way to Python3:
df.loc[df[list(filter_v.keys())].isin(list(filter_v.values())).all(axis=1), :]
Here's another way:
filterSeries = pd.Series(np.ones(df.shape[0],dtype=bool))
for column, value in filter_v.items():
filterSeries = ((df[column] == value) & filterSeries)
This gives:
>>> df[filterSeries]
A B C D
3 1 0 right 3
To follow up on DSM's answer, you can also use any() to turn your query into an OR operation (instead of AND):
df1.loc[(df1[list(filter_v)] == pd.Series(filter_v)).any(axis=1)]
You can also create a query
query_string = ' and '.join(
[f'({key} == "{val}")' if type(val) == str else f'({key} == {val})' for key, val in filter_v.items()]
)
df1.query(query_string)
Combining previous answers, here's a function you can feed to df1.loc. Allows for AND/OR (using how='all'/'any'), plus it allows comparisons other than == using the op keyword, if desired.
import operator
def quick_mask(df, filters, how='all', op=operator.eq) -> pd.Series:
if how == 'all':
comb = pd.Series.all
elif how == 'any':
comb = pd.Series.any
return comb(op(df[[*filters]], pd.Series(filters)), axis=1)
# Usage
df1.loc[quick_mask(df1, filter_v)]
I had an issue due to my dictionary having multiple values for the same key.
I was able to change DSM's query to:
df1.loc[df1[list(filter_v)].isin(filter_v).all(axis=1), :]

Categories

Resources