How to merge duplicates as new columns - python

I am attempting to merge to dataframes on 1 column for which I would like the output of duplicates to be an extra column instead of a new row.
What happens now:
df1 = pd.DataFrame({'A': ['A0'],
'B': ['B0']})
df2 = pd.DataFrame({'A': ['A0', 'A0'],
'C': ['C4', 'C5']})
df1.merge(df2, on = 'A', how = 'left')
Gives the output:
A B C
0 A0 B0 C4
1 A0 B0 C5
What I would like the output to be:
A B C_1 C_2
0 A0 B0 C4 C5
Thanks!

Create unique values of column A in df2 by MultiIndex by DataFrame.set_index with counter column by GroupBy.cumcount, reshape by Series.unstack and flatten Multiindex by map with join::
df2 = df2.set_index(['A', df2.groupby('A').cumcount().add(1).astype(str)]).unstack()
df2.columns = df2.columns.map('_'.join)
df2 = df2.reset_index()
print (df2)
A C_1 C_2
0 A0 C4 C5
df = df1.merge(df2, on = 'A', how = 'left')
print (df)
A B C_1 C_2
0 A0 B0 C4 C5

In one line of code:
df1.merge(df2.assign(Cs=range(0,len(df2))).pivot(index='A',columns='Cs'),on='A')
A B (C, 0) (C, 1)
0 A0 B0 C4 C5

Related

Pandas: Comparing 2 dataframes without iterating

Considering I have 2 dataframes as shown below (DF1 and DF2), I need to compare DF2 with DF1 such that I can identify all the Matching, Different, Missing values for all the columns in DF2 that match columns in DF1 (Col1, Col2 & Col3 in this case) for rows with same EID value (A, B, C & D). I do not wish to iterate on each row of a dataframe as it can be time-consuming.
Note: There can around 70 - 100 columns. This is just a sample dataframe I am using.
DF1
EID Col1 Col2 Col3 Col4
0 A a1 b1 c1 d1
1 B a2 b2 c2 d2
2 C None b3 c3 d3
3 D a4 b4 c4 d4
4 G a5 b5 c5 d5
DF2
EID Col1 Col2 Col3
0 A a1 b1 c1
1 B a2 b2 c9
2 C a3 b3 c3
3 D a4 b4 None
Expected output dataframe
EID Col1 Col2 Col3 New_Col
0 A a1 b1 c1 Match
1 B a2 b2 c2 Different
2 C None b3 c3 Missing in DF1
3 D a4 b4 c4 Missing in DF2
Firstly, you will need to filter df1 based on df2.
new_df = df1.loc[df1['EID'].isin(df2['EID']), df2.columns]
EID Col1 Col2 Col3
0 A a1 b1 c1
1 B a2 b2 c2
2 C None b3 c3
3 D a4 b4 c4
Next, since you have a big dataframe to compare, you can change both the new_df and df2 to numpy arrays.
array1 = new_df.to_numpy()
array2 = df2.to_numpy()
Now you can compare it row-wise using np.where
new_df['New Col'] = np.where((array1 == array2).all(axis=1),'Match', 'Different')
EID Col1 Col2 Col3 New Col
0 A a1 b1 c1 Match
1 B a2 b2 c2 Different
2 C None b3 c3 Different
3 D a4 b4 c4 Different
Finally, to convert the row with None value, you can use df.loc and df.isnull
new_df.loc[new_df.isnull().any(axis=1), ['New Col']] = 'Missing in DF1'
new_df.loc[df2.isnull().any(axis=1), ['New Col']] = 'Missing in DF2'
EID Col1 Col2 Col3 New Col
0 A a1 b1 c1 Match
1 B a2 b2 c2 Different
2 C None b3 c3 Missing in DF1
3 D a4 b4 c4 Missing in DF2
One thing to note is that "Match", "Different", "Missing in DF1", and "Missing in DF1" are not mutually exclusive.
You can have some values missing in DF1, but also missing in DF2.
However, based on your post, the priority seems to be:
"Match" > "Missing in DF1" > "Missing in DF2" > "Different".
Also, it seems like you're using EID as an index, so it makes more sense to use it as the dataframe index. You can call .reset_index() if you want it as a column.
The approach is to use the equality operator / null check element-wise, then call .all and .any across columns.
import numpy as np
import pandas as pd
def compare_dfs(df1, df2):
# output dataframe has df2 dimensions, but df1 values
result = df1.reindex(index=df2.index, columns=df2.columns)
# check if values match; note that None == None, but np.nan != np.nan
eq_check = (result == df2).all(axis=1)
# null values are understood to be "missing"
# change the condition otherwise
null_check1 = result.isnull().any(axis=1)
null_check2 = df2.isnull().any(axis=1)
# create New_Col based on inferred priority
result.loc[:, "New_Col"] = None
result.loc[result["New_Col"].isnull() & eq_check, "New_Col"] = "Match"
result.loc[
result["New_Col"].isnull() & null_check1, "New_Col"
] = "Missing in DF1"
result.loc[
result["New_Col"].isnull() & null_check2, "New_Col"
] = "Missing in DF2"
result["New_Col"].fillna("Different", inplace=True)
return result
You can test your inputs in a jupyter notebook:
import itertools as it
df1 = pd.DataFrame(
np.array(["".join(i) for i in it.product(list("abcd"), list("12345"))])
.reshape((4, 5))
.T,
index=pd.Index(list("ABCDG"), name="EID"),
columns=[f"Col{i + 1}" for i in range(4)],
)
df1.loc["C", "Col1"] = None
df2 = df1.iloc[:4, :3].copy()
df2.loc["B", "Col3"] = "c9"
df2.loc["D", "Col3"] = None
display(df1)
display(df2)
display(compare_dfs(df1, df2))
Which should give these results:
Col1 Col2 Col3 Col4
EID
A a1 b1 c1 d1
B a2 b2 c2 d2
C None b3 c3 d3
D a4 b4 c4 d4
G a5 b5 c5 d5
Col1 Col2 Col3
EID
A a1 b1 c1
B a2 b2 c9
C None b3 c3
D a4 b4 None
Col1 Col2 Col3 New_Col
EID
A a1 b1 c1 Match
B a2 b2 c2 Different
C None b3 c3 Missing in DF1
D a4 b4 c4 Missing in DF2
On my i7 6600U local machine, the function takes ~1 sec for a dataset with 1 million rows, 80 columns.
rng = np.random.default_rng(seed=0)
test_size = (1_000_000, 100)
df1 = (
pd.DataFrame(rng.random(test_size))
.rename_axis(index="EID")
.rename(columns=lambda x: f"Col{x + 1}")
)
df2 = df1.sample(frac=0.8, axis=1)
# add difference
df2 += rng.random(df2.shape) > 0.9
# add nulls
df1[rng.random(df1.shape) > 0.99] = np.nan
df2[rng.random(df2.shape) > 0.99] = np.nan
%timeit compare_dfs(df1, df2)
953 ms ± 199 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Underneath it all, you're still going to be doing iterations. However, what you can do is merge the two columns on the EID, perform and outer join, and then use an apply function to generate your new_col.
df3 = pd.merge(df1, df2, on='EID', how='outer', lsuffix='df1_', rsuffix='df2_')
df3['comparison'] = df3.apply(lambda x: comparison_function(x), axis=1)
# your comparison_function will have your checks that result in missing in df1, df2, etc
You can then use
try:
#1.
DF1 = DF1.drop('Col4', axis=1)
df= pd.merge(DF2, DF1.loc[df['EID'].ne('G')], on=['Col1','Col2', 'Col3', 'EID'], how='left', indicator='New Col')
df['New Col'] = np.where(df['New Col'] == 'left_only', "Missing in DF1", df['New Col'])
df = df.merge(pd.merge(DF2.loc[:, ['EID','Col1','Col2']], DF1.loc[DF1['EID'].ne('G'), [ 'EID', 'Col1','Col2',]], on=['EID', 'Col1','Col2', ], how='left', indicator='col1_col2'), on=['EID','Col1','Col2'], how='left')
df = df.merge(pd.merge(DF2.loc[:, ['EID','Col2','Col3']], DF1.loc[DF1['EID'].ne('G'), [ 'EID', 'Col2','Col3',]], on=['EID', 'Col2','Col3', ], how='left', indicator='col2_col3'), on=['EID','Col2','Col3'], how='left')
df = df.merge(pd.merge(DF2.loc[:, ['EID','Col1','Col3']], DF1.loc[DF1['EID'].ne('G'), [ 'EID', 'Col1','Col3',]], on=['EID', 'Col1','Col3', ], how='left', indicator='col1_col3'), on=['EID','Col1','Col3'], how='left')
a1 = df['New Col'].eq('both') #match
a2 = df['col1_col2'].eq('both') & df['New Col'].eq('Missing in DF1') #same by Col1 & Col2 --> Different
a3 = df['col2_col3'].eq('both') & df['New Col'].eq('Missing in DF1') #same by Col2 & Col3 --> Different
a4 = df['col1_col3'].eq('both') & df['New Col'].eq('Missing in DF1') #same by Col1 & Col3 --> Different
df['New Col'] = np.select([a1, a2, a3, a4], ['match', 'Different/ same Col1 & Col2', 'Different/ same Col2 & Col3', 'Different/ same Col1 & Col3'], df['New Col'])
df = df.drop(columns=['col1_col2', 'col2_col3', 'col1_col3'])
EID Col1 Col2 Col3 New Col
0 A a1 b1 c1 match
1 B a2 b2 c9 Different/ same Col1 & Col2
2 C a3 b3 c3 Different/ same Col2 & Col3
3 D a4 b4 None Different/ same Col1 & Col2
or
#2.
DF1 = DF1.drop('Col4', axis=1)
df= pd.merge(DF2, DF1.loc[df['EID'].ne('G')], on=['Col1','Col2', 'Col3', 'EID'], how='left', indicator='New Col')
df['New Col'] = np.where(df['New Col'] == 'left_only', "Missing in DF1", df['New Col'])
df = df.merge(pd.merge(DF2.loc[:, ['EID','Col1','Col2']], DF1.loc[DF1['EID'].ne('G'), [ 'EID', 'Col1','Col2',]], on=['EID', 'Col1','Col2', ], how='left', indicator='col1_col2'), on=['EID','Col1','Col2'], how='left')
a1 = df['New Col'].eq('both') #match
a2 = df['col1_col2'].eq('both') & df['New Col'].eq('Missing in DF1') #Different
df['New Col'] = np.select([a1, a2], ['match', 'Different'], df['New Col'])
df = df.drop(columns=['col1_col2'])
EID Col1 Col2 Col3 New Col
0 A a1 b1 c1 match
1 B a2 b2 c9 Different
2 C a3 b3 c3 Missing in DF1
3 D a4 b4 None Different
Note1: no iteration
Note2: goal of this solution: compare DF2 with DF1 such that you can identify all the Matching, Different, Missing values for all the columns in DF2 that match columns in DF1 (Col1, Col2 & Col3 in this case) for rows with same EID value (A, B, C & D)
temp_df1 = df1[df2.columns] # to compare the only available columns in df2
joined_df = df2.merge(temp_df1, on='EID') # default indicator is '_x' for left table (df2) and '_y' for right table (df1)
# getting the columns that need to be compared
cols = list(df2.columns)
cols.remove('EID')
cols_left = [i+'_x' for i in cols]
cols_right = [i+'_y' for i in cols]
# getting back the table
temp_df2 = joined_df[cols_left]
temp_df2.columns=cols
temp_df1 = joined_df[cols_right]
temp_df1.columns=cols
output_df = joined_df[['EID']].copy()
output_df[cols] = temp_df1
filt = (temp_df2 == temp_df1).all(axis=1)
output_df.loc[filt, 'New_Col'] = 'Match'
output_df.loc[~filt, 'New_Col'] = 'Different'
output_df.loc[temp_df2.isna().any(axis=1), 'New_Col'] = 'Missing in df2' # getting missing values in df2
output_df.loc[temp_df1.isna().any(axis=1), 'New_Col'] = 'Missing in df1' # getting missing values in df1
output_df
EID Col1 Col2 Col3 New_Col
0 A a1 b1 c1 Match
1 B a2 b2 c2 Different
2 C NaN b3 c3 Missing in df1
3 D a4 b4 c4 Missing in df2

How to compute each cell as a function of index and column?

I have a use-case where it naturally fits to compute each cell of a pd.DataFrame as a function of the corresponding index and column i.e.
import pandas as pd
import numpy as np
data = np.empty((3, 3))
data[:] = np.nan
df = pd.DataFrame(data=data, index=[1, 2, 3], columns=['a', 'b', 'c'])
print(df)
> a b c
>1 NaN NaN NaN
>2 NaN NaN NaN
>3 NaN NaN NaN
and I'd like (this is only a mock example) to get a result that is a function f(index, column):
> a b c
>1 a1 b1 c1
>2 a2 b2 c2
>3 a3 b3 c3
In order to accomplish this I need a way different to apply or applymap where the lambda gets the coordinates in terms of the index and col i.e.
def my_cell_map(ix, col):
return col + str(ix)
Here is possible use numpy - add index values to columns with broadcasting and pass to DataFrame constructor:
a = df.columns.to_numpy() + df.index.astype(str).to_numpy()[:, None]
df = pd.DataFrame(a, index=df.index, columns=df.columns)
print (df)
a b c
1 a1 b1 c1
2 a2 b2 c2
3 a3 b3 c3
EDIT: For processing by columns names is possible use x.name with index values:
def f(x):
return x.name + x.index.astype(str)
df = df.apply(f)
print (df)
a b c
1 a1 b1 c1
2 a2 b2 c2
3 a3 b3 c3
EDIT1: For your function is necessary use another lambda function for loop by index values:
def my_cell_map(ix, col):
return col + str(ix)
def f(x):
return x.index.map(lambda y: my_cell_map(y, x.name))
df = df.apply(f)
print (df)
a b c
1 a1 b1 c1
2 a2 b2 c2
3 a3 b3 c3
EDIT2: Also is possible loop by index and columns values and set by loc, if large DataFrame performance should be slow:
for c in df.columns:
for i in df.index:
df.loc[i, c] = my_cell_map(i, c)
print (df)
a b c
1 a1 b1 c1
2 a2 b2 c2
3 a3 b3 c3

Show common values in a column once in pandas

I have a dataframe that looks like this:
df = pd.DataFrame({'key': ['K0', 'K0', 'K0', 'K1'],'cat': ['C0', 'C0', 'C1', 'C1'],'B': ['A0', 'A1', 'A2', 'A3']})
df
Out[15]:
key cat B
0 K0 C0 A0
1 K0 C0 A1
2 K0 C1 A2
3 K1 C1 A3
Is it possible to convert it to:
key cat B
0 K0 C0 A0
1 A1
2 K0 C1 A2
3 K1 C1 A3
I want to avoid showing same value of key & cat again and again and key reappears once cat changes.
It's for an excel purpose so I need it to be compatible with:
style.apply(f)
to_excel()
You can use duplicated over a subset of the columns to look for duplicate values:
cols = ['key', 'cat']
df.loc[df.duplicated(subset=cols), cols] = ''
key cat B
0 K0 C0 A0
1 A1
2 K0 C1 A2
3 K1 C1 A3

How can I find the "set difference" of rows in two dataframes on a subset of columns in Pandas?

I have two dataframes, say df1 and df2, with the same column names.
Example:
df1
C1 | C2 | C3 | C4
A 1 2 AA
B 1 3 A
A 3 2 B
df2
C1 | C2 | C3 | C4
A 1 3 E
B 1 2 C
Q 4 1 Z
I would like to filter out rows in df1 based on common values in a fixed subset of columns between df1 and df2. In the above example, if the columns are C1 and C2, I would like the first two rows to be filtered out, as their values in both df1 and df2 for these columns are identical.
What would be a clean way to do this in Pandas?
So far, based on this answer, I have been able to find the common rows.
common_df = pandas.merge(df1, df2, how='inner', on=['C1','C2'])
This gives me a new dataframe with only those rows that have common values in the specified columns, i.e., the intersection.
I have also seen this thread, but the answers all seem to assume a difference on all the columns.
The expected result for the above example (rows common on specified columns removed):
C1 | C2 | C3 | C4
A 3 2 B
Maybe not the cleanest, but you could add a key column to df1 to check against.
Setting up the datasets
import pandas as pd
df1 = pd.DataFrame({ 'C1': ['A', 'B', 'A'],
'C2': [1, 1, 3],
'C3': [2, 3, 2],
'C4': ['AA', 'A', 'B']})
df2 = pd.DataFrame({ 'C1': ['A', 'B', 'Q'],
'C2': [1, 1, 4],
'C3': [3, 2, 1],
'C4': ['E', 'C', 'Z']})
Adding a key, using your code to find the commons
df1['key'] = range(1, len(df1) + 1)
common_df = pd.merge(df1, df2, how='inner', on=['C1','C2'])
df_filter = df1[~df1['key'].isin(common_df['key'])].drop('key', axis=1)
You can use an anti-join method where you do an outer join on the specified columns while returning the method of the join with an indicator. Only downside is that you'd have to rename and drop the extra columns after the join.
>>> import pandas as pd
>>> df1 = pd.DataFrame({'C1':['A','B','A'],'C2':[1,1,3],'C3':[2,3,2],'C4':['AA','A','B']})
>>> df2 = pd.DataFrame({'C1':['A','B','Q'],'C2':[1,1,4],'C3':[3,2,1],'C4':['E','C','Z']})
>>> df_merged = df1.merge(df2, on=['C1','C2'], indicator=True, how='outer')
>>> df_merged
C1 C2 C3_x C4_x C3_y C4_y _merge
0 A 1 2.0 AA 3.0 E both
1 B 1 3.0 A 2.0 C both
2 A 3 2.0 B NaN NaN left_only
3 Q 4 NaN NaN 1.0 Z right_only
>>> df1_setdiff = df_merged[df_merged['_merge'] == 'left_only'].rename(columns={'C3_x': 'C3', 'C4_x': 'C4'}).drop(['C3_y', 'C4_y', '_merge'], axis=1)
>>> df1_setdiff
C1 C2 C3 C4
2 A 3 2.0 B
>>> df2_setdiff = df_merged[df_merged['_merge'] == 'right_only'].rename(columns={'C3_y': 'C3', 'C4_y': 'C4'}).drop(['C3_x', 'C4_x', '_merge'], axis=1)
>>> df2_setdiff
C1 C2 C3 C4
3 Q 4 1.0 Z
import pandas as pd
df1 = pd.DataFrame({'C1':['A','B','A'],'C2':[1,1,3],'C3':[2,3,2],'C4':['AA','A','B']})
df2 = pd.DataFrame({'C1':['A','B','Q'],'C2':[1,1,4],'C3':[3,2,1],'C4':['E','C','Z']})
common = pd.merge(df1, df2,on=['C1','C2'])
R1 = df1[~((df1.C1.isin(common.C1))&(df1.C2.isin(common.C2)))]
R2 = df2[~((df2.C1.isin(common.C1))&(df2.C2.isin(common.C2)))]
df1:
C1 C2 C3 C4
0 A 1 2 AA
1 B 1 3 A
2 A 3 2 B
df2:
C1 C2 C3 C4
0 A 1 3 E
1 B 1 2 C
2 Q 4 1 Z
common:
C1 C2 C3_x C4_x C3_y C4_y
0 A 1 2 AA 3 E
1 B 1 3 A 2 C
R1:
C1 C2 C3 C4
2 A 3 2 B
R2:
C1 C2 C3 C4
2 Q 4 1 Z

Pandas: How to expand data frame rows containing a dictionary with varying keys in a column?

I'm a little stuck, can you please help me with this. I've simplified the problem I'm facing to the following:
Input
Desired Output
I know how to handle the case where the dictionaries in col. c have same keys.
You can create DataFrame by constructor, reshape by stack and last join to original:
df1 = (pd.DataFrame(df.c.values.tolist())
.stack()
.reset_index(level=1)
.rename(columns={0:'val','level_1':'key'}))
print (df1)
key val
0 c00 v00
0 c01 v01
1 c10 v10
2 c20 v20
2 c21 v21
2 c22 v22
df = df.drop('c', 1).join(df1).reset_index(drop=True)
print (df)
a b key val
0 a0 b0 c00 v00
1 a0 b0 c01 v01
2 a1 b1 c10 v10
3 a2 b2 c20 v20
4 a2 b2 c21 v21
5 a2 b2 c22 v22
Here is one way:
import pandas as pd
from itertools import chain
df = pd.DataFrame([['a0', 'b0', {'c00': 'v00', 'c01': 'v01'}],
['a1', 'b1', {'c10': 'v10'}],
['a2', 'b2', {'c20': 'v20', 'c21': 'v21', 'c22': 'v22'}] ],
columns=['a', 'b', 'c'])
# first convert 'c' to list of tuples
df['c'] = df['c'].apply(lambda x: list(x.items()))
lens = list(map(len, df['c']))
# create dataframe
df_out = pd.DataFrame({'a': np.repeat(df['a'].values, lens),
'b': np.repeat(df['b'].values, lens),
'c': list(chain.from_iterable(df['c'].values))})
# unpack tuple
df_out = df_out.join(df_out['c'].apply(pd.Series))\
.rename(columns={0: 'key', 1: 'val'}).drop('c', 1)
# a b key val
# 0 a0 b0 c00 v00
# 1 a0 b0 c01 v01
# 2 a1 b1 c10 v10
# 3 a2 b2 c20 v20
# 4 a2 b2 c21 v21
# 5 a2 b2 c22 v22
My solution is next:
import pandas as pd
t=pd.DataFrame([['a0','b0',{'c00':'v00','c01':'v01'}],['a1','b1',{'c10':'v10'}],['a2','b2',{'c20':'v20','c21':'v21','c22':'v22'}]],columns=['a','b','c'])
l2=[]
for i in t.index:
for j in t.loc[i,'c']:
l2+=[[t.loc[i,'a'],t.loc[i,'b'],j,t.loc[i,'c'][j]]]
t2=pd.DataFrame(l2,columns=['a','b','key','val'])
where 't' is your DataFrame, which you obtain as you want.

Categories

Resources