find duplicated groups in dataframe - python

I have a dataframe as described below and I need to find out the duplicate groups based on the columns - value1,value2 & value3 (groups should be grouped by id).
I need to fill column 'duplicated' with true
if the group appears elsewhere in the table,if group is unique fill with false.
note: each group has different id.
df = pd.DataFrame({'id': ['A', 'A', 'A', 'A', 'B', 'B', 'C', 'C', 'C', 'C', 'D', 'D', 'D'],
'value1': ['1', '2', '3', '4', '1', '2', '1', '2', '3', '4', '1', '2', '3'],
'value2': ['1', '2', '3', '4', '1', '2', '1', '2', '3', '4', '1', '2', '3'],
'value3': ['1', '2', '3', '4', '1', '2', '1', '2', '3', '4', '1', '2', '3'],
'duplicated' : []
})
expected result is:
I tried this, but if is comparing rows, I need to compare groups (grouped by id)
import pandas as pd
data = pd.read_excel('C:/Users/path/Desktop/example.xlsx')
# False : Mark all duplicates as True.
data['duplicates'] = data.duplicated(subset= ["value1","value2","value3"], keep=False)
data.to_excel('C:/Users/path/Desktop/example_result.xlsx',index=False)
and I got:
note: the order of the records in the both groups doesnt matter

This may not be very efficient but it works if duplicated groups have the same "order".
import pandas as pd
df = pd.DataFrame({'id': ['A', 'A', 'A', 'A', 'B', 'B', 'C', 'C', 'C', 'C', 'D', 'D', 'D'],
'value1': ['1', '2', '3', '4', '1', '2', '1', '2', '3', '4', '1', '2', '3'],
'value2': ['1', '2', '3', '4', '1', '2', '1', '2', '3', '4', '1', '2', '3'],
'value3': ['1', '2', '3', '4', '1', '2', '1', '2', '3', '4', '1', '2', '3'],
'duplicated': [False] * 13
})
def check_dup(df, col1, col2):
# Checks if two groups are duplicates.
# First checks the sizes, if they are equal then checks actual values.
df1 = df[df['id'] == col1][['value1', 'value2', 'value3']]
df2 = df[df['id'] == col2][['value1', 'value2', 'value3']]
if df1.size != df2.size:
return False
return (df1.values == df2.values).all()
id_unique = set(df['id'].values) # set of unique ids
id_dic = dict.fromkeys(id_unique, False) # dict for "duplicated" value for each id
for id1 in id_unique:
for id2 in id_unique - {id1}:
if check_dup(df, id1, id2):
id_dic[id1] = True
break
# Update 'duplicated' column on df
for id_ in id_dic:
df.loc[df['id'] == id_, 'duplicated'] = id_dic[id_]
print(df)
id value1 value2 value3 duplicated
0 A 1 1 1 True
1 A 2 2 2 True
2 A 3 3 3 True
3 A 4 4 4 True
4 B 1 1 1 False
5 B 2 2 2 False
6 C 1 1 1 True
7 C 2 2 2 True
8 C 3 3 3 True
9 C 4 4 4 True
10 D 1 1 1 False
11 D 2 2 2 False
12 D 3 3 3 False

You can do it like this
First sort_values just in case, set_index the id and stack to change the shape of your data and get a single column with to_frame
df_ = (df.sort_values(by=["value1","value2","value3"])
.set_index('id')[["value1","value2","value3"]]
.stack()
.to_frame()
)
Second, you can append an set_index with a cumcount per id, drop the level of index with the name of the original column (Value1 ...), unstack to get one row per id, fillna with a random value and use duplicated.
s_dup = df_.set_index([df_.groupby('id').cumcount()], append=True)\
.reset_index(level=1, drop=True)[0]\
.unstack()\
.fillna(0)\
.duplicated(keep=False)
print (s_dup)
id
A True
B False
C True
D False
dtype: bool
Now you can just map to the original dataframe:
df['dup'] = df['id'].map(s_dup)
print (df)
id value1 value2 value3 dup
0 A 1 1 1 True
1 A 2 2 2 True
2 A 3 3 3 True
3 A 4 4 4 True
4 B 1 1 1 False
5 B 2 2 2 False
6 C 2 2 2 True
7 C 1 1 1 True
8 C 3 3 3 True
9 C 4 4 4 True
10 D 1 1 1 False
11 D 2 2 2 False
12 D 3 3 3 False

Related

Intercalate pandas dataframe columns when they are in pairs

The desired result is this:
id name
1 A
2 B
3 C
4 D
5 E
6 F
7 G
8 H
Currently I do it this way:
import pandas as pd
df = pd.DataFrame({'home_id': ['1', '3', '5', '7'],
'home_name': ['A', 'C', 'E', 'G'],
'away_id': ['2', '4', '6', '8'],
'away_name': ['B', 'D', 'F', 'H']})
id_col = pd.concat([df['home_id'], df['away_id']])
name_col = pd.concat([df['home_name'], df['away_name']])
result = pd.DataFrame({'id': id_col, 'name': name_col})
result = result.sort_index().reset_index(drop=True)
print(result)
But this form uses the index to reclassify the columns, generating possible errors in cases where there are equal indexes.
How can I intercalate the column values always being:
Use the home of the 1st line, then the away of the 1st line, then the home of the 2nd line, then the away of the 2nd line and so on...
try this:
out = pd.DataFrame(df.values.reshape(-1, 2), columns=['ID', 'Name'])
print(out)
>>>
ID Name
0 1 A
1 2 B
2 3 C
3 4 D
4 5 E
5 6 F
6 7 G
7 8 H
Similar to the python zip, you go iterate through both dataframes:
home = pd.DataFrame(df[['home_id', 'home_name']].values, columns=('id', 'name'))
away = pd.DataFrame(df[['away_id', 'away_name']].values, columns=('id', 'name'))
def zip_dataframes(df1, df2):
rows = []
for i in range(len(df1)):
rows.append(df1.iloc[i, :])
rows.append(df2.iloc[i, :])
return pd.concat(rows, axis=1).T
zip_dataframes(home, away)
id name
0 1 A
0 2 B
1 3 C
1 4 D
2 5 E
2 6 F
3 7 G
3 8 H
You can do this using pd.wide_to_long with a little column header renaming:
import pandas as pd
df = pd.DataFrame({'home_id': ['1', '3', '5', '7'],
'home_name': ['A', 'C', 'E', 'G'],
'away_id': ['2', '4', '6', '8'],
'away_name': ['B', 'D', 'F', 'H']})
dfr = df.rename(columns=lambda x: '_'.join(x.split('_')[::-1])).reset_index()
df_out = (pd.wide_to_long(dfr, ['id', 'name'], 'index', 'No', sep='_', suffix='.*')
.reset_index(drop=True)
.sort_values('id'))
df_out
Output:
id name
0 1 A
4 2 B
1 3 C
5 4 D
2 5 E
6 6 F
3 7 G
7 8 H

Merge row if cels are equals pandas

I have this df:
import pandas as pd
df = pd.DataFrame({'Time' : ['s_1234','s_1234', 's_1234', 's_5678', 's_8998','s_8998' ],
'Control' : ['A', '', '','B', 'C', ''],
'tot_1' : ['1', '1', '1','1', '1', '1'],
'tot_2' : ['2', '2', '2','2', '2', '2']})
--------
Time Control tot_1 tot_2
0 1234 A 1 2
1 1234 A 1 2
2 1234 1 2
3 5678 B 1 2
4 8998 C 1 2
5 8998 1 2
I would like each time an equal time value to be merged into one column. I would also like the "tot_1" and "tot_2" columns to be added together. And finally I would like to keep checking if present. Like:
Time Control tot_1 tot_2
0 1234 A 3 6
1 5678 B 1 2
2 8998 C 2 4
Your data is different then the example df.
construct df:
import pandas as pd
df = pd.DataFrame({'Time' : ['s_1234','s_1234', 's_1234', 's_5678', 's_8998','s_8998' ],
'Control' : ['A', '', '','B', 'C', ''],
'tot_1' : ['1', '1', '1','1', '1', '1'],
'tot_2' : ['2', '2', '2','2', '2', '2']})
df.Time = df.Time.str.split("_").str[1]
df = df.astype({"tot_1": int, "tot_2": int})
Group by Time and aggregate the values.
df.groupby('Time').agg({"Control": "first", "tot_1": "sum", "tot_2": "sum"}).reset_index()
Time Control tot_1 tot_2
0 1234 A 3 6
1 5678 B 1 2
2 8998 C 2 4
EDIT for comment: Not sure if thats the best way to do it, but you could construct your agg information like this:
n = 2
agg_ = {"Control": "first"} | {f"tot_{i+1}": "sum" for i in range(n)}
df.groupby('Time').agg(agg_).reset_index()

Dataframe groupby certain column and repeat the row n times

I would like to get df_output from df_input in below code. It is basically repeating the row 2 times grouped by date column. Also repeated tag should be included.
import pandas as pd
df_input = pd.DataFrame( [
['01/01', '1', '10'],
['01/01', '2', '5'],
['01/02', '1', '9'],
['01/02', '2', '7'],
], columns=['date','type','value'])
df_output = pd.DataFrame( [
['01/01', '1', '10', '1'],
['01/01', '2', '5', '1'],
['01/01', '1', '10', '2'],
['01/01', '2', '5', '2'],
['01/02', '1', '9', '1'],
['01/02', '2', '7', '1'],
['01/02', '1', '9', '2'],
['01/02', '2', '7', '2'],
], columns=['date','type','value', 'repeat'])
print(df_output)
I thought about grouping by the date column above and repeat the rows n times, but could not find the code.
You can use GroupBy.apply per date, and pandas.concat:
N = 2
out = (df_input
.groupby(['date'], group_keys=False)
.apply(lambda d: pd.concat([d]*N))
)
output:
date type value
0 01/01 1 10
1 01/01 2 5
0 01/01 1 10
1 01/01 2 5
2 01/02 1 9
3 01/02 2 7
2 01/02 1 9
3 01/02 2 7
With "repeat" column:
N = 2
out = (df_input
.groupby(['date'], group_keys=False)
.apply(lambda d: pd.concat([d.assign(repeat=n+1) for n in range(N)]))
)
output:
date type value repeat
0 01/01 1 10 1
1 01/01 2 5 1
0 01/01 1 10 2
1 01/01 2 5 2
2 01/02 1 9 1
3 01/02 2 7 1
2 01/02 1 9 2
3 01/02 2 7 2

Group 2 columns to categories based on the column values

I am new to Python and Pandas.
My DataFrame looks like this:
df = pd.DataFrame({'ID': ['a', 'a', 'a', 'a', 'a', 'b', 'b', 'b', 'b', 'c', 'c', 'c', 'd', 'd', 'd'],
'Position': ['0', '1', '2', '3', '4', '0', '1', '2', '3', '0', '1', '2', '0', '1', '2'],
'Brand': ['Mazda', 'BMW', 'Ford', 'Fiat', 'Dodge', 'Mazda', 'BMW', 'Ford', 'Fiat', 'BMW', 'Ford', 'Fiat', 'BMW', 'Ford', 'Fiat']
})
I want to group the position and brand together to make a category.
The output would look like this:
ID Group
a 1
b 2
c 3
d 3
Because group 1 is:
0 Mazda
1 BMW
2 Ford
3 Fiat
4 Dodge
And c = d because they both have the same care makers in the same order so the group is the same - 3:
0 BMW
1 Ford
2 Fiat
If d would have different order defined by the column position it would be a different category:
0 Fiat
1 BWM
2 Ford
How could I achieve the output as defined in the second code block?
Thank you for your suggestions.
You can distinguish same 3 first rows per groups with filter by head and convert to tuples and then use Series.factorize:
s = (df.groupby(['ID'], sort=False)['Position','Brand']
.apply(lambda x: tuple(x.head(3).values.ravel())))
df = pd.DataFrame({'ID':s.index, 'Cat':pd.factorize(s)[0] + 1})
print (df)
ID Cat
0 a 1
1 b 1
2 c 2
3 d 2

How to paste a list into a multi index dataframe?

Could you let me know how to paste a list into a multi-index dataframe?
I wanna paste list1 into column([func1 - In - Name1, Name2]['Val6'])
and list2 into column([func1 - Out - Name3, Name4]['Val6']) in multi-index dataframe
below is dataframe I used
from pandas import Series, DataFrame
raw_data = {'Function': ['env', 'env', 'env', 'func1', 'func1', 'func1'],
'Type': ['In', 'In', 'In', 'In','In', 'out'],
'Name': ['Volt', 'Temp', 'BD#', 'Name1','Name2', 'Name3'],
'Val1': ['Max', 'High', '1', '3', '5', '6'],
'Val2': ['Typ', 'Mid', '2', '4', '7', '6'],
'Val3': ['Min', 'Low', '3', '3', '6', '3'],
'Val4': ['Max', 'High', '4', '3', '9', '4'],
'Val5': ['Max', 'Low', '5', '3', '4', '5'] }
df = DataFrame(raw_data)
df= df.set_index(["Function", "Type","Name"])
df['Val6'] = np.NaN
list1 = [1,2]
list2 = [3,4]
print (df)
below is printed dataframe
Val1 Val2 Val3 Val4 Val5 Val6
Function Type Name
env In Volt Max Typ Min Max Max NaN
Temp High Mid Low High Low NaN
BD# 1 2 3 4 5 NaN
func1 In Name1 4 2 3 4 5 NaN
Name2 6 7 6 9 4 NaN
out Name3 6 6 3 4 5 NaN
Name4 3 3 4 5 6 NaN
Below is expected results.
I'd like to sequentially put each list1 and list2 into dataframe instead of NaN like below
Val1 Val2 Val3 Val4 Val5 Val6
Function Type Name
env In Volt Max Typ Min Max Max NaN
Temp High Mid Low High Low NaN
BD# 1 2 3 4 5 NaN
func1 In Name1 4 2 3 4 5 1
Name2 6 7 6 9 4 2
out Name3 6 6 3 4 5 3
Name4 3 3 4 5 6 4
I have tried to use concat, replace functions to do it but failed
In more complex datafrmae, I think it is better to use mask of multi -index in dataframe.
list1=[1,2]
list2=[3,4]
m1 = df.index.get_level_values(0) == 'func1'
m2 = df.index.get_level_values(1) == 'In'
list1 = [float(i) for i in list1]
df_list1=pd.DataFrame(list1)
df.replace(df[m1&m2]['Val6'], df_list1)
Unfortunately, I don't have any idea to solve the problem. T_T
Please give me some advice.
IIUC add an extra line at the end, simple modify it like it's a non-multi-index dataframe:
df['Val6'] = df['Val6'].tolist()[:-4] + list1 + list2
So your code would be:
from pandas import Series, DataFrame
raw_data = {'Function': ['env', 'env', 'env', 'func1', 'func1', 'func1'],
'Type': ['In', 'In', 'In', 'In','In', 'out'],
'Name': ['Volt', 'Temp', 'BD#', 'Name1','Name2', 'Name3'],
'Val1': ['Max', 'High', '1', '3', '5', '6'],
'Val2': ['Typ', 'Mid', '2', '4', '7', '6'],
'Val3': ['Min', 'Low', '3', '3', '6', '3'],
'Val4': ['Max', 'High', '4', '3', '9', '4'],
'Val5': ['Max', 'Low', '5', '3', '4', '5'] }
df = DataFrame(raw_data)
df= df.set_index(["Function", "Type","Name"])
df['Val6'] = np.NaN
list1 = [1,2]
list2 = [3,4]
df['Val6'] = df['Val6'].tolist()[:-4] + list1 + list2
print(df)
Output:
Val1 Val2 Val3 Val4 Val5 Val6
Function Type Name
env In Volt Max Typ Min Max Max NaN
Temp High Mid Low High Low NaN
BD# 1 2 3 4 5 1.0
func1 In Name1 3 4 3 3 3 2.0
Name2 5 7 6 9 4 3.0
out Name3 6 6 3 4 5 4.0

Categories

Resources