I have a column in dataframe of type list
categories
[0, 0, 2, 2, 2]
[0, 0, 2, 2]
[0, 0, 2, 2, 2]
[1, 1, 2, 2]
[2, 2, 0, 0]
[1, 0, 2, 3]
here is the sample list
li = [[0, 0, 2, 2, 2], [0, 0, 2, 2], [0, 0, 2, 2, 2], [1, 1, 2, 2], [2, 2, 0, 0], [1, 0, 2, 2], [0, 0, 2, 2], [0, 0, 2, 2, 2], [0, 0, 2, 2], [2, 2, 0, 0], [2, 2, 0, 0], [0, 0, 2, 2], [0, 0, 2, 2], [0, 0, 2, 2], [0, 0, 2, 2], [2, 2, 0, 0], [2, 2], [1, 1, 2], [0, 2, 2, 0], [0, 0, 2, 2], [0, 1], [0, 0], [0, 0, 2, 2], [0, 0], [0, 0, 2, 2], [0, 2, 2, 0], [2, 2, 2, 2], [0, 0, 2, 2], [0, 0, 2, 2], [0, 0, 2, 2], [2, 2, 0, 0], [0, 0, 2, 2], [2, 2, 0, 1], [2, 2, 0, 0], [0, 0, 2, 2], [0, 0, 2, 2], [0, 0, 2, 2], [0, 0, 2, 2], [0, 0, 2, 2], [0, 0, 2, 2], [0, 0, 2, 2, 2], [2, 1], [0, 0, 2, 2, 2], [2, 2, 0, 0], [2, 0], [2, 2, 0, 0], [0, 2], [0, 2, 2], [0, 0, 2, 2], [0, 2, 2, 0], [0, 0, 2, 2], [0, 0, 2, 2], [0, 0, 2, 2], [0, 0, 2, 2, 2], [0, 0, 2], [0, 0, 2, 2], [0, 0, 2, 2], [0, 0, 2, 2], [0, 0, 2, 2], [3, 2, 0, 0], [0, 0], [0, 0, 2, 2], [0, 0, 2, 2, 2, 2], [0, 0, 2, 2], [0, 0, 2, 2], [0, 0, 2, 2], [1, 3], [0, 0, 2, 2], [0, 0, 2, 2], [0, 0, 2, 2], [0, 0, 2, 2], [0, 0, 2, 2], [0, 2, 0, 2], [0, 0, 2, 2], [2, 2, 0, 0], [2, 2, 0, 0], [2, 2], [0, 0, 2, 2], [0, 2], [0, 0, 2, 2], [0, 0, 2, 2], [2, 2, 0], [2, 2, 0, 0], [0, 0, 2, 2], [0, 0, 2], [2], [0, 0, 2, 2], [2, 2, 2, 1, 1], [0, 0], [0, 3], [2, 2], [1, 2], [1,3]]
I want to create a new column (class_name) based on the following rule
The rules are based on priority and should be done one after other
if 1 and 3 are present, set class_name to class1
On the remaining rows, wherever 1 is present, set class_name to class2
On the remaining rows, wherever 3 is present, set class_name to class3
if 0 and 2 are present, set class_name to class4
On the remaining rows, wherever 0 is present, set class_name to class5
On the remaining rows, wherever 2 is present, set class_name to class6
What I have tried so far
df.loc[:, "class_name"] = None
for index, row in df.iterrows():
if row["class_name"] == None:
categories = list(row["categories"])
if 1 in categories and 3 in categories:
df.loc[index, "class_name"] = "class1"
Similarly, for each condition I have a separate loop.. but it's too slow.. is there a way to do it without looping ?
I think I understood the question correctly. And I tried something like this, it seems to work fine.
import pandas as pd
li = [[0, 0, 2, 2, 2], [0, 0, 2, 2], [0, 0, 2, 2, 2], [1, 1, 2, 2], [2, 2, 0, 0], [1, 0, 2, 2], [0, 0, 2, 2], [0, 0, 2, 2, 2], [0, 0, 2, 2], [2, 2, 0, 0], [2, 2, 0, 0], [0, 0, 2, 2], [0, 0, 2, 2], [0, 0, 2, 2], [0, 0, 2, 2], [2, 2, 0, 0], [2, 2], [1, 1, 2], [0, 2, 2, 0], [0, 0, 2, 2], [0, 1], [0, 0], [0, 0, 2, 2], [0, 0], [0, 0, 2, 2], [0, 2, 2, 0], [2, 2, 2, 2], [0, 0, 2, 2], [0, 0, 2, 2], [0, 0, 2, 2], [2, 2, 0, 0], [0, 0, 2, 2], [2, 2, 0, 1], [2, 2, 0, 0], [0, 0, 2, 2], [0, 0, 2, 2], [0, 0, 2, 2], [0, 0, 2, 2], [0, 0, 2, 2], [0, 0, 2, 2], [0, 0, 2, 2, 2], [2, 1], [0, 0, 2, 2, 2], [2, 2, 0, 0], [2, 0], [2, 2, 0, 0], [0, 2], [0, 2, 2], [0, 0, 2, 2], [0, 2, 2, 0], [0, 0, 2, 2], [0, 0, 2, 2], [0, 0, 2, 2], [0, 0, 2, 2, 2], [0, 0, 2], [0, 0, 2, 2], [0, 0, 2, 2], [0, 0, 2, 2], [0, 0, 2, 2], [3, 2, 0, 0], [0, 0], [0, 0, 2, 2], [0, 0, 2, 2, 2, 2], [0, 0, 2, 2], [0, 0, 2, 2], [0, 0, 2, 2], [1, 3], [0, 0, 2, 2], [0, 0, 2, 2], [0, 0, 2, 2], [0, 0, 2, 2], [0, 0, 2, 2], [0, 2, 0, 2], [0, 0, 2, 2], [2, 2, 0, 0], [2, 2, 0, 0], [2, 2], [0, 0, 2, 2], [0, 2], [0, 0, 2, 2], [0, 0, 2, 2], [2, 2, 0], [2, 2, 0, 0], [0, 0, 2, 2], [0, 0, 2], [2], [0, 0, 2, 2], [2, 2, 2, 1, 1], [0, 0], [0, 3], [2, 2], [1, 2], [1,3]]
df=pd.DataFrame(data={'category':li})
def check(x):
class_name=False
if all(item in x for item in [1,3]):
class_name='class1'
elif not all(item in x for item in [1,3]) and 1 in x:
class_name='class2'
elif not all(item in x for item in [1,3]) and 3 in x:
class_name='class3'
elif all(item in x for item in [0,2]):
class_name='class4'
elif 0 in x:
class_name='class5'
elif 2 in x:
class_name='class6'
else:
class_name='no_class'
return class_name
df['check']=df['category'].apply(lambda x: check(x))
print(df)
'''
category check
66 [1, 3] class1
92 [1, 3] class1
3 [1, 1, 2, 2] class2
5 [1, 0, 2, 2] class2
17 [1, 1, 2] class2
20 [0, 1] class2
32 [2, 2, 0, 1] class2
41 [2, 1] class2
87 [2, 2, 2, 1, 1] class2
91 [1, 2] class2
59 [3, 2, 0, 0] class3
89 [0, 3] class3
0 [0, 0, 2, 2, 2] class4
'''
Okay, so I ran your problem using a couple of different methods. The fastest of them all was using pandas.DataFrame.apply. Here's the code:
from __future__ import annotations
import pandas as pd
def class_name(row: list) -> str | None:
if 1 in row and 3 in row:
return "class1"
if 1 in row:
return "class2"
if 3 in row:
return "class3"
if 0 in row and 2 in row:
return "class4"
if 0 in row:
return "class5"
if 2 in row:
return "class6"
return None
# == How to use ============================
df["class_name"] = df["categories"].apply(class_name)
# Result using on a dataframe with 186000 rows:
"""
CPU times: user 80.9 ms, sys: 904 µs, total: 81.8 ms
Wall time: 82 ms
"""
Other Implementations I Tried
I've also tried some other implementations to compare. Here's them:
from __future__ import annotations
import pandas as pd
# == Code to Generate Sample DataFrame ==============
li = [
[0, 0, 2, 2, 2],
[0, 0, 2, 2],
[0, 0, 2, 2, 2],
[1, 1, 2, 2],
[2, 2, 0, 0],
[1, 0, 2, 2],
[0, 0, 2, 2],
[0, 0, 2, 2, 2],
[0, 0, 2, 2],
[2, 2, 0, 0],
[2, 2, 0, 0],
[0, 0, 2, 2],
[0, 0, 2, 2],
[0, 0, 2, 2],
[0, 0, 2, 2],
[2, 2, 0, 0],
[2, 2],
[1, 1, 2],
[0, 2, 2, 0],
[0, 0, 2, 2],
[0, 1],
[0, 0],
[0, 0, 2, 2],
[0, 0],
[0, 0, 2, 2],
[0, 2, 2, 0],
[2, 2, 2, 2],
[0, 0, 2, 2],
[0, 0, 2, 2],
[0, 0, 2, 2],
[2, 2, 0, 0],
[0, 0, 2, 2],
[2, 2, 0, 1],
[2, 2, 0, 0],
[0, 0, 2, 2],
[0, 0, 2, 2],
[0, 0, 2, 2],
[0, 0, 2, 2],
[0, 0, 2, 2],
[0, 0, 2, 2],
[0, 0, 2, 2, 2],
[2, 1],
[0, 0, 2, 2, 2],
[2, 2, 0, 0],
[2, 0],
[2, 2, 0, 0],
[0, 2],
[0, 2, 2],
[0, 0, 2, 2],
[0, 2, 2, 0],
[0, 0, 2, 2],
[0, 0, 2, 2],
[0, 0, 2, 2],
[0, 0, 2, 2, 2],
[0, 0, 2],
[0, 0, 2, 2],
[0, 0, 2, 2],
[0, 0, 2, 2],
[0, 0, 2, 2],
[3, 2, 0, 0],
[0, 0],
[0, 0, 2, 2],
[0, 0, 2, 2, 2, 2],
[0, 0, 2, 2],
[0, 0, 2, 2],
[0, 0, 2, 2],
[1, 3],
[0, 0, 2, 2],
[0, 0, 2, 2],
[0, 0, 2, 2],
[0, 0, 2, 2],
[0, 0, 2, 2],
[0, 2, 0, 2],
[0, 0, 2, 2],
[2, 2, 0, 0],
[2, 2, 0, 0],
[2, 2],
[0, 0, 2, 2],
[0, 2],
[0, 0, 2, 2],
[0, 0, 2, 2],
[2, 2, 0],
[2, 2, 0, 0],
[0, 0, 2, 2],
[0, 0, 2],
[2],
[0, 0, 2, 2],
[2, 2, 2, 1, 1],
[0, 0],
[0, 3],
[2, 2],
[1, 2],
[1, 3],
]
def make_df(size=1):
return pd.DataFrame({'categories': [v for i in range(size) for v in li]})
# == Implementation 1 ============================
def class_name2(row: list) -> str | None:
categories = row["categories"]
if 1 in categories and 3 in categories:
return "class1"
if 1 in categories:
return "class2"
if 3 in categories:
return "class3"
if 0 in categories and 2 in categories:
return "class4"
if 0 in categories:
return "class5"
if 2 in categories:
return "class6"
return None
df = make_df(2000)
df["class_name"] = df.apply(class_name2, axis=1)
# Result:
"""
CPU times: user 1.69 s, sys: 17 ms, total: 1.71 s
Wall time: 1.71 s
"""
# == Implementation 2 ============================
# This is your original implementation
df = make_df(2000)
df.loc[:, "class_name"] = None
for index, row in df.iterrows():
if row["class_name"] == None:
categories = list(row["categories"])
if 1 in categories and 3 in categories:
df.loc[index, "class_name"] = "class1"
elif 1 in categories:
df.loc[index, "class_name"] = "class2"
elif 3 in categories:
df.loc[index, "class_name"] = "class3"
elif 0 in categories and 2 in categories:
df.loc[index, "class_name"] = "class4"
elif 0 in categories:
df.loc[index, "class_name"] = "class5"
elif 2 in categories:
df.loc[index, "class_name"] = "class6"
# Result:
"""
CPU times: user 24.2 s, sys: 65.6 ms, total: 24.3 s
Wall time: 24.5 s
"""
# == Implementation 3 ============================
# This is your original implementation without the if statement
df = make_df(2000)
df.loc[:, "class_name"] = None
for index, row in df.iterrows():
categories = list(row["categories"])
if 1 in categories and 3 in categories:
df.loc[index, "class_name"] = "class1"
elif 1 in categories:
df.loc[index, "class_name"] = "class2"
elif 3 in categories:
df.loc[index, "class_name"] = "class3"
elif 0 in categories and 2 in categories:
df.loc[index, "class_name"] = "class4"
elif 0 in categories:
df.loc[index, "class_name"] = "class5"
elif 2 in categories:
df.loc[index, "class_name"] = "class6"
# Result:
"""
CPU times: user 24 s, sys: 91.2 ms, total: 24.1 s
Wall time: 24.3 s
"""
# == Implementation 4 ============================
# This is your original implementation without the if statement
# and the list conversion
df = make_df(2000)
df.loc[:, "class_name"] = None
for index, row in df.iterrows():
categories = row["categories"]
if 1 in categories and 3 in categories:
df.loc[index, "class_name"] = "class1"
elif 1 in categories:
df.loc[index, "class_name"] = "class2"
elif 3 in categories:
df.loc[index, "class_name"] = "class3"
elif 0 in categories and 2 in categories:
df.loc[index, "class_name"] = "class4"
elif 0 in categories:
df.loc[index, "class_name"] = "class5"
elif 2 in categories:
df.loc[index, "class_name"] = "class6"
# Result:
"""
CPU times: user 23.4 s, sys: 80 ms, total: 23.5 s
Wall time: 24.2 s
"""
# == Implementation 5 ============================
# Using `swifter`. Install swifter before trying this one:
# pip install swifter
import swifter
def class_name(row: list) -> str | None:
if 1 in row and 3 in row:
return "class1"
if 1 in row:
return "class2"
if 3 in row:
return "class3"
if 0 in row and 2 in row:
return "class4"
if 0 in row:
return "class5"
if 2 in row:
return "class6"
return None
df = make_df(2000)
df["class_name"] = df["categories"].swifter.apply(class_name)
# Result:
"""
CPU times: user 572 ms, sys: 11 ms, total: 582 ms
Wall time: 930 ms
"""
Summary
Here's a summary of all the results:
Implementation
Total Time
Times Faster
Best
82 ms
300x
Implementation 1
1.71 s
14.3x
Implementation 2
24.5 s
1x
Implementation 3
24.3 s
1.008x
Implementation 4
24.2 s
1.012x
Implementation 5
930 ms
26x