Count keywords in each line of dataframe - python

I want to count the number sum of presence of each key word of my list in a given dataframe column for each line.
d = {
'Column_1': ['mango pret Orange No manner', ' préts No scan '],
'Column_2': ['read priority No', 'This is a priority noir '],
'Column_3': ['No add', 'yep']
}
df = pd.DataFrame(data=d)
list_1 = ['Apple', 'Mango' ,'Orange', 'pr[éeêè]t[s]?']
list_2 = ['weather', 'r[ea]d' ,'p[wr]iority', 'noir?']
list_3 = ['n[eéè]d','snow[s]?', 'blanc?']
dict = {
"s1": ['Column_1', list_1],
"s2": ['Column_1', list_3],
"s3": ['Column_2', list_2],
"s4": ['Column_3', list_3],
"s5": ['Column_2','Column_3',list_1]
}
for elt in list(dict.keys()):
#s1 s2 s3 print(elt)
if len(dict[elt])<=2:
d = Counter(re.findall(r'|'.join(dict[elt][1]).lower(), str(df[dict[elt][0]].str.lower())))
print(d)
#df[elt] = d
sum(d.values())
elif len(dict[elt])>2:
aa = Counter(re.findall(r'|'.join(dict[elt][2]).lower(), str(df[dict[elt][0]].str.lower())))
bb = Counter(re.findall(r'|'.join(dict[elt][2]).lower(), str(df[dict[elt][1]].str.lower())))
b = sum(bb.values())
a = sum(aa.values())
d = a +b
df[elt] = d
the result of my print(d) is below
Counter({'mango': 1, 'pret': 1, 'orange': 1, 'préts': 1})
How can I do to change this code to give something like the dataframe df2 below
d2 = {'s1': [3, 1], 's3':[2,1]}
df2 = pd.DataFrame(data=d2)

import pandas as pd
import re
d = {
'Column_1': [u'mango pret Orange No manner', u' préts No scan '],
'Column_2': [u'read priority No', u'This is a priority noir '],
'Column_3': [u'No add', u'yep']
}
df = pd.DataFrame(data=d)
list_1 = [u'Apple', u'Mango' ,u'Orange', u'pr[éeêè]t[s]?' ]
list_2 = [u'weather', u'r[ea]d' ,u'p[wr]iority', u'noir?' ]
list_3 = [u'n[eéè]d',u'snow[s]?', u'blanc?' ]
my_dict = {
"s1": ['Column_1', list_1],
"s2": ['Column_1', list_3],
"s3": ['Column_2', list_2],
"s4": ['Column_3', list_3],
"s5": ['Column_2','Column_3',list_1]
}
d2 = dict()
for key, lst in my_dict.items():
# Distinguish between columns and regex (assuming regex are stored in lists)
col_names = filter(lambda x: isinstance(x, str), lst)
regex_lists = filter(lambda x: isinstance(x, list), lst)
# Concatenate all regex
regex_list = reduce(lambda x, y: x+y, regex_lists)
# For the considered columns, apply regex search in each cell and count
map_function = lambda cell: len(re.findall(r'|'.join(regex_list).lower(), str(cell).lower()))
df_regex_count = df[col_names].applymap(map_function)
# Convert to desired output with lists to make a new dataframe
d2[key] = map(sum, df_regex_count.values.tolist())
df2 = pd.DataFrame(data=d2)
Output :
s1 s2 s3 s4 s5
0 3 0 1 0 0
1 1 0 2 0 0
Note that s3 gives [1, 2] and not [2, 1] because r[ea]d doesn't catch read and noir? catches noir.

Related

Transform Pandas column to get a key value pair in a column post group by

My DataFrame:
Col X Col Y ID Value
A a 'r' 3
A a 'b' 2
A a 'c' 1
B b 'd' 5
B b 's' 6
B b 'd' 7
Output required:
Col X Col Y Out
A a {'r':3, 'b':2, 'c':1}
B b {'d': 5, 's': 6, 'd':7}
Approach tried so far:
df = df.set_index(['Col X', 'Col Y', 'ID']).Value
dict_column = {k: df.xs((k, v)).to_dict() for k,v,v2 in df.index}
Use GroupBy.apply with lambda function:
df['ID'] = df['ID'].str.strip("'")
df1 = (df.groupby(['Col X', 'Col Y'])[['ID','Value']]
.apply(lambda x: dict(x.to_numpy()))
.reset_index(name='Out'))
print (df1)
Col X Col Y Out
0 A a {'r': 3, 'b': 2, 'c': 1}
1 B b {'d': 7, 's': 6}
Duplicated keys not exist in python dictionary. You can aggregate values, e.g. by sum:
df['ID'] = df['ID'].str.strip("'")
df = df.groupby(['Col X', 'Col Y','ID'], as_index=False)['Value'].sum()
print (df)
Col X Col Y ID Value
0 A a b 2
1 A a c 1
2 A a r 3
3 B b d 12
4 B b s 6
df1 = (df.groupby(['Col X', 'Col Y'])[['ID','Value']]
.apply(lambda x: dict(x.to_numpy()))
.reset_index(name='Out'))
print (df1)
Col X Col Y Out
0 A a {'b': 2, 'c': 1, 'r': 3}
1 B b {'d': 12, 's': 6}
You can create pd.Series inside apply and use to_dict:
output = ( df.groupby(['Col X', 'Col Y'])[['ID', 'Value']].
apply(lambda x: pd.Series(x['Value'].values,index=x['ID']).to_dict()) )
You can use groupby.apply with dict and zip:
(df.groupby(['Col X', 'Col Y'])
.apply(lambda x: dict(zip(x['ID'], x['Value'])))
.reset_index(name='Out')
)
Output:
Col X Col Y Out
0 A a {'r': 3, 'b': 2, 'c': 1}
1 B b {'d': 7, 's': 6}
If you want to aggregate the duplicated keys:
(df.groupby(['Col X', 'Col Y'])
.apply(lambda x: x['Value'].groupby(x['ID']).sum().to_dict())
.reset_index(name='Out')
)
Output:
Col X Col Y Out
0 A a {'b': 2, 'c': 1, 'r': 3}
1 B b {'d': 12, 's': 6}

I want to grab the Values in a nested dictionary and make it as a fresh dictionary and Later I want to convert it into a df. How Can I do that?

Current data
Some_dict = {0: {},
1: {'A': ['apple']},
2: {},
3: {'B': ['orange', 'papaya']},
4: {'C': ['mango', 'berries', 'grape']}}
I want
my_dict = { {},
{'A': ['apple']},
{{},
{'B': ['orange','papaya']},
{'C': ['mango', 'berries', 'grape']}}
Then I want to make my_dict as a dataframe:
name
fruits
{}
A
apple
{}
B
orange
B
papaya
C
mango
C
berries
C
grape
How can I do that in pandas and in python?
Thanks
you need the list of values
new_dictionary_list = [x for x in some_dict.values()]
now,
you can make a data frame using
my_dataframe = pd.DataFrame({'name':[list(x.items())[0][0] for x in new_dictionary_list], 'fruits':[list(x.items())[0][1] for x in new_dictionary_list]})
You already got an answer on how to get to my_dict, for creating a DataFrame of your original data you could do the following to create a pair of data for each row:
tmp = []
for key,value in some_dict.items():
if value:
tmp.extend([(k,v) for k, vals in value.items() for v in vals])
else:
tmp.append(('',''))
print(tmp)
out = pd.DataFrame(res, columns=['name', 'fruits'])
print(out)
Output:
# tmp
[('', ''), ('A', 'apple'), ('', ''), ('B', 'orange'), ('B', 'papaya'), ('C', 'mango'), ('C', 'berries'), ('C', 'grape')]
# out
name fruits
0
1 A apple
2
3 B orange
4 B papaya
5 C mango
6 C berries
7 C grape
Is this what you want ?
my_fruit_list = []
for v in d.values():
if not v:
my_fruit_list.append([{}, ""])
else:
key = list(v.keys())[0]
my_fruit_list.append([key, v[key]])
print(my_fruit_list)

Find all values that one column's value has and collect as JSON

class_id
class
code
id
8
XYZ
A
1
8
XYZ
B
2
9
ABC
C
3
I have a dataframe like above. I want to transform it so the 'codes' column below collects all the unique (code, id) pairs into a JSON format that a class contains.
class_id
class
codes
8
XYZ
[{'code: 'A', 'id': 1}, {'code': 'B', 'id': 2}]
9
ABC
[{'code: 'C', 'id': 3}]
This approach should works for you:
import pandas as pd
records = {"class": ["XYZ", "XYZ", "ABC"], "code": ["A", "B", "C"], "id": [1, 2, 3]}
df = pd.DataFrame(records)
pd.DataFrame(
[
(x[0], x[1].drop(columns=["class"]).to_dict(orient="records"))
for x in df.groupby("class")
],
columns=["class", "codes"],
)
Output:

Add a column with key and value from a dictionary

I have this pandas dataframe
import pandas as pd
df = pd.DataFrame([{'col1': ['plane', 'chair']}, {'col1': ['computer', 'beach', 'book', 'language']}, {'col1': ['rice', 'bus', 'street']}])
and I have this dictionary
categories = {
'transport': ['car', 'truck', 'plane'],
'reading': ['book', 'library'],
'food': ['rice', 'milk', 'tea']
}
and I want this kind of final output:
index col1 col2
0: ['plane', 'chair'], transport-plane
1: ['computer', 'beach', 'book', 'language'], reading-book
2: ['rice', 'bus', 'street'], food-rice
I want that col2 have the key and value from the dictionary.
I have only added the key from the dictionary but not the key and value from the dictionary.
Try:
tmp = {vv: k for k, v in categories.items() for vv in v}
x = df.explode("col1")
x["col2"] = x["col1"].apply(
lambda x: "{}-{}".format(tmp[x], x) if x in tmp else np.nan
)
x = x.groupby(level=0).agg(
col1=("col1", list), col2=("col2", lambda x: ", ".join(x[x.notna()]))
)
print(x)
Prints:
col1 col2
0 [plane, chair] transport-plane
1 [computer, beach, book, language] reading-book
2 [rice, bus, street] food-rice
How about:
import pandas as pd
df = pd.DataFrame([{'col1': ['plane', 'chair']}, {'col1': ['computer', 'beach', 'book', 'language']}, {'col1': ['rice', 'bus', 'street']}])
categories = {
'transport': ['car', 'truck', 'plane'],
'reading': ['book', 'library'],
'food': ['rice', 'milk', 'tea']
}
def match_pairs(categories, df):
col2=[]
index=0
for v in categories:
print(f'{df["col1"][index]} at index {index}')
for entry in df['col1'][index]:
print(f"Finding [{entry}] in {categories[v]}...")
if entry in categories[v]:
col2.append(v+'-'+entry)
break
index += 1
print(col2)
df['col2'] = col2
return df
print (match_pairs(categories, df))

Iterate over a column list pandas with a dictionary

I have a list in a pandas dataframe:
0: [car, telephone]
1: [computer, beach, book, language]
2: [rice, bus, street]
Every list is in each row.Also, this list has different length in every row.
and I have a dictionary:
dict = {'car': 'transport',
'rice':'food'
'book':'reading'
}
After that I have flattened the dict
d = {val:key for key, lst in dict.items() for val in lst}
I would like to iterate over the all items in the list and create a column of this kind,
this is the desired output:
index col1 col2
0: [car, telephone],transport
1: [computer, beach, book, language], reading
2: [rice, bus, street], food
I have tried:
df['col2'] = data_df['col1'].index.map(d)
but I get
col2
NaN
NaN
NaN
You can .explode then use the dictionary for translation and then group again:
Sample data:
import pandas as pd
data = {'id': {0: 1, 1: 2, 2: 3}, 'col': {0: ['car', 'telephone'], 1: ['computer', 'beach', 'book', 'language'], 2: ['rice', 'bus', 'street']}}
df = pd.DataFrame(data)
dct = {'car': 'transport', 'rice':'food', 'book':'reading'}
Code:
df2 = df.explode('col')
df2['col2'] = df2['col'].replace(dct)
df['col2'] = df2[~df2['col'].eq(df2['col2'])]['col2']
Output:
id col col2
0 1 [car, telephone] transport
1 2 [computer, beach, book, language] reading
2 3 [rice, bus, street] food
You can use apply on a custom function:
import pandas as pd
df = pd.DataFrame([{'col1': ['car', 'telephone']}, {'col1': ['computer', 'beach', 'book', 'language']}, {'col1': ['rice', 'bus', 'street']}])
def get_col2(lst):
d={'car': 'transport','rice':'food','book':'reading'}
for k,v in d.items():
if k in lst:
return v
df['col2'] = df['col1'].apply(get_col2)
Output:
col1
col2
0
['car', 'telephone']
transport
1
['computer', 'beach', 'book', 'language']
reading
2
['rice', 'bus', 'street']
food

Categories

Resources