I have this pandas dataframe
import pandas as pd
df = pd.DataFrame([{'col1': ['plane', 'chair']}, {'col1': ['computer', 'beach', 'book', 'language']}, {'col1': ['rice', 'bus', 'street']}])
and I have this dictionary
categories = {
'transport': ['car', 'truck', 'plane'],
'reading': ['book', 'library'],
'food': ['rice', 'milk', 'tea']
}
and I want this kind of final output:
index col1 col2
0: ['plane', 'chair'], transport-plane
1: ['computer', 'beach', 'book', 'language'], reading-book
2: ['rice', 'bus', 'street'], food-rice
I want that col2 have the key and value from the dictionary.
I have only added the key from the dictionary but not the key and value from the dictionary.
Try:
tmp = {vv: k for k, v in categories.items() for vv in v}
x = df.explode("col1")
x["col2"] = x["col1"].apply(
lambda x: "{}-{}".format(tmp[x], x) if x in tmp else np.nan
)
x = x.groupby(level=0).agg(
col1=("col1", list), col2=("col2", lambda x: ", ".join(x[x.notna()]))
)
print(x)
Prints:
col1 col2
0 [plane, chair] transport-plane
1 [computer, beach, book, language] reading-book
2 [rice, bus, street] food-rice
How about:
import pandas as pd
df = pd.DataFrame([{'col1': ['plane', 'chair']}, {'col1': ['computer', 'beach', 'book', 'language']}, {'col1': ['rice', 'bus', 'street']}])
categories = {
'transport': ['car', 'truck', 'plane'],
'reading': ['book', 'library'],
'food': ['rice', 'milk', 'tea']
}
def match_pairs(categories, df):
col2=[]
index=0
for v in categories:
print(f'{df["col1"][index]} at index {index}')
for entry in df['col1'][index]:
print(f"Finding [{entry}] in {categories[v]}...")
if entry in categories[v]:
col2.append(v+'-'+entry)
break
index += 1
print(col2)
df['col2'] = col2
return df
print (match_pairs(categories, df))
Related
I have a list in a pandas dataframe:
0: [car, telephone]
1: [computer, beach, book, language]
2: [rice, bus, street]
Every list is in each row.Also, this list has different length in every row.
and I have a dictionary:
dict = {'car': 'transport',
'rice':'food'
'book':'reading'
}
After that I have flattened the dict
d = {val:key for key, lst in dict.items() for val in lst}
I would like to iterate over the all items in the list and create a column of this kind,
this is the desired output:
index col1 col2
0: [car, telephone],transport
1: [computer, beach, book, language], reading
2: [rice, bus, street], food
I have tried:
df['col2'] = data_df['col1'].index.map(d)
but I get
col2
NaN
NaN
NaN
You can .explode then use the dictionary for translation and then group again:
Sample data:
import pandas as pd
data = {'id': {0: 1, 1: 2, 2: 3}, 'col': {0: ['car', 'telephone'], 1: ['computer', 'beach', 'book', 'language'], 2: ['rice', 'bus', 'street']}}
df = pd.DataFrame(data)
dct = {'car': 'transport', 'rice':'food', 'book':'reading'}
Code:
df2 = df.explode('col')
df2['col2'] = df2['col'].replace(dct)
df['col2'] = df2[~df2['col'].eq(df2['col2'])]['col2']
Output:
id col col2
0 1 [car, telephone] transport
1 2 [computer, beach, book, language] reading
2 3 [rice, bus, street] food
You can use apply on a custom function:
import pandas as pd
df = pd.DataFrame([{'col1': ['car', 'telephone']}, {'col1': ['computer', 'beach', 'book', 'language']}, {'col1': ['rice', 'bus', 'street']}])
def get_col2(lst):
d={'car': 'transport','rice':'food','book':'reading'}
for k,v in d.items():
if k in lst:
return v
df['col2'] = df['col1'].apply(get_col2)
Output:
col1
col2
0
['car', 'telephone']
transport
1
['computer', 'beach', 'book', 'language']
reading
2
['rice', 'bus', 'street']
food
A data frame like below. the names are in 5 groups, linking by the common in column A.
I want to group the names. I tried:
import pandas as pd
data = {'A': ["James","James","James","Edward","Edward","Thomas","Thomas","Jason","Jason","Jason","Brian","Brian"],
'B' : ["John","Michael","William","David","Joseph","Christopher","Daniel","George","Kenneth","Steven","Ronald","Anthony"]}
df = pd.DataFrame(data)
df_1 = df.groupby('A')['B'].apply(list)
df_1 = df_1.to_frame().reset_index()
for index, row in df_1.iterrows():
print (row['A'], row['B'])
the outputs are:
('Brian', ['Ronald', 'Anthony'])
('Edward', ['David', 'Joseph'])
('James', ['John', 'Michael', 'William'])
('Jason', ['George', 'Kenneth', 'Steven'])
('Thomas', ['Christopher', 'Daniel'])
but I want one list for each group (it would be even better if there's an automatic way to assign a variable to each list), like:
['Brian', 'Ronald', 'Anthony']
['Edward', 'David', 'Joseph']
['James', 'John', 'Michael', 'William']
['Jason', 'George', 'Kenneth', 'Steven']
['Thomas', 'Christopher', 'Daniel']
I tried row['B'].append(row['A']) but it returns None.
What's the right way to group them? thank you.
You can add values of A grouping column in GroupBy.apply with .name attribute:
s = df.groupby('A')['B'].apply(lambda x: [x.name] + list(x))
print (s)
A
Brian [Brian, Ronald, Anthony]
Edward [Edward, David, Joseph]
James [James, John, Michael, William]
Jason [Jason, George, Kenneth, Steven]
Thomas [Thomas, Christopher, Daniel]
Name: B, dtype: object
You can try this. Use pd.Series.tolist()
for k,g in df.groupby('A')['B']:
print([k]+g.tolist())
['Brian', 'Ronald', 'Anthony']
['Edward', 'David', 'Joseph']
['James', 'John', 'Michael', 'William']
['Jason', 'George', 'Kenneth', 'Steven']
['Thomas', 'Christopher', 'Daniel']
The reason you got None as output is list.append returns None it mutates the list in-place.
try the following:
import pandas as pd
data = {'A': ["James","James","James","Edward","Edward","Thomas","Thomas","Jason","Jason","Jason","Brian","Brian"],
'B' : ["John","Michael","William","David","Joseph","Christopher","Daniel","George","Kenneth","Steven","Ronald","Anthony"]}
df = pd.DataFrame(data)
#display(df)
df_1 = df.groupby(list('A'))['B'].apply(list)
df_1 = df_1.to_frame().reset_index()
for index, row in df_1.iterrows():
''' The value of column A is not a list,
so need to split the string and store in to a list and then concatenate with column B '''
print(row['A'].split("delimiter") + row['B'])
output:
['Brian', 'Ronald', 'Anthony']
['Edward', 'David', 'Joseph']
['James', 'John', 'Michael', 'William']
['Jason', 'George', 'Kenneth', 'Steven']
['Thomas', 'Christopher', 'Daniel']
Iterate over the rows in pandas and get the list of objects from rows
import pandas as pd;
df=pd.read_json("inputfile.txt")
data
0 {'M': {'1': 'data', '2': 'data2'}}
1 {'M': {'3': '555', '5': '3333'}}
data=[]
for row in df.iterrows():
d = [{k1+k2:v2 for k1,v1 in x.items() for k2,v2 in v1.items()} for x in row]
data.append(d)
print (data)
getting output like this
[[{'M1': 'data', 'M2': 'data2'}], [{'M3': '555', 'M5': '3333'}]]
need the output like this
[{'M1': 'data', 'M2': 'data2'}, {'M3': '555', 'M5': '3333'}]
.extend() - extends the list by adding all items of a list (passed as an argument) to the end.
Ex.
import pandas as pd
data = {'data':[{'M': {'1': 'data', '2': 'data2'}},{'M': {'3': '555', '5': '3333'}}]}
df = pd.DataFrame(data)
print(df)
result=[]
for row in df.iterrows():
x = [{"{0}{1}".format(k,k1) : v1 for k,v in x[1].items() for k1,v1 in v.items()} for x in row[1].items() ]
result.extend(x)
print(result)
Or single-line list comprehension
x = [{"{0}{1}".format(k,k1) : v1 for k,v in x[1].items() for k1,v1 in v.items()} for row in df.iterrows()
for x in row[1].items() ]
print(x)
O/P:
data
0 {'M': {'1': 'data', '2': 'data2'}}
1 {'M': {'3': '555', '5': '3333'}}
[{'M1': 'data', 'M2': 'data2'}, {'M3': '555', 'M5': '3333'}]
By doing:
d = [{k1+k2:v2 for k1,v1 in x.items() for k2,v2 in v1.items()} for x in row]
You are creating a list. And appending it to data.
Modified code:
import pandas as pd
df = pd.DataFrame({'data': [{'M': {'1': 'data', '2': 'data2'}}, {'M': {'3': '555', '5': '3333'}}]})
data = []
for row in df.iterrows():
d = {k1+k2:v2 for x in row[1] for k1,v1 in x.items() for k2,v2 in v1.items()}
data.append(d)
print(data)
I have my dictionary as
{'id': '6576_926_1',
'name': 'xyz',
'm': 926,
0: {'id': '2896_926_2',
'name': 'lmn',
'm': 926},
1: {'id': '23_926_3',
'name': 'abc',
'm': 928}}
And I want to convert it into dataframe like
Id Name M
6576_926_1 Xyz 926
2896_926_2 Lmn 926
23_926_3 Abc 928
I am fine even if first row is not available as it doesn't have index. There are around 1.3 MN records and so speed is very important. I tried using a for loop and append statement and it takes forever
As you have mentioned that first row is not mandatory for you. So, here i've tried this. Hope this will solve your problem
import pandas as pd
lis = []
data = {
0: {'id': '2896_926_2', 'name': 'lmn', 'm': 926},
1: {'id': '23_926_3', 'name': 'abc', 'm': 928}
}
for key,val in data.iteritems():
lis.append(val)
d = pd.DataFrame(lis)
print d
Output--
id m name
0 2896_926_2 926 lmn
1 23_926_3 928 abc
And if you want to id as your index then add set_index
for i,j in data.iteritems():
lis.append(j)
d = pd.DataFrame(lis)
d = d.set_index('id')
print d
Output-
m name
id
2896_926_2 926 lmn
23_926_3 928 abc
You can use a loop to convert each dictionary's entries into a list, and then use panda's .from_dict to convert to a dataframe. Here's the example given:
>>> data = {'col_1': [3, 2, 1, 0], 'col_2': ['a', 'b', 'c', 'd']}
>>> pd.DataFrame.from_dict(data)
col_1 col_2
0 3 a
1 2 b
2 1 c
3 0 d
Use the following approach
import pandas as pd
data = pd.Dataframe(dict)
data = data.drop(0, axis=1)
data = data.drop(1, axis=1)
You can also try this
import pandas as pd
del dict['id']
del dict['name']
del dict['m']
pd.DataFrame(dict)
Try this code!! Still, complexity is O(n)
my_dict.pop('id')
my_dict.pop('name')
my_dict.pop('m')
data = [ row.values() for row in my_dict.values()]
pd.DataFrame(data=data, columns=['id','name','m'])
import pandas as pd
data={'id': '6576_926_1','name': 'xyz','m': 926,0: {'id': '2896_926_2', 'name': 'lmn', 'm': 926},1: {'id': '23_926_3', 'name': 'abc','m': 928}}
Id=[]
Name=[]
M=[]
for k,val in data.items():
if type(val) is dict:
Id.append(val['id'])
Name.append(val['name'])
M.append(val['m'])
df=pd.DataFrame({'Name':Name,'Id':Id,'M':M})
print(df)
mydict = {'id': '6576_926_1',
'name': 'xyz',
'm': 926,
0: {'id': '2896_926_2',
'name': 'lmn',
'm': 926},
1: {'id': '23_926_3',
'name': 'abc',
'm': 928}}
import pandas as pd
del mydict['id']
del mydict['name']
del mydict['m']
d = pd.DataFrame(mydict).T
I want to count the number sum of presence of each key word of my list in a given dataframe column for each line.
d = {
'Column_1': ['mango pret Orange No manner', ' préts No scan '],
'Column_2': ['read priority No', 'This is a priority noir '],
'Column_3': ['No add', 'yep']
}
df = pd.DataFrame(data=d)
list_1 = ['Apple', 'Mango' ,'Orange', 'pr[éeêè]t[s]?']
list_2 = ['weather', 'r[ea]d' ,'p[wr]iority', 'noir?']
list_3 = ['n[eéè]d','snow[s]?', 'blanc?']
dict = {
"s1": ['Column_1', list_1],
"s2": ['Column_1', list_3],
"s3": ['Column_2', list_2],
"s4": ['Column_3', list_3],
"s5": ['Column_2','Column_3',list_1]
}
for elt in list(dict.keys()):
#s1 s2 s3 print(elt)
if len(dict[elt])<=2:
d = Counter(re.findall(r'|'.join(dict[elt][1]).lower(), str(df[dict[elt][0]].str.lower())))
print(d)
#df[elt] = d
sum(d.values())
elif len(dict[elt])>2:
aa = Counter(re.findall(r'|'.join(dict[elt][2]).lower(), str(df[dict[elt][0]].str.lower())))
bb = Counter(re.findall(r'|'.join(dict[elt][2]).lower(), str(df[dict[elt][1]].str.lower())))
b = sum(bb.values())
a = sum(aa.values())
d = a +b
df[elt] = d
the result of my print(d) is below
Counter({'mango': 1, 'pret': 1, 'orange': 1, 'préts': 1})
How can I do to change this code to give something like the dataframe df2 below
d2 = {'s1': [3, 1], 's3':[2,1]}
df2 = pd.DataFrame(data=d2)
import pandas as pd
import re
d = {
'Column_1': [u'mango pret Orange No manner', u' préts No scan '],
'Column_2': [u'read priority No', u'This is a priority noir '],
'Column_3': [u'No add', u'yep']
}
df = pd.DataFrame(data=d)
list_1 = [u'Apple', u'Mango' ,u'Orange', u'pr[éeêè]t[s]?' ]
list_2 = [u'weather', u'r[ea]d' ,u'p[wr]iority', u'noir?' ]
list_3 = [u'n[eéè]d',u'snow[s]?', u'blanc?' ]
my_dict = {
"s1": ['Column_1', list_1],
"s2": ['Column_1', list_3],
"s3": ['Column_2', list_2],
"s4": ['Column_3', list_3],
"s5": ['Column_2','Column_3',list_1]
}
d2 = dict()
for key, lst in my_dict.items():
# Distinguish between columns and regex (assuming regex are stored in lists)
col_names = filter(lambda x: isinstance(x, str), lst)
regex_lists = filter(lambda x: isinstance(x, list), lst)
# Concatenate all regex
regex_list = reduce(lambda x, y: x+y, regex_lists)
# For the considered columns, apply regex search in each cell and count
map_function = lambda cell: len(re.findall(r'|'.join(regex_list).lower(), str(cell).lower()))
df_regex_count = df[col_names].applymap(map_function)
# Convert to desired output with lists to make a new dataframe
d2[key] = map(sum, df_regex_count.values.tolist())
df2 = pd.DataFrame(data=d2)
Output :
s1 s2 s3 s4 s5
0 3 0 1 0 0
1 1 0 2 0 0
Note that s3 gives [1, 2] and not [2, 1] because r[ea]d doesn't catch read and noir? catches noir.