I have a dataframe as follow:
dashboard = pd.DataFrame({
'id':[1,2,3,4],
'category': ['a', 'b', 'a', 'c'],
'price': [123, 151, 21, 24],
'description': ['IT related', 'IT related', 'Marketing','']
})
I need to add a row to show both sum and count only for some categories as follow:
pd.DataFrame({
'id': [3],
'category': ['a&b'],
'price': [295],
'description': ['']
})
An option using .agg:
dashboard = pd.DataFrame({
'id': [1, 2, 3, 4],
'category': ['a', 'b', 'a', 'c'],
'price': [123, 151, 21, 24],
'description': ['IT related', 'IT related', 'Marketing', '']
})
a_b = dashboard[dashboard['category'].isin(['a','b'])].agg({'id':'count', 'price':sum})
df = pd.DataFrame({'a&b':a_b})
yields
a&b
id 3
price 295
which you could then .transpose() and merge into your existing dataframe if desired, or compile a separate dataframe of summary results, etc.
I pre-calculate all the sums for each category, then for each pair we add the sums, and the category names, and append the new row.
try this:
import pandas as pd
dashboard = pd.DataFrame({
'id': [1, 2, 3, 4],
'category': ['a', 'b', 'a', 'c'],
'price': [123, 151, 21, 24],
'description': ['IT related', 'IT related', 'Marketing', '']
})
pairs = [('a', 'b')]
groups = dashboard.groupby("category")['price'].sum()
for c1, c2 in pairs:
new_id = sum((dashboard['category'] == c1) | (dashboard['category'] == c2))
name = '{}&{}'.format(c1, c2)
price_sum = groups[c1] + groups[c2]
dashboard = dashboard.append(pd.DataFrame({'id': [new_id], 'category': [name], 'price': [price_sum], 'description': ['']}))
print(dashboard)
Try this:
Code
dashboard = pd.DataFrame({
'id':[1,2,3,4],
'category': ['a', 'b', 'a', 'c'],
'price': [123, 151, 21, 24],
'description': ['IT related', 'IT related', 'Marketing','']
})
selection =['a','b']
selection_row = '&'.join(selection)
df2 = dashboard[dashboard['category'].isin(selection)].agg({'id' : ['count'], 'price' : ['sum']}).fillna(0).T
df2['summary'] = df2['count'].add(df2['sum'])
df2.loc['description'] =np.nan
df2.loc['category'] = selection_row
final_df = df2['summary']
final_df
id 3
price 295
description NaN
category a&b
Name: summary, dtype: object
Related
It's hard to explain what I'm trying to do so I'll give an example. In the example below, I am trying to get df3. I have done it with the code below but it is very "anti-pandas" and I am looking for a better (faster, cleaner, more pandas-esque) way to do it:
import pandas as pd
df1 = pd.DataFrame({"begin": [{"a", "b"}, {"b"}, {"c"}], "end": [{"x"}, {"z", "y"}, {"z"}]})
df2 = pd.DataFrame(
{"a": [10, 10, 15], "b": [15, 20, 30], "c": [8, 12, 10], "x": [1, 2, 3], "y": [1, 3, 4], "z": [1, 3, 1]}
)
df3 = df1.copy()
for i in range(len(df1)):
for j in range(len(df1.loc[i])):
df3.at[i, df1.columns[j]] = []
for v in df1.loc[i][j]:
df3.at[i, df1.columns[j]].append({"letter": v, "value": df2.loc[i][v]})
print(df3)
Here's my goal (which this code does, just probably not in the best way):
begin end
0 [{'letter': 'b', 'value': 15}, {'letter': 'a', 'value': 10} [{'letter': 'x', 'value': 1}]
1 [{'letter': 'b', 'value': 20}] [{'letter': 'y', 'value': 3}, {'letter': 'z', 'value': 3}
2 [{'letter': 'c', 'value': 10}] [{'letter': 'z', 'value': 1}]
Here is one way to approach the problem using pandas
# Reshape and explode the dataframe
s = df1.stack().explode().reset_index(name='letter')
# Map the values corresponding to the letters
s['value'] = s.set_index(['level_0', 'letter']).index.map(df2.stack())
# Assign list of records
s['records'] = s[['letter', 'value']].to_dict('records')
# Pivot with aggfunc as list
s = s.pivot_table('records', 'level_0', 'level_1', aggfunc=list)
print(s)
level_1 begin end
level_0
0 [{'letter': 'a', 'value': 10}, {'letter': 'b', 'value': 15}] [{'letter': 'x', 'value': 1}]
1 [{'letter': 'b', 'value': 20}] [{'letter': 'z', 'value': 3}, {'letter': 'y', 'value': 3}]
2 [{'letter': 'c', 'value': 10}] [{'letter': 'z', 'value': 1}]
Input: I have a dictionary in this form with a lot more data
d = {
'ag': pd.DataFrame({'ID': ['id1', 'id1', 'id1'], 'name': ['a', 's', 'd'], 'num': [10, 7, 2]}),
'jk': pd.DataFrame({'ID': ['id2', 'id2', 'id2'], 'name': ['w', 'r', 'y'], 'num': [15, 8, 1]}),
'rp': pd.DataFrame({'ID': ['id1', 'id1'], 'name': ['f', 'n'], 'num': [13, 11]})
}
Expected Output: I want to remove the key value from dictionary(d), if the ID(id1) is repeated again in next key(rp).
d = {
'ag': pd.DataFrame({'ID': ['id1', 'id1', 'id1'], 'name': ['a', 's', 'd'], 'num': [10, 7, 2]}),
'jk': pd.DataFrame({'ID': ['id2', 'id2', 'id2'], 'name': ['w', 'r', 'y'], 'num': [15, 8, 1]})
}
code I tried:
new_d = {}
unique_ids = set()
for key in sorted(d.keys()):
key_ids = set(d[key]['ID'].tolist())
if not(key_ids & unique_ids):
new_d[key] = d[key]
unique_ids |= key_ids
print(new_d)
I need a different approach, this is not giving me good results for a large dataset.
Came up with a function to do the task
def remove_duplicate_key(d):
# 'dt' temp variable to iterate over
dt=d.copy()
for i, key in zip(range(len(dt)), dt.keys()):
var = 'id'+str(i+1)
temp_df=dt.get(key, None)
if temp_df['ID'].value_counts().index[0]!=var:
d.pop(key, None)
print(d)
else:
continue
return d
Its creating the variable var='id'+str(i) since id is anyway incrementing. Then call the function remove_duplicate_key(d)
I have a dataframe df as follow
Number PT
5 AA
64 BB
7 CC
Then a another list of objects,
myList = [{'label': 'AA', 'value': 'AA', 'group': 'A'}, {'label': 'BB', 'value': 'BB', 'group': 'B'}]
I want for every PT to have the associated group(when available) from the list, so the result should look like
Number PT group
5 AA A
64 BB B
7 CC NOT_MATCHED
d = {'Number': [5, 64, 7], 'PT': ["AA", "BB", "CC"]}
df = pd.DataFrame(data=d)
myList = [{'label': 'AA', 'value': 'AA', 'group': 'A'}, {'label': 'BB', 'value': 'BB', 'group': 'B'}]
for i, row in df.iterrows():
for item in myList:
if item['value'] == df['PT'][i]:
df.at[i,'Group'] = item['group']
break
else:
df.at[i,'Group'] = "NOT_MATCHED"
TRY:
df['group'] = df.PT.map({tuple(i.values())[0]: tuple(i.values())[
2] for i in myList}).fillna('Not Matched')
I have the following dict structure.
product1 = {'product_tmpl_id': product_id,
'qty':product_uom_qty,
'price':price_unit,
'subtotal':price_subtotal,
'total':price_total,
}
And then a list of products, each item in the list is a dict with the above structure
list_ = [product1,product2,product3,.....]
I need to sum the item in the list, group by the key product_tmpl_id ... I'm using dictcollections but it only sum the qty key, I need to sum key except the product_tmpl_id which is the criteria to group by
c = defaultdict(float)
for d in list_:
c[d['product_tmpl_id']] += d['qty']
c = [{'product_id': id, 'qty': qty} for id, qty in c.items()]
I know how to do it with a for iteration but trying to look for a more pythonic way
thanks
EDIT:
What is need is to pass from this:
lst = [
{'Name': 'A', 'qty':100,'price':10},
{'Name': 'A', 'qty':100,'price':10},
{'Name': 'A', 'qty':100,'price':10},
{'Name': 'B', 'qty':100,'price':10},
{'Name': 'C', 'qty':100,'price':10},
{'Name': 'C', 'qty':100,'price':10},
]
to this
group_lst = [
{'Name': 'A', 'qty':300,'price':30},
{'Name': 'B', 'qty':100,'price':10},
{'Name': 'C', 'qty':200,'price':20},
]
Using basic Python, this doesn't get a whole lot better. You could hack something together with itertools.groupby, but it'd be ugly and probably slower, certainly less clear.
As #9769953 suggested, though, Pandas is a good package to handle this sort of structured, tabular data.
In [1]: import pandas as pd
In [2]: df = pd.DataFrame(lst)
Out[2]:
Name price qty
0 A 10 100
1 A 10 100
2 A 10 100
3 B 10 100
4 C 10 100
5 C 10 100
In [3]: df.groupby('Name').agg(sum)
Out[3]:
price qty
Name
A 30 300
B 10 100
C 20 200
You just need a little extra mojo if you don't want to keep the data as a dataframe:
In [4]: grouped = df.groupby('Name', as_index=False).agg(sum)
In [5]: list(grouped.T.to_dict().values())
Out[5]:
[{'Name': 'A', 'price': 30, 'qty': 300},
{'Name': 'B', 'price': 10, 'qty': 100},
{'Name': 'C', 'price': 20, 'qty': 200}]
On the verbose side, but gets the job done:
group_lst = []
lst_of_names = []
for item in lst:
qty_total = 0
price_total = 0
# Get names that have already been totalled
lst_of_names = [item_get_name['Name'] for item_get_name in group_lst]
if item['Name'] in lst_of_names:
continue
for item2 in lst:
if item['Name'] == item2['Name']:
qty_total += item2['qty']
price_total += item2['price']
group_lst.append(
{
'Name':item['Name'],
'qty':qty_total,
'price':price_total
}
)
pprint(group_lst)
Output:
[{'Name': 'A', 'price': 30, 'qty': 300},
{'Name': 'B', 'price': 10, 'qty': 100},
{'Name': 'C', 'price': 20, 'qty': 200}]
You can use defaultdict and Counter
>>> from collections import Counter, defaultdict
>>> cntr = defaultdict(Counter)
>>> for d in lst:
... cntr[d['Name']].update(d)
...
>>> res = [dict(v, **{'Name':k}) for k,v in cntr.items()]
>>> pprint(res)
[{'Name': 'A', 'price': 30, 'qty': 300},
{'Name': 'C', 'price': 20, 'qty': 200},
{'Name': 'B', 'price': 10, 'qty': 100}]
I want to convert the below pandas data frame
data = pd.DataFrame([[1,2], [5,6]], columns=['10+', '20+'], index=['A', 'B'])
data.index.name = 'City'
data.columns.name= 'Age Group'
print data
Age Group 10+ 20+
City
A 1 2
B 5 6
in to an array of dictionaries, like
[
{'Age Group': '10+', 'City': 'A', 'count': 1},
{'Age Group': '20+', 'City': 'A', 'count': 2},
{'Age Group': '10+', 'City': 'B', 'count': 5},
{'Age Group': '20+', 'City': 'B', 'count': 6}
]
I am able to get the above expected result using the following loops
result = []
cols_name = data.columns.name
index_names = data.index.name
for index in data.index:
for col in data.columns:
result.append({cols_name: col, index_names: index, 'count': data.loc[index, col]})
Is there any better ways of doing this? Since my original data will be having large number of records, using for loops will take more time.
I think you can use stack with reset_index for reshape and last to_dict:
print (data.stack().reset_index(name='count'))
City Age Group count
0 A 10+ 1
1 A 20+ 2
2 B 10+ 5
3 B 20+ 6
print (data.stack().reset_index(name='count').to_dict(orient='records'))
[
{'Age Group': '10+', 'City': 'A', 'count': 1},
{'Age Group': '20+', 'City': 'A', 'count': 2},
{'Age Group': '10+', 'City': 'B', 'count': 5},
{'Age Group': '20+', 'City': 'B', 'count': 6}
]