I have a list of dictionaries like this
data = [
{"_id": {"cohort_name": "09-01-2010", "segment_name": "LTV90-Prime", "driver_name": "ADB"}, "cohort_data": [
{"calculated": [],
"original": [{"1": 225.2699758337715}, {"2": 106.05173118059133}, {"3": 547.2908664469512},
{"4": 573.1083659247656}]}]},
{"_id": {"cohort_name": "11-01-2010", "segment_name": "LTV90-Prime", "driver_name": "Unit Loss Rate"},
"cohort_data": [{"calculated": [], "original": [{"1": 0.002687180620372531}, {"2": 0.001468127113897437}]}]},
{"_id": {"cohort_name": "11-01-2010", "segment_name": "LTV90-Prime", "driver_name": "Unit Loss Rate"},
"cohort_data": [{"calculated": [], "original": [{"10": 0.002687180620372531}, {"1": 0.002687180620372531},
{"2": 0.001468127113897437}]}]}
]
I am trying to group data based upon the driver_name and segment_name and push all cohort_name and cohort_data inside the internal dictionary.
The expected output is as follows
[{'driver_name': 'Unit Loss Rate',
'segment_name': 'LTV90-Prime',
'cohort_data': {
'5-01-2010': [{'1': 0.002687180620372531}, {'2': 0.001468127113897437}, {'10': 0.002687180620372531}],
'11-01-2010': [{'1': 0.002687180620372531}, {'2': 0.001468127113897437}]
}},
{'driver_name': 'ADB',
'segment_name': 'LTV90-Prime',
'cohort_data': {
"09-01-2010": [{'1': 225.2699758337715}, {'2': 106.05173118059133}, {'3': 547.2908664469512},
{'4': 573.1083659247656}]
}}
]
This is what I have done so far. I am stuck in pushing the cohort_name and cohort_data in the internal dictionary.
def get_data_list(d):
final_data = None
for i in d:
calculated = i['calculated']
original = i['original']
if original:
final_data = original
elif calculated:
final_data = calculated
return final_data
dd = defaultdict(dict)
for i in data:
df = {}
id_ = i['_id']
cohort_name_final, segment_name_final, driver_name_final = id_['cohort_name'], \
id_['segment_name'], \
id_['driver_name']
cohort_data_final = i['cohort_data']
if segment_name_final not in df and segment_name_final not in df:
df['segment_name'] = segment_name_final
df['driver_name'] = driver_name_final
df['cohort_data'] = get_data_list(cohort_data_final)
elif segment_name_final in df and segment_name_final in df:
df['cohort_data'].append(get_data_list(cohort_data_final))
# df['cohort_data'].append({cohort_name_final: get_data_list(cohort_data_final)})
I am using Python 3.4.3. The data shown here is an subset of an original dataset which is queried from the MongoDB database.
Please help.
Related
I have a text file which I want to convert to a nested json structure. The text file is :
Report_for Reconciliation
Execution_of application_1673496470638_0001
Spark_version 2.4.7-amzn-0
Java_version 1.8.0_352 (Amazon.com Inc.)
Start_time 2023-01-12 09:45:13.360000
Spark Properties:
Job_ID 0
Submission_time 2023-01-12 09:47:20.148000
Run_time 73957ms
Result JobSucceeded
Number_of_stages 1
Stage_ID 0
Number_of_tasks 16907
Number_of_executed_tasks 16907
Completion_time 73207ms
Stage_executed parquet at RawDataPublisher.scala:53
Job_ID 1
Submission_time 2023-01-12 09:48:34.177000
Run_time 11525ms
Result JobSucceeded
Number_of_stages 2
Stage_ID 1
Number_of_tasks 16907
Number_of_executed_tasks 0
Completion_time 0ms
Stage_executed parquet at RawDataPublisher.scala:53
Stage_ID 2
Number_of_tasks 300
Number_of_executed_tasks 300
Completion_time 11520ms
Stage_executed parquet at RawDataPublisher.scala:53
Job_ID 2
Submission_time 2023-01-12 09:48:46.908000
Run_time 218358ms
Result JobSucceeded
Number_of_stages 1
Stage_ID 3
Number_of_tasks 1135
Number_of_executed_tasks 1135
Completion_time 218299ms
Stage_executed parquet at RawDataPublisher.scala:53
I want the output to be :
{
"Report_for": "Reconciliation",
"Execution_of": "application_1673496470638_0001",
"Spark_version": "2.4.7-amzn-0",
"Java_version": "1.8.0_352 (Amazon.com Inc.)",
"Start_time": "2023-01-12 09:45:13.360000",
"Job_ID 0": {
"Submission_time": "2023-01-12 09:47:20.148000",
"Run_time": "73957ms",
"Result": "JobSucceeded",
"Number_of_stages": "1",
"Stage_ID 0”: {
"Number_of_tasks": "16907",
"Number_of_executed_tasks": "16907",
"Completion_time": "73207ms",
"Stage_executed": "parquet at RawDataPublisher.scala:53"
"Stage": "parquet at RawDataPublisher.scala:53",
},
},
}
I tried defaultdict method but it was generating a json with values as list which was not acceptable to make a table on it. Here's what I did:
import json
from collections import defaultdict
INPUT = 'demofile.txt'
dict1 = defaultdict(list)
def convert():
with open(INPUT) as f:
for line in f:
command, description = line.strip().split(None, 1)
dict1[command].append(description.strip())
OUTPUT = open("demo1file.json", "w")
json.dump(dict1, OUTPUT, indent = 4, sort_keys = False)
and was getting this:
"Report_for": [ "Reconciliation" ],
"Execution_of": [ "application_1673496470638_0001" ],
"Spark_version": [ "2.4.7-amzn-0" ],
"Java_version": [ "1.8.0_352 (Amazon.com Inc.)" ],
"Start_time": [ "2023-01-12 09:45:13.360000" ],
"Job_ID": [
"0",
"1",
"2", ....
]]]
I just want to convert my text to the above json format so that I can build a table on top of it.
There's no way, python or one of it's libraries can figure out your nesting requirements, if a flat text is being given as an input. How should it know Stages are inside Jobs...for example.
You will have to programmatically tell your application how it works.
I hacked an example which should work, you can go from there (assuming input_str is what you posted as your file content):
# define your nesting structure
nesting = {'Job_ID': {'Stage_ID': {}}}
upper_nestings = []
upper_nesting_keys = []
# your resulting dictionary
result_dict = {}
# your "working" dictionaries
current_nesting = nesting
working_dict = result_dict
# parse each line of the input string
for line_str in input_str.split('\n'):
# key is the first word, value are all consecutive words
line = line_str.split(' ')
# if key is in nesting, create new sub-dict, all consecutive entries are part of the sub-dict
if line[0] in current_nesting.keys():
current_nesting = current_nesting[line[0]]
upper_nestings.append(line[0])
upper_nesting_keys.append(line[1])
working_dict[line_str] = {}
working_dict = working_dict[line_str]
else:
# if a new "parallel" or "upper" nesting is detected, reset your nesting structure
if line[0] in upper_nestings:
nests = upper_nestings[:upper_nestings.index(line[0])]
keys = upper_nesting_keys[:upper_nestings.index(line[0])]
working_dict = result_dict
for nest in nests:
working_dict = working_dict[' '.join([nest, keys[nests.index(nest)]])]
upper_nestings = upper_nestings[:upper_nestings.index(line[0])+1]
upper_nesting_keys = upper_nesting_keys[:upper_nestings.index(line[0])]
upper_nesting_keys.append(line[1])
current_nesting = nesting
for nest in upper_nestings:
current_nesting = current_nesting[nest]
working_dict[line_str] = {}
working_dict = working_dict[line_str]
continue
working_dict[line[0]] = ' '.join(line[1:])
print(result_dict)
Results in:
{
'Report_for': 'Reconciliation',
'Execution_of': 'application_1673496470638_0001',
'Spark_version': '2.4.7-amzn-0',
'Java_version': '1.8.0_352 (Amazon.com Inc.)',
'Start_time': '2023-01-12 09:45:13.360000',
'Spark': 'Properties: ',
'Job_ID 0': {
'Submission_time': '2023-01-12 09:47:20.148000',
'Run_time': '73957ms',
'Result': 'JobSucceeded',
'Number_of_stages': '1',
'Stage_ID 0': {
'Number_of_tasks': '16907',
'Number_of_executed_tasks': '16907',
'Completion_time': '73207ms',
'Stage_executed': 'parquet at RawDataPublisher.scala:53'
}
},
'Job_ID 1': {
'Submission_time': '2023-01-12 09:48:34.177000',
'Run_time': '11525ms',
'Result': 'JobSucceeded',
'Number_of_stages': '2',
'Stage_ID 1': {
'Number_of_tasks': '16907',
'Number_of_executed_tasks': '0',
'Completion_time': '0ms',
'Stage_executed': 'parquet at RawDataPublisher.scala:53'
},
'Stage_ID 2': {
'Number_of_tasks': '300',
'Number_of_executed_tasks': '300',
'Completion_time': '11520ms',
'Stage_executed': 'parquet at RawDataPublisher.scala:53'
}
},
'Job_ID 2': {
'Submission_time':
'2023-01-12 09:48:46.908000',
'Run_time': '218358ms',
'Result': 'JobSucceeded',
'Number_of_stages': '1',
'Stage_ID 3': {
'Number_of_tasks': '1135',
'Number_of_executed_tasks': '1135',
'Completion_time': '218299ms',
'Stage_executed': 'parquet at RawDataPublisher.scala:53'
}
}
}
and should pretty much be generically usable for all kinds of nesting definitions from a flat input. Let me know if it works for you!
I couldn't find any examples that match my use case. Still working through my way in python lists and dictionaries.
Problem:
all_cars = {'total_count': 3,'cars': [{'name': 'audi','model': 'S7'}, {'name': 'honda', 'model': 'accord'},{'name': 'jeep', 'model': 'wrangler'} ]}
owners = {'users':[{'owner': 'Nick', 'car': 'audi'},{'owner': 'Jim', 'car': 'ford'},{'owner': 'Mike', 'car': 'mercedes'} ]}
def duplicate():
for c in all_cars['cars']:
if c['name'] == [c['users']for c in owners['users']]:
pass
else:
res = print(c['name'])
return res
output = ['honda', 'jeep', audi']
and
def duplicate():
for c in all_cars['cars']:
if c['name'] == 'audi':
pass
else:
res = print(c['name'])
return res
output - ['honda', 'jeep']
I am trying to find matching values in both dictionaries, using list comprehension, then return non-matching values only.
Solution: Using 'in' rather than '==' operator, I was able to compare values between both lists and skip duplicates.
def duplicate():
for c in all_cars['cars']:
if c['name'] in [c['users']for c in owners['users']]:
pass
else:
res = print(c['name'])
return res
To answer the question in your title, you can conditionally add elements during a list comprehension using the syntax [x for y in z if y == a], where y == a is any condition you need - if the condition evaluates to True, then the element y will be added to the list, otherwise it will not.
I would just keep a dictionary of all of the owner data together:
ownerData = { "Shaft" : {
"carMake" : "Audi",
"carModel" : "A8",
"year" : "2015" },
"JamesBond" : {
"carMake" : "Aston",
"carModel" : "DB8",
"year" : "2012" },
"JeffBezos" : {
"carMake" : "Honda",
"carModel" : "Accord"
"year" : "1989"}
}
Now you can loop through and query it something like this:
for o in ownerData:
if "Audi" in o["carMake"]:
print("Owner %s drives a %s %s %s" % (o, o["year"], o["carMake"], o["carModel"]))
Should output:
"Owner Shaft drives a 2015 Audi A8"
This way you can expand your data set for owners without creating multiple lists.
OK, based on your feedback on the solution above, here is how I would tackle your problem. Drop your common items into lists and then use "set" to print out the diff.
all_cars = {'total_count': 3,'cars': [{'name': 'audi','model': 'S7'},
{'name': 'honda', 'model': 'accord'},{'name': 'jeep', 'model': 'wrangler'} ]}
owners = {'users':[{'owner': 'Nick', 'car': 'audi'},{'owner': 'Jim',
'car': 'ford'},{'owner': 'Mike', 'car': 'mercedes'} ]}
allCarList = []
ownerCarList = []
for auto in all_cars['cars']:
thisCar = auto['name']
if thisCar not in allCarList:
allCarList.append(thisCar)
for o in owners['users']:
thisCar = o['car']
if thisCar not in ownerCarList:
ownerCarList.append(thisCar)
diff = list(set(allCarList) - set(ownerCarList))
print(diff)
I put this in and ran it and came up with this output:
['jeep', 'honda']
Hope that helps!
I have data files of json and xml which return me account details of customer. They both have almost the same data in them and i have to verify it. It may have a single account or multiple. I have made the a dictionary of selected keys which i need to verify. I need help in:
1) Storing the multiple dictionaries somewhere so i can compare it.
2) And a way to compare them.
I am providing a sample data file which contain multiple accounts.
rest = {
"ReqType": "CRI",
"ReqUID": "1234567",
"ResultCode": "00",
"Message": "Success",
"Records": 2,
"OutData": [
{
"CNIC": "123456789",
"PSTS": "P",
"CUSTNO": "CHO7SM",
"Accounts": {
"NoAccounts": "1",
"AccountList": [
{
"ACC#": "17327901207",
"TITLE": "John",
"TYPE": "C",
"STYPE": "C4",
"STPDESC": "ACCOUNT CURRENT",
"REL": "N",
"LBAL": "2500000",
"ABAL": "2500000",
"DECEASED": "N",
"BLOCKED": "N",
"INACTIVE": "N",
"CLOSED": "N"
}
]
}
},
{
"CNIC": "123456789",
"PSTS": "S",
"CUSTNO": "CDG1R8",
"Accounts": {
"NoAccounts": "1",
"AccountList": [
{
"ACC#": "17327900081",
"TITLE": "John",
"TYPE": "C",
"STYPE": "C4",
"STPDESC": "ACCOUNT CURRENT",
"REL": "N",
"LBAL": "3486039",
"ABAL": "3486039",
"DECEASED": "N",
"BLOCKED": "N",
"INACTIVE": "N",
"CLOSED": "N"
}
]
}
}
]
}
xml = <?xml version="1.0" encoding="UTF-8"?>
<FCDB_RES_ENV>
<FCDB_HEADER>
<SOURCE>FCAT</SOURCE>
<SERVICE>CustomerAccountsDetails</SERVICE>
<OPERATION>CustomerAccountsDetails</OPERATION>
<SOURCE_USERID>FCAT</SOURCE_USERID>
<DESTINATION>FCDB</DESTINATION>
<COUNTRYCODE>T001</COUNTRYCODE>
<USERTYPE>ENS</USERTYPE>
<LANGID>eng</LANGID>
<CHANNELID>01</CHANNELID>
</FCDB_HEADER>
<FCDB_BODY>
<CUSTACCOUNT>
<CUSTNO>
<CUSTNO>123456789</CUSTNO>
<TYPECUST>C</TYPECUST>
<NAMCUST>John</NAMCUST>
<ADDRESS>
<ADDRESS1>ABC.</ADDRESS1>
</ADDRESS>
</CUSTNO>
<ACCOUNT>
<CUSTNO>123456789</CUSTNO>
<ACCNO>17327901207</ACCNO>
<ACCOUNTTITLE>John</ACCOUNTTITLE>
<ACCOUNTTYPEDETAIL>C4</ACCOUNTTYPEDETAIL>
<BALANCE>25000.00</BALANCE>
<ACCTTYPE>C</ACCTTYPE>
<ACCPRD>AAAA</ACCPRD>
<ACCPRDDESC>ACCOUNT CURRENT</ACCPRDDESC>
<ACCCCY>PKR</ACCCCY>
<STATUS>A</STATUS>
<RELATION>J</RELATION>
<BAL_AVAIL>25000.00</BAL_AVAIL>
<HASCHEQUE>true</HASCHEQUE>
<HASOVERDRAFT>N</HASOVERDRAFT>
<UNCLEARFUND>0.00</UNCLEARFUND>
</ACCOUNT>
<ACCOUNT>
<CUSTNO>123456789</CUSTNO>
<ACCNO>17327900081</ACCNO>
<ACCOUNTTITLE>John</ACCOUNTTITLE>
<ACCOUNTTYPEDETAIL>C4</ACCOUNTTYPEDETAIL>
<BALANCE>34860.39</BALANCE>
<ACCTTYPE>C</ACCTTYPE>
<ACCPRD>AAAA</ACCPRD>
<ACCPRDDESC>ACCOUNT CURRENT</ACCPRDDESC>
<ACCCCY>PKR</ACCCCY>
<STATUS>A</STATUS>
<RELATION>J</RELATION>
<BAL_AVAIL>34860.39</BAL_AVAIL>
<HASCHEQUE>true</HASCHEQUE>
<HASOVERDRAFT>N</HASOVERDRAFT>
<UNCLEARFUND>0.00</UNCLEARFUND>
</ACCOUNT>
<ACCOUNT>
<CUSTNO>123456789</CUSTNO>
<ACCNO>17327900940</ACCNO>
<ACCOUNTTITLE>Adam</ACCOUNTTITLE>
<ACCOUNTTYPEDETAIL>C4</ACCOUNTTYPEDETAIL>
<BALANCE>2004976.00</BALANCE>
<ACCTTYPE>C</ACCTTYPE>
<ACCPRD>AAAA</ACCPRD>
<ACCPRDDESC>ACCOUNT CURRENT</ACCPRDDESC>
<ACCCCY>PKR</ACCCCY>
<STATUS>A</STATUS>
<RELATION>J</RELATION>
<BAL_AVAIL>2004976.00</BAL_AVAIL>
<HASCHEQUE>true</HASCHEQUE>
<HASOVERDRAFT>N</HASOVERDRAFT>
<UNCLEARFUND>0.00</UNCLEARFUND>
</ACCOUNT>
</CUSTACCOUNT>
<FCDB_ERROR_RESP>
<ERROR>
<ECODE>00</ECODE>
<EDESC>Your transaction has been processed successfully.</EDESC>
</ERROR>
</FCDB_ERROR_RESP>
</FCDB_BODY>
</FCDB_RES_ENV>
I have converted the soap response into a dictionary to get the items easily.
import json
import xmltodict
from collections import OrderedDict
rest_file = json.loads(rest.read())
doc = xmltodict.parse(xml.read())
input_dict = OrderedDict(doc)
xml_file = json.loads(json.dumps(input_dict))
a = xml['FCDB_RES_ENV']['FCDB_BODY']['CUSTACCOUNT']
for i in a.__getitem__('ACCOUNT'):
xml_dict = {key: a[key] for key in a if key in ['ACCNO', 'ACCOUNTTITLE', 'ACCOUNTTYPEDETAIL', 'ACCPRDDESC']}
print(soap_dict)
print("--------------------")
for item in rest.get('OutData'):
b = (item.get('Accounts')['AccountList'])
json_file_account = b.pop()
rest_dict = {key: json_file_account[key] for key in json_file_account if key in
['ACC#', 'TITLE', 'STYPE', 'STPDESC']}
print(rest_dict)
The Output of above script is:
{'ACCNO': '17327901207', 'ACCOUNTTITLE': 'John', 'ACCOUNTTYPEDETAIL': 'C4', 'ACCPRDDESC': 'ACCOUNT CURRENT'}
{'ACCNO': '17327900081', 'ACCOUNTTITLE': 'John', 'ACCOUNTTYPEDETAIL': 'C4', 'ACCPRDDESC': 'ACCOUNT CURRENT'}
{'ACCNO': '17327900940', 'ACCOUNTTITLE': 'Adam', 'ACCOUNTTYPEDETAIL': 'C4', 'ACCPRDDESC': 'ACCOUNT CURRENT'}
--------------------
{'ACC#': '17327901207', 'TITLE': 'John', 'STYPE': 'C4', 'STPDESC': 'ACCOUNT CURRENT'}
{'ACC#': '17327900081', 'TITLE': 'John', 'STYPE': 'C4', 'STPDESC': 'ACCOUNT CURRENT'}
For Comparison, I need to iterate the first dictionary of xml into rest and see if AccNo matches then compare all values. Any help would be appreciated. Thanks
First You need to append all dict into a list and then iterate over it and match the value of accounts.
Below is the modified code.
import json
import xmltodict
from collections import OrderedDict
rest_file = json.loads(rest.read())
doc = xmltodict.parse(xml.read())
input_dict = OrderedDict(doc)
xml_file = json.loads(json.dumps(input_dict))
soap_dict_list = []
a = xml['FCDB_RES_ENV']['FCDB_BODY']['CUSTACCOUNT']
for i in a.__getitem__('ACCOUNT'):
xml_dict = {key: a[key] for key in a if key in ['ACCNO', 'ACCOUNTTITLE', 'ACCOUNTTYPEDETAIL', 'ACCPRDDESC']}
print(soap_dict)
soap_dict_list.append(soap_dict)
print(soap_dict_list)
print("--------------------")
rest_dict_list = []
for item in rest.get('OutData'):
b = (item.get('Accounts')['AccountList'])
json_file_account = b.pop()
rest_dict = {key: json_file_account[key] for key in json_file_account if key in
['ACC#', 'TITLE', 'STYPE', 'STPDESC']}
print(rest_dict)
rest_dict_list.append(rest_dict)
print(rest_dict_list)
# Now iterate over list
for info1 in rest_dict_list:
for info2 in soap_dict_list:
if info1['ACC#'] == info2['ACCNO']:
print("Account number Match found")
if info1['TITLE'] == info2['ACCOUNTTITLE']:
print("ACCOUNTTITLE and TITLE matched")
# and so on you can match rest of values
Ali Khan,
1.) I would suggest grouping them by using keys alone since the keys seem to vary into a list groups.
2.) After grouping is completed try using pandas Dataframe package to compare.
Here is my code below to compare all those values.
import pandas as pd
df_struct = list()
df_struct.append({'ACCNO': '17327901207', 'ACCOUNTTITLE': 'John', 'ACCOUNTTYPEDETAIL': 'C4', 'ACCPRDDESC': 'ACCOUNT CURRENT'})
df_struct.append({'ACCNO': '17327900081', 'ACCOUNTTITLE': 'John', 'ACCOUNTTYPEDETAIL': 'C4', 'ACCPRDDESC': 'ACCOUNT CURRENT'})
df_struct.append({'ACCNO': '17327900940', 'ACCOUNTTITLE': 'Adam', 'ACCOUNTTYPEDETAIL': 'C4', 'ACCPRDDESC': 'ACCOUNT CURRENT'})
df = pd.DataFrame(df_struct)
print(df[df['ACCNO'].duplicated()])
So I have a small data like this:
data = [
{"Name":"Arab","Code":"Zl"},
{"Name":"Korea","Code":"Bl"},
{"Name":"China","Code":"Bz"}
]
I want to find a graph so that the x-axis is: "Bl", "Bz", "Zl" (alphabetic order)
and the y-axis is: "Korea", "China", "Arab" (corresponding to the codenames).
I thought of:
new_data = {}
for dic in data:
country_data = dic["Name"]
code_data = dic["Code"]
new_data[code_data] = country_data
code_data = []
for codes in new_data.keys():
code_data.append(codes)
code_data.sort()
name_data = []
for code in code_data:
name_data.append(new_data[code])
Is there a better way to do this?
Perhaps by not creating a new dictionary?
So here's the data:
data = [
{"Name":"Arab","Code":"Zl"},
{"Name":"Korea","Code":"Bl"},
{"Name":"China","Code":"Bz"}
]
To create a new sorted list:
new_list = sorted(data, key=lambda k: k['Code'])
If you don't want to get a new list:
data[:] = sorted(data, key=lambda k: k['Code'])
The result is:
[{'Code': 'Bl', 'Name': 'Korea'}, {'Code': 'Bz', 'Name': 'China'}, {'Code': 'Zl', 'Name': 'Arab'}]
I hope I could help you!
Better way to produce same results:
from operator import itemgetter
data = [
{"Name": "Arab", "Code": "Zl"},
{"Name": "Korea", "Code": "Bl"},
{"Name": "China", "Code": "Bz"}
]
sorted_data = ((d["Code"], d["Name"]) for d in sorted(data, key=itemgetter("Code")))
code_data, name_data = (list(item) for item in zip(*sorted_data))
print(code_data) # -> ['Bl', 'Bz', 'Zl']
print(name_data) # -> ['Korea', 'China', 'Arab']
Here's one way using operator.itemgetter and unpacking via zip:
from operator import itemgetter
_, data_sorted = zip(*sorted(enumerate(data), key=lambda x: x[1]['Code']))
codes, names = zip(*map(itemgetter('Code', 'Name'), data_sorted))
print(codes)
# ('Bl', 'Bz', 'Zl')
print(names)
# ('Korea', 'China', 'Arab')
I have a list of dictionaries something like this:
users=[{"name": "David", "team": "reds", "score1": 100, "score2": 20,},
{"name": "David", "team": "reds", "score1": 20, "score2": 60,},
{"name": "David", "team": "blues", "score1": 10, "score2": 70,}]
and would really like to get a new processed list of dictionaries something like
summary=[{"team": "reds", "total1": 120, "total2": 80,},
{"team": "blues", "total1": 120, "total2": 80,}]
preferably looping through the original data just once. I can create a dictionary holding a total value for each user key with this
summary = dict()
for user in users:
if not user['team'] in summary:
summary[user['team']]=float(user['score1'])
else:
summary[user['team']]+=float(user['score1'])
to give
summary = {'reds': 120,'blues': 10}
but am struggling with producing the list of dictionaries, the nearest I can get is to create a dictionary at the first instance of a team, and then try to append to its values on subsequent occurrences...
summary = []
for user in users:
if any(d['team'] == user['team'] for d in summary):
# append to values in the relevant dictionary
# ??
else:
# Add dictionary to list with some initial values
d ={'team':user['team'],'total1':user['score1'],'total2':user['score2']}
summary.append(dict(d))
...and it has gotten messy... Am I going about this in completely the wrong way? Can you change values in a dictionary within a list?
Thanks
I think this is good case to use pandas library for python:
>>> import pandas as pd
>>> dfUsers = pd.DataFrame(users)
>>> dfUsers
name score1 score2 team
0 David 100 20 reds
1 David 20 60 reds
2 David 10 70 blues
>>> dfUsers.groupby('team').sum()
score1 score2
team
blues 10 70
reds 120 80
And if you really want to put it into dict:
>>> dfRes = dfUsers.groupby('team').sum()
>>> dfRes.columns = ['total1', 'total2'] # if you want to rename columns
>>> dfRes.reset_index().to_dict(orient='records')
[{'team': 'blues', 'total1': 10, 'total2': 70},
{'team': 'reds', 'total1': 120, 'total2': 80}]
another way to do this is with itertools.groupby:
>>> from itertools import groupby
>>> from operator import itemgetter
>>> users.sort(key=itemgetter('team'))
>>>
>>> res = [{'team': t[0], 'res': list(t[1])} for t in groupby(users, key=itemgetter('team'))]
>>> res = [{'team':t[0], 'total1': sum(x['score1'] for x in t[1]), 'total2': sum(x['score2'] for x in t[1])} for t in res]
>>> res
[{'team': 'blues', 'total1': 10, 'total2': 70},
{'team': 'reds', 'total1': 120, 'total2': 80}]
Or, if you really want simple python:
>>> res = dict()
>>> for x in users:
if x['team'] not in res:
res[x['team']] = [x['score1'], x['score2']]
else:
res[x['team']][0] += x['score1']
res[x['team']][1] += x['score2']
>>> res = [{'team': k, 'total1': v[0], 'total2': v[1]} for k, v in res.iteritems()}]
>>> res
[{'team': 'reds', 'total1': 120, 'total2': 80},
{'team': 'blues', 'total1': 10, 'total2': 70}]
You are really close, you just need a way to look up which dictionary to update. This is the simplest way I can see.
summary = dict()
for user in users:
team = user['team']
if team not in summary:
summary[team] = dict(team=team,
score1=float(user['score1']),
score2=float(user['score2']))
else:
summary[team]['score1'] += float(user['score1'])
summary[team]['score2'] += float(user['score2'])
then
>>> print summary.values()
[{'score1': 120.0, 'score2': 80.0, 'team': 'reds'},
{'score1': 10.0, 'score2': 70.0, 'team': 'blues'}]
Here's my solution which assumes that all scores that need to be added start with score:
users=[{"name": "David", "team": "reds", "score1": 100, "score2": 20,},
{"name": "David", "team": "reds", "score1": 20, "score2": 60,},
{"name": "David", "team": "blues", "score1": 10, "score2": 70,}]
totals = {}
for item in users:
team = item['team']
if team not in totals:
totals[team] = {}
for k,v in item.items():
if k.startswith('score'):
if k in totals[team]:
totals[team][k] += v
else:
totals[team][k] = v
print totals
Output:
{'reds': {'score1': 120, 'score2': 80}, 'blues': {'score1': 10, 'score2': 70}}
See comments inline for an explanation
import pprint
users=[{"name": "David", "team": "reds", "score1": 100, "score2": 20,},
{"name": "David", "team": "reds", "score1": 20, "score2": 60,},
{"name": "David", "team": "blues", "score1": 10, "score2": 70,}]
scores_by_team = dict()
for user in users:
if user['team'] not in scores_by_team:
# Make sure you're gonna have your scores zeroed so you can add the
# user's scores later
scores_by_team[user['team']] = {
'total1': 0,
'total2': 0
}
# Here the user's team exists for sure in scores_by_team
scores_by_team[user['team']]['total1'] += user['score1']
scores_by_team[user['team']]['total2'] += user['score2']
# So now, the scores you want have been calculated in a dictionary where the
# keys are the team names and the values are another dictionary with the scores
# that you actually wanted to calculate
print "Before making it a summary: %s" % pprint.pformat(scores_by_team)
summary = list()
for team_name, scores_by_team in scores_by_team.items():
summary.append(
{
'team': team_name,
'total1': scores_by_team['total1'],
'total2': scores_by_team['total2'],
}
)
print "Summary: %s" % summary
This outputs:
Before making it a summary: {'blues': {'total1': 10, 'total2': 70}, 'reds': {'total1': 120, 'total2': 80}}
Summary: [{'total1': 120, 'total2': 80, 'team': 'reds'}, {'total1': 10, 'total2': 70, 'team': 'blues'}]