Nested dictionary to CSV convertion optimization - python

I have a dictionary like this:
no_empty_keys = {'783': [['4gsx', 'ADTQGS', 0.3333333333333333, {'A': ['A224', 'T226'], 'B': ['A224', 'T226']}, 504, 509], ['4gt0', 'ADTQGS', 0.3333333333333333, {'A': ['A224', 'T226'], 'B': ['A224', 'T226']}, 504, 509]],'1062': [['4gsx', 'AELTGY', 0.5, {'A': ['L175', 'T176', 'Y178'], 'B': ['L175', 'T176', 'Y178']}, 453, 458], ['4gt0', 'AELTGY', 0.5, {'A': ['L175', 'T176', 'Y178'], 'B': ['L175', 'T176', 'Y178']}, 453, 458]]}
My function to transform that into a CSV is this one:
epitope_df = pd.DataFrame(columns=['Epitope ID', 'PDB', 'Percent Identity', 'Epitope Mapped', 'Epitope Sequence', 'Starting Position', 'Ending Position'])
for x in no_empty_keys:
for y in no_empty_keys[x]:
epitope_df = epitope_df.append({'Epitope ID': x, 'PDB': y[0], 'Percent Identity': y[2], 'Epitope Mapped' : y[3], 'Epitope Sequence' : y[1], 'Starting Position' : y[4], 'Ending Position' : y[5]}, ignore_index=True)
epitope_df.to_csv('test.csv', index=False)
My output is a csv file like this:
It is working, but it isn't well optimized. The process is very slow when I run into a dictionary with more than > 10,000 entries. Any ideas on how to speed this process up? Thank you for your time.

I'd start with getting rid of pandas.append. Appending rows to DataFrames is inefficient. You can create a DataFrame in one go:
result = []
for x in no_empty_keys:
for y in no_empty_keys[x]:
result.append(
{
'Epitope ID': x,
'PDB': y[0],
'Percent Identity': y[2],
'Epitope Mapped': y[3],
'Epitope Sequence': y[1],
'Starting Position': y[4],
'Ending Position': y[5]
}
)
epitope_df = epitope_df.from_records(result)
epitope_df.to_csv('new.csv', index=False)

You can either write an ad hoc code by hand or use convtools library, which generates such converters for you:
from convtools import conversion as c
from convtools.contrib.tables import Table
no_empty_keys = {
"783": [
[ "4gsx", "ADTQGS", 0.3333333333333333, {"A": ["A224", "T226"], "B": ["A224", "T226"]}, 504, 509, ],
[ "4gt0", "ADTQGS", 0.3333333333333333, {"A": ["A224", "T226"], "B": ["A224", "T226"]}, 504, 509, ],
],
"1062": [
[ "4gsx", "AELTGY", 0.5, {"A": ["L175", "T176", "Y178"], "B": ["L175", "T176", "Y178"]}, 453, 458,],
[ "4gt0", "AELTGY", 0.5, {"A": ["L175", "T176", "Y178"], "B": ["L175", "T176", "Y178"]}, 453, 458, ],
],
}
columns = (
"Epitope ID",
"PDB",
"Percent Identity",
"Epitope Mapped",
"Epitope Sequence",
"Starting Position",
"Ending Position",
)
# this is just a function, so it can be run on startup once and stored for
# further reuse
converter = (
c.iter(
c.zip(
c.repeat(c.item(0)),
c.item(1)
).iter(
(c.item(0),) + tuple(c.item(1, i) for i in range(len(columns) - 1))
)
)
.flatten()
.gen_converter()
)
# here is the stuff to profile
Table.from_rows(
converter(no_empty_keys.items()),
header=columns,
).into_csv("out.csv")
Consider installing black and passing debug=True to gen_converter if you are curious on the code convtools generates under the hood.

Related

Create dictionary from JSON

I have a JSON file that looks like the one below. How do I create separate lists for gas, pm1, pm10, etc... using list comprehensions?
{
'e00fce6866b13164840961dcdata': {
'-MVId33W0ApTBDg1YtzV': {
'gas': 52.7,
'pm1': 0.1,
'pm10': 0.1,
'pm25': 0.1,
'pr': 441.6,
'rh': 14.6,
'ta': 25.5,
'ts': '2021-03-08T21:00:03Z'
},
'-MVIfE3Y2zWYzgrFefyr': {
'gas': 52.7,
'pm1': 0.1,
'pm10': 0.2,
'pm25': 0.2,
'pr': 451.8,
'rh': 14.1,
'ta': 25,
'ts': '2021-03-08T21:10:02Z'
},
...
}
}
Use the module "json" and the function loads to open a json file content to a dictionnary:
from json import loads
path = "my/path/to.json"
file = open(path, "r")
content = file.read().replace("'", '"')
dict1 = loads(content )
print(dictionnary)
Then you can use your dict using dist of list to store informations
datas = {}
for i1, key1 in enumerate(dict1):
print("key1: " + str(key1))
dict2 = dict1[key1]
for i2, key2 in enumerate(dict2):
print("key2: " + str(key2))
dict3 = dict2[key2]
for i3, key3 in enumerate(dict3):
print("key3: " + str(key3))
value = dict3[key3]
if not key3 in datas:
datas[key3] = [value]
else:
datas[key3].append(value)
print("datas: " + str(datas))
here is the result:
datas: {'gas': [52.7, 52.7], 'pm1': [0.1, 0.1], 'pm10': [0.1, 0.2], 'pm25': [0.1, 0.2], 'pr': [441.6, 451.8], 'rh': [14.6, 14.1], 'ta': [25.5, 25], 'ts': ['2021-03-08T21:00:03Z', '2021-03-08T21:10:02Z']}

python code to convert input data into json format variable

I have a input variable(stud_id), list(sub_code) and array(data) with the below values.
stud_id: 10
sub_code: ['002', '003', '007']
data: [array([['867192', '5545']], dtype=object), array([['964433', '0430']], dtype=object), array([['965686', '2099']], dtype=object)]
How to convert the above input into json format like this?
stud_id is the main key
output = '{ "10" : { "002" : [ 867192, 5545 ], '\
' "003" : [ 964433, 0430 ], '\
' "007" : [ 965686, 2099 ] } }'
I had to adjust your array type for testing.
Try this code:
stud_id = 10
sub_code = ['002', '003', '007']
#data = [array([['867192', '5545']], dtype=object),
# array([['964433', '0430']], dtype=object),
# array([['965686', '2099']], dtype=object)]
data = [['867192', '5545'],
['964433', '0430'],
['965686', '2099']]
output = '{ "10" : { "002" : [ 867192, 5545 ], '\
' "003" : [ 964433, 0430 ], '\
' "007" : [ 965686, 2099 ] } }'
dd = {str(stud_id):{k:a for k,a in zip(sub_code, data)}}
print(dd)
Output
{'10': {'002': ['867192', '5545'], '003': ['964433', '0430'], '007': ['965686', '2099']}}
>>> import json
>>> from numpy import array
>>> stud_id = 10
>>> sub_code = ['002', '003', '007']
>>> data = [array([['867192', '5545']], dtype=object),
... array([['964433', '0430']], dtype=object),
... array([['965686', '2099']], dtype=object)]
>>> json.dumps({stud_id: dict(zip(sub_code, map(lambda arr: arr[0].tolist(), data)))})
'{"10": {"002": ["867192", "5545"], "003": ["964433", "0430"], "007": ["965686", "2099"]}}'
Zip sub_code and data, turn them into a dict with a dict comprehension, then put them in another dictionary with stud_id as a key, then dump as json:
import json
json.dumps({stud_id: {k: v.tolist()[0] for (k, v) in zip(sub_code, data)}})
# '{"10": {"002": ["867192", "5545"], "003": ["964433", "0430"], "007": ["965686", "2099"]}}'

average from a dictionary (values)

I'm trying to change the result so if there are 2 grades in values it will replace the 2 grades with the average. I tried so many techniques to do that but failed.
I need to write a solution for the average and to delete the 2 values of the grades.
I wrote this code:
def myDict(grades, teachers):
Dict={}
for i1 in grades:
for i2 in teachers:
key=i2[1]
value=[]
Dict[key]=value #{'Statistics': [], 'Philosophy': [], 'Computer': [], 'Physics': [], 'English': []}
for i1 in grades:
if key==i1[-1]:
value.append(i1[0]) #{'Statistics': [23560, 23452], 'Philosophy': [], 'Computer': [23415, 12345], 'Physics': [23452, 23459], 'English': [12345]}
for i1 in grades:
if key==i1[-1]:
value.append(i1[1])
value_size=len(value)
if value_size>2:
end=int(value_size)/2
for i in value[-1:end]:
print float(count(i)/value_size)
print Dict
grades = [[12345,75,'English'],
[23452,83,'Physics'],
[23560,81,'Statistics'],
[23415,61,'Computer'],
[23459,90,'Physics'],
[12345,75,'Computer'],
[23452,100,'Statistics']]
teachers = [['Aharoni','English'],
['Melamed','Physics'],
['Kaner','Computer'],
['Zloti','Statistics'],
['Korman','Philosophy']]
print myDict(grades, teachers)
The result is:
>>>
{'Statistics': [23560, 23452, 81, 100], 'Philosophy': [], 'Computer': [23415, 12345, 61, 75], 'Physics': [23452, 23459, 83, 90], 'English': [12345, 75]}
None
>>>
What i want to get (it is in process, i am stuck in this level):
{ 'Aharoni': [12345, 75.0], 'Kaner': [23415, 12345, 68.0], 'Melamed': [23452, 23459, 86.5], 'Korman': [], 'Zloti': [23560, 23452, 90.5] }
What about this simple loop:
myDict = {}
for teacher, subject in teachers:
values = []
scores = []
for i1, i2, s in grades:
if subject == s:
values.append(i1)
scores.append(i2)
if scores:
average = sum(scores) / len(scores)
values.append(average)
myDict[teacher] = values
First, iterate trough the teachers, and for each matching subject in the grade list, append i1 and i2 to some list.
At the end of the iteration, you can easily compute the average of i2 values (if the list is not empty) and then update your dictionnary.
The output with your data would be:
{
'Korman': [],
'Melamed': [23452, 23459, 86.5],
'Zloti': [23560, 23452, 90.5],
'Aharoni': [12345, 75.0],
'Kaner': [23415, 12345, 68.0]
}
List comprehensions are a great way to deal with a data structure like that:
def myDict(grades, teachers):
subjects = [x[1] for x in teachers]
d = {}
for s in subjects:
subject_grades_records = [x for x in grades if x[2] == s]
value = [x[0] for x in subject_grades_records]
if len(value) > 0:
value.append(sum(x[1] for x in subject_grades_records) / float(len(subject_grades_records)))
teacher = [x[0] for x in teachers if x[1] == s][0]
d[teacher] = value
return d
grades = [[12345,75,'English'],
[23452,83,'Physics'],
[23560,81,'Statistics'],
[23415,61,'Computer'],
[23459,90,'Physics'],
[12345,75,'Computer'],
[23452,100,'Statistics']]
teachers = [['Aharoni','English'],
['Melamed','Physics'],
['Kaner','Computer'],
['Zloti','Statistics'],
['Korman','Philosophy']]
print(repr(myDict(grades, teachers)))
# {'Kaner': [23415, 12345, 68.0], 'Aharoni': [12345, 75.0], 'Zloti': [23560, 23452, 90.5], 'Melamed': [23452, 23459, 86.5], 'Korman': []}

Group data inside list of dictionary python

I have a list of dictionaries like this
data = [
{"_id": {"cohort_name": "09-01-2010", "segment_name": "LTV90-Prime", "driver_name": "ADB"}, "cohort_data": [
{"calculated": [],
"original": [{"1": 225.2699758337715}, {"2": 106.05173118059133}, {"3": 547.2908664469512},
{"4": 573.1083659247656}]}]},
{"_id": {"cohort_name": "11-01-2010", "segment_name": "LTV90-Prime", "driver_name": "Unit Loss Rate"},
"cohort_data": [{"calculated": [], "original": [{"1": 0.002687180620372531}, {"2": 0.001468127113897437}]}]},
{"_id": {"cohort_name": "11-01-2010", "segment_name": "LTV90-Prime", "driver_name": "Unit Loss Rate"},
"cohort_data": [{"calculated": [], "original": [{"10": 0.002687180620372531}, {"1": 0.002687180620372531},
{"2": 0.001468127113897437}]}]}
]
I am trying to group data based upon the driver_name and segment_name and push all cohort_name and cohort_data inside the internal dictionary.
The expected output is as follows
[{'driver_name': 'Unit Loss Rate',
'segment_name': 'LTV90-Prime',
'cohort_data': {
'5-01-2010': [{'1': 0.002687180620372531}, {'2': 0.001468127113897437}, {'10': 0.002687180620372531}],
'11-01-2010': [{'1': 0.002687180620372531}, {'2': 0.001468127113897437}]
}},
{'driver_name': 'ADB',
'segment_name': 'LTV90-Prime',
'cohort_data': {
"09-01-2010": [{'1': 225.2699758337715}, {'2': 106.05173118059133}, {'3': 547.2908664469512},
{'4': 573.1083659247656}]
}}
]
This is what I have done so far. I am stuck in pushing the cohort_name and cohort_data in the internal dictionary.
def get_data_list(d):
final_data = None
for i in d:
calculated = i['calculated']
original = i['original']
if original:
final_data = original
elif calculated:
final_data = calculated
return final_data
dd = defaultdict(dict)
for i in data:
df = {}
id_ = i['_id']
cohort_name_final, segment_name_final, driver_name_final = id_['cohort_name'], \
id_['segment_name'], \
id_['driver_name']
cohort_data_final = i['cohort_data']
if segment_name_final not in df and segment_name_final not in df:
df['segment_name'] = segment_name_final
df['driver_name'] = driver_name_final
df['cohort_data'] = get_data_list(cohort_data_final)
elif segment_name_final in df and segment_name_final in df:
df['cohort_data'].append(get_data_list(cohort_data_final))
# df['cohort_data'].append({cohort_name_final: get_data_list(cohort_data_final)})
I am using Python 3.4.3. The data shown here is an subset of an original dataset which is queried from the MongoDB database.
Please help.

Updating values in list of dictionaries

I have a list of dictionaries something like this:
users=[{"name": "David", "team": "reds", "score1": 100, "score2": 20,},
{"name": "David", "team": "reds", "score1": 20, "score2": 60,},
{"name": "David", "team": "blues", "score1": 10, "score2": 70,}]
and would really like to get a new processed list of dictionaries something like
summary=[{"team": "reds", "total1": 120, "total2": 80,},
{"team": "blues", "total1": 120, "total2": 80,}]
preferably looping through the original data just once. I can create a dictionary holding a total value for each user key with this
summary = dict()
for user in users:
if not user['team'] in summary:
summary[user['team']]=float(user['score1'])
else:
summary[user['team']]+=float(user['score1'])
to give
summary = {'reds': 120,'blues': 10}
but am struggling with producing the list of dictionaries, the nearest I can get is to create a dictionary at the first instance of a team, and then try to append to its values on subsequent occurrences...
summary = []
for user in users:
if any(d['team'] == user['team'] for d in summary):
# append to values in the relevant dictionary
# ??
else:
# Add dictionary to list with some initial values
d ={'team':user['team'],'total1':user['score1'],'total2':user['score2']}
summary.append(dict(d))
...and it has gotten messy... Am I going about this in completely the wrong way? Can you change values in a dictionary within a list?
Thanks
I think this is good case to use pandas library for python:
>>> import pandas as pd
>>> dfUsers = pd.DataFrame(users)
>>> dfUsers
name score1 score2 team
0 David 100 20 reds
1 David 20 60 reds
2 David 10 70 blues
>>> dfUsers.groupby('team').sum()
score1 score2
team
blues 10 70
reds 120 80
And if you really want to put it into dict:
>>> dfRes = dfUsers.groupby('team').sum()
>>> dfRes.columns = ['total1', 'total2'] # if you want to rename columns
>>> dfRes.reset_index().to_dict(orient='records')
[{'team': 'blues', 'total1': 10, 'total2': 70},
{'team': 'reds', 'total1': 120, 'total2': 80}]
another way to do this is with itertools.groupby:
>>> from itertools import groupby
>>> from operator import itemgetter
>>> users.sort(key=itemgetter('team'))
>>>
>>> res = [{'team': t[0], 'res': list(t[1])} for t in groupby(users, key=itemgetter('team'))]
>>> res = [{'team':t[0], 'total1': sum(x['score1'] for x in t[1]), 'total2': sum(x['score2'] for x in t[1])} for t in res]
>>> res
[{'team': 'blues', 'total1': 10, 'total2': 70},
{'team': 'reds', 'total1': 120, 'total2': 80}]
Or, if you really want simple python:
>>> res = dict()
>>> for x in users:
if x['team'] not in res:
res[x['team']] = [x['score1'], x['score2']]
else:
res[x['team']][0] += x['score1']
res[x['team']][1] += x['score2']
>>> res = [{'team': k, 'total1': v[0], 'total2': v[1]} for k, v in res.iteritems()}]
>>> res
[{'team': 'reds', 'total1': 120, 'total2': 80},
{'team': 'blues', 'total1': 10, 'total2': 70}]
You are really close, you just need a way to look up which dictionary to update. This is the simplest way I can see.
summary = dict()
for user in users:
team = user['team']
if team not in summary:
summary[team] = dict(team=team,
score1=float(user['score1']),
score2=float(user['score2']))
else:
summary[team]['score1'] += float(user['score1'])
summary[team]['score2'] += float(user['score2'])
then
>>> print summary.values()
[{'score1': 120.0, 'score2': 80.0, 'team': 'reds'},
{'score1': 10.0, 'score2': 70.0, 'team': 'blues'}]
Here's my solution which assumes that all scores that need to be added start with score:
users=[{"name": "David", "team": "reds", "score1": 100, "score2": 20,},
{"name": "David", "team": "reds", "score1": 20, "score2": 60,},
{"name": "David", "team": "blues", "score1": 10, "score2": 70,}]
totals = {}
for item in users:
team = item['team']
if team not in totals:
totals[team] = {}
for k,v in item.items():
if k.startswith('score'):
if k in totals[team]:
totals[team][k] += v
else:
totals[team][k] = v
print totals
Output:
{'reds': {'score1': 120, 'score2': 80}, 'blues': {'score1': 10, 'score2': 70}}
See comments inline for an explanation
import pprint
users=[{"name": "David", "team": "reds", "score1": 100, "score2": 20,},
{"name": "David", "team": "reds", "score1": 20, "score2": 60,},
{"name": "David", "team": "blues", "score1": 10, "score2": 70,}]
scores_by_team = dict()
for user in users:
if user['team'] not in scores_by_team:
# Make sure you're gonna have your scores zeroed so you can add the
# user's scores later
scores_by_team[user['team']] = {
'total1': 0,
'total2': 0
}
# Here the user's team exists for sure in scores_by_team
scores_by_team[user['team']]['total1'] += user['score1']
scores_by_team[user['team']]['total2'] += user['score2']
# So now, the scores you want have been calculated in a dictionary where the
# keys are the team names and the values are another dictionary with the scores
# that you actually wanted to calculate
print "Before making it a summary: %s" % pprint.pformat(scores_by_team)
summary = list()
for team_name, scores_by_team in scores_by_team.items():
summary.append(
{
'team': team_name,
'total1': scores_by_team['total1'],
'total2': scores_by_team['total2'],
}
)
print "Summary: %s" % summary
This outputs:
Before making it a summary: {'blues': {'total1': 10, 'total2': 70}, 'reds': {'total1': 120, 'total2': 80}}
Summary: [{'total1': 120, 'total2': 80, 'team': 'reds'}, {'total1': 10, 'total2': 70, 'team': 'blues'}]

Categories

Resources