Related
I'm storing data from an API and for some response of different country, some of the subsets is not provided, thus when I tried to subset and use the .get() method it return KeyError.
I'm wondering how I can ignore these variable and return null value when the data is not provided by the API response.
Below is the json response of two different country:
businessreport_ES.json
"salesAndTrafficByAsin": [{
"parentAsin": "AX0003",
"childAsin": "AXC0001",
"sku": "AXC1",
"salesByAsin": {
"unitsOrdered": 1,
"orderedProductSales": {
"amount": 31.06,
"currencyCode": "EUR"
},
"totalOrderItems": 1
},
"trafficByAsin": {
"browserSessions": 0,
"mobileAppSessions": 1,
"sessions": 1,
"browserSessionPercentage": 0.0,
"mobileAppSessionPercentage": 50.0,
"sessionPercentage": 14.29,
"browserPageViews": 0,
"mobileAppPageViews": 9,
"pageViews": 9,
"browserPageViewsPercentage": 0.0,
"mobileAppPageViewsPercentage": 90.0,
"pageViewsPercentage": 56.25,
"buyBoxPercentage": 100.0,
"unitSessionPercentage": 100.0
}]
businessreport_UK.json
"salesAndTrafficByAsin": [{
"parentAsin": "AX0003",
"childAsin": "AXC0001",
"sku": "AXC1",
"salesByAsin": {
"unitsOrdered": 0,
"unitsOrderedB2B": 0,
"orderedProductSales": {
"amount": 0.0,
"currencyCode": "GBP"
},
"orderedProductSalesB2B": {
"amount": 0.0,
"currencyCode": "GBP"
},
"totalOrderItems": 0,
"totalOrderItemsB2B": 0
},
"trafficByAsin": {
"browserSessions": 3,
"browserSessionsB2B": 0,
"mobileAppSessions": 12,
"mobileAppSessionsB2B": 0,
"sessions": 15,
"sessionsB2B": 0,
"browserSessionPercentage": 0.16,
"browserSessionPercentageB2B": 0.0,
"mobileAppSessionPercentage": 0.47,
"mobileAppSessionPercentageB2B": 0.0,
"sessionPercentage": 0.34,
"sessionPercentageB2B": 0.0,
"browserPageViews": 3,
"browserPageViewsB2B": 0,
"mobileAppPageViews": 15,
"mobileAppPageViewsB2B": 0,
"pageViews": 18,
"pageViewsB2B": 0,
"browserPageViewsPercentage": 0.12,
"browserPageViewsPercentageB2B": 0.0,
"mobileAppPageViewsPercentage": 0.46,
"mobileAppPageViewsPercentageB2B": 0.0,
"pageViewsPercentage": 0.31,
"pageViewsPercentageB2B": 0.0,
"buyBoxPercentage": 0.0,
"buyBoxPercentageB2B": 0.0,
"unitSessionPercentage": 0.0,
"unitSessionPercentageB2B": 0.0
}
}]
Below is my code:
f = open(f'./responses/businessreport_{marketplace}.json')
jsondata = json.load(f)
salesAndTrafficByAsin = []
for item in jsondata['salesAndTrafficByAsin']:
salesAndTrafficByAsin.append({
"date": pd.to_datetime(datef),
"parentAsin": item.get('parentAsin'),
"childAsin": item.get('childAsin'),
'unitsOrdered': item["salesByAsin"].get('unitsOrdered'),
'unitsOrderedB2B': item["salesByAsin"].get('unitsOrderedB2B'),
'orderedProductSales': item["salesByAsin"]['orderedProductSales'].get('amount'),
'currencyCode': item["salesByAsin"]['orderedProductSales'].get('currencyCode'),
'orderedProductSales_B2B': item["salesByAsin"]['orderedProductSalesB2B'].get('amount'),
'currencyCode_B2B': item["salesByAsin"]['orderedProductSalesB2B'].get('currencyCode'),
'browserSessions': item["trafficByAsin"].get('browserSessions'),
"browserSessionsB2B": item["trafficByAsin"].get('browserSessionsB2B'),
"mobileAppSessions": item["trafficByAsin"].get('mobileAppSessions'),
"mobileAppSessionsB2B": item["trafficByAsin"].get('mobileAppSessionsB2B'),
"sessions": item["trafficByAsin"].get('sessions'),
"sessionsB2B": item["trafficByAsin"].get('sessionsB2B'),
"browserSessionPercentage": item["trafficByAsin"].get('browserSessionPercentage'),
"browserSessionPercentageB2B": item["trafficByAsin"].get('browserSessionPercentageB2B'),
"mobileAppSessionPercentage": item["trafficByAsin"].get('mobileAppSessionPercentage'),
"mobileAppSessionPercentageB2B": item["trafficByAsin"].get('mobileAppSessionPercentageB2B'),
"sessionPercentage": item["trafficByAsin"].get('sessionPercentage'),
"sessionPercentageB2B": item["trafficByAsin"].get('sessionPercentageB2B'),
"browserPageViews": item["trafficByAsin"].get('browserPageViews'),
"browserPageViewsB2B": item["trafficByAsin"].get('browserPageViewsB2B'),
"mobileAppPageViews": item["trafficByAsin"].get('mobileAppPageViews'),
"mobileAppPageViewsB2B": item["trafficByAsin"].get('mobileAppPageViewsB2B'),
"pageViews": item["trafficByAsin"].get('pageViews'),
"pageViewsB2B": item["trafficByAsin"].get('pageViewsB2B'),
"browserPageViewsPercentage": item["trafficByAsin"].get('browserPageViewsPercentage'),
"browserPageViewsPercentageB2B": item["trafficByAsin"].get('browserPageViewsPercentageB2B'),
"mobileAppPageViewsPercentage": item["trafficByAsin"].get('mobileAppPageViewsPercentage'),
"mobileAppPageViewsPercentageB2B": item["trafficByAsin"].get('mobileAppPageViewsPercentageB2B'),
"pageViewsPercentage": item["trafficByAsin"].get('pageViewsPercentage'),
"pageViewsPercentageB2B": item["trafficByAsin"].get('pageViewsPercentageB2B'),
"buyBoxPercentage": item["trafficByAsin"].get('buyBoxPercentage'),
"buyBoxPercentageB2B": item["trafficByAsin"].get('buyBoxPercentageB2B'),
})
So the difference here is that buesinessreport_UK.json have the orderedProductSalesB2B subset while buesinessreport_ES.json dont. So the code return the error as shown below when trying to loop the buesinessreport_ES.json data
46 salesAndTrafficByDate = []
47 for item in jsondata['salesAndTrafficByDate']:
48 salesAndTrafficByDate.append({
49 "date": pd.to_datetime(item.get('date')),
50 "orderedProductSales": item["salesByDate"]['orderedProductSales'].get('amount'),
51 "CurrencyCode": item["salesByDate"]['orderedProductSales'].get('currencyCode'),
---> 52 "orderedProductSales_B2B": item["salesByDate"]['orderedProductSalesB2B'].get('amount'),
53 "CurrencyCode_B2B": item["salesByDate"]['orderedProductSalesB2B'].get('currencyCode'),
54 "unitsOrdered": item["salesByDate"].get("unitsOrdered"),
55 "unitsOrdered_B2B": item["salesByDate"].get("unitsOrderedB2B"),
56 "totalOrderItems": item["salesByDate"].get("totalOrderItems"),
57 "totalOrderItems_B2B": item["salesByDate"].get("totalOrderItemsB2B"),
58 "averageSalesPerOrderItem": item["salesByDate"]["averageSalesPerOrderItem"].get("amount"),
59 "averageSalesPerOrderItem_B2B": item["salesByDate"]["averageSalesPerOrderItemB2B"].get("amount"),
60 "averageUnitsPerOrderItem": item["salesByDate"].get("averageUnitsPerOrderItem"),
61 "averageUnitsPerOrderItem_B2B": item["salesByDate"].get("averageUnitsPerOrderItem"),
62 "averageSellingPrice": item["salesByDate"]["averageSellingPrice"].get("amount"),
63 "averageSellingPrice_B2B": item["salesByDate"]["averageSellingPriceB2B"].get("amount"),
64 "unitsRefunded": item["salesByDate"].get("unitsRefunded"),
65 "refundRate": item["salesByDate"].get("refundRate"),
66 "claimsGranted": item["salesByDate"].get("claimsGranted"),
67 "claimsAmount": item["salesByDate"]["claimsAmount"].get("amount"),
...
71 })
72 salesAndTrafficByAsin = []
74 for item in jsondata['salesAndTrafficByAsin']:
KeyError: 'orderedProductSalesB2B'
Imho you should use multiple gets chained, cause that's what they were designed for i.a.
I wouldn't recommend it to be honest, but you can define a method to "safely" get values from a nested dictionary.
loop over "nested keys"
reassign the result
return the result OR default if the key is not present at some level of nesting
def nested_get(d, *keys, default=None):
"""
Return value from nested dictionary
nested_get(d, "a","b") == d["a"]["b"]
"""
for k in keys:
try:
d = d[k]
except (TypeError, ValueError):
return default
return d
a = {"w":{"d":5}}
b = {"x":{"D": 5}}
value = nested_get(b,"x","D")
print(value) # This prints 5
value = nested_get(a,"w","d","t")
print(value) # This prints "None"
value = nested_get(a,"w","d","t", default = 11)
print(value) # This prints 11
I have a dictionary like this:
no_empty_keys = {'783': [['4gsx', 'ADTQGS', 0.3333333333333333, {'A': ['A224', 'T226'], 'B': ['A224', 'T226']}, 504, 509], ['4gt0', 'ADTQGS', 0.3333333333333333, {'A': ['A224', 'T226'], 'B': ['A224', 'T226']}, 504, 509]],'1062': [['4gsx', 'AELTGY', 0.5, {'A': ['L175', 'T176', 'Y178'], 'B': ['L175', 'T176', 'Y178']}, 453, 458], ['4gt0', 'AELTGY', 0.5, {'A': ['L175', 'T176', 'Y178'], 'B': ['L175', 'T176', 'Y178']}, 453, 458]]}
My function to transform that into a CSV is this one:
epitope_df = pd.DataFrame(columns=['Epitope ID', 'PDB', 'Percent Identity', 'Epitope Mapped', 'Epitope Sequence', 'Starting Position', 'Ending Position'])
for x in no_empty_keys:
for y in no_empty_keys[x]:
epitope_df = epitope_df.append({'Epitope ID': x, 'PDB': y[0], 'Percent Identity': y[2], 'Epitope Mapped' : y[3], 'Epitope Sequence' : y[1], 'Starting Position' : y[4], 'Ending Position' : y[5]}, ignore_index=True)
epitope_df.to_csv('test.csv', index=False)
My output is a csv file like this:
It is working, but it isn't well optimized. The process is very slow when I run into a dictionary with more than > 10,000 entries. Any ideas on how to speed this process up? Thank you for your time.
I'd start with getting rid of pandas.append. Appending rows to DataFrames is inefficient. You can create a DataFrame in one go:
result = []
for x in no_empty_keys:
for y in no_empty_keys[x]:
result.append(
{
'Epitope ID': x,
'PDB': y[0],
'Percent Identity': y[2],
'Epitope Mapped': y[3],
'Epitope Sequence': y[1],
'Starting Position': y[4],
'Ending Position': y[5]
}
)
epitope_df = epitope_df.from_records(result)
epitope_df.to_csv('new.csv', index=False)
You can either write an ad hoc code by hand or use convtools library, which generates such converters for you:
from convtools import conversion as c
from convtools.contrib.tables import Table
no_empty_keys = {
"783": [
[ "4gsx", "ADTQGS", 0.3333333333333333, {"A": ["A224", "T226"], "B": ["A224", "T226"]}, 504, 509, ],
[ "4gt0", "ADTQGS", 0.3333333333333333, {"A": ["A224", "T226"], "B": ["A224", "T226"]}, 504, 509, ],
],
"1062": [
[ "4gsx", "AELTGY", 0.5, {"A": ["L175", "T176", "Y178"], "B": ["L175", "T176", "Y178"]}, 453, 458,],
[ "4gt0", "AELTGY", 0.5, {"A": ["L175", "T176", "Y178"], "B": ["L175", "T176", "Y178"]}, 453, 458, ],
],
}
columns = (
"Epitope ID",
"PDB",
"Percent Identity",
"Epitope Mapped",
"Epitope Sequence",
"Starting Position",
"Ending Position",
)
# this is just a function, so it can be run on startup once and stored for
# further reuse
converter = (
c.iter(
c.zip(
c.repeat(c.item(0)),
c.item(1)
).iter(
(c.item(0),) + tuple(c.item(1, i) for i in range(len(columns) - 1))
)
)
.flatten()
.gen_converter()
)
# here is the stuff to profile
Table.from_rows(
converter(no_empty_keys.items()),
header=columns,
).into_csv("out.csv")
Consider installing black and passing debug=True to gen_converter if you are curious on the code convtools generates under the hood.
I received the output from my Vector AutoRegression (VAR) algorithm as a data frame and converted it into a dictionary giving me the following structure:
{
'Date': '2021-05-07',
'BMI': 40.53002073252068,
'BP': 123.00463807559225,
'BloodSugar': 126.85415609085157,
'ThyroidFunction': 3.0,
'TF': 5.0
}
and I want to restructure it in this form :
# {Vital : {date : value, risk_value : 1}}
{
'BMI' : {'2021-05-07' : 40.53002073252068, risk_value : 1},
'BP': {'2021-05-07' : 123.00463807559225, risk_value : 1},
'BloodSugar' :{'2021-05-07' : 126.85415609085157, risk_value : 1},
'ThyroidFunction' : {'2021-05-07' : 3.0, risk_value:1},
'TF' : {'2021-05-07' : 5.0, risk_value:1}
}
Here the "risk_value : 1" is static for now.
The only thing constant in the output from VAR in the 1st dictionary would be the 1st key- date.
The value of date will change everyday.
The Vitals(BMI, BP, BloodSugar, ThyroidFunction, TF) may vary with input with either new vitals such as (weight, height, BMI, BP, BloodSugar) or completely different vitals(cholestrolLevel, HeartRate, LDL). Like so:
{
'Date': '2021-05-07',
'weight': '170lbs',
'height': '175cm',
'BMI': 39.3252068004638,
'BP': 104.530020707559225,
'BloodSugar': 126.85415609085157,
}
I wanted to make a function which dynamically restructures the dictionary. so I tried creating a function which would take all the keys of the dictionary and tried to run it through while loop and which then creates a dictionary with vitals and the values but was unsuccessful.
I've been stuck on this for a long time and any help will be greatly appreciated.
your_dict = {
'Date': '2021-05-07',
'BMI': 40.53002073252068,
'BP': 123.00463807559225,
'BloodSugar': 126.85415609085157,
'ThyroidFunction': 3.0,
'TF': 5.0
}
def parse_dict(dict_in):
dictionary = {}
for x in dict_in:
if x != 'Date':
dictionary[x] = {dict_in['Date']: dict_in[x], 'risk_value': 1}
return dictionary
dict_out = parse_dict(your_dict)
Seems #peter beat me to it but since I've already bothered...
def foo(some_dict, **kwargs):
ret = {}
date_key = some_dict.pop('Date')
for k, v in some_dict.items():
ret[k] = {date_key: v}
ret[k].update(kwargs)
return ret
d = {
'Date': '2021-05-07',
'weight': '170lbs',
'height': '175cm',
'BMI': 39.3252068004638,
'BP': 104.530020707559225,
'BloodSugar': 126.85415609085157,
}
from pprint import pprint
pprint(foo(d, risk_value=42, abc='x'))
Here you go!
def restructure (dct):
date = dct['Date'] #store the date
dct.pop('Date') #remove 'Date' key from the dict since it not needed anymore
for key, value in dct.items():
dct[key] = {date:value, 'risk_value' : 1} #update values for each key
return dct
print(restructure(d))
I'm trying to change the result so if there are 2 grades in values it will replace the 2 grades with the average. I tried so many techniques to do that but failed.
I need to write a solution for the average and to delete the 2 values of the grades.
I wrote this code:
def myDict(grades, teachers):
Dict={}
for i1 in grades:
for i2 in teachers:
key=i2[1]
value=[]
Dict[key]=value #{'Statistics': [], 'Philosophy': [], 'Computer': [], 'Physics': [], 'English': []}
for i1 in grades:
if key==i1[-1]:
value.append(i1[0]) #{'Statistics': [23560, 23452], 'Philosophy': [], 'Computer': [23415, 12345], 'Physics': [23452, 23459], 'English': [12345]}
for i1 in grades:
if key==i1[-1]:
value.append(i1[1])
value_size=len(value)
if value_size>2:
end=int(value_size)/2
for i in value[-1:end]:
print float(count(i)/value_size)
print Dict
grades = [[12345,75,'English'],
[23452,83,'Physics'],
[23560,81,'Statistics'],
[23415,61,'Computer'],
[23459,90,'Physics'],
[12345,75,'Computer'],
[23452,100,'Statistics']]
teachers = [['Aharoni','English'],
['Melamed','Physics'],
['Kaner','Computer'],
['Zloti','Statistics'],
['Korman','Philosophy']]
print myDict(grades, teachers)
The result is:
>>>
{'Statistics': [23560, 23452, 81, 100], 'Philosophy': [], 'Computer': [23415, 12345, 61, 75], 'Physics': [23452, 23459, 83, 90], 'English': [12345, 75]}
None
>>>
What i want to get (it is in process, i am stuck in this level):
{ 'Aharoni': [12345, 75.0], 'Kaner': [23415, 12345, 68.0], 'Melamed': [23452, 23459, 86.5], 'Korman': [], 'Zloti': [23560, 23452, 90.5] }
What about this simple loop:
myDict = {}
for teacher, subject in teachers:
values = []
scores = []
for i1, i2, s in grades:
if subject == s:
values.append(i1)
scores.append(i2)
if scores:
average = sum(scores) / len(scores)
values.append(average)
myDict[teacher] = values
First, iterate trough the teachers, and for each matching subject in the grade list, append i1 and i2 to some list.
At the end of the iteration, you can easily compute the average of i2 values (if the list is not empty) and then update your dictionnary.
The output with your data would be:
{
'Korman': [],
'Melamed': [23452, 23459, 86.5],
'Zloti': [23560, 23452, 90.5],
'Aharoni': [12345, 75.0],
'Kaner': [23415, 12345, 68.0]
}
List comprehensions are a great way to deal with a data structure like that:
def myDict(grades, teachers):
subjects = [x[1] for x in teachers]
d = {}
for s in subjects:
subject_grades_records = [x for x in grades if x[2] == s]
value = [x[0] for x in subject_grades_records]
if len(value) > 0:
value.append(sum(x[1] for x in subject_grades_records) / float(len(subject_grades_records)))
teacher = [x[0] for x in teachers if x[1] == s][0]
d[teacher] = value
return d
grades = [[12345,75,'English'],
[23452,83,'Physics'],
[23560,81,'Statistics'],
[23415,61,'Computer'],
[23459,90,'Physics'],
[12345,75,'Computer'],
[23452,100,'Statistics']]
teachers = [['Aharoni','English'],
['Melamed','Physics'],
['Kaner','Computer'],
['Zloti','Statistics'],
['Korman','Philosophy']]
print(repr(myDict(grades, teachers)))
# {'Kaner': [23415, 12345, 68.0], 'Aharoni': [12345, 75.0], 'Zloti': [23560, 23452, 90.5], 'Melamed': [23452, 23459, 86.5], 'Korman': []}
I have a list of dictionaries like this
data = [
{"_id": {"cohort_name": "09-01-2010", "segment_name": "LTV90-Prime", "driver_name": "ADB"}, "cohort_data": [
{"calculated": [],
"original": [{"1": 225.2699758337715}, {"2": 106.05173118059133}, {"3": 547.2908664469512},
{"4": 573.1083659247656}]}]},
{"_id": {"cohort_name": "11-01-2010", "segment_name": "LTV90-Prime", "driver_name": "Unit Loss Rate"},
"cohort_data": [{"calculated": [], "original": [{"1": 0.002687180620372531}, {"2": 0.001468127113897437}]}]},
{"_id": {"cohort_name": "11-01-2010", "segment_name": "LTV90-Prime", "driver_name": "Unit Loss Rate"},
"cohort_data": [{"calculated": [], "original": [{"10": 0.002687180620372531}, {"1": 0.002687180620372531},
{"2": 0.001468127113897437}]}]}
]
I am trying to group data based upon the driver_name and segment_name and push all cohort_name and cohort_data inside the internal dictionary.
The expected output is as follows
[{'driver_name': 'Unit Loss Rate',
'segment_name': 'LTV90-Prime',
'cohort_data': {
'5-01-2010': [{'1': 0.002687180620372531}, {'2': 0.001468127113897437}, {'10': 0.002687180620372531}],
'11-01-2010': [{'1': 0.002687180620372531}, {'2': 0.001468127113897437}]
}},
{'driver_name': 'ADB',
'segment_name': 'LTV90-Prime',
'cohort_data': {
"09-01-2010": [{'1': 225.2699758337715}, {'2': 106.05173118059133}, {'3': 547.2908664469512},
{'4': 573.1083659247656}]
}}
]
This is what I have done so far. I am stuck in pushing the cohort_name and cohort_data in the internal dictionary.
def get_data_list(d):
final_data = None
for i in d:
calculated = i['calculated']
original = i['original']
if original:
final_data = original
elif calculated:
final_data = calculated
return final_data
dd = defaultdict(dict)
for i in data:
df = {}
id_ = i['_id']
cohort_name_final, segment_name_final, driver_name_final = id_['cohort_name'], \
id_['segment_name'], \
id_['driver_name']
cohort_data_final = i['cohort_data']
if segment_name_final not in df and segment_name_final not in df:
df['segment_name'] = segment_name_final
df['driver_name'] = driver_name_final
df['cohort_data'] = get_data_list(cohort_data_final)
elif segment_name_final in df and segment_name_final in df:
df['cohort_data'].append(get_data_list(cohort_data_final))
# df['cohort_data'].append({cohort_name_final: get_data_list(cohort_data_final)})
I am using Python 3.4.3. The data shown here is an subset of an original dataset which is queried from the MongoDB database.
Please help.