nested json file with dictionary into a dataframe

nested json file with dictionary into a dataframe - python

I have a json file as
{
"Date": 2017,
"count": 88,
"demographics": [
{
"key": "age",
"value": "20-30"
},
{
"key": "education",
"value": 'bachelor'
},
{
"key": "income",
"value": "70-80"
},
{
"key": "location",
"value": "USA"
}
]
}
I use the code below to convert it into csv file. It gives me a dataframe with 3 columns of Date, count, and demographics, but the output I need is a dataframe with 6 columns of Date, Count, age, education, income, and location.
with open(r'Sample.json') as f:
data = json.load(f)
dfNormalized=json_normalize(data)

By using apply with pd.Serise:
d={
"Date": 2017,
"count": 88,
"demographics": [
{
"key": "age",
"value": "20-30"
},
{
"key": "education",
"value": 'bachelor'
},
{
"key": "income",
"value": "70-80"
},
{
"key": "location",
"value": "USA"
}
]
}
df=pd.DataFrame(d)
A=(df.demographics.apply(pd.Series).T.loc['value'].to_frame().\
set_index(df.demographics.apply(pd.Series).T.loc['key']).T).\
rename_axis(None,1).reset_index(drop=True)
pd.concat([df.drop('demographics',1).loc[0].to_frame().T,A],1)
Out[273]:
Date count age education income location
0 2017 88 20-30 bachelor 70-80 USA

Related

How to map the dictionary values to another dictionary

I have dictionary which is below
{
"aggregations": {
"A": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{ "key": "ADL", "doc_count": 1 },
{ "key": "SDD", "doc_count": 1 },
{ "key": "JJD", "doc_count": 1 }
]
},
"B": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{ "key": "ABC", "doc_count": 1 },
{ "key": "CDE", "doc_count": 1 },
{ "key": "FGH", "doc_count": 1 }
]
},
"C": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{ "key": "XYX", "doc_count": 1 },
{ "key": "NXS", "doc_count": 1 }
]
}
}
}
aggregations.keys will be aggregationfilters.fieldName
aggregations.buckets.key will be aggregationfilters.values.title
aggregationfilters.values.paragraph is null everytime
aggregations.buckets.doc_count will be aggregationfilters.values.count
Basically I need to extract aggregations.keys and aggregations.bucket values and put into different dictionary.
Need to write a general code structure to do that.
I cannot do with .pop(rename) the dictioanry
My expected out
{
"aggregationfilters": [
{
"name": "ABC",
"fieldName": "A",
"values": [
{ "title": "ADL", "paragraph": null, "count": 1 },
{ "title": "SDD", "paragraph": null, "count": 1 },
{ "title": "JJD", "paragraph": null, "count": 1 }
]
}, {
"name": "CDE",
"fieldName": "B",
"values": [
{ "title": "ABC", "paragraph": null, "count": 1 },
{ "title": "CDE", "paragraph": null, "count": 1 },
{ "title": "FGH", "paragraph": null, "count": 1 }
]
}, {
"name": "FGH",
"fieldName": "C",
"values": [
{ "title": "XYX", "paragraph": null, "count": 1 },
{ "title": "NXS", "paragraph": null, "count": 1 }
]
}
]
}

Well, this works, but even with my best effort this still doesn't look that clean.
import json
source = {
"aggregations": {
"A": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{"key": "ADL", "doc_count": 1},
{"key": "SDD", "doc_count": 1},
{"key": "JJD", "doc_count": 1},
],
},
"B": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{"key": "ABC", "doc_count": 1},
{"key": "CDE", "doc_count": 1},
{"key": "FGH", "doc_count": 1},
],
},
"C": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [{"key": "XYX", "doc_count": 1}, {"key": "NXS", "doc_count": 1}],
},
}
}
convert_map = {
"buckets": "values",
"doc_count": "count",
"key": "title",
}
remove_map = {"sum_other_doc_count", "doc_count_error_upper_bound"}
add_map = {"name": "Changed VAL_", "fieldName": "VAL_"}
def converting_generator(
source_: dict, convert_map_: dict, remove_map_: set, add_map_: dict
):
working_dict = {k: v for k, v in source_.items()}
variable_identifier = "VAL_"
for key, inner_dic in working_dict.items():
inner_dic: dict
for rm_key in remove_map_:
try:
inner_dic.pop(rm_key)
except KeyError:
pass
for add_key, add_val in add_map_.items():
inner_dic[add_key] = add_val.replace(variable_identifier, key)
dumped = json.dumps(inner_dic, indent=2)
for original, target in convert_map_.items():
dumped = dumped.replace(original, target)
yield json.loads(dumped)
converted = {
"aggregation_filters": list(
converting_generator(source["aggregations"], convert_map, remove_map, add_map)
)
}
for inner_dict in converted["aggregation_filters"]:
for even_inner_dict in inner_dict["values"]:
even_inner_dict["paragraph"] = None
print(json.dumps(converted, indent=2))
Output:
{
"aggregation_filters": [
{
"values": [
{
"title": "ADL",
"count": 1,
"paragraph": null
},
{
"title": "SDD",
"count": 1,
"paragraph": null
},
{
"title": "JJD",
"count": 1,
"paragraph": null
}
],
"name": "Changed A",
"fieldName": "A"
},
{
"values": [
{
"title": "ABC",
"count": 1,
"paragraph": null
},
{
"title": "CDE",
"count": 1,
"paragraph": null
},
{
"title": "FGH",
"count": 1,
"paragraph": null
}
],
"name": "Changed B",
"fieldName": "B"
},
{
"values": [
{
"title": "XYX",
"count": 1,
"paragraph": null
},
{
"title": "NXS",
"count": 1,
"paragraph": null
}
],
"name": "Changed C",
"fieldName": "C"
}
]
}
Always show your code, would be nice if that's a working one - to show that you've put at least that worth of the effort on your problem.
I don't bother it as this feels like puzzle solving, but others may not.

Manipulating data of Pandas dataframe

I'm reading a dataframe and trying to 'insert' a list inside another list and then converting it to json file. I'm using python 3 and 0.25.3 version of pandas for it.
My dataframe:
id label id_customer label_customer part_number number_client
6 Sao Paulo CUST-99992 Brazil 7897 982
6 Sao Paulo CUST-99992 Brazil 888 12
92 Hong Kong CUST-88888 China 147 288
My code:
import pandas as pd
import json
data = pd.read_excel(path)
data["part_number"] = data["part_number"].apply(lambda x: str(x))
data["number_client"] = data["number_client"].apply(lambda x: str(x))
data = data.groupby(["id", "label", "id_customer", "label_customer"], as_index=False).agg("#".join)
data["part_number"] = data["part_number"].apply(lambda x: {"part": x})
data["number_client"] = data["number_client"].apply(lambda x: {"client": x})
data["id_customer"] = data["id_customer"].apply(lambda x: {"id": x})
data["label_customer"] = data["label_customer"].apply(lambda x: {"label": x})
data["number"] = data.apply(lambda x: [{**x["part_number"], **x["number_client"]}], axis=1)
data["Customer"] = data.apply(lambda x: [{**x["id_customer"], **x["label_customer"], **data["number"]}],axis=1)
data = data[["id", "label", "Customer"]]
data.to_json(path)
Json output I'm getting:
[{
"id": 6,
"label": "Sao Paulo",
"Customer": [{
"id": "CUST-99992",
"label": "Brazil",
"0": [{
"part": "7897",
"client": "982"
}],
"1": [{
"part": "888",
"client": "12"
}],
"2": [{
"part": "147",
"client": "288"
}]
}]
}, {
"id": 6,
"label": "Sao Paulo",
"Customer": [{
"id": "CUST-99992",
"label": "Brazil",
"0": [{
"part": "7897",
"client": "982"
}],
"1": [{
"part": "888",
"client": "12"
}],
"2": [{
"part": "147",
"client": "288"
}]
}]
}, {
"id": 92,
"label": "Hong Kong",
"Customer": [{
"id": "CUST-888888",
"label": "China",
"0": [{
"part": "7897",
"client": "982"
}],
"1": [{
"part": "888",
"client": "12"
}],
"2": [{
"part": "147",
"client": "288"
}]
}]
}]
What I need:
[{
"id": 6,
"label": "Sao Paulo",
"Customer": [{
"id": "CUST-99992",
"label": "Brazil",
"number": [{
"part": "7897",
"client": "982"
},
{
"part": "888",
"client": "12"
}]
}]
},
{
"id": 92,
"label": "Hong Kong",
"Customer": [{
"id": "CUST-888888",
"label": "China",
"number": [{
"part": "147",
"client": "288"
}]
}]
}
]
Look that id and label a is group of information even as id_customer and label_customer is another group, part_number and number_client is another. Customerand number are lists and they can have a lot objects inside them (the number of objects depends of my data in my dataframe).
What am I doing wrong and how can I fix it?
Tks so much!

First cast both columns to strings and then use lambda functions with DataFrame.to_dict and rename columns names, last convert output to json by DataFrame.to_json:
data[["part_number","number_client"]] = data[["part_number","number_client"]].astype(str)
f = lambda x: x.split('_')[0]
j =(data.groupby(["id","label","id_customer","label_customer"])['part_number','number_client']
.apply(lambda x: x.rename(columns=f).to_dict('r')).reset_index(name='number')
.groupby(["id", "label"])[ "id_customer", "label_customer", "number"]
.apply(lambda x: x.rename(columns=f).to_dict('r')).reset_index(name='customer')
.to_json(orient='records'))
print (j)
[{
"id": 6,
"label": "Sao Paulo",
"customer": [{
"id": "CUST-99992",
"label": "Brazil",
"number": [{
"part": "7897",
"number": "982"
}, {
"part": "888",
"number": "12"
}]
}]
}, {
"id": 92,
"label": "Hong Kong",
"customer": [{
"id": "CUST-88888",
"label": "China",
"number": [{
"part": "147",
"number": "288"
}]
}]
}]

Pandas dataframe and conversion to Json

Basically I´m reading a pandas dataframe and converting it to Json. I´m a beginner in coding, but I know that is preferable to use apply function instead iterrows (and I already tried to use apply function, but some difficulties in understand the syntax and find out my solution arose)!!
===============================
Data that I´m reading from excel
id label id_customer label_customer part_number number_customer product label_product key country value_product
6 Sao Paulo CUST-99992 Brazil 982 10 sho1564 shoes SH-99 Chile 1.5
6 Sao Paulo CUST-99992 Brazil 982 10 sn47282 sneakers SN-71 Germany 43.8
6 Sao Paulo CUST-43535 Argentina 435 15 sk84393 skirt SK-11 Netherlands 87.1
92 Hong Hong CUST-88888 China 785 58 ca40349 cap CA-82 Russia 3.95
===============================
CODE:
import pandas as pd
import json
df = pd.read_excel(path)
result = []
for labels, df1 in df.groupby(['id', 'label'],sort=False):
id_, label = labels
record = {'id': int(id_), 'label': label, 'Customer': []}
for inner_labels, df2 in df1.groupby(['id_customer', 'label_customer'],sort=False):
id_,label = inner_labels
record['Customer'].append({
'id': id_,
'label': label,
'Number': [{'part': str(p), 'number_customer': str(s)} for p, s in zip(df2['part_number'], df2['number_customer'])]
})
result.append(record)
===============================
Json I'm getting:
[
{
"id": 6,
"label": "Sao Paulo",
"Customer": [
{
"id": "CUST-99992",
"label": "Brazil",
"Number": [
{
"part": "982",
"number_customer": "10"
},
{
"part": "982",
"number_customer": "10"
}
]
},
{
"id": "CUST-43535",
"label": "Argentina",
"Number": [
{
"part": "435",
"number_customer": "15"
}
]
}
]
},
{
"id": 92,
"label": "Hong Kong",
"Customer": [
{
"id": "CUST-88888",
"label": "China",
"Number": [
{
"part": "785",
"number_customer": "58"
}
]
}
]
}
]
===============================
Json expected:
[
{
"id": 6,
"label": "Sao Paulo",
"Customer": [
{
"id": "CUST-99992",
"label": "Brazil",
"Number": [
{
"part": "982",
"number_customer": "10",
"Procucts": [
{
"product": "sho1564",
"label_product": "shoes",
"Order": [
{
"key": "SH-99",
"country": "Chile",
"value_product": "1.5"
}
]
},
{
"product": "sn47282",
"label_product": "sneakers",
"Order": [
{
"key": "SN-71",
"country": "Germany",
"value_product": "43.8"
}
]
}
]
}
]
},
{
"id": "CUST-43535",
"label": "Argentina",
"Number": [
{
"part": "435",
"number_customer": "15",
"Procucts": [
{
"product": "sk84393",
"label_product": "skirt",
"Order": [
{
"key": "SK-11",
"country": "Netherlands",
"value_product": "87.1"
}
]
}
]
}
]
}
]
},
{
"id": 92,
"label": "Hong Kong",
"Customer": [
{
"id": "CUST-88888",
"label": "China",
"Number": [
{
"part": "785",
"number_customer": "58",
"Procucts": [
{
"product": "ca40349",
"label_product": "cap",
"Order": [
{
"key": "CA-82",
"country": "Russia",
"value_product": "3.95"
}
]
}
]
}
]
}
]
}
]
===============================
Look that id and label is group of information even as id_customer and label customer is another group, part_number and number_customer is another, product and label_product another, key, country and value_product another.
My expected Json depends of my information inside my dataframe.
Can somebody help me in any way pls?

import pandas as pd
import json
df = pd.read_excel(path)
result = []
for labels, df1 in df.groupby(['id', 'label'], sort=False):
id_, label = labels
record = {'id': int(id_), 'label': label, 'Customer': []}
for inner_labels, df2 in df1.groupby(['id_customer', 'label_customer'], sort=False):
id_, label = inner_labels
customer = {'id': id_, 'label': label, 'Number': []}
for inner_labels, df3 in df2.groupby(['part_number', 'number_customer'], sort=False):
p, s = inner_labels
number = {'part': str(p), 'number_customer': str(s), 'Products': []}
for inner_labels, df4 in df3.groupby(['product', 'label_product'], sort=False):
p, lp = inner_labels
product = {'product': p, 'label_product': lp, 'Order': []}
for k, c, v in zip(df4['key'], df4['country'], df4['value_product']):
product['Order'].append({'key': k, 'country': c, 'value_product': v})
number['Products'].append(product)
customer['Number'].append(number)
record['Customer'].append(customer)
result.append(record)

Hope this is of use!
from io import StringIO
import pandas as pd
import json
csv = """id,label,id_customer,label_customer,part_number,number_customer,product,label_product,key,country,value_product
6,Sao Paulo,CUST-99992,Brazil,982,10,sho1564,shoes,SH-99,Chile,1.5
6,Sao Paulo,CUST-99992,Brazil,982,10,sn47282,sneakers,SN-71,Germany,43.8
6,Sao Paulo,CUST-43535,Argentina,435,15,sk84393,skirt,SK-11,Netherlands,87.1
92,Hong Hong,CUST-88888,China,785,58,ca40349,cap,CA-82,Russia,3.95"""
csv = StringIO(csv)
df = pd.read_csv(csv)
def split(df, groupby, json_func):
for x, group in df.groupby(groupby):
yield json_func(group, *x)
a = list(split(df, ['id', 'label'], lambda grp, id_, label: {"id": id_, "label": label, "Customer": list(
split(grp, ['id_customer', 'label_customer'], lambda grp_1, id_cust, label_cust: {"id": id_cust, "label": label_cust, "Number": list(
split(grp_1, ['part_number', 'number_customer'], lambda grp_2, part, num_cust: {"part": part, "number_customer": num_cust, "Products": list(
split(grp_2, ['product', 'label_product'], lambda grp_3, product, label_product: {"product": product, "label_product": label_product, "Order": list(
split(grp_3, ['key', 'country', 'value_product'], lambda _, key, country, value_product: {"key": key, "country": country, "value_product": value_product}))}
))})
)}))}))
display(a)

Parsing and creating nested dictionaries

I would like to create a dictionary containing a nested structure of dictionaries, like bellow :
{
"Jaque": {
"ES": {
"Madrid": [
{
"experience": 9
}
]
},
"FR": {
"Lyon": [
{
"experience": 11.4
}
],
"Paris": [
{
"experience": 20
}
]
}
},
"James": {
"UK": {
"London": [
{
"experience": 10.9
}
]
}
},
"Henry": {
"UK": {
"London": [
{
"experience": 15
}
]
}
},
"Joe": {
"US": {
"Boston": [
{
"experience": 100
}
]
}
}
}
}
My input is a list of dictionaries of this format:
c = [{
"country": "US",
"city": "Boston",
"name": "Joe",
"experience": 100
},
{
"country": "FR",
"city": "Paris",
"name": "Jaque",
"experience": 20
},
{
"country": "FR",
"city": "Lyon",
"name": "Jaque",
"experience": 11.4
},
{
"country": "ES",
"city": "Madrid",
"name": "Jaque",
"experience": 9
},
{
"country": "UK",
"city": "London",
"name": "Henry",
"experience": 15
},
{
"country": "UK",
"city": "London",
"name": "James",
"experience": 10.9
}
]
My first approach was to create the nested dict, step by step:
dd = dict.fromkeys([i.get("name") for i in c],defaultdict(dict))
#will create
# dd = {'Joe': defaultdict(<class 'dict'>, {}), 'Jaque': defaultdict(<class 'dict'>, {}), 'James': defaultdict(<class 'dict'>, {}), 'Henry': defaultdict(<class 'dict'>, {})}
for i in dd:
for j in c:
#verify if name from d is in dict j
if i in j.values():
dd[i]=dict(zip([a.get("country") for a in c if i in a.values() ],[b.get("city") for b in c if i in b.values() ]))
# dd will become
#{'Joe': {'US': 'Boston'}, 'Jaque': {'FR': 'Lyon', 'ES': 'Madrid'}, 'Henry': {'UK': 'London'}, 'James': {'UK': 'London'}}
Now I can't figure a way to create/update the nested structure of dict dd. Is there a more dynamic way to create dict? Thx

You could use itertools.groupby to organize the list similarly to your expected output and then loop to convert to a dict.
from itertools import groupby
from operator import itemgetter
data = [{"country": "US", "city": "Boston", "name": "Joe", "experience": 100 }, {"country": "FR", "city": "Paris", "name": "Jaque", "experience": 20 }, {"country": "FR", "city": "Lyon", "name": "Jaque", "experience": 11.4 }, {"country": "ES", "city": "Madrid", "name": "Jaque", "experience": 9 }, {"country": "UK", "city": "London", "name": "Henry", "experience": 15 }, {"country": "UK", "city": "London", "name": "James", "experience": 10.9 } ]
result = {}
for key, values in groupby(sorted(data, key=itemgetter('name')), key=itemgetter('name')):
result[key] = {
v['country']: {v['city']: [{'experience': v['experience']}]} for v in values
}
print(result)
# {'Henry': {'UK': {'London': [{'experience': 15}]}}, 'James': {'UK': {'London': [{'experience': 10.9}]}}, 'Jaque': {'FR': {'Lyon': [{'experience': 11.4}]}, 'ES': {'Madrid': [{'experience': 9}]}}, 'Joe': {'US': {'Boston': [{'experience': 100}]}}}

You can use recursion with itertools.groupby:
from itertools import groupby
def group(d, keys = None):
key, *keys = keys
new_d = {a:list(b) for a, b in groupby(sorted(d, key=lambda x:x[key]), key=lambda x:x[key])}
t = {a:[{c:d for c, d in k.items() if c != key} for k in b] for a, b in new_d.items()}
return {a:group(b, keys) if not all(len(i) == 1 for i in b) else b for a, b in t.items()}
result = group(data, keys = ['name', 'country', 'city', 'experience'])
import json
print(json.dumps(result, indent=4)))
Output:
{
"Henry": {
"UK": {
"London": [
{
"experience": 15
}
]
}
},
"James": {
"UK": {
"London": [
{
"experience": 10.9
}
]
}
},
"Jaque": {
"ES": {
"Madrid": [
{
"experience": 9
}
]
},
"FR": {
"Lyon": [
{
"experience": 11.4
}
],
"Paris": [
{
"experience": 20
}
]
}
},
"Joe": {
"US": {
"Boston": [
{
"experience": 100
}
]
}
}
}

Merge two lists of dicts in python using pandas

I have two lists of dicts: one which has monthly data, and another which has quarterly data as follows:
monthly = [
{
"name": "Boston",
"month": "2015-May",
"total_monthly": 2
},
{
"name": "Boston",
"month": "2015-June",
"total_monthly": 8
},
{
"name": "Chicago",
"month": "2015-May",
"total_monthly": 10
},
{
"name": "Chicago",
"month": "2015-June",
"total_monthly": 13
}
]
quarterly =[
{
"name": "Boston",
"quarter": "2015-Q1",
"total_quarterly": 23
},
{
"name": "Boston",
"quarter": "2015-Q2",
"total_quarterly": 24
},
{
"name": "Chicago",
"quarter": "2015-Q1",
"total_quarterly": 40
},
{
"name": "Chicago",
"quarter": "2015-Q2",
"total_quarterly": 32
}
]
Conventionally, I can iterate through the lists and merge them based on common names. However, how can I achieve the merged data as follows using Pandas?
merged = [
{
"name": "Boston",
"trend_monthly" : [
{
"month": "2015-May",
"total_monthly": 2
},
{
"month": "2015-June",
"total_monthly": 8
},
],
"trend_quarterly" : [
{
"quarter": "2015-Q1",
"total_quarterly": 23
},
{
"quarter": "2015-Q2",
"total_quarterly": 24
},
]
},
{
"name": "Chicago",
"trend_monthly" : [
{
"month": "2015-May",
"total_monthly": 10
},
{
"month": "2015-June",
"total_monthly": 13
},
],
"trend_quarterly" : [
{
"quarter": "2015-Q1",
"total_quarterly": 40
},
{
"quarter": "2015-Q2",
"total_quarterly": 32
},
]
}]

You have to do something like this:
import pandas as pd
df_monthly = pd.DataFrame(monthly)
df_quarterly = pd.DataFrame(quarterly)
df = pd.concat([df_monthly, df_quarterly])
# This part does not group correctly, please edit for your needs
result = []
dict_monthly = dict(list(df[df.month.notnull()][['name',
'month',
'total_monthly']
].groupby(by='name')))
dict_quarterly = dict(list(df[df.quarter.notnull()][['name',
'quarter',
'total_quarterly']
].groupby(by='name')))
result.append(dict_monthly)
result.append(dict_quarterly)

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

nested json file with dictionary into a dataframe - python

Related

How to map the dictionary values to another dictionary

Manipulating data of Pandas dataframe

Pandas dataframe and conversion to Json

Parsing and creating nested dictionaries

Merge two lists of dicts in python using pandas

Categories

Resources