How to convert nested json to csv with multiple different names? - python

I've been trying to convert a nested json file to csv. Here is a small example of the json file.
json_data =
{"labels":
{
"longfilename01:png": {
"events": {
"-N8V6uUR__vvB0qv1lPb": {
"t": "2022-08-02T19:54:23.608Z",
"user": "bmEhwNCZT9Wiftgvsopb7vBjO9o1"
}
},
"questions": {
"would-you": {
"-N8V6uUR__vvB0qv1lPb": {
"answer": "no",
"format": 1
}
}
}
},
"longfilename02:png": {
"events": {
"-N8ILnaH-1ylwp2LGvtP": {
"t": "2022-07-31T08:24:23.698Z",
"user": "Qf7C5cXQkXfQanxKPR0rsKW4QzE2"
}
},
"questions": {
"would-you": {
"-N8ILnaH-1ylwp2LGvtP": {
"answer": "yes",
"format": 1
}
}
}
}
I've tried multiple ways to get this output:
Labels
Event
User
Time
Answer
Long filename 01
-N8V6uUR__vvB0qv1lPb
bmEhwNCZT9Wiftgvsopb7vBjO9o1
2022-08-02T19:54:23.608Z
no
Long filename 02
-N8ILnaH-1ylwp2LGvtP
bmEhwNCZT9Wiftgvsopb7vBjO9o1
2022-07-31T08:24:23.698Z
yes
If I normalise with:
f= open('after_labels.json')
data = json.load(f)
df = pd.json_normalize(data)
Or try to flatten the file with multiple functions such as:
def flatten_json(json):
def process_value(keys, value, flattened):
if isinstance(value, dict):
for key in value.keys():
process_value(keys + [key], value[key], flattened)
elif isinstance(value, list):
for idx, v in enumerate(value):
process_value(keys + [str(idx)], v, flattened)
else:
flattened['__'.join(keys)] = value
flattened = {}
for key in json.keys():
process_value([key], json[key], flattened)
return flattened
df = flatten_json(data)
or
from copy import deepcopy
import pandas
def cross_join(left, right):
new_rows = [] if right else left
for left_row in left:
for right_row in right:
temp_row = deepcopy(left_row)
for key, value in right_row.items():
temp_row[key] = value
new_rows.append(deepcopy(temp_row))
return new_rows
def flatten_list(data):
for elem in data:
if isinstance(elem, list):
yield from flatten_list(elem)
else:
yield elem
def json_to_dataframe(data_in):
def flatten_json(data, prev_heading=''):
if isinstance(data, dict):
rows = [{}]
for key, value in data.items():
rows = cross_join(rows, flatten_json(value, prev_heading + '.' + key))
elif isinstance(data, list):
rows = []
for item in data:
[rows.append(elem) for elem in flatten_list(flatten_json(item, prev_heading))]
else:
rows = [{prev_heading[1:]: data}]
return rows
return pandas.DataFrame(flatten_json(data_in))
df = json_to_dataframe(data)
print(df)
It gives me 292 columns and I suspect this is because of the long unique filenames.
I can't change the json file before processing, because that seems like the simple solution to do "filename": "longfilename01:png" as they would then all be consistent and I wouldn't have this problem.
I would be grateful for any other clever ideas on how to solve this.

Try:
json_data = {
"labels": {
"longfilename01:png": {
"events": {
"-N8V6uUR__vvB0qv1lPb": {
"t": "2022-08-02T19:54:23.608Z",
"user": "bmEhwNCZT9Wiftgvsopb7vBjO9o1",
}
},
"questions": {
"would-you": {
"-N8V6uUR__vvB0qv1lPb": {"answer": "no", "format": 1}
}
},
},
"longfilename02:png": {
"events": {
"-N8ILnaH-1ylwp2LGvtP": {
"t": "2022-07-31T08:24:23.698Z",
"user": "Qf7C5cXQkXfQanxKPR0rsKW4QzE2",
}
},
"questions": {
"would-you": {
"-N8ILnaH-1ylwp2LGvtP": {"answer": "yes", "format": 1}
}
},
},
}
}
df = pd.DataFrame(
[
{
"Labels": k,
"Event": list(v["events"])[0],
"User": list(v["events"].values())[0]["user"],
"Time": list(v["events"].values())[0]["t"],
"Answer": list(list(v["questions"].values())[0].values())[0][
"answer"
],
}
for k, v in json_data["labels"].items()
]
)
print(df)
Prints:
Labels Event User Time Answer
0 longfilename01:png -N8V6uUR__vvB0qv1lPb bmEhwNCZT9Wiftgvsopb7vBjO9o1 2022-08-02T19:54:23.608Z no
1 longfilename02:png -N8ILnaH-1ylwp2LGvtP Qf7C5cXQkXfQanxKPR0rsKW4QzE2 2022-07-31T08:24:23.698Z yes

Related

How to select multiple JSON Objects using python

I have a Json data as following. The Json has many such objects with same NameId's:
[{
"NameId": "name1",
"exp": {
"exp1": "test1"
}
}, {
"NameId": "name1",
"exp": {
"exp2": "test2"
}
}
]
Now, what I am after is to create a new Json Object that has a merged exp and create a file something like below, so that I do not have multiple NameId:
[{
"NameId": "name1",
"exp": {
"exp1": "test1",
"exp2": "test2"
}
}
]
Is there a possibility I can achive it using Python?
You can do the manual work, merging the entries while rebuilding the structure. You can keep a dictionary with the exp to merge them.
import json
jsonData = [{
"NameId": "name1",
"exp": {
"exp1": "test1"
}
}, {
"NameId": "name1",
"exp": {
"exp2": "test2"
}
}, {
"NameId": "name2",
"exp": {
"exp3": "test3"
}
}]
result = []
expsDict = {}
for entry in jsonData:
nameId = entry["NameId"]
exp = entry["exp"]
if nameId in expsDict:
# Merge exp into resultExp.
# Note that resultExp belongs to both result and expsDict,
# changes made will be reflected in both containers!
resultExp = expsDict[nameId]
for (expName, expValue) in exp.items():
resultExp[expName] = expValue
else:
# Copy copy copy, otherwise merging would modify jsonData too!
exp = exp.copy()
entry = entry.copy()
entry["exp"] = exp
# Add a new item to the result
result.append(entry)
# Store exp to later merge other entries with the same name.
expsDict[nameId] = exp
print(result)
You can use itertools.groupby and functools.reduce
d = [{
"NameId": "name1",
"exp": {
"exp1": "test1"
}
}, {
"NameId": "name1",
"exp": {
"exp2": "test2"
}
}]
from itertools import groupby
[ {'NameId': k, 'exp': reduce(lambda x,y : {**x["exp"], **y["exp"]} , v) } for k,v in groupby(sorted(d, key=lambda x: x["NameId"]), lambda x: x["NameId"]) ]
#output
[{'NameId': 'name1', 'exp': {'exp1': 'test1', 'exp2': 'test2'}}]

Python - Problem extracting data from nested json

I have a problem extracting data from json, I tried n different ways. I was able to extract the ID itself, unfortunately I can't manage to show the details of the field.
Below is my json
{
"params": {
"cid": "15482782896",
"datemax": "20190831",
"datemin": "20190601",
"domains": [
"url.com"
],
},
"results": {
"59107": {
"url.com": {
"1946592": {
"data": {
"2019-06-01": {
"ENGINE": {
"DEVICE": {
"": {
"position": 21,
"url": "url3.com"
}
}
}
},
"2019-07-01": {
"ENGINE": {
"DEVICE": {
"": {
"position": 4,
"url": "url3.com"
}
}
}
},
"2019-08-01": {
"ENGINE": {
"DEVICE": {
"": {
"position": 2,
"url": "url3.com"
}
}
}
}
},
"keyword": {
"title": "python_1",
"volume": 10
}
},
"1946602": {
"data": {
"2019-06-01": {
"ENGINE": {
"DEVICE": {
"": {
"position": 5,
"url": "url1.com"
}
}
}
},
"2019-07-01": {
"ENGINE": {
"DEVICE": {
"": {
"position": 12,
"url": "url1.com"
}
}
}
},
"2019-08-01": {
"ENGINE": {
"DEVICE": {
"": {
"position": 10.25,
"url": "url1.com"
}
}
}
}
},
"keyword": {
"title": "python_2",
"volume": 20
}
}
}
}
}
}
I tried the following code but I got the result in the form of id itself
import json
import csv
def get_leaves(item, key=None):
if isinstance(item, dict):
leaves = {}
for i in item.keys():
leaves.update(get_leaves(item[i], i))
return leaves
elif isinstance(item, list):
leaves = {}
for i in item:
leaves.update(get_leaves(i, key))
return leaves
else:
return {key : item}
with open('me_filename') as f_input:
json_data = json.load(f_input)
fieldnames = set()
for entry in json_data:
fieldnames.update(get_leaves(entry).keys())
with open('output.csv', 'w', newline='') as f_output:
csv_output = csv.DictWriter(f_output, fieldnames=sorted(fieldnames))
csv_output.writeheader()
csv_output.writerows(get_leaves(entry) for entry in json_data)
I also tried to use the pandas but also failed to parse properly
import io
import json
import pandas as pd
with open('me_filename', encoding='utf-8') as f_input:
df = pd.read_json(f_input , orient='None')
df.to_csv('output.csv', encoding='utf-8')
The result I'd need to get it :
ID Name page volume url 2019-06-01 2019-07-01 2019-08-01 2019-09-01
1946592 python_1 url.com 10 url3.com 21 4 2 null
1946602 python_2 url.com 20 url1.com 5 12 10,25 null
What could I do wrong?
Hmm this is a bit of a convoluted solution and it looks very messy and no-longer looks like the code provided however I believe it will resolve your issue.
First of all I had a problem with the provided Json (due to the trailing ',' on line 8) however have managed to generate:
Output (temp.csv)
ID,Name,Page,Volume,Url,2019-08-01,2019-07-01,2019-06-01,
1946592,python_1,url.com,10,url3.com,2,4,21,
1946602,python_2,url.com,20,url1.com,10.25,12,5,
using the following:
import json
dates: set = set()
# Collect the data
def get_breakdown(json):
collected_data = []
for result in json['results']:
for page in json['results'][result]:
for _id in json['results'][result][page]:
data_struct = {
'ID': _id,
'Name': json['results'][result][page][_id]['keyword']['title'],
'Page': page,
'Volume': json['results'][result][page][_id]['keyword']['volume'],
'Dates': {}
}
for date in dates:
if date in json['results'][result][page][_id]['data']:
data_struct['URL'] = json['results'][result][page][_id]['data'][date]['ENGINE']['DEVICE']['']['url']
data_struct['Dates'][date] = {'Position' : json['results'][result][page][_id]['data'][date]['ENGINE']['DEVICE']['']['position']}
else:
data_struct['Dates'][date] = {'Position' : 'null'}
collected_data.append(data_struct)
return collected_data
# Collect all dates across the whole data
# structure and save them to a set
def get_dates(json):
for result in json['results']:
for page in json['results'][result]:
for _id in json['results'][result][page]:
for date in json['results'][result][page][_id]['data']:
dates.add(date)
# Write to .csv file
def write_csv(collected_data, file_path):
f = open(file_path, "w")
# CSV Title
date_string = ''
for date in dates:
date_string = '{0}{1},'.format(date_string, date)
f.write('ID,Name,Page,Volume,Url,{0}\n'.format(date_string))
# Data
for data in collected_data:
position_string = ''
for date in dates:
position_string = '{0}{1},'.format(position_string, data['Dates'][date]['Position'])
f.write('{0},{1},{2},{3},{4},{5}\n'.format(
data['ID'],
data['Name'],
data['Page'],
data['Volume'],
data['URL'],
position_string
))
# Code Body
with open('me_filename.json') as f_input:
json_data = json.load(f_input)
get_dates(json_data)
write_csv(get_breakdown(json_data), "output.csv")
Hopefully you can follow the code and it does what is expected. I am sure that it can be made much more reliable - however as previously mentioned I couldn't make it work with the base code you provided.
After a small modification your code works great, but I noticed that showing the date as the next line would be a better solution in the format.
I tried to modify your solution to this form, but I'm still too weak in python to easily deal with it. Can you still tell me how you can do it to achieve this csv file format?
Output(temp.csv)
ID,Name,Page,Volume,Url,data,value,
1946592,python_1,url.com,10,url3.com,2019-08-01,2
1946592,python_1,url.com,10,url3.com,2019-07-01,4
1946592,python_1,url.com,10,url3.com,2019-06-01,21
1946602,python_2,url.com,20,url1.com,2019-08-01,10.25,
1946602,python_2,url.com,20,url1.com,2019-07-01,12,
1946602,python_2,url.com,20,url1.com,2019-06-01,5,

Want to get acces inner element of json with loop

I want to access with loop the inner element alias values of both dims and metrics present in json and appended in separate dimsList and metricsList python lists.
json_obj =
{
"dataset":"246",
"dims":{
"Location":{
"alias":"Location",
"format":""
}
},
"metrics":{
"ToTal_Dwell":[
{
"agg":"sum",
"format":"",
"alias":"ToTal_Dwell"
}
]
},
"filters":"",
"limit":"10"
}
expecting result to be like dimsList = ['Location'] and metricsList = ['ToTal_Dwell']
you can recursively iterate using .items(). every time you see an inner dict you make a recursive call, and an inner list causes a call per inner dict in the list.
try this:
json_obj = {
"dataset": "246",
"dims": {
"Location": {
"alias": "Location",
"format": ""
}
},
"metrics": {
"ToTal_Dwell": [
{
"agg": "sum",
"format": "",
"alias": "ToTal_Dwell"
}
]
},
"filters": "",
"limit": "10"
}
def extract_inner_values(d, key):
results = []
for k, v in d.items():
if k == key:
results.append(v)
if isinstance(v, dict):
results.extend(extract_inner_values(v, key))
if isinstance(v, list):
for inner_d in v:
results.extend(extract_inner_values(inner_d, key))
return results
dimsList = extract_inner_values(json_obj["dims"], "alias")
metricsList = extract_inner_values(json_obj["metrics"], "alias")
print(dimsList)
print(metricsList)
Output:
['Location']
['ToTal_Dwell']

Reading CSV and outputting json. Need to set data types

I have a project where I am reading CSV and outputting to json.
Here is some sample CSV:
firstName,lastName,email,age,gender
John,Doe,jdoe#emaildomain.com,50,male
Jane,Doe,jdoe#emaildomain.com,28,female
Bill,Smith,bsmith#emaildomain.com,49,male
Dick,Tracy,dtracy#emaildomain.com,18,male
Peter,Parker,pparker#emaildomain.com,26,male
Clark,Kent,ckent#emaildomain.com,17,male
Wonder,Woman,wwoman#emaildomain.com,44,female
John,James,jjames#emaildomain.com,17,male
Kat,Whoaman,kwhoamans#emaildomain.com,23,female
Everything is working as I had hoped in terms of the output, except I need certain values to be integers in the output, but they come out as strings (age for example). Is there a way to keep the code that I have mostly in tact, but output certain values as integers rather than strings?
import json
import csv
import itertools
primary_field = ['email']
result = []
with open('SampleCSV.csv') as csv_file:
reader = csv.DictReader(csv_file, skipinitialspace=True)
for row in itertools.islice(reader, 5):
d = {k: v for k, v in row.items() if k in primary_field}
d['dataFields'] = [{k: v,} for k, v in row.items() if k not in primary_field]
result.append(d)
root = {}
root["users"] = result
print(json.dumps(root, indent=4))
Sample output:
{
"users": [
{
"email": "jdoe#emaildomain.com",
"dataFields": [
{
"firstName": "John"
},
{
"lastName": "Doe"
},
{
"age": "50"
},
{
"gender": "male"
}
]
}
]
}
Desired output:
{
"users": [
{
"email": "jdoe#emaildomain.com",
"dataFields": [
{
"firstName": "John"
},
{
"lastName": "Doe"
},
{
"age": 50
},
{
"gender": "male"
}
]
}
]
}
This is what I referred to earlier. The commented-out line is your original code.
import json
import csv
import itertools
primary_field = ['email']
result = []
with open('SampleCSV.csv') as csv_file:
reader = csv.DictReader(csv_file, skipinitialspace=True)
for row in itertools.islice(reader, 5):
d = {k: v for k, v in row.items() if k in primary_field}
# d['dataFields'] = [{k: v,} for k, v in row.items() if k not in primary_field]
tmp_list = []
for k,v in row.items():
if k not in primary_field:
try:
vint = int(v)
except ValueError:
vint = v
tmp_list.append({k: vint})
d['dataFields'] = tmp_list
result.append(d)
root = {}
root["users"] = result
print(json.dumps(root, indent=4))
giving the results
{
"users": [
{
"email": "jdoe#emaildomain.com",
"dataFields": [
{
"firstName": "John"
},
{
"lastName": "Doe"
},
{
"age": 50
},
{
"gender": "male"
}
]
}, ...

Add the sub dictonary element in list in python

I am trying to add my sub dictionary element in list. It is giving me type error.
Here is dictionary and my code:
{
"key1": "value1",
"key2": {
"skey1": "svalue2",
"skey2": {
"sskey1": [{
"url": "value",
"sid": "511"
},
{
"url": "value",
"sid": "522"
},
{
"url": "value",
"sid": "533"
}]
}
}
}
I want to add the sid into the list like [511,522,533]:
here is my code:
rsId=[]
for i in op['key2']['skey2']['sskey1']:
for k,v in i.items():
if k=='sid':
rsId.append(v)
D = {
"key1":"value1",
"key2":{
"skey1":"svalue2",
"skey2":{
"sskey1":[{
"url":"value",
"sid":"511"
},
{
"url":"value",
"sid":"522"
},
{
"url":"value",
"sid":"533"
} ]
}
}
}
res = []
for i in D['key2']['skey2']['sskey1']:
res.append(i['sid'])
print res
Result:
['511', '522', '533']
or a one line code:
res = [i['sid'] for i in D['key2']['skey2']['sskey1']]
You can use dict comprehension:
rsId = [v for item in op['key2']['skey2']['sskey1'] for k, v in item.items() if k == 'sid']
You can try with one line something like this:
print(list(map(lambda x:x['sid'],data['key2']['skey2']['sskey1'])))
output:
['511', '522', '533']
If you want value in int then:
print(list(map(lambda x:int(x['sid']),data['key2']['skey2']['sskey1'])))
output:
[511, 522, 533]
when data is:
data = {
"key1":"value1",
"key2":{
"skey1":"svalue2",
"skey2":{
"sskey1":[{
"url":"value",
"sid":"511"
},
{
"url":"value",
"sid":"522"
},
{
"url":"value",
"sid":"533"
} ]
}
}
}
Get the int as output
The type error is probably due to the fact that you get a string as item of the list. Let’s see it transforming it to a number wit int() it solves your problem.
The only change to your code is in the last line of code.
op = {
"key1": "value1",
"key2": {
"skey1": "svalue2",
"skey2": {
"sskey1": [{
"url": "value",
"sid": "511"
},
{
"url": "value",
"sid": "522"
},
{
"url": "value",
"sid": "533"
}]
}
}
}
rsId = []
for i in op['key2']['skey2']['sskey1']:
for k, v in i.items():
if k == 'sid':
rsId.append(int(v)) # put the int here
output
>>> rsId
[511, 522, 533]
Another approach: checking every key that has a dictionary as value
op = {
"key1": "value1",
"key2": {
"skey1": "svalue2",
"skey2": {
"sskey1": [
{
"url": "value",
"sid": "511"
},
{
"url": "value",
"sid": "522"
},
{
"url": "value",
"sid": "533"
}
]
}
}
}
l = []
for k in op: # searching in the main dictonary
if type(op[k]) is dict: # if the value contains a dict (sub1)
for k2 in op[k]: # for every key
if type(op[k][k2]) is dict: # if the value is a dict (sub2)
for k3 in op[k][k2]: # for each key of subdict 2
for i in op[k][k2][k3]: # for every item of the list
for k4 in i: # foreach key in the item (a dict)
if k4 == 'sid': # if the key is 'sid'
l.append(int((i[k4]))) # append the value
print(l)
output
[511, 522, 533]

Categories

Resources