How to convert nested JSON files to CSV in python

How to convert nested JSON files to CSV in python - python

I am completely new to python and trying to covert nested json files to csv. The current code I am trying to use is:
import json
def read_json(filename: str) -> dict:
try:
with open(filename, "r") as f:
data = json.loads(f.read())
except:
raise Exception(f"Reading {filename} file encountered an error")
return data
def normalize_json(data: dict) -> dict:
new_data = dict()
for key, value in data.items():
if not isinstance(value, dict):
new_data[key] = value
else:
for k, v in value.items():
new_data[key + "_" + k] = v
return new_data
def generate_csv_data(data: dict) -> str:
# Defining CSV columns in a list to maintain
# the order
csv_columns = data.keys()
# Generate the first row of CSV
csv_data = ",".join(csv_columns) + "\n"
# Generate the single record present
new_row = list()
for col in csv_columns:
new_row.append(str(data[col]))
# Concatenate the record with the column information
# in CSV format
csv_data += ",".join(new_row) + "\n"
return csv_data
def write_to_file(data: str, filepath: str) -> bool:
try:
with open(filepath, "w+") as f:
f.write(data)
except:
raise Exception(f"Saving data to {filepath} encountered an error")
def main():
# Read the JSON file as python dictionary
data = read_json(filename="test2.json")
# Normalize the nested python dict
new_data = normalize_json(data=data)
# Pretty print the new dict object
print("New dict:", new_data)
# Generate the desired CSV data
csv_data = generate_csv_data(data=new_data)
# Save the generated CSV data to a CSV file
write_to_file(data=csv_data, filepath=data2.csv")
if __name__ == '__main__':
main()
It works partly: I get a CSV file that contains all values. However, for the nested key fields it only gives me the "highest" level (e.g. I get "currentEmployments" but not "currentEmployments_firmId").
Could someone help me with this?
Sample json file:
{
"basicInformation": {
"individualId": 10000,
"firstName": "Name",
"middleName": "middleName.",
"lastName": "lastName",
"bcScope": "Active",
"iaScope": "NotInScope",
"daysInIndustryCalculatedDate": "1/1/2000"
},
"currentEmployments": [
{
"firmId": 001,
"firmName": "firm1",
"iaOnly": "N",
"registrationBeginDate": "1/1/2005",
"firmBCScope": "ACTIVE",
"firmIAScope": "ACTIVE",
"iaSECNumber": "10000",
"iaSECNumberType": "100",
"bdSECNumber": "1000",
"branchOfficeLocations": [
{
"locatedAtFlag": "Y",
"supervisedFromFlag": "N",
"privateResidenceFlag": "N",
"branchOfficeId": "10000",
"street1": "street1",
"city": "city",
"state": "MD",
"country": "United States",
"zipCode": "10000"
}
]
}
],
"currentIAEmployments": [],
"previousEmployments": [
{
"iaOnly": "N",
"bdSECNumber": "20000",
"firmId": 200,
"firmName": "firm2",
"street1": "street",
"city": "city",
"state": "MD",
"country": "UNITED STATES",
"zipCode": "10000",
}
],
"examsCount": {
"stateExamCount": 0,
"principalExamCount": 0,
"productExamCount": 1
},
}

Related

Writing a List of Dictionaries to seperate JSONs

I do have dictionary, with each value as a list.
I want to write individual items to separate JSON files.
For example
data_to_write = {"Names":["name1", "name2", "name3"], "email":["mail1", "mail2", "mail3"]}
Now I want 3 jsons i.e data1.jsob, data2.json, data3.json in the following(approx) format.
data1.json
{
Name: name1,
email: mail1
}
data2.json
{
Name: name2,
email: mail2
}
and so on.
My current approach is
for file_no in range(no_of_files):
for count, (key, info_list) in enumerate(data_to_write.items()):
for info in info_list:
with open(
os.path.join(self.path_to_output_dir, str(file_no)) + ".json",
"a",
) as resume:
json.dump({key: info}, resume)
But this is wrong. Any helps appreciated.

You could use pandas to do the work for you. Read the dictionary into a dataframe, then iterate the rows of the dataframe to produce the json for each row:
import pandas as pd
data_to_write = {"Names":["name1", "name2", "name3"], "email":["mail1", "mail2", "mail3"]}
df = pd.DataFrame(data_to_write).rename(columns={'Names':'Name'})
for i in range(len(df)):
jstr = df.iloc[i].to_json()
with open(f"data{i+1}.json", "w") as f:
f.write(jstr)
Output (each line is in a separate file):
{"Name":"name1","email":"mail1"}
{"Name":"name2","email":"mail2"}
{"Name":"name3","email":"mail3"}

Try:
import json
data_to_write = {
"Names": ["name1", "name2", "name3"],
"email": ["mail1", "mail2", "mail3"],
}
for i, val in enumerate(zip(*data_to_write.values()), 1):
d = dict(zip(data_to_write, val))
with open(f"data{i}.json", "w") as f_out:
json.dump(d, f_out, indent=4)
This writes data(1..3).json with content:
# data1.json
{
"Names": "name1",
"email": "mail1"
}
# data2.json
{
"Names": "name2",
"email": "mail2"
}
...

import json
data_to_write = {
"Names": ["name1", "name2", "name3"],
"email": ["mail1", "mail2", "mail3"],
}
for ind, val in enumerate(zip(*data_to_write.values())):
jsn = dict(zip(data_to_write, val))
print(jsn)
with open("data{}.json".format(ind), "w") as f:
f.write(json.dumps(jsn))

Extract specific JSON keys and convert to CSV in Python

I'm converting several JSON files into a CSV using the following code below, it works as intended, but it converts all of the data in the JSON file. Instead, I want it to do the following:
Load JSON file [done]
Extract certain nested data in the JSON file [wip]
Convert to CSV [done]
Current Code
import json, pandas
from flatten_json import flatten
# Enter the path to the JSON and the filename without appending '.json'
file_path = r'C:\Path\To\file_name'
# Open and load the JSON file
dic = json.load(open(file_path + '.json', 'r', encoding='utf-8', errors='ignore'))
# Flatten and convert to a data frame
dic_flattened = (flatten(d, '.') for d in dic)
df = pandas.DataFrame(dic_flattened)
# Export to CSV in the same directory with the original file name
export_csv = df.to_csv (file_path + r'.csv', sep=',', encoding='utf-8', index=None, header=True)
In the example at the bottom, I only want everything under the following keys: created, emails, and identities. The rest is useless information (such as statusCode) or it's duplicated under a different key name (such as profile and userInfo).
I know it requires a for loop and if statement to specify the key names later on, but not sure the best way to implement it. This is what I have so far when I want to test it:
Attempted Code
import json, pandas
from flatten_json import flatten
# Enter the path to the JSON and the filename without appending '.json'
file_path = r'C:\Path\To\file_name'
# Open and load the JSON file
json_file = open(file_path + '.json', 'r', encoding='utf-8', errors='ignore')
dic = json.load(json_file)
# List keys to extract
key_list = ['created', 'emails', 'identities']
for d in dic:
#print(d['identities']) #Print all 'identities'
#if 'identities' in d: #Check if 'identities' exists
if key_list in d:
# Flatten and convert to a data frame
#dic_flattened = (flatten(d, '.') for d in dic)
#df = pandas.DataFrame(dic_flattened)
else:
# Skip
# Export to CSV in the same directory with the original file name
#export_csv = df.to_csv (file_path + r'.csv', sep=',', encoding='utf-8', index=None, header=True)
Is this the right logic?
file_name.json Example
[
{
"callId": "abc123",
"errorCode": 0,
"apiVersion": 2,
"statusCode": 200,
"statusReason": "OK",
"time": "2020-12-14T12:00:32.744Z",
"registeredTimestamp": 1417731582000,
"UID": "_guid_abc123==",
"created": "2014-12-04T22:19:42.894Z",
"createdTimestamp": 1417731582000,
"data": {},
"preferences": {},
"emails": {
"verified": [],
"unverified": []
},
"identities": [
{
"provider": "facebook",
"providerUID": "123",
"allowsLogin": true,
"isLoginIdentity": true,
"isExpiredSession": true,
"lastUpdated": "2014-12-04T22:26:37.002Z",
"lastUpdatedTimestamp": 1417731997002,
"oldestDataUpdated": "2014-12-04T22:26:37.002Z",
"oldestDataUpdatedTimestamp": 1417731997002,
"firstName": "John",
"lastName": "Doe",
"nickname": "John Doe",
"profileURL": "https://www.facebook.com/John.Doe",
"age": 30,
"birthDay": 31,
"birthMonth": 12,
"birthYear": 1969,
"city": "City, State",
"education": [
{
"school": "High School Name",
"schoolType": "High School",
"degree": null,
"startYear": 0,
"fieldOfStudy": null,
"endYear": 0
}
],
"educationLevel": "High School",
"followersCount": 0,
"gender": "m",
"hometown": "City, State",
"languages": "English",
"locale": "en_US",
"name": "John Doe",
"photoURL": "https://graph.facebook.com/123/picture?type=large",
"timezone": "-8",
"thumbnailURL": "https://graph.facebook.com/123/picture?type=square",
"username": "john.doe",
"verified": "true",
"work": [
{
"companyID": null,
"isCurrent": null,
"endDate": null,
"company": "Company Name",
"industry": null,
"title": "Company Title",
"companySize": null,
"startDate": "2010-12-31T00:00:00"
}
]
}
],
"isActive": true,
"isLockedOut": false,
"isRegistered": true,
"isVerified": false,
"lastLogin": "2014-12-04T22:26:33.002Z",
"lastLoginTimestamp": 1417731993000,
"lastUpdated": "2014-12-04T22:19:42.769Z",
"lastUpdatedTimestamp": 1417731582769,
"loginProvider": "facebook",
"loginIDs": {
"emails": [],
"unverifiedEmails": []
},
"rbaPolicy": {
"riskPolicyLocked": false
},
"oldestDataUpdated": "2014-12-04T22:19:42.894Z",
"oldestDataUpdatedTimestamp": 1417731582894
"registered": "2014-12-04T22:19:42.956Z",
"regSource": "",
"socialProviders": "facebook"
}
]

As mentioned by juanpa.arrivillaga, I simply need to add the following line after the key_list:
json_list = [{k:d[k] for k in key_list} for d in json_list]
This is the full working code:
import json, pandas
from flatten_json import flatten
# Enter the path to the JSON and the filename without appending '.json'
file_path = r'C:\Path\To\file_name'
# Open and load the JSON file
json_list = json.load(open(file_path + '.json', 'r', encoding='utf-8', errors='ignore'))
# Extract data from the defined key names
key_list = ['created', 'emails', 'identities']
json_list = [{k:d[k] for k in key_list} for d in json_list]
# Flatten and convert to a data frame
json_list_flattened = (flatten(d, '.') for d in json_list)
df = pandas.DataFrame(json_list_flattened)
# Export to CSV in the same directory with the original file name
export_csv = df.to_csv (file_path + r'.csv', sep=',', encoding='utf-8', index=None, header=True)

Adding size to dynamic tree from last column

I need to create a nested dict structure where the number of children can vary at each level.
Appending “size” element to last json child element for a sunburst diagram
Tree creation is covered in this question, except that i need the size to be picked up from the last column.
Given my labels repeat between levels and each level can have the same label "abc" as a terminal one, as well as a parent to the next level - I modified the code here slightly (to avoid duplicates in a children branch). I am unable to however specify the size, which is stored in the last column and should replace the 1 here in each leaf end. I know that I need to pass the value from the rows to the recursion loop build_leaf, but can't seem to figure how.
import csv
from collections import defaultdict
import json
def ctree():
return defaultdict(ctree)
def build_leaf(name, leaf):
if len(name)==0:
res={"name":"last node"}
res['size']=1
else:
res = {"name": name}
# add children node if the leaf actually has any children
if len(leaf.keys())>0:
res["children"] = [build_leaf(k, v) for k, v in leaf.items()]
else:
res['size'] = 1
return res
def main():
tree = ctree()
# NOTE: you need to have test.csv file as neighbor to this file
with open('./inpfile.csv') as csvfile:
reader = csv.reader(csvfile)
header = next(reader) # read the header row
i=0
for row in reader:
# usage of python magic to construct dynamic tree structure and
# basically grouping csv values under their parents
leaf = tree[row[0]]
size=row[-1]
for value in row[1:-1]:
leaf = leaf[value]
# building a custom tree structure
res = []
for name, leaf in tree.items():
res.append(build_leaf(name, leaf))
# printing results into the terminal
print(json.dumps(res, indent=2))
with open('paths.json', 'w') as fp:
json.dump(res, fp)
main()
The final output for the data mentioned should look something like:
[
{
"name": "A1",
"children": [
{
"name": "A2",
"children": [
{
"name": "A1",
"children": [
{
"name": "A2",
"children": [
{
"name": "A3",
"size": 80
}
]
}
]
},
{
"name": "A3",
"children": [
{
"name": "A2",
"children": [
{
"name": "A3",
"size": 169
}
]
},
{
"name": "exit site",
"size": 764
}
]
},
{
"name": "A6",
"children": [
{
"name": "A3",
"children": [
{
"name": "exit site",
"size": 127
}
]
}
]
},
{
"name": "exit site",
"size": 576
}
]
}
]
}
]

In case someone stumbles across the same problem - I could get it to work, by creating another recursive loop to retrieve size from the nested leaf (thanks to Douglas for the help).
def ctree():
return defaultdict(ctree)
def get_size(leaf1):
for k,v in leaf1.items():
if k=="size":
return v
else:
return get_size(v)
def build_leaf(name, leaf):
if len(name)==0:
res={"name":"exit site"}
res['size']=int(get_size(leaf))
else:
res = {"name": name}
# add children node if the leaf actually has any children
if not leaf["size"]:
res["children"] = [build_leaf(k, v) for k, v in leaf.items() if not k == "size" ]
else:
res['size'] = int(get_size(leaf))
return res
def make_json(inpfile,outjson):
tree = ctree()
# NOTE: you need to have test.csv file as neighbor to this file
with open("./filepath.csv") as csvfile:
reader = csv.reader(csvfile)
header = next(reader) # read the header row
for row in reader:
# usage of python magic to construct dynamic tree structure and
# basically grouping csv values under their parents
leaf = tree[row[0]]
size=row[-1]
for value in row[1:-1]:
leaf = leaf[value]
if len(row) < 6:
leaf["exit site"]["size"]=size
else:
leaf["size"]=size
# building a custom tree structure
res = []
for name, leaf in tree.items():
res.append(build_leaf(name, leaf))
with open(outjson, 'w') as fp:
json.dump(res, fp)

Writing JSON data into CSV

I have a dynamic json, in whcih key(attributes) will be different based on records.
I want to write below json data into csv file:
[{"Id": "12345",
"attributes": {"Address": [{"label": "United Kingdom",
"value": {"AddressLine": [{"value": "Baker "
"Street"}]}},
{"label": "United States",
"value": {"AddressLine": [{"value": "Florida"}]}}],
"CountryCode": [{"value": "Australia"}],
"Identifiers": [{"value": {"Type": [{"value": "Licence Id"}]}},
{"value": {"Type": [{"value": "NPI"}]}}],
"StatusReasonCode": [{"value": "XXX"}],
"UniqueId": [{"value": "71581742"}]},
"createdBy": "Rahul"}]
Data is expected in below format of the csv:
ID, createdBy, CountryCode, StatusReasonCode, Identifiers, UniqueId, AddressLine
12345,Rahul,Australia,XXX,Licence Id,71581742,Baker Street
12345,Rahul,Australia,XXX,NPI,71581742,Florida
Here is my code to extract the data from json:
import json
with open('data.json') as f:
data = json.load(f)
for key,value in data.items():
if(type(value))==str:
print(key + ',' + value)
# global res
res =[]
if(type(value))==list:
for fg in value:
crosswalk_final=fg['value']
if (type(value))== dict:
for key1,val in value.items():
for k in val:
if type(k['value']) == dict:
for sub_key,sub_value in k.items():
if(type(sub_value)) == dict:
for child_key,child_value in sub_value.items():
if(type(child_value)) == list:
for m in child_value:
if type(m['value']) == dict:
for qaq,waq in m.items():
if (isinstance(waq, dict)):
for our,pur in waq.items():
for qq in pur:
print(our+','+qq['value'])
else:
pass
print(key1+'_'+sub_key+'_'+child_key+','+m['value'])
else:
attr1=(key+'_'+key1+','+k['value'])
print((attr1))
The above code is giving me result in below format:
Id,12345
createdBy,Rahul
attributes_UniqueId,71581742
attributes_CountryCode,Australia
attributes_StatusReasonCode,XXX
Address_value_AddressLine,Baker Street
Address_value_AddressLine,Florida
Identifiers_value_Type,Licence Id
Identifiers_value_Type,NPI
However I am not sure how to write it in my csv (in the same format shown above).

Write a function that flattens the dictionary into a list with constant length. If key is missing, set value to None.
Example:
data_dict = {"a": 1, "c": 12, "b": 0}
data_list = [data_dict.get("a"), data_dict.get("b"), data_dict.get("c")]
Then insert in csv.

convert csv file to multiple nested json format

I have written a code to convert csv file to nested json format. I have multiple columns to be nested hence assigning separately for each column. The problem is I'm getting 2 fields for the same column in the json output.
import csv
import json
from collections import OrderedDict
csv_file = 'data.csv'
json_file = csv_file + '.json'
def main(input_file):
csv_rows = []
with open(input_file, 'r') as csvfile:
reader = csv.DictReader(csvfile, delimiter='|')
for row in reader:
row['TYPE'] = 'REVIEW', # adding new key, value
row['RAWID'] = 1,
row['CUSTOMER'] = {
"ID": row['CUSTOMER_ID'],
"NAME": row['CUSTOMER_NAME']
}
row['CATEGORY'] = {
"ID": row['CATEGORY_ID'],
"NAME": row['CATEGORY']
}
del (row["CUSTOMER_NAME"], row["CATEGORY_ID"],
row["CATEGORY"], row["CUSTOMER_ID"]) # deleting since fields coccuring twice
csv_rows.append(row)
with open(json_file, 'w') as f:
json.dump(csv_rows, f, sort_keys=True, indent=4, ensure_ascii=False)
f.write('\n')
The output is as below:
[
{
"CATEGORY": {
"ID": "1",
"NAME": "Consumers"
},
"CATEGORY_ID": "1",
"CUSTOMER_ID": "41",
"CUSTOMER": {
"ID": "41",
"NAME": "SA Port"
},
"CUSTOMER_NAME": "SA Port",
"RAWID": [
1
]
}
]
I'm getting 2 entries for the fields I have assigned using row[''].
Is there any other way to get rid of this? I want only one entry for a particular field in each record.
Also how can I convert the keys to lower case after reading from csv.DictReader(). In my csv file all the columns are in upper case and hence I'm using the same to assign. But I want to convert all of them to lower case.

In order to convert the keys to lower case, it would be simpler to generate a new dict per row. BTW, it should be enough to get rid of the duplicate fields:
for row in reader:
orow = collection.OrderedDict()
orow['type'] = 'REVIEW', # adding new key, value
orow['rawid'] = 1,
orow['customer'] = {
"id": row['CUSTOMER_ID'],
"name": row['CUSTOMER_NAME']
}
orow['category'] = {
"id": row['CATEGORY_ID'],
"name": row['CATEGORY']
}
csv_rows.append(orow)

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

How to convert nested JSON files to CSV in python - python

Related

Writing a List of Dictionaries to seperate JSONs

Extract specific JSON keys and convert to CSV in Python

Adding size to dynamic tree from last column

Writing JSON data into CSV

convert csv file to multiple nested json format

Categories

Resources