Adding size to dynamic tree from last column - python

I need to create a nested dict structure where the number of children can vary at each level.
Appending “size” element to last json child element for a sunburst diagram
Tree creation is covered in this question, except that i need the size to be picked up from the last column.
Given my labels repeat between levels and each level can have the same label "abc" as a terminal one, as well as a parent to the next level - I modified the code here slightly (to avoid duplicates in a children branch). I am unable to however specify the size, which is stored in the last column and should replace the 1 here in each leaf end. I know that I need to pass the value from the rows to the recursion loop build_leaf, but can't seem to figure how.
import csv
from collections import defaultdict
import json
def ctree():
return defaultdict(ctree)
def build_leaf(name, leaf):
if len(name)==0:
res={"name":"last node"}
res['size']=1
else:
res = {"name": name}
# add children node if the leaf actually has any children
if len(leaf.keys())>0:
res["children"] = [build_leaf(k, v) for k, v in leaf.items()]
else:
res['size'] = 1
return res
def main():
tree = ctree()
# NOTE: you need to have test.csv file as neighbor to this file
with open('./inpfile.csv') as csvfile:
reader = csv.reader(csvfile)
header = next(reader) # read the header row
i=0
for row in reader:
# usage of python magic to construct dynamic tree structure and
# basically grouping csv values under their parents
leaf = tree[row[0]]
size=row[-1]
for value in row[1:-1]:
leaf = leaf[value]
# building a custom tree structure
res = []
for name, leaf in tree.items():
res.append(build_leaf(name, leaf))
# printing results into the terminal
print(json.dumps(res, indent=2))
with open('paths.json', 'w') as fp:
json.dump(res, fp)
main()
The final output for the data mentioned should look something like:
[
{
"name": "A1",
"children": [
{
"name": "A2",
"children": [
{
"name": "A1",
"children": [
{
"name": "A2",
"children": [
{
"name": "A3",
"size": 80
}
]
}
]
},
{
"name": "A3",
"children": [
{
"name": "A2",
"children": [
{
"name": "A3",
"size": 169
}
]
},
{
"name": "exit site",
"size": 764
}
]
},
{
"name": "A6",
"children": [
{
"name": "A3",
"children": [
{
"name": "exit site",
"size": 127
}
]
}
]
},
{
"name": "exit site",
"size": 576
}
]
}
]
}
]

In case someone stumbles across the same problem - I could get it to work, by creating another recursive loop to retrieve size from the nested leaf (thanks to Douglas for the help).
def ctree():
return defaultdict(ctree)
def get_size(leaf1):
for k,v in leaf1.items():
if k=="size":
return v
else:
return get_size(v)
def build_leaf(name, leaf):
if len(name)==0:
res={"name":"exit site"}
res['size']=int(get_size(leaf))
else:
res = {"name": name}
# add children node if the leaf actually has any children
if not leaf["size"]:
res["children"] = [build_leaf(k, v) for k, v in leaf.items() if not k == "size" ]
else:
res['size'] = int(get_size(leaf))
return res
def make_json(inpfile,outjson):
tree = ctree()
# NOTE: you need to have test.csv file as neighbor to this file
with open("./filepath.csv") as csvfile:
reader = csv.reader(csvfile)
header = next(reader) # read the header row
for row in reader:
# usage of python magic to construct dynamic tree structure and
# basically grouping csv values under their parents
leaf = tree[row[0]]
size=row[-1]
for value in row[1:-1]:
leaf = leaf[value]
if len(row) < 6:
leaf["exit site"]["size"]=size
else:
leaf["size"]=size
# building a custom tree structure
res = []
for name, leaf in tree.items():
res.append(build_leaf(name, leaf))
with open(outjson, 'w') as fp:
json.dump(res, fp)

Related

How to convert nested JSON files to CSV in python

I am completely new to python and trying to covert nested json files to csv. The current code I am trying to use is:
import json
def read_json(filename: str) -> dict:
try:
with open(filename, "r") as f:
data = json.loads(f.read())
except:
raise Exception(f"Reading {filename} file encountered an error")
return data
def normalize_json(data: dict) -> dict:
new_data = dict()
for key, value in data.items():
if not isinstance(value, dict):
new_data[key] = value
else:
for k, v in value.items():
new_data[key + "_" + k] = v
return new_data
def generate_csv_data(data: dict) -> str:
# Defining CSV columns in a list to maintain
# the order
csv_columns = data.keys()
# Generate the first row of CSV
csv_data = ",".join(csv_columns) + "\n"
# Generate the single record present
new_row = list()
for col in csv_columns:
new_row.append(str(data[col]))
# Concatenate the record with the column information
# in CSV format
csv_data += ",".join(new_row) + "\n"
return csv_data
def write_to_file(data: str, filepath: str) -> bool:
try:
with open(filepath, "w+") as f:
f.write(data)
except:
raise Exception(f"Saving data to {filepath} encountered an error")
def main():
# Read the JSON file as python dictionary
data = read_json(filename="test2.json")
# Normalize the nested python dict
new_data = normalize_json(data=data)
# Pretty print the new dict object
print("New dict:", new_data)
# Generate the desired CSV data
csv_data = generate_csv_data(data=new_data)
# Save the generated CSV data to a CSV file
write_to_file(data=csv_data, filepath=data2.csv")
if __name__ == '__main__':
main()
It works partly: I get a CSV file that contains all values. However, for the nested key fields it only gives me the "highest" level (e.g. I get "currentEmployments" but not "currentEmployments_firmId").
Could someone help me with this?
Sample json file:
{
"basicInformation": {
"individualId": 10000,
"firstName": "Name",
"middleName": "middleName.",
"lastName": "lastName",
"bcScope": "Active",
"iaScope": "NotInScope",
"daysInIndustryCalculatedDate": "1/1/2000"
},
"currentEmployments": [
{
"firmId": 001,
"firmName": "firm1",
"iaOnly": "N",
"registrationBeginDate": "1/1/2005",
"firmBCScope": "ACTIVE",
"firmIAScope": "ACTIVE",
"iaSECNumber": "10000",
"iaSECNumberType": "100",
"bdSECNumber": "1000",
"branchOfficeLocations": [
{
"locatedAtFlag": "Y",
"supervisedFromFlag": "N",
"privateResidenceFlag": "N",
"branchOfficeId": "10000",
"street1": "street1",
"city": "city",
"state": "MD",
"country": "United States",
"zipCode": "10000"
}
]
}
],
"currentIAEmployments": [],
"previousEmployments": [
{
"iaOnly": "N",
"bdSECNumber": "20000",
"firmId": 200,
"firmName": "firm2",
"street1": "street",
"city": "city",
"state": "MD",
"country": "UNITED STATES",
"zipCode": "10000",
}
],
"examsCount": {
"stateExamCount": 0,
"principalExamCount": 0,
"productExamCount": 1
},
}

Having an issue parsing through this json in python

I have created a var that is equal to t.json. The JSON file is a follows:
{
"groups": {
"customerduy": {
"nonprod": {
"name": "customerduynonprod",
"id": "529646781943",
"owner": "cloudops#coerce.com",
"manager_email": ""
},
"prod": {
"name": "phishing_duyaccountprod",
"id": "241683454720",
"owner": "cloudops#coerce.com",
"manager_email": ""
}
},
"customerduyprod": {
"nonprod": {
"name": "phishing_duyaccountnonprod",
"id": "638968214142",
"owner": "cloudops#coerce.com",
"manager_email": ""
}
},
"ciasuppliergenius": {
"prod": {
"name": "ciasuppliergeniusprod",
"id": "220753788760",
"owner": "cia_developers#coerce.com",
"manager_email": "jarks#coerce.com"
}
}
}
}
my goal was to pars this JSON file and get value for "owner" and output it to a new var. Example below:
t.json = group_map
group_id_aws = group(
group.upper(),
"accounts",
template,
owner = group_map['groups']['prod'],
manager_description = "Groups for teams to access their product accounts.",
The error I keep getting is: KeyError: 'prod'
Owner occurs 4 times, so here is how to get all of them.
import json
# read the json
with open("C:\\test\\test.json") as f:
data = json.load(f)
# get all 4 occurances
owner_1 = data['groups']['customerduy']['nonprod']['owner']
owner_2 = data['groups']['customerduy']['prod']['owner']
owner_3 = data['groups']['customerduyprod']['nonprod']['owner']
owner_4 = data['groups']['ciasuppliergenius']['prod']['owner']
# print results
print(owner_1)
print(owner_2)
print(owner_3)
print(owner_4)
the result:
cloudops#coerce.com
cloudops#coerce.com
cloudops#coerce.com
cia_developers#coerce.com
You get a key error since the key 'prod' is not in 'groups'
What you have is
group_map['groups']['customerduy']['prod']
group_map['groups']['ciasuppliergenius']['prod']
So you will have to extract the 'owner' from each element in the tree:
def s(d,t):
for k,v in d.items():
if t == k:
yield v
try:
for i in s(v,t):
yield i
except:
pass
print(','.join(s(j,'owner')))
If your JSON is loaded in variable data, you can use a recursive function
that deals with the two containers types (dict and list) that can occur
in a JSON file, recursively:
def find_all_values_for_key(d, key, result):
if isinstance(d, dict):
if key in d:
result.append(d[key])
return
for k, v in d.items():
find_all_values_for_key(v, key, result)
elif isinstance(d, list):
for elem in d:
find_all_values_for_key(elem, key, result)
owners = []
find_all_values_for_key(data, 'owner', owners)
print(f'{owners=}')
which gives:
owners=['cloudops#coerce.com', 'cloudops#coerce.com', 'cloudops#coerce.com', 'cia_developers#coerce.com']
This way you don't have to bother with the names of intermediate keys, or in general the structure of your JSON file.
You don't have any lists in your example, but it is trivial to recurse through
them to any dict with an owner key that might "lurk" somewhere nested
under a a list element, so it is better to deal with potential future changes
to the JSON.

unable to update JSON using python

I am trying to update transaction ID from the following json:
{
"locationId": "5115",
"transactions": [
{
"transactionId": "1603804404-5650",
"source": "WEB"
} ]
I have done following code for the same, but it does not update the transaction id, but it inserts the transaction id to the end of block:-
try:
session = requests.Session()
with open(
"sales.json",
"r") as read_file:
payload = json.load(read_file)
payload["transactionId"] = random.randint(0, 5)
with open(
"sales.json",
"w") as read_file:
json.dump(payload, read_file)
Output:-
{
"locationId": "5115",
"transactions": [
{
"transactionId": "1603804404-5650",
"source": "WEB"
} ]
}
'transactionId': 1
}
Expected Outut:-
{
"locationId": "5115",
"transactions": [
{
"transactionId": "1",
"source": "WEB"
} ]
This would do it, but only in your specific case:
payload["transactions"][0]["transactionId"] = xxx
There should be error handling for cases like "transactions" key is not int the dict, or there are no records or there are more than one
also, you will need to assign =str(your_random_number) not the int if you wish to have the record of type string as the desired output suggests
If you just want to find the transactionId key and you don't know exactly where it may exist. You can do-
from collections.abc import Mapping
def update_key(key, new_value, jsondict):
new_dict = {}
for k, v in jsondict.items():
if isinstance(v, Mapping):
# Recursive traverse if value is a dict
new_dict[k] = update_key(key, new_value, v)
elif isinstance(v, list):
# Traverse through all values of list
# Recursively traverse if an element is a dict
new_dict[k] = [update_key(key, new_value, innerv) if isinstance(innerv, Mapping) else innerv for innerv in v]
elif k == key:
# This is the key to replace with new value
new_dict[k] = new_value
else:
# Just a regular value, assign to new dict
new_dict[k] = v
return new_dict
Given a dict-
{
"locationId": "5115",
"transactions": [
{
"transactionId": "1603804404-5650",
"source": "WEB"
} ]
}
You can do-
>>> update_key('transactionId', 5, d)
{'locationId': '5115', 'transactions': [{'transactionId': 5, 'source': 'WEB'}]}
Yes because transactionId is inside transactions node. So your code should be like:
payload["transactions"][0].transactionId = random.randint(0, 5)
or
payload["transactions"][0]["transactionId"] = random.randint(0, 5)

convert csv file to multiple nested json format

I have written a code to convert csv file to nested json format. I have multiple columns to be nested hence assigning separately for each column. The problem is I'm getting 2 fields for the same column in the json output.
import csv
import json
from collections import OrderedDict
csv_file = 'data.csv'
json_file = csv_file + '.json'
def main(input_file):
csv_rows = []
with open(input_file, 'r') as csvfile:
reader = csv.DictReader(csvfile, delimiter='|')
for row in reader:
row['TYPE'] = 'REVIEW', # adding new key, value
row['RAWID'] = 1,
row['CUSTOMER'] = {
"ID": row['CUSTOMER_ID'],
"NAME": row['CUSTOMER_NAME']
}
row['CATEGORY'] = {
"ID": row['CATEGORY_ID'],
"NAME": row['CATEGORY']
}
del (row["CUSTOMER_NAME"], row["CATEGORY_ID"],
row["CATEGORY"], row["CUSTOMER_ID"]) # deleting since fields coccuring twice
csv_rows.append(row)
with open(json_file, 'w') as f:
json.dump(csv_rows, f, sort_keys=True, indent=4, ensure_ascii=False)
f.write('\n')
The output is as below:
[
{
"CATEGORY": {
"ID": "1",
"NAME": "Consumers"
},
"CATEGORY_ID": "1",
"CUSTOMER_ID": "41",
"CUSTOMER": {
"ID": "41",
"NAME": "SA Port"
},
"CUSTOMER_NAME": "SA Port",
"RAWID": [
1
]
}
]
I'm getting 2 entries for the fields I have assigned using row[''].
Is there any other way to get rid of this? I want only one entry for a particular field in each record.
Also how can I convert the keys to lower case after reading from csv.DictReader(). In my csv file all the columns are in upper case and hence I'm using the same to assign. But I want to convert all of them to lower case.
In order to convert the keys to lower case, it would be simpler to generate a new dict per row. BTW, it should be enough to get rid of the duplicate fields:
for row in reader:
orow = collection.OrderedDict()
orow['type'] = 'REVIEW', # adding new key, value
orow['rawid'] = 1,
orow['customer'] = {
"id": row['CUSTOMER_ID'],
"name": row['CUSTOMER_NAME']
}
orow['category'] = {
"id": row['CATEGORY_ID'],
"name": row['CATEGORY']
}
csv_rows.append(orow)

Convert csv to JSON tree structure and change name?

Thank you to Hatt for the explanation and the code. It works, although I am unable to change the string name for a meaningful name from column headers.
Can anyone suggest how to achieve that?
Data in csv file
conversion_month channel sub_channel campaign Id cost kpi
2017-08 DISPLAY Retargeting Summer_Campaign 200278217 2.286261 0.1
2017-08 DISPLAY Retargeting Summer_Campaign 200278218 3.627064 2.5
2017-08 DISPLAY Retargeting Summer_Campaign 200278219 2.768436 0.001
2017-08 DISPLAY Retargeting August Campaign 200278220 5.653297 0.35
2017-09 DISPLAY Prospecting Test Campaign 200278221 4.11847 1.5
2017-08 DISPLAY Prospecting August Campaign 200278222 3.393972 0.26
2017-09 DISPLAY Prospecting Test Campaign 200278223 3.975332 4.2
2017-08 DISPLAY Prospecting August Campaign 200278224 4.131035 0.3
Code used:
import csv
from collections import defaultdict
def ctree():
return defaultdict(ctree)
def build_leaf(name, leaf):
res = {"name":name}
# add children node if the leaf actually has any children
if len(leaf.keys()) > 0:
res["children"] = [build_leaf(k, v) for k, v in leaf.items()]
return res
def main():
tree = ctree()
with open('file.csv') as csvfile:
reader = csv.reader(csvfile)
for rid, row in enumerate(reader):
if rid == 0:
continue
leaf = tree[row[0]]
for cid in range(1, (len(row)-2)):
leaf = leaf[row[cid]]
for cid in range((len(row)-1), len(row)):
leaf = (leaf[row[cid-1]],leaf[row[cid]])
# building a custom tree structure
res = []
for name, leaf in tree.items():
res.append(build_leaf(name, leaf))
# printing results into the terminal
import json
print(json.dumps(res, indent=2))
main()
It gives the tree, but I would like to change the string "name" for meaningful name such as "month", "channel", ..."id", etc. The names are in the first row of the csv file.
[
{
"name": "2017-08",
"children": [
{
"name": "DISPLAY",
"children": [
{
"name": "Retargeting",
"children": [
{
"name": "Summer_Campaign",
"children": [
{
"name": "200278217",
"children": [
{
"name": "2.286261"
},
{
"name": "0.1"
}
]
Thank you for any suggestions in advance.
Use next(reader) to first extract the header row from the CSV file. A level counter can be used to indicate which column is currently being dealt with so the corresponding column header can be extracted from the header:
import csv
from collections import defaultdict
def ctree():
return defaultdict(ctree)
def build_leaf(name, leaf, level, header):
res = {header[level] : name}
# add children node if the leaf actually has any children
if len(leaf.keys()) > 0:
res["children"] = [build_leaf(k, v, level+1, header) for k, v in leaf.items()]
return res
def main():
tree = ctree()
with open('file.csv') as csvfile:
reader = csv.reader(csvfile)
header = next(reader)
for row in reader:
leaf = tree[row[0]]
for cid in range(1, (len(row)-2)):
leaf = leaf[row[cid]]
for cid in range((len(row)-1), len(row)):
leaf = (leaf[row[cid-1]],leaf[row[cid]])
# building a custom tree structure
res = []
for name, leaf in tree.items():
res.append(build_leaf(name, leaf, 0, header))
# printing results into the terminal
import json
print(json.dumps(res, indent=2))
main()
This would give you:
[
{
"conversion_month": "2017-08",
"children": [
{
"channel": "DISPLAY",
"children": [
{
"sub_channel": "Retargeting",
"children": [
{
"campaign": "Summer_Campaign",
"children": [
{
"Id": "200278217",
"children": [
{
"cost": "2.286261"
},

Categories

Resources