Create a json tree from csv list in python - python

I'm trying to build a json hierarchy from a simple table in python.
The data comes in looking like the following:
id parent name
1 10 test-name-1
2 10 test-name-2
3 5 test-name-3
4 none test-name-4
5 10 test-name-5
6 none test-name-6
7 1 test-name-7
8 1 test-name-8
9 8 test-name-9
10 4 test-name-10
and I'm looking for an output like this:
{"$4":{"name":"test-name-4","children":{
"$10":{"name":"test-name-10","children":{
"$1":{"name":"test-name-1","children":{
"$7":{"name":"test-name-7","children":{}},
"$8":{"name":"test-name-8","children":{
"$9":{"name":"test-name-9","children":{}}}}}},
"$2":{"name":"test-name-2","children":{}},
"$5":{"name":"test-name-5","children":{
"$3":{"name":"test-name-3","children":{}}}}}}}},
"$6":{"name":"test-name-6","children":"test-name-6"}}
I have no idea how many "leaves" there will be or "roots", or what order the rows from the csv will come in. My question is, is there a way that I can recursively build a dictionary/list from a child node up to the parent? How can I produce a hierarchical tree from the "leaf" pieces of the tree in python?
Thanks for the help!

I have a solution based on 2 loops too (1 to cache, 1 to build), without JSON encoder, and that gives exactly the output you required:
>>> import re
>>> from collections import defaultdict
>>> parents = defaultdict(list)
>>> for i, line in enumerate(file_.split('\n')):
if i != 0 and line.strip():
id_, parent, name = re.findall(r'[\d\w-]+', line)
parents[parent].append((id_, name))
>>> parents
defaultdict(<type 'list'>, {'10': [('1', 'test-name-1'), ('2', 'test-name-2'), ('5', 'test-name-5')], 'none': [('4', 'test-name-4'), ('6', 'test-name-6')], '1': [('7', 'test-name-7'), ('8', 'test-name-8')], '5': [('3', 'test-name-3')], '4': [('10', 'test-name-10')], '8': [('9', 'test-name-9')]})
OK, now we have our cache, the recursive function easily builds the output we'd like:
>>> def build_tree(d, val):
return {'$' + id_: {'name': name, 'children': build_tree(d, id_)} for id_, name in d[val]}
We just have to call it on the dict built previously, with value 'none' which is the tree root:
>>> from pprint import pprint
>>> pprint(build_tree(parents, 'none'))
{'$4': {'children': {'$10': {'children': {'$1': {'children': {'$7': {'children': {},
'name': 'test-name-7'},
'$8': {'children': {'$9': {'children': {},
'name': 'test-name-9'}},
'name': 'test-name-8'}},
'name': 'test-name-1'},
'$2': {'children': {},
'name': 'test-name-2'},
'$5': {'children': {'$3': {'children': {},
'name': 'test-name-3'}},
'name': 'test-name-5'}},
'name': 'test-name-10'}},
'name': 'test-name-4'},
'$6': {'children': {}, 'name': 'test-name-6'}}
>>>

To assign all child nodes to its parent, you can do two passes over the list of nodes. The first pass adds each node to a UserDict. In the second pass the parent of each node is guaranteed to be in the UserDict so the node can be added to the children of its parent.
To serialize to JSON a JSONEncoder can be used.
#!/usr/bin/env python
import sys
import json
import UserDict
class Node(object):
def __init__(self, nid, parent, name):
self.nid = nid
self.parent = parent
self.children = []
self.name = name
class NodeDict(UserDict.UserDict):
def addNodes(self, nodes):
""" Add every node as a child to its parent by doing two passes."""
for i in (1, 2):
for node in nodes:
self.data[node.nid] = node
if node.parent in self.data.keys():
if node.parent != "none" and
node not in self.data[node.parent].children:
self.data[node.parent].children.append(node)
class NodeJSONEncoder(json.JSONEncoder):
def default(self, node):
if type(node) == Node:
return {"nid":node.nid, "name":node.name, "children":node.children}
raise TypeError("{} is not an instance of Node".format(node))
if __name__ == "__main__":
nodes = []
with open(sys.argv[1]) as f:
for row in f.readlines()[1:]:
nid, parent, name = row.split()
nodes.append(Node(nid, parent, name))
nodeDict = NodeDict()
nodeDict.addNodes(nodes)
rootNodes = [node for nid, node in nodeDict.items()
if node.parent == "none"]
for rootNode in rootNodes:
print NodeJSONEncoder().encode(rootNode)
Result:
{"name": "test-name-4", "nid": "4", "children":[
{"name": "test-name-10", "nid": "10", "children":[
{"name": "test-name-1", "nid": "1", "children":[
{"name": "test-name-7", "nid": "7", "children": []},
{"name": "test-name-8", "nid": "8", "children":[
{"name": "test-name-9", "nid": "9", "children": []}]}]},
{"name": "test-name-2", "nid": "2", "children": []},
{"name": "test-name-5", "nid": "5", "children":[
{"name": "test-name-3", "nid": "3", "children": []}]}]}]}
{"name": "test-name-6", "nid": "6", "children": []}

The answer given did not work for me in python 3.6 because Dict.Dict has been deprecated. So I made some changes to make it work and generalized it a little by letting user specify columns for child_id, parent_id and child name via command line. Please see below (I am just learning and am sure this could be improved, but it works for my purposes).
""" Converts a CSV file with Parent/Child Hierarchy to a hierarchical JSON file for front-end processing (javascript/DS)
USAGE: csv2json.py <somefile.csv> a b c (column nrs of a=child_id, b=parent-id, c=name(of child))
ROOT of hierarchy should contain child_id and parent_id = 'none' or blank. name must exist """
import sys
import json
import csv
#import UserDict
from collections import UserDict
class Node(object):
def __init__(self, child_id, parent_id, name):
self.child_id = child_id
self.parent_id = parent_id
self.children = []
self.name = name
class NodeDict(UserDict):
def addNodes(self, nodes):
""" Add every node as a child to its parent_id by doing two passes."""
for i in (1, 2):
for node in nodes:
self.data[node.child_id] = node
if node.parent_id in self.data.keys():
if (node.parent_id != "none" or node.parent_id != "") and node not in self.data[node.parent_id].children:
self.data[node.parent_id].children.append(node)
class NodeJSONEncoder(json.JSONEncoder):
def default(self, node):
if type(node) == Node:
return {"name":node.name, "children":node.children}
raise TypeError("{} is not an instance of Node".format(node))
if __name__ == "__main__":
nodes = []
with open(sys.argv[1], 'r') as f:
reader = csv.reader(f)
for row in reader:
if not row[int(sys.argv[4])] : #skip if no name/label exists
continue
child_id, parent_id, name = row[int(sys.argv[2])] , row[int(sys.argv[3])] , row[int(sys.argv[4])]
nodes.append(Node(child_id, parent_id, name))
nodeDict = NodeDict()
nodeDict.addNodes(nodes)
rootNodes = [node for child_id, node in nodeDict.items()
if (node.parent_id == "none" or node.parent_id == "")]
for rootNode in rootNodes:
print(NodeJSONEncoder().encode(rootNode))

Related

How do I converted my textfile to a nested json in python

I have a text file which I want to convert to a nested json structure. The text file is :
Report_for Reconciliation
Execution_of application_1673496470638_0001
Spark_version 2.4.7-amzn-0
Java_version 1.8.0_352 (Amazon.com Inc.)
Start_time 2023-01-12 09:45:13.360000
Spark Properties:
Job_ID 0
Submission_time 2023-01-12 09:47:20.148000
Run_time 73957ms
Result JobSucceeded
Number_of_stages 1
Stage_ID 0
Number_of_tasks 16907
Number_of_executed_tasks 16907
Completion_time 73207ms
Stage_executed parquet at RawDataPublisher.scala:53
Job_ID 1
Submission_time 2023-01-12 09:48:34.177000
Run_time 11525ms
Result JobSucceeded
Number_of_stages 2
Stage_ID 1
Number_of_tasks 16907
Number_of_executed_tasks 0
Completion_time 0ms
Stage_executed parquet at RawDataPublisher.scala:53
Stage_ID 2
Number_of_tasks 300
Number_of_executed_tasks 300
Completion_time 11520ms
Stage_executed parquet at RawDataPublisher.scala:53
Job_ID 2
Submission_time 2023-01-12 09:48:46.908000
Run_time 218358ms
Result JobSucceeded
Number_of_stages 1
Stage_ID 3
Number_of_tasks 1135
Number_of_executed_tasks 1135
Completion_time 218299ms
Stage_executed parquet at RawDataPublisher.scala:53
I want the output to be :
{
"Report_for": "Reconciliation",
"Execution_of": "application_1673496470638_0001",
"Spark_version": "2.4.7-amzn-0",
"Java_version": "1.8.0_352 (Amazon.com Inc.)",
"Start_time": "2023-01-12 09:45:13.360000",
"Job_ID 0": {
"Submission_time": "2023-01-12 09:47:20.148000",
"Run_time": "73957ms",
"Result": "JobSucceeded",
"Number_of_stages": "1",
"Stage_ID 0”: {
"Number_of_tasks": "16907",
"Number_of_executed_tasks": "16907",
"Completion_time": "73207ms",
"Stage_executed": "parquet at RawDataPublisher.scala:53"
"Stage": "parquet at RawDataPublisher.scala:53",
},
},
}
I tried defaultdict method but it was generating a json with values as list which was not acceptable to make a table on it. Here's what I did:
import json
from collections import defaultdict
INPUT = 'demofile.txt'
dict1 = defaultdict(list)
def convert():
with open(INPUT) as f:
for line in f:
command, description = line.strip().split(None, 1)
dict1[command].append(description.strip())
OUTPUT = open("demo1file.json", "w")
json.dump(dict1, OUTPUT, indent = 4, sort_keys = False)
and was getting this:
"Report_for": [ "Reconciliation" ],
"Execution_of": [ "application_1673496470638_0001" ],
"Spark_version": [ "2.4.7-amzn-0" ],
"Java_version": [ "1.8.0_352 (Amazon.com Inc.)" ],
"Start_time": [ "2023-01-12 09:45:13.360000" ],
"Job_ID": [
"0",
"1",
"2", ....
]]]
I just want to convert my text to the above json format so that I can build a table on top of it.
There's no way, python or one of it's libraries can figure out your nesting requirements, if a flat text is being given as an input. How should it know Stages are inside Jobs...for example.
You will have to programmatically tell your application how it works.
I hacked an example which should work, you can go from there (assuming input_str is what you posted as your file content):
# define your nesting structure
nesting = {'Job_ID': {'Stage_ID': {}}}
upper_nestings = []
upper_nesting_keys = []
# your resulting dictionary
result_dict = {}
# your "working" dictionaries
current_nesting = nesting
working_dict = result_dict
# parse each line of the input string
for line_str in input_str.split('\n'):
# key is the first word, value are all consecutive words
line = line_str.split(' ')
# if key is in nesting, create new sub-dict, all consecutive entries are part of the sub-dict
if line[0] in current_nesting.keys():
current_nesting = current_nesting[line[0]]
upper_nestings.append(line[0])
upper_nesting_keys.append(line[1])
working_dict[line_str] = {}
working_dict = working_dict[line_str]
else:
# if a new "parallel" or "upper" nesting is detected, reset your nesting structure
if line[0] in upper_nestings:
nests = upper_nestings[:upper_nestings.index(line[0])]
keys = upper_nesting_keys[:upper_nestings.index(line[0])]
working_dict = result_dict
for nest in nests:
working_dict = working_dict[' '.join([nest, keys[nests.index(nest)]])]
upper_nestings = upper_nestings[:upper_nestings.index(line[0])+1]
upper_nesting_keys = upper_nesting_keys[:upper_nestings.index(line[0])]
upper_nesting_keys.append(line[1])
current_nesting = nesting
for nest in upper_nestings:
current_nesting = current_nesting[nest]
working_dict[line_str] = {}
working_dict = working_dict[line_str]
continue
working_dict[line[0]] = ' '.join(line[1:])
print(result_dict)
Results in:
{
'Report_for': 'Reconciliation',
'Execution_of': 'application_1673496470638_0001',
'Spark_version': '2.4.7-amzn-0',
'Java_version': '1.8.0_352 (Amazon.com Inc.)',
'Start_time': '2023-01-12 09:45:13.360000',
'Spark': 'Properties: ',
'Job_ID 0': {
'Submission_time': '2023-01-12 09:47:20.148000',
'Run_time': '73957ms',
'Result': 'JobSucceeded',
'Number_of_stages': '1',
'Stage_ID 0': {
'Number_of_tasks': '16907',
'Number_of_executed_tasks': '16907',
'Completion_time': '73207ms',
'Stage_executed': 'parquet at RawDataPublisher.scala:53'
}
},
'Job_ID 1': {
'Submission_time': '2023-01-12 09:48:34.177000',
'Run_time': '11525ms',
'Result': 'JobSucceeded',
'Number_of_stages': '2',
'Stage_ID 1': {
'Number_of_tasks': '16907',
'Number_of_executed_tasks': '0',
'Completion_time': '0ms',
'Stage_executed': 'parquet at RawDataPublisher.scala:53'
},
'Stage_ID 2': {
'Number_of_tasks': '300',
'Number_of_executed_tasks': '300',
'Completion_time': '11520ms',
'Stage_executed': 'parquet at RawDataPublisher.scala:53'
}
},
'Job_ID 2': {
'Submission_time':
'2023-01-12 09:48:46.908000',
'Run_time': '218358ms',
'Result': 'JobSucceeded',
'Number_of_stages': '1',
'Stage_ID 3': {
'Number_of_tasks': '1135',
'Number_of_executed_tasks': '1135',
'Completion_time': '218299ms',
'Stage_executed': 'parquet at RawDataPublisher.scala:53'
}
}
}
and should pretty much be generically usable for all kinds of nesting definitions from a flat input. Let me know if it works for you!

How can I convert/transform a JSON tree structure to a merkle tree

I'm running a web server, where I receive data in JSON format and planning to store it in a NoSQL database. Here is an example:
data_example = {
"key1": "val1",
"key2": [1, 2, 3],
"key3": {
"subkey1": "subval1",
.
.
}
}
I had thoughts about using a Merkle tree to represent my data since JSON is also a tree-like structure.
Essentially, what I want to do is to store my data in (or as) a more secure decentralized tree-like structure. Many entities will have access to create, read, update or delete (CRUD) a record from it. These CRUD operations will ideally need to be verified from other entities in the network, which will also hold a copy of the database. Just like in blockchain.
I'm having a design/concept problem and I'm trying to understand how can I turn my JSON into a Merkle tree structure. This is my Node class:
class Node:
""" class that represents a node in a merkle tree"""
def __init__(data):
self.data = data
self.hash = self.calculate_some_hash() # based on the data or based on its child nodes
I'm interested in the conception/design of this as I couldn't figure out how this can work. Any idea how to save/store my data_example object in a Merkle tree? (is it possible?)
You can create a Merkle Tree by first converting your dictionary to a class object form, and then recursively traverse the tree, hashing the sum of the child node hashes. Since a Merkle Tree requires a single root node, any input dictionaries that have more than one key at the topmost level should become the child dictionary of an empty root node (with a default key of None):
data_example = {
"key1": "val1",
"key2": [1, 2, 3],
"key3": {
"subkey1": "subval1",
"subkey2": "subval2",
"subkey3": "subval3",
}
}
class MTree:
def __init__(self, key, value):
self.key, self.hash = key, None
self.children = value if not isinstance(value, (dict, list)) else self.__class__.build(value, False)
def compute_hashes(self):
#build hashes up from the bottom
if not isinstance(self.children, list):
self.hash = hash(self.children)
else:
self.hash = hash(sum([i.compute_hashes() for i in self.children]))
return self.hash
def update_kv(self, k, v):
#recursively update a value in the tree with an associated key
if self.key == k:
self.children = v
elif isinstance(self.children, list):
_ = [i.update_kv(k, v) for i in self.children]
def update_tree(self, payload):
#update key-value pairs in the tree from payload
for a, b in payload.items():
self.update_kv(a, b)
self.compute_hashes() #after update is complete, recompute the hashes
#classmethod
def build(cls, dval, root=True):
#convert non-hashible values to strings
vals = [i if isinstance(i, (list, tuple)) else (None, i) for i in getattr(dval, 'items', lambda :dval)()]
if root:
if len(vals) > 1:
return cls(None, dval)
return cls(vals[0][0], vals[0][-1])
return [cls(a, b) for a, b in vals]
def __repr__(self):
return f'{self.__class__.__name__}({self.hash}, {repr(self.children)})'
tree = MTree.build(data_example) #create the basic tree with the input dictionary
_ = tree.compute_hashes() #get the hashes for each node (predicated on its children)
print(tree)
Output:
MTree(-1231139208667999673, [MTree(-8069796171680625903, 'val1'), MTree(6, [MTree(1, 1), MTree(2, 2), MTree(3, 3)]), MTree(-78872064628455629, [MTree(-8491910191379857244, 'subval1'), MTree(1818926376495655970, 'subval2'), MTree(1982425731828357743, 'subval3')])])
Updating the tree with the contents from a payload:
tree.update_tree({"key1": "newVal1"})
Output:
MTree(1039734050960246293, [MTree(5730292134016089818, 'newVal1'), MTree(6, [MTree(1, 1), MTree(2, 2), MTree(3, 3)]), MTree(-78872064628455629, [MTree(-8491910191379857244, 'subval1'), MTree(1818926376495655970, 'subval2'), MTree(1982425731828357743, 'subval3')])])

Get the value inside a dictionary

im fairly new to python and im doing a RPG to practice what i'm learning. I have a dictionary of items and i want to get the name of one of them, but when i call it i get a message :
You don't have any<classes.inventory.Item object at 0x7f52e39bce48>left!
My dictionary is this one:
player_items = [{"item":potion, "quantity": 15},
{"item":hipotion, "quantity": 10},
{"item":superpotion, "quantity": 8},
{"item":elixir, "quantity": 3},
{"item":hielixir, "quantity":1},
{"item":grenade, "quantity": 12}]
item = player.items[item_choice]["item"]
player.items[item_choice]["quantity"] -= 1
if player.items[item_choice]["quantity"] == 0:
print(bcolors.FAIL+"\n"+"You don't have any"+str(item)+"left!"+bcolors.ENDC)
class Item:
def __init__(self, name, type, description, prop):
self.name = name
self.type = type
self.description = description
self.prop = prop
class Person:
def __init__(self, hp, mp, atk, df, magic, items):
self.maxhp = hp
self.hp = hp
self.maxmp = mp
self.mp = mp
self.atkhigh = atk+10
self.atklow = atk-10
self.df = df
self.items = items
self.magic = magic
self.actions = ["Attack", "Magic", "Items"]
Anyone know what im doing wrong? Thanks in advance for your time.
EDIT : Found the error. I need to learn more. I just needed to add the name property when i called the item :
print(bcolors.FAIL+"\n"+"You don't have any"+str(item.name)+"left!"+bcolors.ENDC)
Sorry i wasted your time guys.
item isn't the string "potion", for instance; it's a reference to an instance of Item that represents a potion. You need to provide an appropriate __str__ method for your Item class.
#Ifsalazar2010 welcome to python. here are a few things I see you may be doing wrong.
First, your dict values need to be string, otherwise python will try to look for those as object. So write "potion" instead of potion
player_items = [{"item":"potion", "quantity": 15},
{"item":"hipotion", "quantity": 10},
{"item":"superpotion", "quantity": 8},
{"item":"elixir", "quantity": 3},
{"item":"hielixir", "quantity":1},
{"item":"grenade", "quantity": 12}]
Next, you need to use the proper object player_items and not player.items. Your object is a list of dictionaries, which you can call by index and you haven't defined item_choice. I am using the index 1 as example.
item = player_items[1]["item"]
print(item)
player_items[1]["quantity"] -= 1
print(player_items)
In short, seems like you didn't provide detail in your example to replicate your errors. for example what is bcolors.FAIL

JSON dump for User defined class in python

Here is how I want my data to be : (key=name, value=[dob,[misc1, misc2,..]])
# my sample code
inputNames = [
('james', ['1990-01-19', ['james1', 'james2', 'james3'] ]),
('julie', ['1991-08-07', ['julie1', 'julie2'] ]),
('mikey', ['1989-01-23', ['mikey1'] ]),
('sarah', ['1988-02-05', ['sarah1', 'sarah2', 'sarah3', 'sarah4'] ])
]
class empData (list):
def __init__ (self, misc=None):
list.__init__([])
# print('add empdata: ',misc[0],misc[1])
self.dob = misc[0]
self.extend(misc[1])
def edprint(self):
return(self.dob, self)
class myEmp():
def __init__ (self, anm, amisc=None):
self.nm = anm
self.details = empData(amisc)
def printme(self):
print(self.nm, self.details.edprint())
emps={}
for i in inputNames:
m = myEmp(i[0],i[1])
emps[m] = m
print(emps)
# prints addresses of variables
# for actual data use the following lines
for ea in emps:
emps[ea].printme()
try:
with open('data.json','w') as wfd:
json.dump(emps, wfd)
except IOError as ioerr:
print('File error: ',str(ioerr))
wfd.close()
The above gives me an error: TypeError: key <main.myEmp object at 0x10143d588> is not a string
I am unable to figure out how to dump my dict of myEmp data structures as JSON
Before you can dump to json you need explicitly convert your data to a serializable type like dict or list. You could do this using a list comprehension:
>>> d = [{'key':ea.nm, 'value':[ea.details.dob, ea.details]} for ea in emps]
>>> json.dumps(d)
'[{"value": ["1991-08-07", ["julie1", "julie2"]], "key": "julie"}, {"value": ["1989-01-23", ["mikey1"]], "key": "mikey"}, {"value": ["1990-01-19", ["james1", "james2", "james3"]], "key": "james"}, {"value": ["1988-02-05", ["sarah1", "sarah2", "sarah3", "sarah4"]], "key": "sarah"}]'

List of parent and child into nested dictionary

I have a list that I'd like to transform into a nested dictionary. The first element of the list is the parent, the second the child. Can I do this recursively without having to continue creating helper lists for each level? I feel so dumb not understanding this.
relations = [["basket", "money"],
["basket", "fruits"],
["fruits", "orange"],
["fruits", "apple"],
["basket", "vegetables"],
["vegetables", "bean"],
["vegetables", "tomato"],
["tomato", "red tomato"],
["tomato", "green tomato"],
["vegetables", "pepper"],
["sweets", "candy"]]
result = {}
running_list = []
for parent, child in relations:
if parent == "basket":
result[child] = {}
running_list.append(child)
for parent, child in relations:
if parent in running_list:
result[parent] = {child : {}}
print result
Just create a dictionary that maps a name to the corresponding dictionary:
items = {}
for parent, child in relations:
parent_dict = items.setdefault(parent, {})
child_dict = items.setdefault(child, {})
if child not in parent_dict:
parent_dict[child] = child_dict
result = items['basket'] # basket is the top-level item
This produces:
>>> items = {}
>>> for parent, child in relations:
... parent_dict = items.setdefault(parent, {})
... child_dict = items.setdefault(child, {})
... if child not in parent_dict:
... parent_dict[child] = child_dict
...
>>> items['basket']
{'money': {}, 'vegetables': {'tomato': {'green tomato': {}, 'red tomato': {}}, 'bean': {}, 'pepper': {}}, 'fruits': {'orange': {}, 'apple': {}}}
>>> from pprint import pprint
>>> pprint(items['basket'])
{'fruits': {'apple': {}, 'orange': {}},
'money': {},
'vegetables': {'bean': {},
'pepper': {},
'tomato': {'green tomato': {}, 'red tomato': {}}}}

Categories

Resources