Convert list of paths to dictionary in python - python

I'm making a program in Python where I need to interact with "hypothetical" paths, (aka paths that don't and won't exist on the actual filesystem) and I need to be able to listdir them like normal (path['directory'] would return every item inside the directory like os.listdir()).
The solution I came up with was to convert a list of string paths to a dictionary of dictionaries. I came up with this recursive function (it's inside a class):
def DoMagic(self,paths):
structure = {}
if not type(paths) == list:
raise ValueError('Expected list Value, not '+str(type(paths)))
for i in paths:
print(i)
if i[0] == '/': #Sanity check
print('trailing?',i) #Inform user that there *might* be an issue with the input.
i[0] = ''
i = i.split('/') #Split it, so that we can test against different parts.
if len(i[1:]) > 1: #Hang-a-bout, there's more content!
structure = {**structure, **self.DoMagic(['/'.join(i[1:])])}
else:
structure[i[1]] = i[1]
But when I go to run it with ['foo/e.txt','foo/bar/a.txt','foo/bar/b.cfg','foo/bar/c/d.txt'] as input, I get:
{'e.txt': 'e.txt', 'a.txt': 'a.txt', 'b.cfg': 'b.cfg', 'd.txt': 'd.txt'}
I want to be able to just path['foo']['bar'] to get everything in the foo/bar/ directory.
Edit:
A more desirable output would be:
{'foo':{'e.txt':'e.txt','bar':{'a.txt':'a.txt','c':{'d.txt':'d.txt'}}}}

Edit 10-14-22 My first answer matches what the OP asks but isn't really the ideal approach nor the cleanest output. Since this question appears to be used more often, see a cleaner approach below that is more resilient to Unix/Windows paths and the output dictionary makes more sense.
from pathlib import Path
import json
def get_path_dict(paths: list[str | Path]) -> dict:
"""Builds a tree like structure out of a list of paths"""
def _recurse(dic: dict, chain: tuple[str, ...] | list[str]):
if len(chain) == 0:
return
if len(chain) == 1:
dic[chain[0]] = None
return
key, *new_chain = chain
if key not in dic:
dic[key] = {}
_recurse(dic[key], new_chain)
return
new_path_dict = {}
for path in paths:
_recurse(new_path_dict, Path(path).parts)
return new_path_dict
l1 = ['foo/e.txt', 'foo/bar/a.txt', 'foo/bar/b.cfg', Path('foo/bar/c/d.txt'), 'test.txt']
result = get_path_dict(l1)
print(json.dumps(result, indent=2))
Output:
{
"foo": {
"e.txt": null,
"bar": {
"a.txt": null,
"b.cfg": null,
"c": {
"d.txt": null
}
}
},
"test.txt": null
}
Older approach
How about this. It gets your desired output, however a tree structure may be cleaner.
from collections import defaultdict
import json
def nested_dict():
"""
Creates a default dictionary where each value is an other default dictionary.
"""
return defaultdict(nested_dict)
def default_to_regular(d):
"""
Converts defaultdicts of defaultdicts to dict of dicts.
"""
if isinstance(d, defaultdict):
d = {k: default_to_regular(v) for k, v in d.items()}
return d
def get_path_dict(paths):
new_path_dict = nested_dict()
for path in paths:
parts = path.split('/')
if parts:
marcher = new_path_dict
for key in parts[:-1]:
marcher = marcher[key]
marcher[parts[-1]] = parts[-1]
return default_to_regular(new_path_dict)
l1 = ['foo/e.txt','foo/bar/a.txt','foo/bar/b.cfg','foo/bar/c/d.txt', 'test.txt']
result = get_path_dict(l1)
print(json.dumps(result, indent=2))
Output:
{
"foo": {
"e.txt": "e.txt",
"bar": {
"a.txt": "a.txt",
"b.cfg": "b.cfg",
"c": {
"d.txt": "d.txt"
}
}
},
"test.txt": "test.txt"
}

Wouldn't simple tree, implemented via dictionaries, suffice?
Your implementation seems a bit redundant. It's hard to tell easily to which folder a file belongs.
https://en.wikipedia.org/wiki/Tree_(data_structure)
There's a lot of libs on pypi, if you need something extra.
treelib
There're also Pure paths in pathlib.

Related

How to recreate the tree organization in nested dictionnaries

I've a problem I have been struggling on for some time now. What I need to do is to check things for a large amount of data inside many folders. To keep track of what has been done I wanted to create a yaml file containing the tree organization of my data structure. Thus, the objective is to create nested dictionaries of the folders containing data.
The script I made is working, but it duplicates each folder and I don't know how to call recursively the function to avoid this. Here is the code :
def load_tree_structure_as_dictionnary(current_dict):
for dir_name in current_dict.keys():
lst_sub_dir = [f.path for f in os.scandir(dir_name) if f.is_dir()]
if lst_sub_dir == []:
current_dict[dir_name]['correct_calibration'] = None
else:
for sub_dir in lst_sub_dir:
current_dict[dir_name][sub_dir] = load_tree_structure_as_dictionnary( {sub_dir: {}} )
return current_dict
init_dict = {data_path : {} }
full_dict = load_tree_structure_as_dictionnary(init_dict)
I know the error is in the recursive call, but I can't create a new 'sub_dir' key if there isnt a dictionnary initialized ( hence the {sub_dir : {}} )
Also I am new to writing stackoverflow questions, lmk if something needs to be improved in the syntax.
After changing current_dict[dir_name][sub_dir] = load_tree_structure_as_dictionnary( {sub_dir: {}} ) to current_dict[dir_name].update(load_tree_structure_as_dictionnary( {sub_dir: {}} )) your code will not duplicate the sub_dir.
def load_tree_structure_as_dictionnary(current_dict):
for dir_name in current_dict.keys():
lst_sub_dir = [f.path for f in os.scandir(dir_name) if f.is_dir()]
if lst_sub_dir == []:
current_dict[dir_name]['correct_calibration'] = None
else:
for sub_dir in lst_sub_dir:
current_dict[dir_name].update(load_tree_structure_as_dictionnary( {sub_dir: {}} ))
return current_dict
init_dict = {"venv" : {} }
full_dict = load_tree_structure_as_dictionnary(init_dict)

get list of json paths in python

I'm looking to get list of all possible json paths in a json file - can recommend any one?
Eg : if input is below
{
"_id":{
"$oid":""
},
"aa":false,
"bb":false,
"source":"",
"email":"",
"createdAt":{
"$date":""
},
"updatedAt":{
"$date":""
},
"cc":"",
"vv":"",
"metadata":{
"vv":"",
"xx":[{}]
}
}
o/p :
obj
obj._id
obj._id.$oid
obj.aa
obj.bb
obj.source
obj.email
obj.createdAt
obj.createdAt.$date
obj.updatedAt
obj.updatedAt.$date
obj.cc
obj.vv
obj.metadata
obj.metadata.vv
obj.metadata.xx
obj.metadata.xx[0]
I'm basically looking. a python version of this : https://www.convertjson.com/json-path-list.htm
I want to build a general solution , if any json file - it will be a single value for schema generation (ie one line in a newline delimeted json)
Any suggestions ?
You can do this in a reasonably succinct way with a recursive generator. The string "obj" is a little awkward since it doesn't occur in the data structure. On the other hand, adding it at the end is simple:
def get_paths(d):
if isinstance(d, dict):
for key, value in d.items():
yield f'.{key}'
yield from (f'.{key}{p}' for p in get_paths(value))
elif isinstance(d, list):
for i, value in enumerate(d):
yield f'[{i}]'
yield from (f'[{i}]{p}' for p in get_paths(value))
paths = ['obj'+s for s in get_paths(d)]
Gives you paths as a list of strings:
['obj._id',
'obj._id.$oid',
'obj.aa',
'obj.bb',
'obj.source',
'obj.email',
'obj.createdAt',
'obj.createdAt.$date',
'obj.updatedAt',
'obj.updatedAt.$date',
'obj.cc',
'obj.vv',
'obj.metadata',
'obj.metadata.vv',
'obj.metadata.xx',
'obj.metadata.xx[0]']
Of course, you can wrap that last step in a function like and accept a root object string:
def get_paths(d, root="obj"):
def recur(d):
if isinstance(d, dict):
for key, value in d.items():
yield f'.{key}'
yield from (f'.{key}{p}' for p in get_paths(value))
elif isinstance(d, list):
for i, value in enumerate(d):
yield f'[{i}]'
yield from (f'[{i}]{p}' for p in get_paths(value))
return (root + p for p in recur(d))
list(get_paths(d))
# same result
You can do this with this code:
mylist = []
def getKeys(obj, parent="obj"):
global mylist
for i in obj.keys():
mylist.append(parent+"."+i)
try:
getKeys(obj[i], parent+"."+i)
except AttributeError:
pass
getKeys({
"_id":{
"$oid":""
},
"aa":False,
"bb":False,
"source":"",
"email":"",
"createdAt":{
"$date":""
},
"updatedAt":{
"$date":""
},
"cc":"",
"vv":"",
"metadata":{
"vv":"",
"xx":[{}]
}
})
print(mylist)
I changed false to False. If you are using JSON and not a dictionary you may want to use the JSON library to convert it to a dictionary.
import json
myDict = json.loads("{"_id":{"$oid":""},"aa":false,"bb":false,"source":"","email":"","createdAt":{"$date":""},"updatedAt":{"$date":""},"cc":"","vv":"","metadata":{"vv":"","xx":[{}]}}")

`KeyError:` when recursively parsing JSON -- error gives very little information

I am trying to recursively/flexibly parse any valid JSON object. I'm getting an error using the below code with the below (further down) JSON example that doesn't make sense to me. The below code currently outputs the output right below and then errors out. The reason for the error is hard to pinpoint because the behavior on the first pass through is expected. The second time, not so much...and the error is cryptic to say the least.
def get_level_keys(obj, level_count=0, level_keys={}):
key_list = list(obj.keys())
print(f"key_list {level_count}: {key_list}")
level_key = f"{level_count}_level"
rem_keys = []
for key in key_list:
if type(obj[key]) in [int,bool,str,float]:
level_keys[level_key].append(key)
else:
rem_keys.append(key)
level_keys[level_key] = []
for key in rem_keys:
if type(obj[key]) == dict:
level_count += 1
get_level_keys(obj[key], level_count, level_keys)
return(level_keys)
Current output is:
key_list 0: ['fields', 'name', 'tags', 'timestamp']
key_list 1: ['LogEndOffset', 'LogStartOffset', 'NumLogSegments', 'Size',
'UnderReplicatedPartitions']
...and then it errors with: KeyError: '1_level' (line 9: level_keys[level_key].append(key)).
Json for reproducing:
{
"fields": {
"LogEndOffset": 0,
"LogStartOffset": 0,
"NumLogSegments": 1,
"Size": 0,
"UnderReplicatedPartitions": 0
},
"name": "partition",
"tags": {
"host": "CUD1-001560",
"jolokia_agent_url": "http://localhost:7777/jolokia",
"partition": "22",
"topic": "qa-connect-offsets"
},
"timestamp": 1591124460
}
Any ideas? Thanks in advance!
To me it appears that level_keys[level_key] may be initialized to [] after it is referenced in the line
level_keys[level_key].append(key)
Also, you may run the risk of adding a key multiple times so you should add a check for this.
Try rearranging your code to
def get_level_keys(obj, level_count=0, level_keys={}):
key_list = list(obj.keys())
print(f"key_list {level_count}: {key_list}")
level_key = f"{level_count}_level"
rem_keys = []
# CHANGE IS HERE
if level_key not in level_keys:
level_keys[level_key] = []
for key in key_list:
if type(obj[key]) in [int,bool,str,float]:
# CHANGE IS HERE
if key not in level_keys[level_key]:
level_keys[level_key].append(key)
else:
rem_keys.append(key)
for key in rem_keys:
if type(obj[key]) == dict:
level_count += 1
get_level_keys(obj[key], level_count, level_keys)
return(level_keys)
Update:
You also need to fix your recursion. Instead of performing level_count += 1, you should just use
get_level_keys(obj[key], level_count + 1, level_keys)
Your code currently assigns a new level to each sub-key.
You are getting this error because the script is unable to find that key in the json object. You are recursively parsing this json but somewhere it expects to find the 1_level key which fails. Can you perhaps share the initial data with which you call this method? Also what is the expected output? Hard to tell what its you are trying to achieve but try the below perhaps it might help.
def get_level_keys(obj, level_count=0, level_keys={}, empty_dict={}):
key_list = list(obj.keys())
print(f"key_list {level_count}: {key_list}")
level_key = f"{level_count}_level"
rem_keys = []
for key in key_list:
if type(obj[key]) in [int,bool,str,float]:
if level_keys.get(level_key, None):
level_keys[level_key].append(key)
else:
rem_keys.append(key)
# level_keys[level_key] =
for key in rem_keys:
if type(obj[key]) == dict:
level_count += 1
get_level_keys(obj[key], level_count, level_keys, empty_dict)
empty_dict[level_key] = key_list
return empty_dict

Formatting JSON in Python

What is the simplest way to pretty-print a string of JSON as a string with indentation when the initial JSON string is formatted without extra spaces or line breaks?
Currently I'm running json.loads() and then running json.dumps() with indent=2 on the result. This works, but it feels like I'm throwing a lot of compute down the drain.
Is there a more simple or efficient (built-in) way to pretty-print a JSON string? (while keeping it as valid JSON)
Example
import requests
import json
response = requests.get('http://spam.eggs/breakfast')
one_line_json = response.content.decode('utf-8')
pretty_json = json.dumps(json.loads(response.content), indent=2)
print(f'Original: {one_line_json}')
print(f'Pretty: {pretty_json}')
Output:
Original: {"breakfast": ["spam", "spam", "eggs"]}
Pretty: {
"breakfast": [
"spam",
"spam",
"eggs"
]
}
json.dumps(obj, indent=2) is better than pprint because:
It is faster with the same load methodology.
It has the same or similar simplicity.
The output will produce valid JSON, whereas pprint will not.
pprint_vs_dumps.py
import cProfile
import json
import pprint
from urllib.request import urlopen
def custom_pretty_print():
url_to_read = "https://www.cbcmusic.ca/Component/Playlog/GetPlaylog?stationId=96&date=2018-11-05"
with urlopen(url_to_read) as resp:
pretty_json = json.dumps(json.load(resp), indent=2)
print(f'Pretty: {pretty_json}')
def pprint_json():
url_to_read = "https://www.cbcmusic.ca/Component/Playlog/GetPlaylog?stationId=96&date=2018-11-05"
with urlopen(url_to_read) as resp:
info = json.load(resp)
pprint.pprint(info)
cProfile.run('custom_pretty_print()')
>>> 71027 function calls (42309 primitive calls) in 0.084 seconds
cProfile.run('pprint_json()')
>>>164241 function calls (140121 primitive calls) in 0.208 seconds
Thanks #tobias_k for pointing out my errors along the way.
I think for a true JSON object print, it's probably as good as it gets. timeit(number=10000) for the following took about 5.659214497s:
import json
d = {
'breakfast': [
'spam', 'spam', 'eggs',
{
'another': 'level',
'nested': [
{'a':'b'},
{'c':'d'}
]
}
],
'foo': True,
'bar': None
}
s = json.dumps(d)
q = json.dumps(json.loads(s), indent=2)
print(q)
I tried with pprint, but it actually wouldn't print the pure JSON string unless it's converted to a Python dict, which loses your true, null and false etc valid JSON as mentioned in the other answer. As well it doesn't retain the order in which the items appeared, so it's not great if order is important for readability.
Just for fun I whipped up the following function:
def pretty_json_for_savages(j, indentor=' '):
ind_lvl = 0
temp = ''
for i, c in enumerate(j):
if c in '{[':
print(indentor*ind_lvl + temp.strip() + c)
ind_lvl += 1
temp = ''
elif c in '}]':
print(indentor*ind_lvl + temp.strip() + '\n' + indentor*(ind_lvl-1) + c, end='')
ind_lvl -= 1
temp = ''
elif c in ',':
print(indentor*(0 if j[i-1] in '{}[]' else ind_lvl) + temp.strip() + c)
temp = ''
else:
temp += c
print('')
# {
# "breakfast":[
# "spam",
# "spam",
# "eggs",
# {
# "another": "level",
# "nested":[
# {
# "a": "b"
# },
# {
# "c": "d"
# }
# ]
# }
# ],
# "foo": true,
# "bar": null
# }
It prints pretty alright, and unsurprisingly it took a whooping 16.701202023s to run in timeit(number=10000), which is 3 times as much as a json.dumps(json.loads()) would get you. It's probably not worthwhile to build your own function to achieve this unless you spend some time to optimize it, and with the lack of a builtin for the same, it's probably best you stick with your gun since your efforts will most likely give diminishing returns.

Python json dumps syntax error when appending list of dict

I got two functions that return a list of dictionary and i'm trying to get json to encode it, it works when i try doing it with my first function, but now i'm appending second function with a syntax error of ": expected". I will eventually be appending total of 7 functions that each output a list of dict. Is there a better way of accomplishing this?
import dmidecode
import simplejson as json
def get_bios_specs():
BIOSdict = {}
BIOSlist = []
for v in dmidecode.bios().values():
if type(v) == dict and v['dmi_type'] == 0:
BIOSdict["Name"] = str((v['data']['Vendor']))
BIOSdict["Description"] = str((v['data']['Vendor']))
BIOSdict["BuildNumber"] = str((v['data']['Version']))
BIOSdict["SoftwareElementID"] = str((v['data']['BIOS Revision']))
BIOSdict["primaryBIOS"] = "True"
BIOSlist.append(BIOSdict)
return BIOSlist
def get_board_specs():
MOBOdict = {}
MOBOlist = []
for v in dmidecode.baseboard().values():
if type(v) == dict and v['dmi_type'] == 2:
MOBOdict["Manufacturer"] = str(v['data']['Manufacturer'])
MOBOdict["Model"] = str(v['data']['Product Name'])
MOBOlist.append(MOBOdict)
return MOBOlist
def get_json_dumps():
jsonOBJ = json
#Syntax error is here, i can't use comma to continue adding more, nor + to append.
return jsonOBJ.dumps({'HardwareSpec':{'BIOS': get_bios_specs()},{'Motherboard': get_board_specs()}})
Use multiple items within your nested dictionary.
jsonOBJ.dumps({
'HardwareSpec': {
'BIOS': get_bios_specs(),
'Motherboard': get_board_specs()
}
})
And if you want multiple BIOS items or Motherboard items, just use a list.
...
'HardwareSpec': {
'BIOS': [
get_bios_specs(),
get_uefi_specs()
]
...
}
If you want a more convenient lookup of specs, you can just embed a dict:
jsonOBJ.dumps({'HardwareSpec':{'BIOS': get_bios_specs(),
'Motherboard': get_board_specs()
}
})

Categories

Resources