Create directories tree from list - python

I have this function:
def Test2(my_path):
def create_sound_folder_from_path(current_path):
result = {
'folders': {},
'sounds': []
}
for entry in os.listdir(current_path):
full_path = os.path.join(current_path, entry)
if os.path.isdir(full_path):
result['folders'][entry] = create_sound_folder_from_path(full_path)
elif entry.endswith('.wav'):
result['sounds'].append(entry)
return result
path_to_use = my_path
result = create_sound_folder_from_path(path_to_use)
return result
and it returns an dictionary with folders and files like this:
{
'folders':
{'sounds': {'folders': {}, 'sounds': ['song1.wav', 'song2.wav']},
'machine': {'folders': {}, 'sounds': ['song5.wav']}
},
'sounds': [] # no sounds at root
}
My Input list:
['sounds/sound1.wav', 'sounds/sound2.wav', 'sounds/new/sound2.wav', 'sounds/old/sound2.wav', 'machine/mach.wav']
I just want the same dictionary but from a path list. Is it possible?

here is my contribution, this code use recursive calls to obtain sub-folder's information, for sure can be rewritten to avoid the main loop.
import json
def get_subdirectories(path_dict, path_list):
if len(path_list) == 1:
path_dict['sounds'].extend([x for x in path_list])
elif len(path_list) > 1:
key = path_list.pop(0)
if key not in path_dict['folders'].keys():
path_dict['folders'][key] = {'sounds': [], 'folders': {}}
get_subdirectories(path_dict['folders'][key], path_list)
def main():
directories = ['sounds/sound1.wav', 'sounds/sound2.wav',
'sounds/new/sound2.wav', 'sounds/old/sound2.wav',
'machine/mach.wav']
output_dict = {'sounds': [], 'folders': {}}
for d in directories:
root = d.split('/')[0]
if root not in output_dict['folders'].keys():
output_dict['folders'][root] = {'sounds': [], 'folders': {}}
get_subdirectories(output_dict['folders'][root], d.split('/')[1:])
print(
json.dumps(
output_dict,
sort_keys=True,
indent=4,
separators=(',', ': ')))
This is the result:
{
"folders": {
"machine": {
"folders": {},
"sounds": [
"mach.wav"
]
},
"sounds": {
"folders": {
"new": {
"folders": {},
"sounds": [
"sound2.wav"
]
},
"old": {
"folders": {},
"sounds": [
"sound2.wav"
]
}
},
"sounds": [
"sound1.wav",
"sound2.wav"
]
}
},
"sounds": []
}

Have you tried using expanduser from os.path.expanduser? os.walk will work with this if put into a list. For example, to iterate through my music and documents folder, I did this:
from os.path import expanduser
from os.path import join
directories = [expanduser('~/Music'), expanduser('~/Documents')]
counter = 0
for item in directories:
for subdir, dirs, files in walk(directories[counter]):
for file in files:
print(join(subdir, file))
As you mentioned os.walk explicitly, I'm guessing you do know how to parse out the data as you need. If not, I can expand on how to do that.

Related

How can I create a folder tree in json format using python?

I am wondering, if it possible, to provide a folder path and let a python script scan the given folder and return a json tree with the amount of files for each folder. The tree should contain every sub-folder:
E.g. result:
[{
foldername: "folder1",
amount_of_files: 123,
children: [
{
foldername: "folder1.1",
amount_of_files: 3,
children: []
},
{
foldername: "folder1.2",
amount_of_files: 5,
children: [
{
foldername: "folder1.2.1",
amount_of_files: 20,
children: []
}
]
}
]
},
{
foldername: "folder2",
amount_of_files: 1,
children: [
{
foldername: "folder2.1",
amount_of_files: 3,
children: [
{
foldername: "folder2.1.1",
amount_of_files: 2,
children: [
{
foldername: "folder2.1.1.1",
amount_of_files: 24,
children: []
}
]
}
]
},
{
foldername: "folder1.2",
amount_of_files: 5,
children: []
}
]
}
]
You can use os.listdir with recursion:
import os, json
def get_tree(path=os.getcwd()):
return {'foldername':path,
'amount_of_files':sum(not os.path.isdir(os.path.join(path, k)) for k in os.listdir(path)),
'children':[get_tree(os.path.join(path, k)) for k in os.listdir(path) if os.path.isdir(os.path.join(path, k))]}
with open('folder_tree.json', 'w') as f:
json.dump(get_tree(), f)
To produce a list of dictionaries, with each dictionary containing the folder name and number of files, you can use a recursive generator function:
def get_tree(path=os.getcwd()):
yield {'foldername':path, 'amount_of_files':sum(not os.path.isdir(os.path.join(path, k)) for k in os.listdir(path))}
for i in os.listdir(path):
if os.path.isdir(os.path.join(path, i)):
yield from get_tree(os.path.join(path, i))
with open('folder_tree.json', 'w') as f:
json.dump(list(get_tree()), f)
This seems to do the job:
from os import listdir
from os.path import isdir, isfile, basename, join
from json import dumps
def folder(d):
result = dict(
amount_of_files=0,
foldername=basename(d)
)
for each in listdir(d):
path = join(d, each)
if isdir(path):
if 'children' not in result:
result['children'] = list()
result['children'].append(folder(path))
if isfile(path):
result['amount_of_files'] += 1
return result
print(dumps(folder('.')))

How can I extend file paths for a dictionary

import plazy
txt_filter = lambda x : True if x.endswith('') else False
file_paths: list = plazy.list_files(root='/data/', filter_func=txt_filter, is_include_root=True)
print(file_paths)
output:
["/data/subdir1/subdir1_1/file1.txt","/data/subdir2/subdir2_1/file2.txt", "/data/subdir2/subdir2_1/file1.txt", "/data/subdir3/subdir3_1/subdir3_2/file1.txt"]
How can I extend these paths for a specific dictionary. I want it to look like this
{
"data":
"subdir1" : { "subdir1_1": ["file1.txt"]},
"subdir2" : { "subdir2_1": ["file1.txt", "file2.txt"]},
"subdir3" : { "subdir3_1":
{ "subdir3_2": ["file1.txt"]}
}
}
I think one way to address this is by using plazy.list_files in a limited depth first (to get top level dirs) and recurse manually, rather than letting it get the whole tree.
Some pseudo code to illustrate...
topdirs = getdirs(/root)
foreach dir
children = getdirs(/dir)
leaves = gettxtfiles(/dir/)
As your program recurses into the structure it builds it map the way you want it.
I took care of it without using Plazy.
path = '/data/'
import os
import pprint
def f(path):
if os.path.isdir(path):
d ,l = {}, []
for name in os.listdir(path):
if os.path.isdir(os.path.join(path, name)):
d[name] = f(os.path.join(path, name))
else:
l.append(name)
d = l
return d
pprint.pprint(f(path))
Output
{
"data":
"subdir1" : { "subdir1_1": ["file1.txt"]},
"subdir2" : { "subdir2_1": ["file1.txt", "file2.txt"]},
"subdir3" : { "subdir3_1":
{ "subdir3_2": ["file1.txt"]}
}
}

How to return all of elements from function?

I'm trying to create an Ansible module to use Batfish inside an Ansible playbook.
I'm comparing JSON values with defined variables in function. But it can compare only one JSON value and variable in the function. How do I use loop and return?
I have already tried extract values from each JSON and tried to compare with defined variable.
import json
json_list = {"batfish_result": [
{
"Action": {
"0": "DENY"
},
"Line_Content": {
"0": "no-match"
}
},
{
"Action": {
"0": "PERMIT"
},
"Line_Content": {
"0": "permit 10.20.0.0 255.255.255.0"
}
}
]
}
def main(json_list):
PASS = 'PASS'
FAIL = 'FAIL'
result = {}
result_list = []
action_num_list = []
condition_list = ['permit', 'permit']
jsons = json_list["batfish_result"]
for j in jsons:
print(j)
action = j['Action']
action_num = action["0"]
action_num_list.append(action_num)
for con in condition_list:
for action in action_num_list:
if action == con.upper():
result_list.append(PASS)
result['msg'] = result_list
else:
result_list.append(FAIL)
result['msg'] = result_list
return result
main(json_list)
It returns
{'msg': ['PASS', 'PASS']}
It should be comparing each action with each condition variable like this.
{ "msg": ['FAIL', 'PASS'] }
Finally I solved it like this;
import json
from pprint import pprint
json_list = {"batfish_result": [
{
"Action": {
"0": "DENY"
},
"Line_Content": {
"0": "no-match"
}
},
{
"Action": {
"0": "PERMIT"
},
"Line_Content": {
"0": "permit 10.20.0.0 255.255.255.0"
}
}
]
}
def main(json_list):
PASS = "PASS"
FAIL = "FAIL"
result = {}
result_list = []
action_num_list = []
condition_list = ["permit", "permit"]
jsons = json_list["batfish_result"]
for j in jsons:
action = j['Action']
action_num = action["0"]
action_num_list.append(action_num)
#[DENY, PERMIT]
for con in condition_list:
con = con
#for action in action_num_list:
for x, y in zip(condition_list, action_num_list):
if y == x.upper():
result_list.append(PASS)
result['msg'] = result_list
#if pprint(y) != pprint(x.upper()):
else:
result_list.append(FAIL)
result['msg'] = result_list
return result_list
main(json_list)
Because pprint always returns 'None'. So I had to remove pprint after debug and also I used loop too much.

Python: Compare & Count Dictionary Structures Across Thousands of Dictionaries/XMLs/JSON

I'm parsing many thousands of XML files into dictionaries, and storing their structures in JSON.
They have much the same structure, but there is an unknown number of different tag-naming schemes. A variety of different abbreviations exist for naming tags within these thousands of files.
I need to find out how many different tags exist to describe each piece of information, to parse all of them correctly.
To do so, I want to create one master dictionary of the XMLs/dictionaries that includes all variations on tag names, and preferably their counts within the thousands of XMLs/dictionaries.
Here's a small sample of one of the dictionaries:
{
"Header": {
"Ts": {},
"PeriodEndDt": {},
"PreparedBy": {
"PreparerID": {},
"PreparerFirmName": {
"BusinessNameLine1Txt": {}
},
"PreparerAddress": {
"AddLn1Txt": {},
"CityName": {},
"StateAbbreviationCd": {},
"ZIPCd": {}
}
},
"FormTypeCd": {},
"PeriodBeginDt": {},
"Filer": {
"UniqueID": {},
"BusinessName": {
"BusinessNameLine1Txt": {}
},
"BusinessNameControlTxt": {},
"PhoneNum": {},
"USAddress": {
"AddressLine1Txt": {},
"CityNm": {},
"StateAbbreviationCd": {},
"ZIPCd": {}
}
},
"FormData": {
"FormCodeType": {
"BizType": {},
"AssetsAtEOY": {},
"AccountingMethod": {},
"RevenueAndExpenses": {
"ScheduleBNotReqd": {},
"DivsRevAndExpenses": {},
"DivsNetInvstIncomeAmt": {},
"NetGainSaleAstRevAndExpnssAmt": {},
"RevsOvrExpenses": {},
"NetInvestmentIncomeAmt": {}
},
"BalanceSheetGroup": {
"CashInvstBOYAmt": {},
"CashInvstEOYAmt": {},
"CashInvstEOYFMVAmt": {},
"OtherInvestmentsBOYAmt": {},
"OtherInvestmentsEOYAmt": {},
"CapitalStockEOYAmt": {},
"TotalLiabilitiesNetAstEOYAmt": {}
},
"ChangeNetAssetsFundGroup": {
"NetAssettFundBalancesBOYAmt": {},
"ExcessRevExpensesAmt": {},
"OtherIncreasesAmt": {},
"SubtotalAmt": {},
"OtherDecreasesAmt": {},
"TotNetAstOrFundBalancesEOYAmt": {}
},
"CapGainsLossTxInvstIncmDetail": {
"CapGainsLossTxInvstIncmGrp": {
"PropertyDesc": {},
"HowAcquiredCd": {},
"GrossSalesPriceAmt": {},
"GainOrLossAmt": {},
"GainsMinusExcessOrLossesAmt": {}
},
"StatementsRegardingActyGrp": {
"LegislativePoliticalActyInd": {},
"MoreThan100SpentInd": {}
},
"PhoneNum": {},
"LocationOfBooksUSAddress": {
"AddressLine1Txt": {},
"CityNm": {},
"StateAbbreviationCd": {},
"ZIPCd": {}
},
"CorporateDirectorsGrp": {
"DirectorsGrp": {
"PersonNm": {},
"USAddress": {
"AddressLine1Txt": {},
"CityNm": {},
"StateAbbreviationCd": {},
"ZIPCd": {}
},
"EmpPrograms": {
"EmployeeBenefitGroupNum": {},
"GroupType": {
"GroupElement": {},
"GroupCharacter": {
"GroupNames": {}
}
}
},
"EmpOffice1": {},
"EmpOffice2": {},
"EmpOffice3": {},
"EmpOffice4": {}
}
}
}
}
}
}
}
The code I'm using to create the dictionaries/JSON in the first place is like this:
import xml.etree.ElementTree as ET
strip_ns = lambda xx: str(xx).split('}', 1)[1]
tree = ET.parse('xmlpath.xml')
root = tree.getroot()
tierdict = {}
for tier1 in root:
tier1var = strip_ns(tier1.tag)
tierdict[tier1var] = {}
for tier2 in tier1:
tier2var = strip_ns(tier2.tag)
tierdict[tier1var][tier2var] = {}
for tier3 in tier2:
tier3var = strip_ns(tier3.tag)
tierdict[tier1var][tier2var][tier3var] = {}
for tier4 in tier3:
tier4var = strip_ns(tier4.tag)
tierdict[tier1var][tier2var][tier3var][tier4var] = {}
The output I'd want to see is something like:
{
"Header": {
"Header.Count": 5672,
"Ts": {
"Ts.Count": 3365
},
"Ss": {
"Ss.Count": 2328
},
I'd probably do a recursive search of the elements you want as defined below:
def get_elements(json_entry, child_elements=[]):
if not child_elements:
return json_entry
el, other_children = child_elements[0], child_elements[1:]
children = el.getchildren()
rec = json_entry.get(el.tag)
if not children:
json_entry[el.tag] = {"Count": rec.get("Count",0)+1 if rec else 1}
else:
json_entry[el.tag] = {"Count": rec.get("Count",0) if rec else 1,
**get_elements({}, children)}
return get_elements(json_entry, other_children)
This way, you can just pass the root element of your xml:
from lxml import etree
with open("myxml.xml", "r") as fh:
tree = etree.parse(fh)
root = tree.getroot()
root_children = root.getchildren()
child_recs = get_elements({}, root_children)
{'tagOne': {'Count': 1}, 'tagTwo': {'Count': 1, 'tagThree': {'Count': 1}, 'tagFour': {'Count': 1, 'tagFive': {'Count': 1}}}}
If you want to wrap your root element around it, do it like so:
master_lookup = {root.tag: {"Count": 1, **child_recs}}
This can be easily extended to a for loop through many files
master_lookup = {}
for file in os.walk(path):
with open(file) as fh:
tree = etree.parse(fh)
root = tree.getroot()
root_entry = master_lookup.get(root.tag, {"Count": 0})
root_children = root.getchildren()
root_count = root_entry.pop("Count")
master_lookup[root.tag] = {"Count": root_count, **get_elements({**root_entry}, root_children)}
Something to that effect

Pandas Create Dict within Deeply Nested JSON

I'm having trouble modifying my code to add another dictionary to separate "hostNumber" and "hostMode" in my output. Below is the code that found here and manipulated:
import json
from json import dumps
top = "Top_Level"
top_dict = {}
top_dict["name"] = top
top_dict["sub_name"] = []
for site, site_data in df.groupby("site", sort=False):
site_dict = {}
site_dict["site"] = site
site_dict["sub_site"] = []
for stor, stor_data in site_data.groupby("system", sort=False):
stor_dict = {}
stor_dict["system"] = stor
stor_dict["sub_system"] = []
for port, port_data in stor_data.groupby("portId", sort=False):
port_dict = {}
port_dict["portId"] = port
port_dict["sub_portId"] = []
for host, host_data in port_data.groupby("hostName", sort=False):
host_data = host_data.drop(["portId", "system",
"site"], axis=1).set_index(
"hostName")
for n in host_data.to_dict(orient="records"):
port_dict["sub_portId"].append({"hostName": host,
"sub_hostName": [n]})
stor_dict["sub_system"].append(port_dict)
site_dict["sub_site"].append(stor_dict)
top_dict["sub_name"].append(site_dict)
top_out = dumps(top_dict)
parsed = json.loads(top_out)
resulting in:
print(json.dumps(parsed, indent=4, sort_keys=True))
{
"name": "Top_Level",
"sub_name": [
{
"site": "A",
"sub_site": [
{
"system": "system01",
"sub_system": [
{
"portId": "1-A",
"sub_portId": [
{
"hostName": "ahost005",
"sub_hostName": [
{
"hostNumber": "1",
"hostMode": "WIN"
}
]
}, ...
How can I modify my code to have it output in the following way:
...
"sub_hostName": [
{"hostNumber": "1"},
{"hostMode": "WIN"}
]...
Use the following line instead of "sub_hostName": [n]:
"sub_hostName": [dict([i]) for i in n.items()]

Categories

Resources