Python: Compare & Count Dictionary Structures Across Thousands of Dictionaries/XMLs/JSON

Python: Compare & Count Dictionary Structures Across Thousands of Dictionaries/XMLs/JSON - python

I'm parsing many thousands of XML files into dictionaries, and storing their structures in JSON.
They have much the same structure, but there is an unknown number of different tag-naming schemes. A variety of different abbreviations exist for naming tags within these thousands of files.
I need to find out how many different tags exist to describe each piece of information, to parse all of them correctly.
To do so, I want to create one master dictionary of the XMLs/dictionaries that includes all variations on tag names, and preferably their counts within the thousands of XMLs/dictionaries.
Here's a small sample of one of the dictionaries:
{
"Header": {
"Ts": {},
"PeriodEndDt": {},
"PreparedBy": {
"PreparerID": {},
"PreparerFirmName": {
"BusinessNameLine1Txt": {}
},
"PreparerAddress": {
"AddLn1Txt": {},
"CityName": {},
"StateAbbreviationCd": {},
"ZIPCd": {}
}
},
"FormTypeCd": {},
"PeriodBeginDt": {},
"Filer": {
"UniqueID": {},
"BusinessName": {
"BusinessNameLine1Txt": {}
},
"BusinessNameControlTxt": {},
"PhoneNum": {},
"USAddress": {
"AddressLine1Txt": {},
"CityNm": {},
"StateAbbreviationCd": {},
"ZIPCd": {}
}
},
"FormData": {
"FormCodeType": {
"BizType": {},
"AssetsAtEOY": {},
"AccountingMethod": {},
"RevenueAndExpenses": {
"ScheduleBNotReqd": {},
"DivsRevAndExpenses": {},
"DivsNetInvstIncomeAmt": {},
"NetGainSaleAstRevAndExpnssAmt": {},
"RevsOvrExpenses": {},
"NetInvestmentIncomeAmt": {}
},
"BalanceSheetGroup": {
"CashInvstBOYAmt": {},
"CashInvstEOYAmt": {},
"CashInvstEOYFMVAmt": {},
"OtherInvestmentsBOYAmt": {},
"OtherInvestmentsEOYAmt": {},
"CapitalStockEOYAmt": {},
"TotalLiabilitiesNetAstEOYAmt": {}
},
"ChangeNetAssetsFundGroup": {
"NetAssettFundBalancesBOYAmt": {},
"ExcessRevExpensesAmt": {},
"OtherIncreasesAmt": {},
"SubtotalAmt": {},
"OtherDecreasesAmt": {},
"TotNetAstOrFundBalancesEOYAmt": {}
},
"CapGainsLossTxInvstIncmDetail": {
"CapGainsLossTxInvstIncmGrp": {
"PropertyDesc": {},
"HowAcquiredCd": {},
"GrossSalesPriceAmt": {},
"GainOrLossAmt": {},
"GainsMinusExcessOrLossesAmt": {}
},
"StatementsRegardingActyGrp": {
"LegislativePoliticalActyInd": {},
"MoreThan100SpentInd": {}
},
"PhoneNum": {},
"LocationOfBooksUSAddress": {
"AddressLine1Txt": {},
"CityNm": {},
"StateAbbreviationCd": {},
"ZIPCd": {}
},
"CorporateDirectorsGrp": {
"DirectorsGrp": {
"PersonNm": {},
"USAddress": {
"AddressLine1Txt": {},
"CityNm": {},
"StateAbbreviationCd": {},
"ZIPCd": {}
},
"EmpPrograms": {
"EmployeeBenefitGroupNum": {},
"GroupType": {
"GroupElement": {},
"GroupCharacter": {
"GroupNames": {}
}
}
},
"EmpOffice1": {},
"EmpOffice2": {},
"EmpOffice3": {},
"EmpOffice4": {}
}
}
}
}
}
}
}
The code I'm using to create the dictionaries/JSON in the first place is like this:
import xml.etree.ElementTree as ET
strip_ns = lambda xx: str(xx).split('}', 1)[1]
tree = ET.parse('xmlpath.xml')
root = tree.getroot()
tierdict = {}
for tier1 in root:
tier1var = strip_ns(tier1.tag)
tierdict[tier1var] = {}
for tier2 in tier1:
tier2var = strip_ns(tier2.tag)
tierdict[tier1var][tier2var] = {}
for tier3 in tier2:
tier3var = strip_ns(tier3.tag)
tierdict[tier1var][tier2var][tier3var] = {}
for tier4 in tier3:
tier4var = strip_ns(tier4.tag)
tierdict[tier1var][tier2var][tier3var][tier4var] = {}
The output I'd want to see is something like:
{
"Header": {
"Header.Count": 5672,
"Ts": {
"Ts.Count": 3365
},
"Ss": {
"Ss.Count": 2328
},

I'd probably do a recursive search of the elements you want as defined below:
def get_elements(json_entry, child_elements=[]):
if not child_elements:
return json_entry
el, other_children = child_elements[0], child_elements[1:]
children = el.getchildren()
rec = json_entry.get(el.tag)
if not children:
json_entry[el.tag] = {"Count": rec.get("Count",0)+1 if rec else 1}
else:
json_entry[el.tag] = {"Count": rec.get("Count",0) if rec else 1,
**get_elements({}, children)}
return get_elements(json_entry, other_children)
This way, you can just pass the root element of your xml:
from lxml import etree
with open("myxml.xml", "r") as fh:
tree = etree.parse(fh)
root = tree.getroot()
root_children = root.getchildren()
child_recs = get_elements({}, root_children)
{'tagOne': {'Count': 1}, 'tagTwo': {'Count': 1, 'tagThree': {'Count': 1}, 'tagFour': {'Count': 1, 'tagFive': {'Count': 1}}}}
If you want to wrap your root element around it, do it like so:
master_lookup = {root.tag: {"Count": 1, **child_recs}}
This can be easily extended to a for loop through many files
master_lookup = {}
for file in os.walk(path):
with open(file) as fh:
tree = etree.parse(fh)
root = tree.getroot()
root_entry = master_lookup.get(root.tag, {"Count": 0})
root_children = root.getchildren()
root_count = root_entry.pop("Count")
master_lookup[root.tag] = {"Count": root_count, **get_elements({**root_entry}, root_children)}
Something to that effect

Related

retrieving data from json with python

I have a nested JSON data. I want to get the value of key "name" inside the dictionary "value" based on the key "id" in "key" dictionary (let the user enter the id). I don't want to use indexing which, because places are changing on every url differently. Also data is large, so I need one row solution (without for loop).
Code
import requests, re, json
r = requests.get('https://www.trendyol.com/apple/macbook-air-13-m1-8gb-256gb-ssd-altin-p-67940132').text
json_data1 = json.loads(re.search(r"window.__PRODUCT_DETAIL_APP_INITIAL_STATE__=({.*}});window", r).group(1))
print(json_data1)
print('json_data1:',json_data1['product']['attributes'][0]['value']['name'])
Output
{'product': {'attributes': [{'key': {'name': 'İşlemci Tipi', 'id': 168}, 'value': {'name': 'Apple M1', 'id': 243383}, 'starred': True, 'description': '', 'mediaUrls': []}, {'key': {'name': 'SSD Kapasitesi', 'id': 249}..........
json_data1: Apple M1
JSON Data
{
"product": {
"attributes": [
{
"key": { "name": "İşlemci Tipi", "id": 168 },
"value": { "name": "Apple M1", "id": 243383 },
"starred": true,
"description": "",
"mediaUrls": []
},
{
"key": { "name": "SSD Kapasitesi", "id": 249 },
"value": { "name": "256 GB", "id": 3376 },
"starred": true,
"description": "",
"mediaUrls": []
},
.
.
.
]
}
}
Expected Output is getting value by key id: (type must be str)
input >> id: 168
output >> name: Apple M1

Since you originally didn't want a for loop, but now it's a matter of speed,
Here's a solution with for loop, you can test it and see if it's faster than the one you already had
import json
with open("file.json") as f:
data = json.load(f)
search_key = int(input("Enter id: "))
for i in range(0, len(data['product']['attributes'])):
if search_key == data['product']['attributes'][i]['key']['id']:
print(data['product']['attributes'][i]['value']['name'])
Input >> Enter id: 168
Output >> Apple M1

I found the solution with for loop. It works fast so I preferred it.
for i in json_data1['product']['attributes']:
cpu = list(list(i.values())[0].values())[1]
if cpu == 168:
print(list(list(i.values())[1].values())[0])

Iteration is unavoidable if the index is unknown, but the cost can be reduced substantially by using a generator expression and Python's built-in next function:
next((x["value"]["name"] for x in data["product"]["attributes"] if x["key"]["id"] == 168), None)
To verify that a generator expression is in fact faster than a for loop, here is a comparison of the running time of xFranko's solution and the above:
import time
def time_func(func):
def timer(*args):
time1 = time.perf_counter()
func(*args)
time2 = time.perf_counter()
return (time2 - time1) * 1000
return timer
number_of_attributes = 100000
data = {
"product": {
"attributes": [
{
"key": { "name": "İşlemci Tipi", "id": i },
"value": { "name": "name" + str(i), "id": 243383 },
"starred": True,
"description": "",
"mediaUrls": []
} for i in range(number_of_attributes)
]
}
}
def getName_generator(id):
return next((x["value"]["name"] for x in data["product"]["attributes"] if x["key"]["id"] == id), None)
def getName_for_loop(id):
return_value = None
for i in range(0, len(data['product']['attributes'])):
if id == data['product']['attributes'][i]['key']['id']:
return_value = data['product']['attributes'][i]['value']['name']
return return_value
print("Generator:", time_func(getName_generator)(0))
print("For loop:", time_func(getName_for_loop)(0))
print()
print("Generator:", time_func(getName_generator)(number_of_attributes - 1))
print("For loop:", time_func(getName_for_loop)(number_of_attributes - 1))
My results:
Generator: 0.0075999999999964984
For loop: 43.73920000000003
Generator: 23.633300000000023
For loop: 49.839699999999986
Conclusion:
For large data sets, a generator expression is indeed faster, even if it has to traverse the entire set.

Why doesn't pymongo MongoDB return an exact value in find_one()?

I want to retrieve the single value "count "from pymongo DB but it is not working. The image below shows how the data entry is setup.
Here is the call to my Database class to use the db.find_one().
CODE HERE:
filters = {"email": session.get('email')}
returns = {f'words.{today_s}.{self.length - 3}.count': 1}
count_value = Database.find_one_return_one("users", filters, returns)
print({f'words.{today_s}.{self.length - 3}.count':1})
print(count_value)
#staticmethod
def find_one_return_one(collection: str, query: Dict, data: Dict) -> Dict:
return Database.DATABASE[collection].find_one(query, data)
This returns an empty list of dictionaries from the correct data? I want the count value returned.
This is the projection query: {words.20220302.0.count : 1}
This is what is returned:
{'_id': ObjectId('621ee5065d08c44070140df0'), 'words': {'20220302': [{}, {}, {}, {}, {}, {}, {}]}}
What is wrong or is there a better quicker way to retrieve the count value?

The following query projection can be used to get the desired result. Note this worked with MongoDB v5.
A sample document; similar to the one in the question post:
{ _id: 1, words: { fld: [ { a: 1, b: 2 }, { a: 9, b: 100 } ] } }
The expected result is: { "_id" : 1, "words" : { "fld" : { "a" : 9 } } }
The query:
INDEX = 1 # this is the index of the array element
query = { }
projection = {
'words.fld': {
'$arrayElemAt': [
{ '$map': { 'input': '$words.fld', 'in': { 'a': '$$this.a' } } },
INDEX
]
}
}
result = collection.find_one(query, projection)
print(result)

Create a list of nested dictionaries from a single csv file in python

I have a csv file with the following structure:
team,tournament,player
Team 1,spring tournament,Rebbecca Cardone
Team 1,spring tournament,Salina Youngblood
Team 1,spring tournament,Catarina Corbell
Team 1,summer tournament,Cara Mejias
Team 1,summer tournament,Catarina Corbell
...
Team 10, spring tournament,Jessi Ravelo
I want to create a nested dictionary (team, tournament) with a list of player dictionary. The desired outcome would be something like:
{'data':
{Team 1:
{'spring tournament':
{'players': [
{name: Rebecca Cardone},
{name: Salina Youngblood},
{name: Catarina Corbell}]
},
{'summer tournament':
{'players': [
{name: Cara Mejias},
{name: Catarina Corbell}]
}
}
},
...
{Team 10:
{'spring tournament':
{'players': [
{name: Jessi Ravelo}]
}
}
}
}
I've been struggling to format it like this. I have been able to successfully nest the first level (team # --> tournament) but I cannot get the second level to nest. Currently, my code looks like this:
d = {}
header = True
with open("input.csv") as f:
for line in f.readlines():
if header:
header = False
continue
team, tournament, player = line.strip().split(",")
d_team = d.get(team,{})
d_tournament = d_team.get(tournament, {})
d_player = d_tournament.get('player',['name'])
d_player.append(player)
d_tournament['player'] = d_tournament
d_team[tournament] = d_tournament
d[team] = d_team
print(d)
What would be the next step in fixing my code so I can create the nested dictionary?

Some problems with your implementation:
You do d_player = d_tournament.get('player',['name']). But you actually want to get the key named players, and this should be a list of dictionaries. Each of these dictionaries must have the form {"name": "Player's Name"}. So you want
l_player = d_tournament.get('players',[]) (default to an empty list), and then do l_player.append({"name": player}) (I renamed it to l_player because it's a list, not a dict).
You do d_tournament['player'] = d_tournament. I suspect you meant d_tournament['player'] = d_player
Strip the whitespace off the elements in the rows. Do team, tournament, player = (word.strip() for word in line.split(","))
Your code works fine after you make these changes
I strongly suggest you use the csv.reader class to read your CSV file instead of manually splitting the line by commas.
Also, since python's containers (lists and dictionaries) hold references to their contents, you can just add the container once and then modify it using mydict["key"] = value or mylist.append(), and these changes will be reflected in parent containers too. Because of this behavior, you don't need to repeatedly assign these things in the loop like you do with d_team[tournament] = d_tournament
allteams = dict()
hasHeader = True
with open("input.csv") as f:
csvreader = csv.reader(f)
if hasHeader: next(csvreader) # Consume one line if a header exists
# Iterate over the rows, and unpack each row into three variables
for team_name, tournament_name, player_name in csvreader:
# If the team hasn't been processed yet, create a new dict for it
if team_name not in allteams:
allteams[team_name] = dict()
# Get the dict object that holds this team's information
team = allteams[team_name]
# If the tournament hasn't been processed already for this team, create a new dict for it in the team's dict
if tournament_name not in team:
team[tournament_name] = {"players": []}
# Get the tournament dict object
tournament = team[tournament_name]
# Add this player's information to the tournament dict's "player" list
tournament["players"].append({"name": player_name})
# Add all teams' data to the "data" key in our result dict
result = {"data": allteams}
print(result)
Which gives us what we want (prettified output):
{
'data': {
'Team 1': {
'spring tournament': {
'players': [
{ 'name': 'Rebbecca Cardone' },
{ 'name': 'Salina Youngblood' },
{ 'name': 'Catarina Corbell' }
]
},
'summer tournament': {
'players': [
{ 'name': 'Cara Mejias' },
{ 'name': 'Catarina Corbell' }
]
}
},
'Team 10': {
' spring tournament': {
'players': [
{ 'name': 'Jessi Ravelo' }
]
}
}
}
}

The example dictionary you describe is not possible (if you want multiple dictionaries under the key "Team 1", put them in a list), but this snippet:
if __name__ == '__main__':
your_dict = {}
with open("yourfile.csv") as file:
all_lines = file.readlines()
data_lines = all_lines[1:] # Skipping "team,tournament,player" line
for line in data_lines:
line = line.strip() # Remove \n
team, tournament_type, player_name = line.split(",")
team_dict = your_dict.get(team, {}) # e.g. "Team 1"
tournaments_of_team_dict = team_dict.get(tournament_type, {'players': []}) # e.g. "spring_tournament"
tournaments_of_team_dict["players"].append({'name': player_name})
team_dict[tournament_type] = tournaments_of_team_dict
your_dict[team] = team_dict
your_dict = {'data': your_dict}
For this example yourfile.csv:
team,tournament,player
Team 1,spring tournament,Rebbecca Cardone
Team 1,spring tournament,Salina Youngblood
Team 2,spring tournament,Catarina Corbell
Team 1,summer tournament,Cara Mejias
Team 2,summer tournament,Catarina Corbell
Gives the following:
{
"data": {
"Team 1": {
"spring tournament": {
"players": [
{
"name": "Rebbecca Cardone"
},
{
"name": "Salina Youngblood"
}
]
},
"summer tournament": {
"players": [
{
"name": "Cara Mejias"
}
]
}
},
"Team 2": {
"spring tournament": {
"players": [
{
"name": "Catarina Corbell"
}
]
},
"summer tournament": {
"players": [
{
"name": "Catarina Corbell"
}
]
}
}
}
}
Process finished with exit code 0

Maybe I overlook somethign but couldn't you use:
df.groupby(['team','tournament'])['player'].apply(list).reset_index().to_json(orient='records')

You might approach it this way:
from collections import defaultdict
import csv
from pprint import pprint
d = defaultdict(dict)
with open('f00.txt', 'r') as f:
reader = csv.DictReader(f)
for row in reader:
d[ row['team'] ].setdefault(row['tournament'], []
).append(row['player'])
pprint(dict(d))
Prints:
{'Team 1': {'spring tournament': ['Rebbecca Cardone',
'Salina Youngblood',
'Catarina Corbell'],
'summer tournament': ['Cara Mejias', 'Catarina Corbell']},
'Team 10': {' spring tournament': ['Jessi Ravelo']}}

Using .values() with list of dictionaries?

I'm comparing json files between two different API endpoints to see which json records need an update, which need a create and what needs a delete. So, by comparing the two json files, I want to end up with three json files, one for each operation.
The json at both endpoints is structured like this (but they use different keys for same sets of values; different problem):
{
"records": [{
"id": "id-value-here",
"c": {
"d": "eee"
},
"f": {
"l": "last",
"f": "first"
},
"g": ["100", "89", "9831", "09112", "800"]
}, {
…
}]
}
So the json is represented as a list of dictionaries (with further nested lists and dictionaries).
If a given json endpoint (j1) id value ("id":) exists in the other endpoint json (j2), then that record should be added to j_update.
So far I have something like this, but I can see that .values() doesn't work because it's trying to operate on the list instead of on all the listed dictionaries(?):
j_update = {r for r in j1['records'] if r['id'] in
j2.values()}
This doesn't return an error, but it creates an empty set using test json files.
Seems like this should be simple, but tripping over the nesting I think of dictionaries in a list representing the json. Do I need to flatten j2, or is there a simpler dictionary method python has to achieve this?
====edit j1 and j2====
have same structure, use different keys; toy data
j1
{
"records": [{
"field_5": 2329309841,
"field_12": {
"email": "cmix#etest.com"
},
"field_20": {
"last": "Mixalona",
"first": "Clara"
},
"field_28": ["9002329309999", "9002329309112"],
"field_44": ["1002329309832"]
}, {
"field_5": 2329309831,
"field_12": {
"email": "mherbitz345#test.com"
},
"field_20": {
"last": "Herbitz",
"first": "Michael"
},
"field_28": ["9002329309831", "9002329309112", "8002329309999"],
"field_44": ["1002329309832"]
}, {
"field_5": 2329309855,
"field_12": {
"email": "nkatamaran#test.com"
},
"field_20": {
"first": "Noriss",
"last": "Katamaran"
},
"field_28": ["9002329309111", "8002329309112"],
"field_44": ["1002329309877"]
}]
}
j2
{
"records": [{
"id": 2329309831,
"email": {
"email": "mherbitz345#test.com"
},
"name_primary": {
"last": "Herbitz",
"first": "Michael"
},
"assign": ["8003329309831", "8007329309789"],
"hr_id": ["1002329309877"]
}, {
"id": 2329309884,
"email": {
"email": "yinleeshu#test.com"
},
"name_primary": {
"last": "Lee Shu",
"first": "Yin"
},
"assign": ["8002329309111", "9003329309831", "9002329309111", "8002329309999", "8002329309112"],
"hr_id": ["1002329309832"]
}, {
"id": 23293098338,
"email": {
"email": "amlouis#test.com"
},
"name_primary": {
"last": "Maxwell Louis",
"first": "Albert"
},
"assign": ["8002329309111", "8007329309789", "9003329309831", "8002329309999", "8002329309112"],
"hr_id": ["1002329309877"]
}]
}

If you read the json it will output a dict. You are looking for a particular key in the list of the values.
if 'records' in j2:
r = j2['records'][0].get('id', []) # defaults if id does not exist
It it prettier to do a recursive search but i dunno how you data is organized to quickly come up with a solution.
To give an idea for recursive search consider this example
def resursiveSearch(dictionary, target):
if target in dictionary:
return dictionary[target]
for key, value in dictionary.items():
if isinstance(value, dict):
target = resursiveSearch(value, target)
if target:
return target
a = {'test' : 'b', 'test1' : dict(x = dict(z = 3), y = 2)}
print(resursiveSearch(a, 'z'))

You tried:
j_update = {r for r in j1['records'] if r['id'] in j2.values()}
Aside from the r['id'/'field_5] problem, you have:
>>> list(j2.values())
[[{'id': 2329309831, ...}, ...]]
The id are buried inside a list and a dict, thus the test r['id'] in j2.values() always return False.
The basic solution is fairly simple.
First, create a set of j2 ids:
>>> present_in_j2 = set(record["id"] for record in j2["records"])
Then, rebuild the json structure of j1 but without the j1 field_5 that are not present in j2:
>>> {"records":[record for record in j1["records"] if record["field_5"] in present_in_j2]}
{'records': [{'field_5': 2329309831, 'field_12': {'email': 'mherbitz345#test.com'}, 'field_20': {'last': 'Herbitz', 'first': 'Michael'}, 'field_28': ['9002329309831', '9002329309112', '8002329309999'], 'field_44': ['1002329309832']}]}
It works, but it's not totally satisfying because of the weird keys of j1. Let's try to convert j1 to a more friendly format:
def map_keys(json_value, conversion_table):
"""Map the keys of a json value
This is a recursive DFS"""
def map_keys_aux(json_value):
"""Capture the conversion table"""
if isinstance(json_value, list):
return [map_keys_aux(v) for v in json_value]
elif isinstance(json_value, dict):
return {conversion_table.get(k, k):map_keys_aux(v) for k,v in json_value.items()}
else:
return json_value
return map_keys_aux(json_value)
The function focuses on dictionary keys: conversion_table.get(k, k) is conversion_table[k] if the key is present in the conversion table, or the key itself otherwise.
>>> j1toj2 = {"field_5":"id", "field_12":"email", "field_20":"name_primary", "field_28":"assign", "field_44":"hr_id"}
>>> mapped_j1 = map_keys(j1, j1toj2)
Now, the code is cleaner and the output may be more useful for a PUT:
>>> d1 = {record["id"]:record for record in mapped_j1["records"]}
>>> present_in_j2 = set(record["id"] for record in j2["records"])
>>> {"records":[record for record in mapped_j1["records"] if record["id"] in present_in_j2]}
{'records': [{'id': 2329309831, 'email': {'email': 'mherbitz345#test.com'}, 'name_primary': {'last': 'Herbitz', 'first': 'Michael'}, 'assign': ['9002329309831', '9002329309112', '8002329309999'], 'hr_id': ['1002329309832']}]}

Create directories tree from list

I have this function:
def Test2(my_path):
def create_sound_folder_from_path(current_path):
result = {
'folders': {},
'sounds': []
}
for entry in os.listdir(current_path):
full_path = os.path.join(current_path, entry)
if os.path.isdir(full_path):
result['folders'][entry] = create_sound_folder_from_path(full_path)
elif entry.endswith('.wav'):
result['sounds'].append(entry)
return result
path_to_use = my_path
result = create_sound_folder_from_path(path_to_use)
return result
and it returns an dictionary with folders and files like this:
{
'folders':
{'sounds': {'folders': {}, 'sounds': ['song1.wav', 'song2.wav']},
'machine': {'folders': {}, 'sounds': ['song5.wav']}
},
'sounds': [] # no sounds at root
}
My Input list:
['sounds/sound1.wav', 'sounds/sound2.wav', 'sounds/new/sound2.wav', 'sounds/old/sound2.wav', 'machine/mach.wav']
I just want the same dictionary but from a path list. Is it possible?

here is my contribution, this code use recursive calls to obtain sub-folder's information, for sure can be rewritten to avoid the main loop.
import json
def get_subdirectories(path_dict, path_list):
if len(path_list) == 1:
path_dict['sounds'].extend([x for x in path_list])
elif len(path_list) > 1:
key = path_list.pop(0)
if key not in path_dict['folders'].keys():
path_dict['folders'][key] = {'sounds': [], 'folders': {}}
get_subdirectories(path_dict['folders'][key], path_list)
def main():
directories = ['sounds/sound1.wav', 'sounds/sound2.wav',
'sounds/new/sound2.wav', 'sounds/old/sound2.wav',
'machine/mach.wav']
output_dict = {'sounds': [], 'folders': {}}
for d in directories:
root = d.split('/')[0]
if root not in output_dict['folders'].keys():
output_dict['folders'][root] = {'sounds': [], 'folders': {}}
get_subdirectories(output_dict['folders'][root], d.split('/')[1:])
print(
json.dumps(
output_dict,
sort_keys=True,
indent=4,
separators=(',', ': ')))
This is the result:
{
"folders": {
"machine": {
"folders": {},
"sounds": [
"mach.wav"
]
},
"sounds": {
"folders": {
"new": {
"folders": {},
"sounds": [
"sound2.wav"
]
},
"old": {
"folders": {},
"sounds": [
"sound2.wav"
]
}
},
"sounds": [
"sound1.wav",
"sound2.wav"
]
}
},
"sounds": []
}

Have you tried using expanduser from os.path.expanduser? os.walk will work with this if put into a list. For example, to iterate through my music and documents folder, I did this:
from os.path import expanduser
from os.path import join
directories = [expanduser('~/Music'), expanduser('~/Documents')]
counter = 0
for item in directories:
for subdir, dirs, files in walk(directories[counter]):
for file in files:
print(join(subdir, file))
As you mentioned os.walk explicitly, I'm guessing you do know how to parse out the data as you need. If not, I can expand on how to do that.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Python: Compare & Count Dictionary Structures Across Thousands of Dictionaries/XMLs/JSON - python

Related

retrieving data from json with python

Why doesn't pymongo MongoDB return an exact value in find_one()?

Create a list of nested dictionaries from a single csv file in python

Using .values() with list of dictionaries?

Create directories tree from list

Categories

Resources