import plazy
txt_filter = lambda x : True if x.endswith('') else False
file_paths: list = plazy.list_files(root='/data/', filter_func=txt_filter, is_include_root=True)
print(file_paths)
output:
["/data/subdir1/subdir1_1/file1.txt","/data/subdir2/subdir2_1/file2.txt", "/data/subdir2/subdir2_1/file1.txt", "/data/subdir3/subdir3_1/subdir3_2/file1.txt"]
How can I extend these paths for a specific dictionary. I want it to look like this
{
"data":
"subdir1" : { "subdir1_1": ["file1.txt"]},
"subdir2" : { "subdir2_1": ["file1.txt", "file2.txt"]},
"subdir3" : { "subdir3_1":
{ "subdir3_2": ["file1.txt"]}
}
}
I think one way to address this is by using plazy.list_files in a limited depth first (to get top level dirs) and recurse manually, rather than letting it get the whole tree.
Some pseudo code to illustrate...
topdirs = getdirs(/root)
foreach dir
children = getdirs(/dir)
leaves = gettxtfiles(/dir/)
As your program recurses into the structure it builds it map the way you want it.
I took care of it without using Plazy.
path = '/data/'
import os
import pprint
def f(path):
if os.path.isdir(path):
d ,l = {}, []
for name in os.listdir(path):
if os.path.isdir(os.path.join(path, name)):
d[name] = f(os.path.join(path, name))
else:
l.append(name)
d = l
return d
pprint.pprint(f(path))
Output
{
"data":
"subdir1" : { "subdir1_1": ["file1.txt"]},
"subdir2" : { "subdir2_1": ["file1.txt", "file2.txt"]},
"subdir3" : { "subdir3_1":
{ "subdir3_2": ["file1.txt"]}
}
}
Related
Problem Statement:
I have around 500 ZIP files with lots of XMLS, i am able to convert them to JSON and parse them to parquet files as example below for one nested JSON file.
Not able to process multiple files with spark also
I have below code that flattens whole JSON into pandas data frame but now have to run this code over 150,000 files. when my JSON is very big it takes around 2 minutes to flatten whole data. Also if i run it using SPARK over my RDD of multiple files it fails with either OOM or struct error.
Am i doing something wrong SPARK wise ?
import xmltodict
import pandas as pd
def parser(master_tree):
flatten_tree_node = []
def _process_leaves(tree:dict,prefix:str = "node", tree_node:dict = dict(), update:bool = True):
is_nested = False
if isinstance(tree,dict):
for k in tree.keys():
if type(tree[k]) == str:
colName = prefix + "_" + k
tree_node[colName] = tree[k]
elif type(tree[k]) == dict:
prefix += "_" + k
leave = tree[k]
_process_leaves(leave,prefix = prefix, tree_node = tree_node, update = False)
for k in tree.keys():
if type(tree[k]) == list:
is_nested = True
prefix += "_" + k
for leave in tree[k]:
_process_leaves(leave,prefix = prefix, tree_node = tree_node.copy())
if not is_nested and update:
flatten_tree_node.append(tree_node)
_process_leaves(master_tree)
df = pd.DataFrame(flatten_tree_node)
df.columns = df.columns.str.replace("#", "_")
df.columns = df.columns.str.replace("#", "_")
return df
def extractor(file_name,file):
data = file.decode('utf-8')
d = bytes(bytearray(data, encoding='utf-8'))
data = xmltodict.parse(d)
flatten_data = parser(dict_data)
return (file_name,flatten_data)
def extract_files(x):
in_memory_data = io.BytesIO(x[1])
file_obj = zipfile.ZipFile(in_memory_data, "r")
files = [i for i in file_obj.namelist()]
return [extractor(file_name,file_obj.open(file_name).read()) for file_name in files]
zip_rdd = spark.read.format('binaryFile').load('/home/me/sample.zip').select('path','content').rdd
Fails here at the time of collection:
collected_data = zip_rdd.map(extract_files).collect()
Below Errors:
org.apache.spark.api.python.PythonException: 'struct.error: 'i' format requires -2147483648 <= number <= 2147483647'. Full traceback
or
java.lang.OutOfMemoryError
at java.io.ByteArrayOutputStream.hugeCapacity(ByteArrayOutputStream.java:123
Although everything works fine when ran one only single file.
Example Run of parsing nested JSON using parser function is like below:
Is there a way to make it memory and speed efficient ?
import pandas as pd
tree= {
"products":
[
{
"id":"0",
"name": "First",
"emptylist":[],
"properties" :
{
"id" : "",
"name" : ""
}
},
{
"id":"1",
"name": "Second",
"emptylist":[],
"properties":
{
"id" : "23",
"name" : "a useful product",
"features" :
[
{
"name":"Features",
"id":"18",
"features":
[
{
"id":"1001",
"name":"Colour",
"value":"Black"
},
{
"id":"2093",
"name":"Material",
"value":"Plastic"
}
]
},
{
"name":"Sizes",
"id":"34",
"features":
[
{
"id":"4736",
"name":"Length",
"value":"56"
},
{
"id":"8745",
"name":"Width",
"value":"76"
}
]
}
]
}
},
{
"id":"2",
"name": "Third",
"properties" :
{
"id" : "876",
"name" : "another one",
"features" :
[
{
"name":"Box",
"id":"937",
"features":
[
{
"id":"3758",
"name":"Amount",
"value":"1"
},
{
"id":"2222",
"name":"Packaging",
"value":"Blister"
}
]
},
{
"name":"Features",
"id":"8473",
"features":
[
{
"id":"9372",
"name":"Colour",
"value":"White"
},
{
"id":"9375",
"name":"Position",
"value":"A"
},
{
"id":"2654",
"name":"Amount",
"value":"6"
}
]
}
]
}
}
]
}
def parser(master_tree):
flatten_tree_node = []
def _process_leaves(tree:dict,prefix:str = "node", tree_node:dict = dict(), update:bool = True):
is_nested = False
if isinstance(tree,dict):
for k in tree.keys():
if type(tree[k]) == str:
colName = prefix + "_" + k
tree_node[colName] = tree[k]
elif type(tree[k]) == dict:
prefix += "_" + k
leave = tree[k]
_process_leaves(leave,prefix = prefix, tree_node = tree_node, update = False)
for k in tree.keys():
if type(tree[k]) == list:
is_nested = True
prefix += "_" + k
for leave in tree[k]:
_process_leaves(leave,prefix = prefix, tree_node = tree_node.copy())
if not is_nested and update:
flatten_tree_node.append(tree_node)
_process_leaves(master_tree)
df = pd.DataFrame(flatten_tree_node)
df.columns = df.columns.str.replace("#", "_")
df.columns = df.columns.str.replace("#", "_")
return df
print(parser(tree))
node_products_id node_products_name ... node_products_properties_features_features_name node_products_properties_features_features_value
0 1 Second ... Colour Black
1 1 Second ... Material Plastic
2 1 Second ... Length 56
3 1 Second ... Width 76
4 2 Third ... Amount 1
5 2 Third ... Packaging Blister
6 2 Third ... Colour White
7 2 Third ... Position A
8 2 Third ... Amount 6
9 2 Third ... NaN NaN
[10 rows x 9 columns]
Do not collect this data, it's likely it will never fit in memory as you are trying to pull all the data into the driver.
You can just save it to a file directly.
collected_data = zip_rdd.map(extract_files).toDF("column","names","go","here")
collected_data.write.parquet("/path/to/folder")
I do not have spark 3.2 but I'm aware of the features it posses. And in this case it will make your life easy. unionByName is a new feature that will let you magically join schemas.
collected_data = spark.createDataFrame( data = [], schema = [] )
zip_array = spark.read.format('binaryFile').load('/home/me/sample.zip').select('path').collect() # this will likely fit in driver memory so it's OK to call. After all it's just a list of file paths.
for my_file in zip_array:
collected_data = collected_data.unionByName( spark.createDataFrame(extract_files(my_file)), allowMissingColumns=True )
collected_data.write.parquet("/path/to/folder")
For better efficiency you want to use mapParitions. There are a couple reasons why but this actually goes back to map/reduce era. You want to create an iterator as this can work at lower levels. Can be optimized and pipelined better.(Hence the use of yield)
MapParitition code will execute inside an executor, and can only contain 'python code'. No spark code allowed as you don't have access to the sparkContext in an executor. Sometimes requires imports to be completed in the function itself as the scope is local not global.
If you are looking to save more memory, you might want to reconsider an alternative to xmltodict.parse(d) and re-writing reformat. You could use a library that you initiate once per partition and re-use it for the entire set of rows in the partition. This would be more efficient than the static call to xmltodict.parse(d) that just uses memory to create the struct just to be thrown away immediately by the garbage collector as it goes out of scope. (Google lists several alternatives you can review and determine what one best fits your needs)
zip_array = spark.read.format('binaryFile').load('/home/me/sample.zip').select('path').collect() # this will likely fit in driver memory so it's OK to call. After all it's just a list of file paths.
def reformat(partitionData):
for row in partitionData:
in_memory_data = io.BytesIO(row[1])
file_obj = zipfile.ZipFile(in_memory_data, "r").namelist()
for file_name in file_obj:
yield extractor(file_name,file_obj.open(file_name).read())
collected_data = zip_array.rdd.mapPartitions(reformat).toDF("file_name","flattened_data")
collected_data.write.parquet("/path/to/folder")
How to create folders with name of all dictionaries in nested dictionary?
The code should iterate through dictionary and create the directories structure like in nested dictionary with keeping all hierarchy.
dic = {
"root": {
'0_name': {
"0_name_a": {
"0_name_a_a": {
},
"0_name_a_b": {
"file": "file"
}
},
"0_name_b": {
}
},
"1_name": {
},
"2_name": {
},
"3_name": {
"3_name": {
},
}
}
}
Should make directories like:
root/0_name
root/0_name/0_name_a
root/0_name/0_name_a/0_name_a_a
root/0_name/0_name_a/0_name_a_b
root/0_name/0_name_b
root/1_name/1_name_a
root/2_name/
root/3_name/3_name(the same name)
The script needs to determine if the value is final, and create a folder with that path, then remove that value from a dictionary and start over. Also somehow to recognize "file" type and skip it. I couldn't determine a recursive way to iterate all values and add them to a path.
My approach (absolutely not working, just pasted it in to show something):
def rec(dic):
path = []
def nova(dic):
for k, v in dic.copy().items():
if isinstance(v, dict):
path.append(k)
if not v:
print(os.path.join(*path))
path.clear()
dic.pop(k)
nova(v)
if path == []:
nova(dic)
You're going to need a recursive program, and the value of "path" needs to be part of that recursion.
This isn't exactly what you want, but it's close.
def handle_problem(dic):
def one_directory(dic, path):
for name, info in dic.items():
next_path = path + "/" + name
if isinstance(info, dict):
print("Creating " + next_path) # actually use mkdir here!
one_directory(info, next_path)
one_directory(dic, '')
I would like to write a fucntion that mimics a folder structure in Python. Given a list of strings I would like to create a tree of folders & subfolders. For example:
['beers', 'wines', 'beers/ipa/stone', 'wines/red/cabernet']
Would output a dictionary with the following:
{
'beers': {
'ipa': {
'stone': {}
}
},
'wines': {
'red': {
'cabernet': {}
}
}
}
x = ['beers', 'wines', 'beers/ipa/stone', 'wines/red/cabernet']
def add_items(d, items):
if len(items) == 1:
if items[0] in d:
return
else:
d[items[0]] = dict()
else:
if items[0] not in d:
d[items[0]] = dict()
add_items(d[items[0]], items[1:])
out = dict()
for item in x:
items = item.split("/")
add_items(out, items)
print(out)
{'wines': {'red': {'cabernet': {}}}, 'beers': {'ipa': {'stone': {}}}}
Just go through you list and add names of each strings that you split on the slash character. You can use setdefault() to ensure that the next level dictionary exist (i.e. auto create entries as you go)
strings = ['beers', 'wines', 'beers/ipa/stone', 'wines/red/cabernet']
directory = dict()
for path in strings:
d = directory
for name in path.split("/"):
d = d.setdefault(name,dict())
print(directory)
{'beers':
{ 'ipa': {'stone': {}} },
'wines':
{'red': {'cabernet': {}} }
}
With Python 3, the order of items in each dict will correspond to their original relative order in the list of strings
If you want the items in each dictionary to appear in alphanumerical order, you can change the loop like this:
directory = dict()
for path in sorted(s.split("/") for s in strings):
d = directory
for name in path:
d = d.setdefault(name,dict())
If you like recursive functions, here's a simple one that does the same thing (but less efficiently):
def makeTree(strings, separator="/", tree=None):
tree = tree or dict()
for name,*subs in (s.split(separator,1) for s in strings):
tree[name] = makeTree(subs, separator, tree.get(name))
return tree
d = makeTree(strings)
print(d)
{'beers':
{ 'ipa': {'stone': {}} },
'wines':
{'red': {'cabernet': {}} }
}
I have this function:
def Test2(my_path):
def create_sound_folder_from_path(current_path):
result = {
'folders': {},
'sounds': []
}
for entry in os.listdir(current_path):
full_path = os.path.join(current_path, entry)
if os.path.isdir(full_path):
result['folders'][entry] = create_sound_folder_from_path(full_path)
elif entry.endswith('.wav'):
result['sounds'].append(entry)
return result
path_to_use = my_path
result = create_sound_folder_from_path(path_to_use)
return result
and it returns an dictionary with folders and files like this:
{
'folders':
{'sounds': {'folders': {}, 'sounds': ['song1.wav', 'song2.wav']},
'machine': {'folders': {}, 'sounds': ['song5.wav']}
},
'sounds': [] # no sounds at root
}
My Input list:
['sounds/sound1.wav', 'sounds/sound2.wav', 'sounds/new/sound2.wav', 'sounds/old/sound2.wav', 'machine/mach.wav']
I just want the same dictionary but from a path list. Is it possible?
here is my contribution, this code use recursive calls to obtain sub-folder's information, for sure can be rewritten to avoid the main loop.
import json
def get_subdirectories(path_dict, path_list):
if len(path_list) == 1:
path_dict['sounds'].extend([x for x in path_list])
elif len(path_list) > 1:
key = path_list.pop(0)
if key not in path_dict['folders'].keys():
path_dict['folders'][key] = {'sounds': [], 'folders': {}}
get_subdirectories(path_dict['folders'][key], path_list)
def main():
directories = ['sounds/sound1.wav', 'sounds/sound2.wav',
'sounds/new/sound2.wav', 'sounds/old/sound2.wav',
'machine/mach.wav']
output_dict = {'sounds': [], 'folders': {}}
for d in directories:
root = d.split('/')[0]
if root not in output_dict['folders'].keys():
output_dict['folders'][root] = {'sounds': [], 'folders': {}}
get_subdirectories(output_dict['folders'][root], d.split('/')[1:])
print(
json.dumps(
output_dict,
sort_keys=True,
indent=4,
separators=(',', ': ')))
This is the result:
{
"folders": {
"machine": {
"folders": {},
"sounds": [
"mach.wav"
]
},
"sounds": {
"folders": {
"new": {
"folders": {},
"sounds": [
"sound2.wav"
]
},
"old": {
"folders": {},
"sounds": [
"sound2.wav"
]
}
},
"sounds": [
"sound1.wav",
"sound2.wav"
]
}
},
"sounds": []
}
Have you tried using expanduser from os.path.expanduser? os.walk will work with this if put into a list. For example, to iterate through my music and documents folder, I did this:
from os.path import expanduser
from os.path import join
directories = [expanduser('~/Music'), expanduser('~/Documents')]
counter = 0
for item in directories:
for subdir, dirs, files in walk(directories[counter]):
for file in files:
print(join(subdir, file))
As you mentioned os.walk explicitly, I'm guessing you do know how to parse out the data as you need. If not, I can expand on how to do that.
I had a problem on converting dictionaries to strings which has recursive features.
I had a map of routing such as the following;
urls = {
'/' : 'BaseController.hello',
'/api' : {
'/auth' : {
'/me' : 'ApiController.hello',
'/login' : {
'/guest' : 'ApiController.guest_login',
'/member': 'ApiController.member_login'
}
}
}
}
What I need to do is to generate a dictionary from that into the following;
url_map = {
'/' : 'BaseController.hello',
'/api/auth/me' : 'ApiController.hello',
'/api/auth/login/guest' : 'ApiController.guest_login',
'/api/auth/login/member': 'ApiController.member_login',
}
This feature is called route grouping but I haven't been able to write a function to generate that. Any ideas ?
You can recursively do it like this
def flatten(current_dict, current_key, result_dict):
# For every key in the dictionary
for key in current_dict:
# If the value is of type `dict`, then recurse with the value
if isinstance(current_dict[key], dict):
flatten(current_dict[key], current_key + key, result_dict)
# Otherwise, add the element to the result
else:
result_dict[current_key + key] = current_dict[key]
return result_dict
print flatten(urls, "", {})
Output
{
'/api/auth/me': 'ApiController.hello',
'/api/auth/login/guest': 'ApiController.guest_login',
'/': 'BaseController.hello',
'/api/auth/login/member': 'ApiController.member_login'
}