Problem Statement:
I have around 500 ZIP files with lots of XMLS, i am able to convert them to JSON and parse them to parquet files as example below for one nested JSON file.
Not able to process multiple files with spark also
I have below code that flattens whole JSON into pandas data frame but now have to run this code over 150,000 files. when my JSON is very big it takes around 2 minutes to flatten whole data. Also if i run it using SPARK over my RDD of multiple files it fails with either OOM or struct error.
Am i doing something wrong SPARK wise ?
import xmltodict
import pandas as pd
def parser(master_tree):
flatten_tree_node = []
def _process_leaves(tree:dict,prefix:str = "node", tree_node:dict = dict(), update:bool = True):
is_nested = False
if isinstance(tree,dict):
for k in tree.keys():
if type(tree[k]) == str:
colName = prefix + "_" + k
tree_node[colName] = tree[k]
elif type(tree[k]) == dict:
prefix += "_" + k
leave = tree[k]
_process_leaves(leave,prefix = prefix, tree_node = tree_node, update = False)
for k in tree.keys():
if type(tree[k]) == list:
is_nested = True
prefix += "_" + k
for leave in tree[k]:
_process_leaves(leave,prefix = prefix, tree_node = tree_node.copy())
if not is_nested and update:
flatten_tree_node.append(tree_node)
_process_leaves(master_tree)
df = pd.DataFrame(flatten_tree_node)
df.columns = df.columns.str.replace("#", "_")
df.columns = df.columns.str.replace("#", "_")
return df
def extractor(file_name,file):
data = file.decode('utf-8')
d = bytes(bytearray(data, encoding='utf-8'))
data = xmltodict.parse(d)
flatten_data = parser(dict_data)
return (file_name,flatten_data)
def extract_files(x):
in_memory_data = io.BytesIO(x[1])
file_obj = zipfile.ZipFile(in_memory_data, "r")
files = [i for i in file_obj.namelist()]
return [extractor(file_name,file_obj.open(file_name).read()) for file_name in files]
zip_rdd = spark.read.format('binaryFile').load('/home/me/sample.zip').select('path','content').rdd
Fails here at the time of collection:
collected_data = zip_rdd.map(extract_files).collect()
Below Errors:
org.apache.spark.api.python.PythonException: 'struct.error: 'i' format requires -2147483648 <= number <= 2147483647'. Full traceback
or
java.lang.OutOfMemoryError
at java.io.ByteArrayOutputStream.hugeCapacity(ByteArrayOutputStream.java:123
Although everything works fine when ran one only single file.
Example Run of parsing nested JSON using parser function is like below:
Is there a way to make it memory and speed efficient ?
import pandas as pd
tree= {
"products":
[
{
"id":"0",
"name": "First",
"emptylist":[],
"properties" :
{
"id" : "",
"name" : ""
}
},
{
"id":"1",
"name": "Second",
"emptylist":[],
"properties":
{
"id" : "23",
"name" : "a useful product",
"features" :
[
{
"name":"Features",
"id":"18",
"features":
[
{
"id":"1001",
"name":"Colour",
"value":"Black"
},
{
"id":"2093",
"name":"Material",
"value":"Plastic"
}
]
},
{
"name":"Sizes",
"id":"34",
"features":
[
{
"id":"4736",
"name":"Length",
"value":"56"
},
{
"id":"8745",
"name":"Width",
"value":"76"
}
]
}
]
}
},
{
"id":"2",
"name": "Third",
"properties" :
{
"id" : "876",
"name" : "another one",
"features" :
[
{
"name":"Box",
"id":"937",
"features":
[
{
"id":"3758",
"name":"Amount",
"value":"1"
},
{
"id":"2222",
"name":"Packaging",
"value":"Blister"
}
]
},
{
"name":"Features",
"id":"8473",
"features":
[
{
"id":"9372",
"name":"Colour",
"value":"White"
},
{
"id":"9375",
"name":"Position",
"value":"A"
},
{
"id":"2654",
"name":"Amount",
"value":"6"
}
]
}
]
}
}
]
}
def parser(master_tree):
flatten_tree_node = []
def _process_leaves(tree:dict,prefix:str = "node", tree_node:dict = dict(), update:bool = True):
is_nested = False
if isinstance(tree,dict):
for k in tree.keys():
if type(tree[k]) == str:
colName = prefix + "_" + k
tree_node[colName] = tree[k]
elif type(tree[k]) == dict:
prefix += "_" + k
leave = tree[k]
_process_leaves(leave,prefix = prefix, tree_node = tree_node, update = False)
for k in tree.keys():
if type(tree[k]) == list:
is_nested = True
prefix += "_" + k
for leave in tree[k]:
_process_leaves(leave,prefix = prefix, tree_node = tree_node.copy())
if not is_nested and update:
flatten_tree_node.append(tree_node)
_process_leaves(master_tree)
df = pd.DataFrame(flatten_tree_node)
df.columns = df.columns.str.replace("#", "_")
df.columns = df.columns.str.replace("#", "_")
return df
print(parser(tree))
node_products_id node_products_name ... node_products_properties_features_features_name node_products_properties_features_features_value
0 1 Second ... Colour Black
1 1 Second ... Material Plastic
2 1 Second ... Length 56
3 1 Second ... Width 76
4 2 Third ... Amount 1
5 2 Third ... Packaging Blister
6 2 Third ... Colour White
7 2 Third ... Position A
8 2 Third ... Amount 6
9 2 Third ... NaN NaN
[10 rows x 9 columns]
Do not collect this data, it's likely it will never fit in memory as you are trying to pull all the data into the driver.
You can just save it to a file directly.
collected_data = zip_rdd.map(extract_files).toDF("column","names","go","here")
collected_data.write.parquet("/path/to/folder")
I do not have spark 3.2 but I'm aware of the features it posses. And in this case it will make your life easy. unionByName is a new feature that will let you magically join schemas.
collected_data = spark.createDataFrame( data = [], schema = [] )
zip_array = spark.read.format('binaryFile').load('/home/me/sample.zip').select('path').collect() # this will likely fit in driver memory so it's OK to call. After all it's just a list of file paths.
for my_file in zip_array:
collected_data = collected_data.unionByName( spark.createDataFrame(extract_files(my_file)), allowMissingColumns=True )
collected_data.write.parquet("/path/to/folder")
For better efficiency you want to use mapParitions. There are a couple reasons why but this actually goes back to map/reduce era. You want to create an iterator as this can work at lower levels. Can be optimized and pipelined better.(Hence the use of yield)
MapParitition code will execute inside an executor, and can only contain 'python code'. No spark code allowed as you don't have access to the sparkContext in an executor. Sometimes requires imports to be completed in the function itself as the scope is local not global.
If you are looking to save more memory, you might want to reconsider an alternative to xmltodict.parse(d) and re-writing reformat. You could use a library that you initiate once per partition and re-use it for the entire set of rows in the partition. This would be more efficient than the static call to xmltodict.parse(d) that just uses memory to create the struct just to be thrown away immediately by the garbage collector as it goes out of scope. (Google lists several alternatives you can review and determine what one best fits your needs)
zip_array = spark.read.format('binaryFile').load('/home/me/sample.zip').select('path').collect() # this will likely fit in driver memory so it's OK to call. After all it's just a list of file paths.
def reformat(partitionData):
for row in partitionData:
in_memory_data = io.BytesIO(row[1])
file_obj = zipfile.ZipFile(in_memory_data, "r").namelist()
for file_name in file_obj:
yield extractor(file_name,file_obj.open(file_name).read())
collected_data = zip_array.rdd.mapPartitions(reformat).toDF("file_name","flattened_data")
collected_data.write.parquet("/path/to/folder")
Related
the list I have -
[
"Mathematics-2 (21SMT-125)",
"Mid-Semester Test-1",
"40",
"23.5",
"Mid-Semester Test-2",
"40",
"34",
"Disruptive Technologies - 2 (21ECH-103)",
"Experiment-1",
"20",
"19",
"Experiment-2",
"20",
"17",
"Experiment-3",
"20",
"18.5",
]
This list of stings is parsed from html using bs4
format to convert in :
{
"Subject": {
"Mathematics-2 (21SMT-125)": {
"Mid-Semester Test-1": [40,23.5],
"Mid-Semester Test-2": [40,34]
},
"Disruptive Technologies - 2 (21ECH-103)": {
"Experiment-1": [20,19],
"Experiment-2": [20,17],
"Experiment-3": [20,18.5]
}
}
}
The problem is that the list you provided is a flat list of items with no indicator of their hierarchical position in the desired structure.
One approach you could consider is if the entries that represent a parent object (Mathematics, etc...) are the only entries that contain parentheses, you could iterate on your list and use either string matching or regex to identify the parent, create a top level object for it then you'd need to add the next two entries as the value of the key/value pair as a list.
This assumes that you'll always have two subsequent values at the child level. If the number of attributes isn't fixed but they're always numeric you could use regex to determine if it's numeric or non-numeric and keep adding items to the value list until you hit another non-numeric entry, which would be treated as the next sibling in the hierarchy.
I would review the approach and check whether information from bs4 can be parsed in some smarter way - try to do more scrapping steps, first to reach subject, second "Semester/Experiment" third - grades.
If it's not possible and data returned from bs4 cannot be changed.. Only thing you can do is to try determine whether string is name of subject, semester or grade/score and try to use some while loops. Name of subject seems to have special code in the end, which can be distinguished from name of the semester/experiment using regexp and grade/scrore can be always parsed to number..
For data exactly like yours (where a string with a ( denotes a top-level entry, and there are always two numbers per entry), you could come up with a state machine sort of thing like this -- but like I commented, you really should improve your parsing code instead, since the HTML you're scraping your data off is likely already structured.
def is_float(s):
try:
float(s)
return True
except ValueError:
return False
def parse_inp(inp):
flat_map = {}
stack = []
x = 0
while x < len(inp):
if "(" in inp[x]:
stack.clear()
if is_float(inp[x]) and is_float(inp[x + 1]):
flat_map[tuple(stack)] = (float(inp[x]), float(inp[x + 1]))
x += 2
stack.pop(-1)
continue
stack.append(inp[x])
x += 1
return flat_map
def nest_flat_map(flat_map):
root = {}
for key_path, values_list in flat_map.items():
dst = root
for key in key_path[:-1]:
dst = dst.setdefault(key, {})
dst[key_path[-1]] = values_list
return root
inp = [
# ... data from original post
]
nested_map = nest_flat_map(parse_inp(inp))
print(nested_map)
This outputs the expected
{
"Mathematics-2 (21SMT-125)": {
"Mid-Semester Test-1": (40.0, 23.5),
"Mid-Semester Test-2": (40.0, 34.0),
},
"Disruptive Technologies - 2 (21ECH-103)": {
"Experiment-1": (20.0, 19.0),
"Experiment-2": (20.0, 17.0),
"Experiment-3": (20.0, 18.5),
},
}
You can use a fuzzy form of itertools.groupby to find the groups in this list of strings. This assumes that every class ends with the pattern "(classref-section)", and that it is followed by test or homework names each followed by one or more numeric scores.
source_data = [
"Mathematics-2 (21SMT-125)",
"Mid-Semester Test-1",
"40",
"23.5",
"Mid-Semester Test-2",
"40",
"34",
"Disruptive Technologies - 2 (21ECH-103)",
"Experiment-1",
"20",
"19",
"Experiment-2",
"20",
"17",
"Experiment-3",
"20",
"18.5",
]
from collections import defaultdict
import itertools
import json
import re
class_id_pattern = re.compile(r"\([A-Z0-9]+-\d+\)")
def is_class_reference(s):
return bool(class_id_pattern.match(s.rsplit(" ", 1)[-1]))
def group_by_class(s):
if is_class_reference(s):
group_by_class.current_class = s
return group_by_class.current_class
group_by_class.current_class = ""
def convert_numeric(s):
try:
return int(s)
except ValueError:
try:
return float(s)
except ValueError:
return None
def is_score(s):
return convert_numeric(s) is not None
def is_test(s):
return not is_score(s)
def group_by_test(s):
if is_test(s):
group_by_test.current_test = s
return group_by_test.current_test
group_by_test.current_test = ""
accum = defaultdict(lambda: defaultdict(list))
for class_name, class_name_and_tests in itertools.groupby(source_data, key=group_by_class):
class_name, *tests = class_name_and_tests
for test_name, test_name_and_scores in itertools.groupby(tests, key=group_by_test):
test_name, *scores = test_name_and_scores
accum[class_name][test_name].extend(convert_numeric(s) for s in scores)
print(json.dumps(accum, indent=4))
Prints:
{
"Mathematics-2 (21SMT-125)": {
"Mid-Semester Test-1": [
40,
23.5
],
"Mid-Semester Test-2": [
40,
34
]
},
"Disruptive Technologies - 2 (21ECH-103)": {
"Experiment-1": [
20,
19
],
"Experiment-2": [
20,
17
],
"Experiment-3": [
20,
18.5
]
}
}
Read more about fuzzy groupby in my blog post: https://thingspython.wordpress.com/2020/11/11/fuzzy-groupby-unusual-restaurant-part-ii/
I am trying to create a complex object based on metadata I have. It is an array of attributes which I am iterating and trying to create a dict. For example below is the array:
[
"itemUniqueId",
"itemDescription",
"manufacturerInfo[0].manufacturer.value",
"manufacturerInfo[0].manufacturerPartNumber",
"attributes.noun.value",
"attributes.modifier.value",
"attributes.entityAttributes[0].attributeName",
"attributes.entityAttributes[0].attributeValue",
"attributes.entityAttributes[0].attributeUOM",
"attributes.entityAttributes[1].attributeName",
"attributes.entityAttributes[1].attributeValue",
"attributes.entityAttributes[1].attributeUOM",
]
This array should give an output as below:
{
"itemUniqueId": "",
"itemDescription": "",
"manufacturerInfo": [
{
"manufacturer": {
"value": ""
},
"manufacturerPartNumber": ""
}
],
"attributes": {
"noun": {
"value": ""
},
"modifier": {
"value": ""
},
"entityAttributes": [
{
"attributeName": "",
"attributeValue": "",
"attributeUOM": ""
},
{
"attributeName": "",
"attributeValue": "",
"attributeUOM": ""
}
]
}
}
I have written this logic but unable to get the desired output. It should work on both object and array given the metadata.
source_json = [
"itemUniqueId",
"itemDescription",
"manufacturerInfo[0].manufacturer.value",
"manufacturerInfo[0].manufacturerPartNumber",
"attributes.noun.value",
"attributes.modifier.value",
"attributes.entityAttributes[0].attributeName",
"attributes.entityAttributes[0].attributeValue",
"attributes.entityAttributes[0].attributeUOM",
"attributes.entityAttributes[1].attributeName",
"attributes.entityAttributes[1].attributeValue",
"attributes.entityAttributes[1].attributeUOM",
]
for row in source_json:
propertyNames = row.split('.')
temp = ''
parent = {}
parentArr = []
parentObj = {}
# if len(propertyNames) > 1:
arrLength = len(propertyNames)
for i, (current) in enumerate(zip(propertyNames)):
if i == 0:
if '[' in current:
parent[current]=parentArr
else:
parent[current] = parentObj
temp = current
if i > 0 and i < arrLength - 1:
if '[' in current:
parent[current] = parentArr
else:
parent[current] = parentObj
temp = current
if i == arrLength - 1:
if '[' in current:
parent[current] = parentArr
else:
parent[current] = parentObj
temp = current
# temp[prev][current] = ""
# finalMapping[target] = target
print(parent)
There's a similar question at Convert Dot notation string into nested Python object with Dictionaries and arrays where the accepted answer works for this question, but has unused code paths (e.g. isInArray) and caters to unconventional conversions expected by that question:
❓ "arrOne[0]": "1,2,3" → "arrOne": ["1", "2", "3"] instead of
✅ "arrOne[0]": "1,2,3" → "arrOne": ["1,2,3"] or
✅ "arrOne[0]": "1", "arrOne[1]": "2", "arrOne[2]": "3" → "arrOne": ["1", "2", "3"]
Here's a refined implementation of the branch function:
def branch(tree, path, value):
key = path[0]
array_index_match = re.search(r'\[([0-9]+)\]', key)
if array_index_match:
# Get the array index, and remove the match from the key
array_index = int(array_index_match[0].replace('[', '').replace(']', ''))
key = key.replace(array_index_match[0], '')
# Prepare the array at the key
if key not in tree:
tree[key] = []
# Prepare the object at the array index
if array_index == len(tree[key]):
tree[key].append({})
# Replace the object at the array index
tree[key][array_index] = value if len(path) == 1 else branch(tree[key][array_index], path[1:], value)
else:
# Prepare the object at the key
if key not in tree:
tree[key] = {}
# Replace the object at the key
tree[key] = value if len(path) == 1 else branch(tree[key], path[1:], value)
return tree
Usage:
VALUE = ''
def create_dict(attributes):
d = {}
for path_str in attributes:
branch(d, path_str.split('.'), VALUE)
return d
source_json = [
"itemUniqueId",
"itemDescription",
"manufacturerInfo[0].manufacturer.value",
"manufacturerInfo[0].manufacturerPartNumber",
"attributes.noun.value",
"attributes.modifier.value",
"attributes.entityAttributes[0].attributeName",
"attributes.entityAttributes[0].attributeValue",
"attributes.entityAttributes[0].attributeUOM",
"attributes.entityAttributes[1].attributeName",
"attributes.entityAttributes[1].attributeValue",
"attributes.entityAttributes[1].attributeUOM",
]
assert create_dict(source_json) == {
"itemUniqueId": "",
"itemDescription": "",
"manufacturerInfo": [
{
"manufacturer": {
"value": ""
},
"manufacturerPartNumber": ""
}
],
"attributes": {
"noun": {
"value": ""
},
"modifier": {
"value": ""
},
"entityAttributes": [
{
"attributeName": "",
"attributeValue": "",
"attributeUOM": ""
},
{
"attributeName": "",
"attributeValue": "",
"attributeUOM": ""
}
]
}
}
First we should iterate over whole list and store each 3rd attributes, after that we could change this struct to our desired output:
from typing import Dict, List
source_json = [
"attributes.entityAttributes[0].attributeName",
"attributes.entityAttributes[0].attributeValue",
"attributes.entityAttributes[0].attributeUOM",
"attributes.entityAttributes[1].attributeName",
"attributes.entityAttributes[1].attributeValue",
"attributes.entityAttributes[1].attributeUOM",
"attributes.entityAttributes[2].attributeName"
]
def accumulate(source: List) -> Dict:
accumulator = {}
for v in source:
vs = v.split(".")
root_attribute = vs[0]
if not root_attribute in accumulator:
accumulator[root_attribute] = {}
i = vs[1].rfind('[')
k = (vs[1][:i], vs[1][i+1:-1])
if not k in accumulator[root_attribute]:
accumulator[root_attribute][k] = {}
accumulator[root_attribute][k][vs[2]] = ""
return accumulator
def get_result(accumulated: Dict) -> Dict:
result = {}
for k, v in accumulated.items():
result[k] = {}
for (entity, idx), v1 in v.items():
if not entity in result[k]:
result[k][entity] = []
if len(v1) == 3:
result[k][entity].append(v1)
return result
print(get_result(accumulate(source_json)))
The output will be:
{
'attributes':
{
'entityAttributes':
[
{
'attributeName': '',
'attributeValue': '',
'attributeUOM': ''
},
{'attributeName': '',
'attributeValue': '',
'attributeUOM': ''
}
]
}
}
In accumulate function we store 3rd level attributes in Dict with (entityAttributes, 0) ... (entityAttributes, 2) keys.
In get_result function we convert Dict with (entityAttributes, 0) ... (entityAttributes, 2) keys to Dict from string to List.
How about something like this:
import re
import json
source_json = [
"attributes.entityAttributes[0].attributeName",
"attributes.entityAttributes[0].attributeValue",
"attributes.entityAttributes[0].attributeUOM",
"attributes.entityAttributes[1].attributeName",
"attributes.entityAttributes[1].attributeValue",
"attributes.entityAttributes[1].attributeUOM",
"attributes.entityAttributes[2].attributeName"
]
def to_object(source_json):
def add_attribute(target, attribute_list):
head, tail = attribute_list[0], attribute_list[1:]
if tail:
add_attribute(target.setdefault(head,{}), tail)
else:
target[head] = ''
target = {}
for row in source_json:
add_attribute(target, re.split(r'[\.\[\]]+',row))
return target
print(json.dumps(to_object(source_json), indent=4))
Note that this will not exactly do what you requested. It interprets stores the array also as an object with keys '0' ... '2'. This makes it easier to implement and also more stable. What would you expect, when the input list missed the entries with entityAttributes[0]. Should the list include an empty element or something different. Anyway you save space by not including this element, which works only if you store the array in an object.
None of the answers provided so far strike me as very intuitive. Here's one way
to tackle the problem with three easy-to-understand functions.
Normalize inputs. First we need a function to normalize the inputs strings. Instead of rules-bearing strings like
'foo[0].bar' – where one must understand that integers
in square brackets imply a list – we want a simple tuple
of keys like ('foo', 0, 'bar').
def attribute_to_keys(a):
return tuple(
int(k) if k.isdigit() else k
for k in a.replace('[', '.').replace(']', '').split('.')
)
Build a uniform data structure. Second, we need a function to assemble a data structure consisting of dicts
of dicts of dicts ... all the way down.
def assemble_data(attributes):
data = {}
for a in attributes:
d = data
for k in attribute_to_keys(a):
d = d.setdefault(k, {})
return convert(data)
def convert(d):
# Just a placeholder for now.
return d
Convert the uniform data. Third, we need to implement a real version of the placeholder. Specifically, we
need it to recursively convert the uniform data structure into our ultimate
goal having (a) empty strings at leaf nodes, and (b) lists rather than dicts
whenever the dict keys are all integers. Note that this even fills in empty
list positions with an empty string (a contingency not covered in your problem
description; adjust as needed if you want a different behavior).
def convert(d):
if not d:
return ''
elif all(isinstance(k, int) for k in d):
return [convert(d.get(i)) for i in range(max(d) + 1)]
else:
return {k : convert(v) for k, v in d.items()}
You can use a custom builder class which implements __getattr__ and __getitem__ to gradually build the underlying object. This building can then be triggered by using eval on each of the attribute strings (note: eval is not safe for input from untrusted sources).
The following is an example implementation:
class Builder:
def __init__(self):
self.obj = None
def __getattr__(self, key):
if self.obj is None:
self.obj = {}
return self.obj.setdefault(key, Builder())
def __getitem__(self, index):
if self.obj is None:
self.obj = []
self.obj.extend(Builder() for _ in range(index+1-len(self.obj)))
return self.obj[index]
def convert(self):
if self.obj is None:
return ''
elif isinstance(self.obj, list):
return [v.convert() for v in self.obj]
elif isinstance(self.obj, dict):
return {k: v.convert() for k,v in self.obj.items()}
else:
assert False
attributes = [
'itemUniqueId',
'itemDescription',
'manufacturerInfo[0].manufacturer.value',
'manufacturerInfo[0].manufacturerPartNumber',
'attributes.noun.value',
'attributes.modifier.value',
'attributes.entityAttributes[0].attributeName',
'attributes.entityAttributes[0].attributeValue',
'attributes.entityAttributes[0].attributeUOM',
'attributes.entityAttributes[1].attributeName',
'attributes.entityAttributes[1].attributeValue',
'attributes.entityAttributes[1].attributeUOM',
]
builder = Builder()
for attr in attributes:
eval(f'builder.{attr}')
result = builder.convert()
import json
print(json.dumps(result, indent=4))
which gives the following output:
{
"itemUniqueId": "",
"itemDescription": "",
"manufacturerInfo": [
{
"manufacturer": {
"value": ""
},
"manufacturerPartNumber": ""
}
],
"attributes": {
"noun": {
"value": ""
},
"modifier": {
"value": ""
},
"entityAttributes": [
{
"attributeName": "",
"attributeValue": "",
"attributeUOM": ""
},
{
"attributeName": "",
"attributeValue": "",
"attributeUOM": ""
}
]
}
}
I have the following invalid JSON string which I'd like to convert into valid JSON (so each "template" will have a vuln-x key before it):
{"template":"network/vsftpd-detection.yaml","matcher-status":true}{"template":"cves/2018/CVE-2018-15473.yaml","matcher-status":true}{"template":"cves/2016/CVE-2016-6210.yaml","matcher-status":true}
I'm currently doing the following in order to format it:
import json
s1 = '{"template":"network/vsftpd-detection.yaml","matcher-status":true}{"template":"cves/2018/CVE-2018-15473.yaml","matcher-status":true}{"template":"cves/2016/CVE-2016-6210.yaml","matcher-status":true}'
s2 = s1.split('{"template":')
num = s1.count('{"template":')
out_json = "{"
for x in range(num):
out_json += '"vuln-' + str(x) + '":{"template":' + s2[x+1]
new_json = out_json.replace("true}", "true},")
cleaned_json = new_json[:-1] + "}"
print(cleaned_json)
I feel this is incredibly messy and am sure there's a cleaner way to do it - any ideas?
Here's the desired output which I'm getting with my current script:
{
"vuln-0":{
"template":"network/vsftpd-detection.yaml",
"matcher-status":true
},
"vuln-1":{
"template":"cves/2018/CVE-2018-15473.yaml",
"matcher-status":true
},
"vuln-2":{
"template":"cves/2016/CVE-2016-6210.yaml",
"matcher-status":true
}
}
Add a delimiter between the dictionaries to enable easier splitting, then process as dictionaries:
import json
s = '{"template":"network/vsftpd-detection.yaml","matcher-status":true}{"template":"cves/2018/CVE-2018-15473.yaml","matcher-status":true}{"template":"cves/2016/CVE-2016-6210.yaml","matcher-status":true}'
# add a delimiter not used in the string (nul) and split on it.
strings = s.replace('}{', '}\0{').split('\0')
# dict comprehension
data = {f'vuln-{i}': json.loads(v) for i, v in enumerate(strings)}
print(json.dumps(data, indent=2))
Output:
{
"vuln-0": {
"template": "network/vsftpd-detection.yaml",
"matcher-status": true
},
"vuln-1": {
"template": "cves/2018/CVE-2018-15473.yaml",
"matcher-status": true
},
"vuln-2": {
"template": "cves/2016/CVE-2016-6210.yaml",
"matcher-status": true
}
}
import plazy
txt_filter = lambda x : True if x.endswith('') else False
file_paths: list = plazy.list_files(root='/data/', filter_func=txt_filter, is_include_root=True)
print(file_paths)
output:
["/data/subdir1/subdir1_1/file1.txt","/data/subdir2/subdir2_1/file2.txt", "/data/subdir2/subdir2_1/file1.txt", "/data/subdir3/subdir3_1/subdir3_2/file1.txt"]
How can I extend these paths for a specific dictionary. I want it to look like this
{
"data":
"subdir1" : { "subdir1_1": ["file1.txt"]},
"subdir2" : { "subdir2_1": ["file1.txt", "file2.txt"]},
"subdir3" : { "subdir3_1":
{ "subdir3_2": ["file1.txt"]}
}
}
I think one way to address this is by using plazy.list_files in a limited depth first (to get top level dirs) and recurse manually, rather than letting it get the whole tree.
Some pseudo code to illustrate...
topdirs = getdirs(/root)
foreach dir
children = getdirs(/dir)
leaves = gettxtfiles(/dir/)
As your program recurses into the structure it builds it map the way you want it.
I took care of it without using Plazy.
path = '/data/'
import os
import pprint
def f(path):
if os.path.isdir(path):
d ,l = {}, []
for name in os.listdir(path):
if os.path.isdir(os.path.join(path, name)):
d[name] = f(os.path.join(path, name))
else:
l.append(name)
d = l
return d
pprint.pprint(f(path))
Output
{
"data":
"subdir1" : { "subdir1_1": ["file1.txt"]},
"subdir2" : { "subdir2_1": ["file1.txt", "file2.txt"]},
"subdir3" : { "subdir3_1":
{ "subdir3_2": ["file1.txt"]}
}
}
I would like to create a tree style structure from a dataframe in Python.
The desired output would look something like this:
{
"Cat1": {
"Band1" : {
"PE1": {
"1995-01": {
"GroupCalcs": {
"Count": 382.0,
"EqWghtRtn": -0.015839621267015727,
"Idx_EqWghtRtn": 0.9841603787329842,
"Idx_WghtRtn": 0.9759766819565102,
"WghtRtn": -0.02402331804348973
}
},
"1995-02": {
"GroupCalcs": {
"Count": 382.0,
"EqWghtRtn": -0.015839621267015727,
"Idx_EqWghtRtn": 0.9841603787329842,
"Idx_WghtRtn": 0.9759766819565102,
"WghtRtn": -0.02402331804348973
}
}
}
}
}
I am looking for the most efficient way of building this from a Dataframe. I have 20k+ rows to parse out which currently look like this snippet
Data Snippett
I was thinking something like this but I know this is very inefficient.
dctcat = {}
for cat in alldf.index.levels[0]:
dctMktBand = {}
for mktband in alldf.index.levels[1]:
dctpe = {}
for pe in alldf.index.levels[2]:
dctMonth = {}
for month in alldf.index.levels[3]:
dctMonth[month]=alldf.loc[[cat,mktband,pe,month]].filter(items=['Count', 'EqWghtRtn', 'Idx_EqWghtRtn','Idx_WghtRtn', 'WghtRtn']).to_dict()
dctpe[str(pe)]=dctMonth
dctMktBand[str(mktband)] = dctpe
dctcat[str(cat)] = dctpe
Possible Solution
I stumbled across this article https://gist.github.com/hrldcpr/2012250 and using defauldict(tree) I was able to accomplish what I needed in under 2 seconds. Which while that sounds long, it is considerably faster than what I had. Sharing the code below, if someone has updates or improvements on this I would appreciate it.
def tree():
return defaultdict(tree)
def dicts(t):
if isinstance(t ,defaultdict):
outval = {k: dicts(t[k]) for k in t }
else:
outval = t
return outval
alldf.set_index(['category','marketCapBand','peGreaterThanMarket','Month'], inplace=True)
tmp = alldf.filter(items=['Count', 'EqWghtRtn', 'Idx_EqWghtRtn','Idx_WghtRtn', 'WghtRtn']).to_dict('index')
outjson = tree()
for k,v in tmp.items():
(cat,mkb,pe,mon) = k
outjson[str(cat)][str(mkb)][str(pe)][str(mon)] = v
# convert back to dictionary
outjson = dicts(outjson)