I would like to create a tree style structure from a dataframe in Python.
The desired output would look something like this:
{
"Cat1": {
"Band1" : {
"PE1": {
"1995-01": {
"GroupCalcs": {
"Count": 382.0,
"EqWghtRtn": -0.015839621267015727,
"Idx_EqWghtRtn": 0.9841603787329842,
"Idx_WghtRtn": 0.9759766819565102,
"WghtRtn": -0.02402331804348973
}
},
"1995-02": {
"GroupCalcs": {
"Count": 382.0,
"EqWghtRtn": -0.015839621267015727,
"Idx_EqWghtRtn": 0.9841603787329842,
"Idx_WghtRtn": 0.9759766819565102,
"WghtRtn": -0.02402331804348973
}
}
}
}
}
I am looking for the most efficient way of building this from a Dataframe. I have 20k+ rows to parse out which currently look like this snippet
Data Snippett
I was thinking something like this but I know this is very inefficient.
dctcat = {}
for cat in alldf.index.levels[0]:
dctMktBand = {}
for mktband in alldf.index.levels[1]:
dctpe = {}
for pe in alldf.index.levels[2]:
dctMonth = {}
for month in alldf.index.levels[3]:
dctMonth[month]=alldf.loc[[cat,mktband,pe,month]].filter(items=['Count', 'EqWghtRtn', 'Idx_EqWghtRtn','Idx_WghtRtn', 'WghtRtn']).to_dict()
dctpe[str(pe)]=dctMonth
dctMktBand[str(mktband)] = dctpe
dctcat[str(cat)] = dctpe
Possible Solution
I stumbled across this article https://gist.github.com/hrldcpr/2012250 and using defauldict(tree) I was able to accomplish what I needed in under 2 seconds. Which while that sounds long, it is considerably faster than what I had. Sharing the code below, if someone has updates or improvements on this I would appreciate it.
def tree():
return defaultdict(tree)
def dicts(t):
if isinstance(t ,defaultdict):
outval = {k: dicts(t[k]) for k in t }
else:
outval = t
return outval
alldf.set_index(['category','marketCapBand','peGreaterThanMarket','Month'], inplace=True)
tmp = alldf.filter(items=['Count', 'EqWghtRtn', 'Idx_EqWghtRtn','Idx_WghtRtn', 'WghtRtn']).to_dict('index')
outjson = tree()
for k,v in tmp.items():
(cat,mkb,pe,mon) = k
outjson[str(cat)][str(mkb)][str(pe)][str(mon)] = v
# convert back to dictionary
outjson = dicts(outjson)
Related
Problem Statement:
I have around 500 ZIP files with lots of XMLS, i am able to convert them to JSON and parse them to parquet files as example below for one nested JSON file.
Not able to process multiple files with spark also
I have below code that flattens whole JSON into pandas data frame but now have to run this code over 150,000 files. when my JSON is very big it takes around 2 minutes to flatten whole data. Also if i run it using SPARK over my RDD of multiple files it fails with either OOM or struct error.
Am i doing something wrong SPARK wise ?
import xmltodict
import pandas as pd
def parser(master_tree):
flatten_tree_node = []
def _process_leaves(tree:dict,prefix:str = "node", tree_node:dict = dict(), update:bool = True):
is_nested = False
if isinstance(tree,dict):
for k in tree.keys():
if type(tree[k]) == str:
colName = prefix + "_" + k
tree_node[colName] = tree[k]
elif type(tree[k]) == dict:
prefix += "_" + k
leave = tree[k]
_process_leaves(leave,prefix = prefix, tree_node = tree_node, update = False)
for k in tree.keys():
if type(tree[k]) == list:
is_nested = True
prefix += "_" + k
for leave in tree[k]:
_process_leaves(leave,prefix = prefix, tree_node = tree_node.copy())
if not is_nested and update:
flatten_tree_node.append(tree_node)
_process_leaves(master_tree)
df = pd.DataFrame(flatten_tree_node)
df.columns = df.columns.str.replace("#", "_")
df.columns = df.columns.str.replace("#", "_")
return df
def extractor(file_name,file):
data = file.decode('utf-8')
d = bytes(bytearray(data, encoding='utf-8'))
data = xmltodict.parse(d)
flatten_data = parser(dict_data)
return (file_name,flatten_data)
def extract_files(x):
in_memory_data = io.BytesIO(x[1])
file_obj = zipfile.ZipFile(in_memory_data, "r")
files = [i for i in file_obj.namelist()]
return [extractor(file_name,file_obj.open(file_name).read()) for file_name in files]
zip_rdd = spark.read.format('binaryFile').load('/home/me/sample.zip').select('path','content').rdd
Fails here at the time of collection:
collected_data = zip_rdd.map(extract_files).collect()
Below Errors:
org.apache.spark.api.python.PythonException: 'struct.error: 'i' format requires -2147483648 <= number <= 2147483647'. Full traceback
or
java.lang.OutOfMemoryError
at java.io.ByteArrayOutputStream.hugeCapacity(ByteArrayOutputStream.java:123
Although everything works fine when ran one only single file.
Example Run of parsing nested JSON using parser function is like below:
Is there a way to make it memory and speed efficient ?
import pandas as pd
tree= {
"products":
[
{
"id":"0",
"name": "First",
"emptylist":[],
"properties" :
{
"id" : "",
"name" : ""
}
},
{
"id":"1",
"name": "Second",
"emptylist":[],
"properties":
{
"id" : "23",
"name" : "a useful product",
"features" :
[
{
"name":"Features",
"id":"18",
"features":
[
{
"id":"1001",
"name":"Colour",
"value":"Black"
},
{
"id":"2093",
"name":"Material",
"value":"Plastic"
}
]
},
{
"name":"Sizes",
"id":"34",
"features":
[
{
"id":"4736",
"name":"Length",
"value":"56"
},
{
"id":"8745",
"name":"Width",
"value":"76"
}
]
}
]
}
},
{
"id":"2",
"name": "Third",
"properties" :
{
"id" : "876",
"name" : "another one",
"features" :
[
{
"name":"Box",
"id":"937",
"features":
[
{
"id":"3758",
"name":"Amount",
"value":"1"
},
{
"id":"2222",
"name":"Packaging",
"value":"Blister"
}
]
},
{
"name":"Features",
"id":"8473",
"features":
[
{
"id":"9372",
"name":"Colour",
"value":"White"
},
{
"id":"9375",
"name":"Position",
"value":"A"
},
{
"id":"2654",
"name":"Amount",
"value":"6"
}
]
}
]
}
}
]
}
def parser(master_tree):
flatten_tree_node = []
def _process_leaves(tree:dict,prefix:str = "node", tree_node:dict = dict(), update:bool = True):
is_nested = False
if isinstance(tree,dict):
for k in tree.keys():
if type(tree[k]) == str:
colName = prefix + "_" + k
tree_node[colName] = tree[k]
elif type(tree[k]) == dict:
prefix += "_" + k
leave = tree[k]
_process_leaves(leave,prefix = prefix, tree_node = tree_node, update = False)
for k in tree.keys():
if type(tree[k]) == list:
is_nested = True
prefix += "_" + k
for leave in tree[k]:
_process_leaves(leave,prefix = prefix, tree_node = tree_node.copy())
if not is_nested and update:
flatten_tree_node.append(tree_node)
_process_leaves(master_tree)
df = pd.DataFrame(flatten_tree_node)
df.columns = df.columns.str.replace("#", "_")
df.columns = df.columns.str.replace("#", "_")
return df
print(parser(tree))
node_products_id node_products_name ... node_products_properties_features_features_name node_products_properties_features_features_value
0 1 Second ... Colour Black
1 1 Second ... Material Plastic
2 1 Second ... Length 56
3 1 Second ... Width 76
4 2 Third ... Amount 1
5 2 Third ... Packaging Blister
6 2 Third ... Colour White
7 2 Third ... Position A
8 2 Third ... Amount 6
9 2 Third ... NaN NaN
[10 rows x 9 columns]
Do not collect this data, it's likely it will never fit in memory as you are trying to pull all the data into the driver.
You can just save it to a file directly.
collected_data = zip_rdd.map(extract_files).toDF("column","names","go","here")
collected_data.write.parquet("/path/to/folder")
I do not have spark 3.2 but I'm aware of the features it posses. And in this case it will make your life easy. unionByName is a new feature that will let you magically join schemas.
collected_data = spark.createDataFrame( data = [], schema = [] )
zip_array = spark.read.format('binaryFile').load('/home/me/sample.zip').select('path').collect() # this will likely fit in driver memory so it's OK to call. After all it's just a list of file paths.
for my_file in zip_array:
collected_data = collected_data.unionByName( spark.createDataFrame(extract_files(my_file)), allowMissingColumns=True )
collected_data.write.parquet("/path/to/folder")
For better efficiency you want to use mapParitions. There are a couple reasons why but this actually goes back to map/reduce era. You want to create an iterator as this can work at lower levels. Can be optimized and pipelined better.(Hence the use of yield)
MapParitition code will execute inside an executor, and can only contain 'python code'. No spark code allowed as you don't have access to the sparkContext in an executor. Sometimes requires imports to be completed in the function itself as the scope is local not global.
If you are looking to save more memory, you might want to reconsider an alternative to xmltodict.parse(d) and re-writing reformat. You could use a library that you initiate once per partition and re-use it for the entire set of rows in the partition. This would be more efficient than the static call to xmltodict.parse(d) that just uses memory to create the struct just to be thrown away immediately by the garbage collector as it goes out of scope. (Google lists several alternatives you can review and determine what one best fits your needs)
zip_array = spark.read.format('binaryFile').load('/home/me/sample.zip').select('path').collect() # this will likely fit in driver memory so it's OK to call. After all it's just a list of file paths.
def reformat(partitionData):
for row in partitionData:
in_memory_data = io.BytesIO(row[1])
file_obj = zipfile.ZipFile(in_memory_data, "r").namelist()
for file_name in file_obj:
yield extractor(file_name,file_obj.open(file_name).read())
collected_data = zip_array.rdd.mapPartitions(reformat).toDF("file_name","flattened_data")
collected_data.write.parquet("/path/to/folder")
I want to know if its faster(importing) using updateone or updatemany with bulk write.My code for importing the data into the collection with pymongo look is this:
for file in sorted_files:
df = process_file(file)
for row, item in df.iterrows():
data_dict = item.to_dict()
bulk_request.append(UpdateOne(
{"nsamples": {"$lt": 12}},
{
"$push": {"samples": data_dict},
"$inc": {"nsamples": 1}
},
upsert=True
))
result = mycol1.bulk_write(bulk_request)
When i tried update many the only thing i change is this:
...
...
bulk_request.append(UpdateMany(..
..
..
I didnt see any major difference in insertion time.Shouldnt updateMany be way faster?
Maybe i am doing something wrong.Any advice would be helpful!
Thanks in advance!
Note:My data consist of 1.2m rows .I need each document to contain 12 subdocuments.
#Wernfried Domscheit's answer is correct.
This answer is specific to your scenario.
If you don't mind not updating records to existing documents and insert new documents altogether, use the below code which is the best optimized for your use case.
sorted_files = []
process_file = None
for file in sorted_files:
df = process_file(file)
sample_data = []
for row, item in df.iterrows():
sample_data.append(item.to_dict())
if len(sample_data) == 12:
mycol1.insertOne({
"samples": sample_data,
"nsamples": len(sample_data),
})
sample_data = []
mycol1.insertOne({
"samples": sample_data,
"nsamples": len(sample_data),
})
If you want to fill up your existing records with 12 objects and then,
create new records, use the below code logic.
Note: I have not tested the code in my local, its just to understand the flow for you to use.
for file in sorted_files:
df = process_file(file)
sample_data = []
continuity_flag = False
for row, item in df.iterrows():
sample_data.append(item.to_dict())
if not continuity_flag:
sample_rec = mycol1.find_one({"nsamples": {"$lt": 12}}, {"nsamples": 1})
if sample_rec is None:
continuity_flag = True
elif sample_rec["nsamples"] + len(sample_data) == 12:
mycol1.update_one({
"_id": sample_rec["_id"]
}, {
"$push": {"samples": {"$each": sample_data}},
"$inc": {"nsamples": len(sample_data)}
})
if len(sample_data) == 12:
mycol1.insert_one({
"samples": sample_data,
"nsamples": len(sample_data),
})
sample_data = []
if sample_data:
mycol1.insert_one({
"samples": sample_data,
"nsamples": len(sample_data),
})
updateOne updates only the first matching document for nsamples: {$lt: 12}. So updateOne should be faster.
However, why do you insert them one by one? Put all in one document and make a single update. Similar to this one:
sample_data = [];
for row, item in df.iterrows():
data_dict = item.to_dict();
sample_data.append(data_dict);
db.mycol1.updateOne(
{"nsamples": {"$lt": 12}},
{
"$push": { samples: { $each: sample_data } },
"$inc": {"nsamples": len(sample_data) }
}
)
Is there a way to match these json objects based on the "matchID" and make them into one object with merged data? I basically want these to make a table that contains the stat of each player participating on the same match id.
Input:
data_list = [
{
"kloweritotv#3560772":[
{
"kd":2,
"kills":6,
"teamPlacement":3,
"damageDone":2388,
"matchID":"12887455297423544724"
},
{
"kd":1,
"kills":4,
"teamPlacement":14,
"damageDone":1828,
"matchID":"11929202821836542057"
},
]
},
{
"Stylnox07":[
{
"kd":4.5,
"kills":9,
"teamPlacement":3,
"damageDone":2549,
"matchID":"12887455297423544724"
},
{
"kd":1.5,
"kills":3,
"teamPlacement":14,
"damageDone":1008,
"matchID":"11929202821836542057"
}
}
]
Desired output (Pseudo code):
[
{
"matchid":12887455297423544724,
"kloweritotv":{
"kd":2,
"kills":6,
"teamPlacement":3,
"damageDone":2388,
"matchID":"12887455297423544724"
},
"Stylnox07":{
"kd":4.5,
"kills":9,
"teamPlacement":3,
"damageDone":2549,
"matchID":"12887455297423544724"
}
},
{
"matchid":11929202821836542057,
"kloweritotv":{
"kd":1,
"kills":4,
"teamPlacement":14,
"damageDone":1828,
"matchID":"11929202821836542057"
},
"Stylnox07":{
"kd":1.5,
"kills":3,
"teamPlacement":14,
"damageDone":1008,
"matchID":"11929202821836542057"
}
}
]
Any information would definitely be very helpful! TIA!
Update: Took out the extra match for easier understanding. In other words, if User 1 has match_id 123 and user 2 also has match_id 123 - They should both be merged under one match_id 123 with their respective data.
def collect_data(key_id, match_id):
d = []
for k1 in data_list:
for k2 in k1:
if k2 != key_id: continue
for e in k1[k2]:
if e.get('matchID', None) == match_id:
e1 = e.copy()
e1.pop('matchID')
d.append(e1)
return d
dics = {}
for k1 in data_list:
for k2 in k1:
for e in k1[k2]:
m = e.get('matchID', None)
if m is None: continue
if not m in dics: dics[m] = {}
dics[m].update({k2: collect_data(k2, m)})
import plazy
txt_filter = lambda x : True if x.endswith('') else False
file_paths: list = plazy.list_files(root='/data/', filter_func=txt_filter, is_include_root=True)
print(file_paths)
output:
["/data/subdir1/subdir1_1/file1.txt","/data/subdir2/subdir2_1/file2.txt", "/data/subdir2/subdir2_1/file1.txt", "/data/subdir3/subdir3_1/subdir3_2/file1.txt"]
How can I extend these paths for a specific dictionary. I want it to look like this
{
"data":
"subdir1" : { "subdir1_1": ["file1.txt"]},
"subdir2" : { "subdir2_1": ["file1.txt", "file2.txt"]},
"subdir3" : { "subdir3_1":
{ "subdir3_2": ["file1.txt"]}
}
}
I think one way to address this is by using plazy.list_files in a limited depth first (to get top level dirs) and recurse manually, rather than letting it get the whole tree.
Some pseudo code to illustrate...
topdirs = getdirs(/root)
foreach dir
children = getdirs(/dir)
leaves = gettxtfiles(/dir/)
As your program recurses into the structure it builds it map the way you want it.
I took care of it without using Plazy.
path = '/data/'
import os
import pprint
def f(path):
if os.path.isdir(path):
d ,l = {}, []
for name in os.listdir(path):
if os.path.isdir(os.path.join(path, name)):
d[name] = f(os.path.join(path, name))
else:
l.append(name)
d = l
return d
pprint.pprint(f(path))
Output
{
"data":
"subdir1" : { "subdir1_1": ["file1.txt"]},
"subdir2" : { "subdir2_1": ["file1.txt", "file2.txt"]},
"subdir3" : { "subdir3_1":
{ "subdir3_2": ["file1.txt"]}
}
}
I am trying to convert a list of dot-separated strings, e.g.
['one.two.three.four', 'one.six.seven.eight', 'five.nine.ten', 'twelve.zero']
into a tree (nested lists or dicts - anything that is easy to walk through).
The real data happens to have 1 to 4 dot-separated parts of different length and has 2200 records in total.
My actual goal is to fill in the set of 4 QComboBox'es with this data, in manner that the 1st QComboBox is filled with first set items ['one', 'five', 'twelve'] (no duplicates). Then depending on the chosen item, the 2nd QComboBox is filled with its related items: for 'one' it would be: ['two', 'six'], and so on, if there's another nested level.
So far I've got a working list -> nested dicts solution, but it's horribly slow, since I use regular dict(). And I seem to have a trouble to redesign it to a defaultdict in a way to easily work out filling the ComboBoxes properly.
My current code:
def list2tree(m):
tmp = {}
for i in range(len(m)):
if m.count('.') == 0:
return m
a = m.split('.', 1)
try:
tmp[a[0]].append(list2tree(a[1]))
except (KeyError, AttributeError):
tmp[a[0]] = list2tree(a[1])
return tmp
main_dict = {}
i = 0
for m in methods:
main_dict = list2tree(m)
i += 1
if (i % 100) == 0: print i, len(methods)
print main_dict, i, len(methods)
ls = ['one.two.three.four', 'one.six.seven.eight', 'five.nine.ten', 'twelve.zero']
tree = {}
for item in ls:
t = tree
for part in item.split('.'):
t = t.setdefault(part, {})
Result:
{
"twelve": {
"zero": {}
},
"five": {
"nine": {
"ten": {}
}
},
"one": {
"six": {
"seven": {
"eight": {}
}
},
"two": {
"three": {
"four": {}
}
}
}
}
While this is beyond the reach of the original question, some comments mentioned a form of this algorithm that incorporates values. I came up with this to that end:
def dictionaryafy(self, in_dict):
tree = {}
for key, value in in_dict.items():
t = tree
parts = key.split(".")
for part in parts[:-1]:
t = t.setdefault(part, {})
t[parts[-1]] = value
return tree