lxml memory usage when parsing huge xml in python

lxml memory usage when parsing huge xml in python - python

I am a python newbie. I am trying to parse a huge xml file in my python module using lxml. In spite of clearing the elements at the end of each loop, my memory shoots up and crashes the application. I am sure I am missing something here. Please helpme figure out what that is.
Following are main functions I am using -
from lxml import etree
def parseXml(context,attribList):
for _, element in context:
fieldMap={}
rowList=[]
readAttribs(element,fieldMap,attribList)
readAllChildren(element,fieldMap,attribList)
for row in rowList:
yield row
element.clear()
def readAttribs(element,fieldMap,attribList):
for atrrib in attribList:
fieldMap[attrib]=element.get(attrib,'')
def readAllChildren(element,fieldMap,attribList,rowList):
for childElem in element:
readAttribs(childEleme,fieldMap,attribList)
if len(childElem) > 0:
readAllChildren(childElem,fieldMap,attribList)
rowlist.append(fieldMap.copy())
childElem.clear()
def main():
attribList=['name','age','id']
context=etree.iterparse(fullFilePath, events=("start",))
for row in parseXml(context,attribList)
print row
Thanks!!
Example xml and the nested dictionary -
<root xmlns='NS'>
<Employee Name="Mr.ZZ" Age="30">
<Experience TotalYears="10" StartDate="2000-01-01" EndDate="2010-12-12">
<Employment id = "1" EndTime="ABC" StartDate="2000-01-01" EndDate="2002-12-12">
<Project Name="ABC_1" Team="4">
</Project>
</Employment>
<Employment id = "2" EndTime="XYZ" StartDate="2003-01-01" EndDate="2010-12-12">
<PromotionStatus>Manager</PromotionStatus>
<Project Name="XYZ_1" Team="7">
<Award>Star Team Member</Award>
</Project>
</Employment>
</Experience>
</Employee>
</root>
ELEMENT_NAME='element_name'
ELEMENTS='elements'
ATTRIBUTES='attributes'
TEXT='text'
xmlDef={ 'namespace' : 'NS',
'content' :
{ ELEMENT_NAME: 'Employee',
ELEMENTS: [{ELEMENT_NAME: 'Experience',
ELEMENTS: [{ELEMENT_NAME: 'Employment',
ELEMENTS: [{
ELEMENT_NAME: 'PromotionStatus',
ELEMENTS: [],
ATTRIBUTES:[],
TEXT:['PromotionStatus']
},
{
ELEMENT_NAME: 'Project',
ELEMENTS: [{
ELEMENT_NAME: 'Award',
ELEMENTS: {},
ATTRIBUTES:[],
TEXT:['Award']
}],
ATTRIBUTES:['Name','Team'],
TEXT:[]
}],
ATTRIBUTES: ['TotalYears','StartDate','EndDate'],
TEXT:[]
}],
ATTRIBUTES: ['TotalYears','StartDate','EndDate'],
TEXT:[]
}],
ATTRIBUTES: ['Name','Age'],
TEXT:[]
}
}

Welcome to Python and Stack Overflow!
It looks like you've followed some good advice looking at lxml and especially etree.iterparse(..), but I think your implementation is approaching the problem from the wrong angle. The idea of iterparse(..) is to get away from collecting and storing data, and instead processing tags as they get read in. Your readAllChildren(..) function is saving everything to rowList, which grows and grows to cover the whole document tree. I made a few changes to show what's going on:
from lxml import etree
def parseXml(context,attribList):
for event, element in context:
print "%s element %s:" % (event, element)
fieldMap = {}
rowList = []
readAttribs(element, fieldMap, attribList)
readAllChildren(element, fieldMap, attribList, rowList)
for row in rowList:
yield row
element.clear()
def readAttribs(element, fieldMap, attribList):
for attrib in attribList:
fieldMap[attrib] = element.get(attrib,'')
print "fieldMap:", fieldMap
def readAllChildren(element, fieldMap, attribList, rowList):
for childElem in element:
print "Found child:", childElem
readAttribs(childElem, fieldMap, attribList)
if len(childElem) > 0:
readAllChildren(childElem, fieldMap, attribList, rowList)
rowList.append(fieldMap.copy())
print "len(rowList) =", len(rowList)
childElem.clear()
def process_xml_original(xml_file):
attribList=['name','age','id']
context=etree.iterparse(xml_file, events=("start",))
for row in parseXml(context,attribList):
print "Row:", row
Running with some dummy data:
>>> from cStringIO import StringIO
>>> test_xml = """\
... <family>
... <person name="somebody" id="5" />
... <person age="45" />
... <person name="Grandma" age="62">
... <child age="35" id="10" name="Mom">
... <grandchild age="7 and 3/4" />
... <grandchild id="12345" />
... </child>
... </person>
... <something-completely-different />
... </family>
... """
>>> process_xml_original(StringIO(test_xml))
start element: <Element family at 0x105ca58>
fieldMap: {'age': '', 'name': '', 'id': ''}
Found child: <Element person at 0x105ca80>
fieldMap: {'age': '', 'name': 'somebody', 'id': '5'}
len(rowList) = 1
Found child: <Element person at 0x105c468>
fieldMap: {'age': '45', 'name': '', 'id': ''}
len(rowList) = 2
Found child: <Element person at 0x105c7b0>
fieldMap: {'age': '62', 'name': 'Grandma', 'id': ''}
Found child: <Element child at 0x106e468>
fieldMap: {'age': '35', 'name': 'Mom', 'id': '10'}
Found child: <Element grandchild at 0x106e148>
fieldMap: {'age': '7 and 3/4', 'name': '', 'id': ''}
len(rowList) = 3
Found child: <Element grandchild at 0x106e490>
fieldMap: {'age': '', 'name': '', 'id': '12345'}
len(rowList) = 4
len(rowList) = 5
len(rowList) = 6
Found child: <Element something-completely-different at 0x106e4b8>
fieldMap: {'age': '', 'name': '', 'id': ''}
len(rowList) = 7
Row: {'age': '', 'name': 'somebody', 'id': '5'}
Row: {'age': '45', 'name': '', 'id': ''}
Row: {'age': '7 and 3/4', 'name': '', 'id': ''}
Row: {'age': '', 'name': '', 'id': '12345'}
Row: {'age': '', 'name': '', 'id': '12345'}
Row: {'age': '', 'name': '', 'id': '12345'}
Row: {'age': '', 'name': '', 'id': ''}
start element: <Element person at 0x105ca80>
fieldMap: {'age': '', 'name': '', 'id': ''}
start element: <Element person at 0x105c468>
fieldMap: {'age': '', 'name': '', 'id': ''}
start element: <Element person at 0x105c7b0>
fieldMap: {'age': '', 'name': '', 'id': ''}
start element: <Element child at 0x106e468>
fieldMap: {'age': '', 'name': '', 'id': ''}
start element: <Element grandchild at 0x106e148>
fieldMap: {'age': '', 'name': '', 'id': ''}
start element: <Element grandchild at 0x106e490>
fieldMap: {'age': '', 'name': '', 'id': ''}
start element: <Element something-completely-different at 0x106e4b8>
fieldMap: {'age': '', 'name': '', 'id': ''}
It's a little hard to read but you can see it's climbing the whole tree down from the root tag on the first pass, building up rowList for every element in the entire document. You'll also notice it's not even stopping there, since the element.clear() call comes after the yield statment in parseXml(..), it doesn't get executed until the second iteration (i.e. the next element in the tree).
Incremental processing FTW
A simple fix is to let iterparse(..) do its job: parse iteratively! The following will pull the same information and process it incrementally instead:
def do_something_with_data(data):
"""This just prints it out. Yours will probably be more interesting."""
print "Got data: ", data
def process_xml_iterative(xml_file):
# by using the default 'end' event, you start at the _bottom_ of the tree
ATTRS = ('name', 'age', 'id')
for event, element in etree.iterparse(xml_file):
print "%s element: %s" % (event, element)
data = {}
for attr in ATTRS:
data[attr] = element.get(attr, u"")
do_something_with_data(data)
element.clear()
del element # for extra insurance
Running on the same dummy XML:
>>> print test_xml
<family>
<person name="somebody" id="5" />
<person age="45" />
<person name="Grandma" age="62">
<child age="35" id="10" name="Mom">
<grandchild age="7 and 3/4" />
<grandchild id="12345" />
</child>
</person>
<something-completely-different />
</family>
>>> process_xml_iterative(StringIO(test_xml))
end element: <Element person at 0x105cc10>
Got data: {'age': u'', 'name': 'somebody', 'id': '5'}
end element: <Element person at 0x106e468>
Got data: {'age': '45', 'name': u'', 'id': u''}
end element: <Element grandchild at 0x106e148>
Got data: {'age': '7 and 3/4', 'name': u'', 'id': u''}
end element: <Element grandchild at 0x106e490>
Got data: {'age': u'', 'name': u'', 'id': '12345'}
end element: <Element child at 0x106e508>
Got data: {'age': '35', 'name': 'Mom', 'id': '10'}
end element: <Element person at 0x106e530>
Got data: {'age': '62', 'name': 'Grandma', 'id': u''}
end element: <Element something-completely-different at 0x106e558>
Got data: {'age': u'', 'name': u'', 'id': u''}
end element: <Element family at 0x105c6e8>
Got data: {'age': u'', 'name': u'', 'id': u''}
This should greatly improve both the speed and memory performance of your script. Also, by hooking the 'end' event, you're free to clear and delete elements as you go, rather than waiting until all children have been processed.
Depending on your dataset, it might be a good idea to only process certain types of elements. The root element, for one, probably isn't very meaningful, and other nested elements may also fill your dataset with a lot of {'age': u'', 'id': u'', 'name': u''}.
Or, with SAX
As an aside, when I read "XML" and "low-memory" my mind always jumps straight to SAX, which is another way you could attack this problem. Using the builtin xml.sax module:
import xml.sax
class AttributeGrabber(xml.sax.handler.ContentHandler):
"""SAX Handler which will store selected attribute values."""
def __init__(self, target_attrs=()):
self.target_attrs = target_attrs
def startElement(self, name, attrs):
print "Found element: ", name
data = {}
for target_attr in self.target_attrs:
data[target_attr] = attrs.get(target_attr, u"")
# (no xml trees or elements created at all)
do_something_with_data(data)
def process_xml_sax(xml_file):
grabber = AttributeGrabber(target_attrs=('name', 'age', 'id'))
xml.sax.parse(xml_file, grabber)
You'll have to evaluate both options based on what works best in your situation (and maybe run a couple benchmarks, if this is something you'll be doing often).
Be sure to follow up with how things work out!
Edit based on follow-up comments
Implementing either of the above solutions may require some changes to the overall structure of your code, but anything you have should still be doable. For instance, processing "rows" in batches, you could have:
def process_xml_batch(xml_file, batch_size=10):
ATTRS = ('name', 'age', 'id')
batch = []
for event, element in etree.iterparse(xml_file):
data = {}
for attr in ATTRS:
data[attr] = element.get(attr, u"")
batch.append(data)
element.clear()
del element
if len(batch) == batch_size:
do_something_with_batch(batch)
# Or, if you want this to be a genrator:
# yield batch
batch = []
if batch:
# there are leftover items
do_something_with_batch(batch) # Or, yield batch

Related

convert xml which has 'children' into dictionary

I have a xml file which has childrens and I want to convert it into a dict.
<people>
<type>
<name>lo_123</name>
<country>AUS</country>
<note>
<name>joe</name>
<gender>m</gender>
<age>26</age>
<spouse>
<name>lisa</name>
<gender>f</gender>
</spouse>
</note>
</type>
</people>
This is my code to convert it
import xml.etree.cElementTree as ET
xml='xmltest.xml'
crif_tree = ET.parse(xml)
crif_root = crif_tree.getroot()
data = []
for one in crif_root.findall('.//type'):
reg={e.tag: e.text for e in list(note1)}
data.append(reg)
for two in crif_root.findall('.//type/note'):
reg={e.tag: e.text for e in list(note1)}
data.append(reg)
for three in crif_root.findall('.//type/note/spouse'):
reg={e.tag: e.text for e in list(note1)}
data.append(reg)
print(data)
Here is the output of data
[{'name': 'lo_123', 'country': 'AUS', 'note': '\n '}, {'name': 'joe', 'gender': 'm', 'age': '26', 'spouse': '\n '}, {'name': 'lisa', 'gender': 'f'}]
My desired output would be
[{'name': 'lo_123', 'country': 'AUS', 'note': '\n ', 'name': 'joe', 'gender': 'm', 'age': '26', 'spouse': '\n ', 'name': 'lisa', 'gender': 'f'}]

Accessing keys/values in a paginated/nested dictionary

I know that somewhat related questions have been asked here: Accessing key, value in a nested dictionary and here: python accessing elements in a dictionary inside dictionary among other places but I can't quite seem to apply the answers' methodology to my issue.
I'm getting a KeyError trying to access the keys within response_dict, which I know is due to it being nested/paginated and me going about this the wrong way. Can anybody help and/or point me in the right direction?
import requests
import json
URL = "https://api.constantcontact.com/v2/contacts?status=ALL&limit=1&api_key=<redacted>&access_token=<redacted>"
#make my request, store it in the requests object 'r'
r = requests.get(url = URL)
#status code to prove things are working
print (r.status_code)
#print what was retrieved from the API
print (r.text)
#visual aid
print ('---------------------------')
#decode json data to a dict
response_dict = json.loads(r.text)
#show how the API response looks now
print(response_dict)
#just for confirmation
print (type(response_dict))
print('-------------------------')
# HERE LIES THE ISSUE
print(response_dict['first_name'])
And my output:
200
{"meta":{"pagination":{}},"results":[{"id":"1329683950","status":"ACTIVE","fax":"","addresses":[{"id":"4e19e250-b5d9-11e8-9849-d4ae5275509e","line1":"222 Fake St.","line2":"","line3":"","city":"Kansas City","address_type":"BUSINESS","state_code":"","state":"OK","country_code":"ve","postal_code":"19512","sub_postal_code":""}],"notes":[],"confirmed":false,"lists":[{"id":"1733488365","status":"ACTIVE"}],"source":"Site Owner","email_addresses":[{"id":"1fe198a0-b5d5-11e8-92c1-d4ae526edd6c","status":"ACTIVE","confirm_status":"NO_CONFIRMATION_REQUIRED","opt_in_source":"ACTION_BY_OWNER","opt_in_date":"2018-09-11T18:18:20.000Z","email_address":"rsmith#fake.com"}],"prefix_name":"","first_name":"Robert","middle_name":"","last_name":"Smith","job_title":"I.T.","company_name":"FBI","home_phone":"","work_phone":"5555555555","cell_phone":"","custom_fields":[],"created_date":"2018-09-11T15:12:40.000Z","modified_date":"2018-09-11T18:18:20.000Z","source_details":""}]}
---------------------------
{'meta': {'pagination': {}}, 'results': [{'id': '1329683950', 'status': 'ACTIVE', 'fax': '', 'addresses': [{'id': '4e19e250-b5d9-11e8-9849-d4ae5275509e', 'line1': '222 Fake St.', 'line2': '', 'line3': '', 'city': 'Kansas City', 'address_type': 'BUSINESS', 'state_code': '', 'state': 'OK', 'country_code': 've', 'postal_code': '19512', 'sub_postal_code': ''}], 'notes': [], 'confirmed': False, 'lists': [{'id': '1733488365', 'status': 'ACTIVE'}], 'source': 'Site Owner', 'email_addresses': [{'id': '1fe198a0-b5d5-11e8-92c1-d4ae526edd6c', 'status': 'ACTIVE', 'confirm_status': 'NO_CONFIRMATION_REQUIRED', 'opt_in_source': 'ACTION_BY_OWNER', 'opt_in_date': '2018-09-11T18:18:20.000Z', 'email_address': 'rsmith#fake.com'}], 'prefix_name': '', 'first_name': 'Robert', 'middle_name': '', 'last_name': 'Smith', 'job_title': 'I.T.', 'company_name': 'FBI', 'home_phone': '', 'work_phone': '5555555555', 'cell_phone': '', 'custom_fields': [], 'created_date': '2018-09-11T15:12:40.000Z', 'modified_date': '2018-09-11T18:18:20.000Z', 'source_details': ''}]}
<class 'dict'>
-------------------------
Traceback (most recent call last):
File "C:\Users\rkiek\Desktop\Python WIP\Chris2.py", line 20, in <module>
print(response_dict['first_name'])
KeyError: 'first_name'

first_name = response_dict["results"][0]["first_name"]
Even though I think this question would be better answered by yourself by reading some documentation, I will explain what is going on here. You see the dict-object of the man named "Robert" is within a list which is a value under the key "results". So, at first you need to access the value within results which is a python-list.
Then you can use a loop to iterate through each of the elements within the list, and treat each individual element as a regular dictionary object.
results = response_dict["results"]
results = response_dict.get("results", None)
# use any one of the two above, the first one will throw a KeyError if there is no key=="results" the other will return NULL
# this results is now a list according to the data you mentioned.
for item in results:
print(item.get("first_name", None)
# here you can loop through the list of dictionaries and treat each item as a normal dictionary

Traversing from one node in xml to another using Python

I am very new to XML with Python and I have the following XML string that I get as a response from a network device:
'<Response MajorVersion="1" MinorVersion="0"><Get><Configuration><OSPF MajorVersion="19" MinorVersion="2"><ProcessTable><Process><Naming><ProcessName>1</ProcessName></Naming><DefaultVRF><AreaTable><Area><Naming><AreaID>0</AreaID></Naming><Running>true</Running><NameScopeTable><NameScope><Naming><InterfaceName>Loopback0</InterfaceName></Naming><Running>true</Running><Cost>1000</Cost></NameScope><NameScope><Naming><InterfaceName>Loopback1</InterfaceName></Naming><Running>true</Running><Cost>1</Cost></NameScope><NameScope><Naming><InterfaceName>GigabitEthernet0/0/0/0</InterfaceName></Naming><Running>true</Running><Cost>1</Cost></NameScope></NameScopeTable></Area></AreaTable></DefaultVRF><Start>true</Start></Process></ProcessTable></OSPF></Configuration></Get><ResultSummary ErrorCount="0" /></Response>'
I have the following code to retrieve the interface information along with the interface cost associated with it. However I would also like to get the 'AreaID' tag associated with each interface as part of my dictionary. Unable to navigate the tree correctly to retrieve the AreaID tag value:
for node in x.iter('NameScope'):
int_name = str(node.find('Naming/InterfaceName').text)
d[int_name] = {}
d[int_name]['cost'] = str(node.find('Cost').text)
This code gives the following output when 'd' is printed:
{'GigabitEthernet0/0/0/0': {'cost': '1'},
'Loopback0': {'cost': '1000'},
'Loopback1': {'cost': '1'}}
I want something like this in the output:
{'GigabitEthernet0/0/0/0': {'cost': '1', 'area': 0},
'Loopback0': {'cost': '1000', 'area': 0},
'Loopback1': {'cost': '1', 'area': 0}}
Any suggestions or modifications to my code will be really appreciated!

I would use the preceding notation:
node.xpath(".//preceding::AreaID")[0].text
Complete code I am executing:
from lxml import etree as ET
x = ET.parse("input.xml")
d = {}
for node in x.iter('NameScope'):
int_name = str(node.find('Naming/InterfaceName').text)
d[int_name] = {
'cost': str(node.find('Cost').text),
'area': node.xpath(".//preceding::AreaID")[0].text
}
print(d)
Prints:
{
'Loopback0': {'cost': '1000', 'area': '0'},
'Loopback1': {'cost': '1', 'area': '0'},
'GigabitEthernet0/0/0/0': {'cost': '1', 'area': '0'}
}

Providing an id for each recursed item in nested dictionary of lists of dictionaries

extension of: recursing a dictionary of lists of dictionaries, etc et al (python)
I'm working with a nested dictionary structure of 4 levels, I'm trying to iterate of the entire nested dictionary and give each individual dictionary an identification number(as a precursor to building a tree of the items and being able tell which item node is parent, which children a node has etc.)
I have this function:
def r(y):
cnt = 1
def recurse(y, count):
for i in y.iteritems():
count+=1
i['id'] = count
for k,v in y.iteritems():
if isinstance(v, list):
[recurse(i, count) for i in v]
else:
pass
recurse(y, cnt)
return y
I put in my nested dictionary of lists of dictionaries,
and I get a mess, i.e. doesn't work like I thought it would.
{'sections': [{'id': 11, 'info': 'This is section ONE', 'tag': 's1'},
{'fields': [{'id': 15,
'info': 'This is field ONE',
'tag': 'f1'},
{'elements': [{'id': 20,
'info': 'This is element',
'tag': 'e1',
'type_of': 'text_field'},
{'id': 20,
'info': 'This is element',
'tag': 'e2',
'type_of': 'text_field'},
{'id': 20,
'info': 'This is element',
'tag': 'e3',
'type_of': 'text_field'},
{'id': 20,
'info': 'This is element',
'tag': 'e4',
'type_of': 'text_field'}],
'id': 16,
'info': 'This is field TWO',
'tag': 'f2'},
{'elements': [{'id': 20,
'info': 'This is element',
'tag': 'e5',
'type_of': 'text_field'},
{'id': 20,
'info': 'This is element',
'tag': 'e6',
'type_of': 'text_field'},
{'id': 20,
'info': 'This is element',
'tag': 'e7',
'type_of': 'text_field'},
{'id': 20,
'info': 'This is element ONE',
'tag': 'e8',
'type_of': 'text_field'}],
'id': 16,
'info': 'This is field THREE',
'tag': 'f3'}],
'id': 12,
'info': 'This is section TWO',
'tag': 's2'},
{'fields': [{'id': 15,
'info': 'This is field FOUR',
'tag': 'f4'},
{'id': 15,
'info': 'This is field FIVE',
'tag': 'f5'},
{'id': 15,
'info': 'This is field SIX',
'tag': 'f6'}],
'id': 12,
'info': 'This is section THREE',
'tag': 's3'}],
'tag': 'test'}
What I want to happen is that all items in level one are numbered, then all items in level two are numbered, then the third level, then the fourth. In this case the main item should be given an id of 1, then the sections be identified as 2,3,4 then fields as 5 on, then elements, etc. Looking back on this after sleeping on it I can see it as a start, but quite wrong.
EDIT: What I really need to do is create a tree of parent/child nodes from a nested dictionary structure so that I can iterate/insert/get/work with as needed the items from this tree. Is there a quick way to do that? I seem to be doing more work than I anticipated.
EDIT2: I found a solution to my original question. I just decided to use the in built id() function instead of an extra step of adding an id, and was able to create the minimal tree I needed, but this is still useful an exercise.

You're getting duplicate ids because your count variable is local, and once the recurse function exits, any changes to it are lost. You could get around it by declaring a global variable, but since you're not otherwise using the return value of recurse, you can use that instead:
def r(y):
def recurse(y, count):
y['id'] = count
count += 1
for k,v in y.iteritems():
if isinstance(v, list):
for i in v:
count = recurse(i, count)
return count
recurse(y, 1)
return y
Edit: Just realized you're looking for a breadth-first assignment of ids... this won't accomplish that, but I'll leave the answer as it may be helpful to get you started.

Well, I have a solution that uses depth and parent to set the ID:
>>> def decorate_tree(tree, parent=None, index=None):
global ID
if type(tree) == type({}):
if parent is None:
parent = '1'
tree['id'] = parent
else:
tree['id'] = '{0}.{1}'.format(parent, index)
if 'info' in tree:
print tree['info'], '=>', tree['id']
child_index = 1
for key in tree:
if type(tree[key]) == type([]):
for item in tree[key]:
decorate_tree(item, tree['id'], child_index)
child_index += 1
>>> decorate_tree(d)
This is section ONE => 1.1
This is section TWO => 1.2
This is field ONE => 1.2.1
This is field TWO => 1.2.2
This is element => 1.2.2.1
This is element => 1.2.2.2
This is element => 1.2.2.3
This is element => 1.2.2.4
This is field THREE => 1.2.3
This is element => 1.2.3.1
This is element => 1.2.3.2
This is element => 1.2.3.3
This is element ONE => 1.2.3.4
This is section THREE => 1.3
This is field FOUR => 1.3.1
This is field FIVE => 1.3.2
This is field SIX => 1.3.3
>>> from pprint import pprint
>>> pprint(d)
{'id': '1',
'sections': [{'id': '1.1', 'info': 'This is section ONE', 'tag': 's1'},
{'fields': [{'id': '1.2.1',
'info': 'This is field ONE',
'tag': 'f1'},
{'elements': [{'id': '1.2.2.1',
'info': 'This is element',
'tag': 'e1',
'type_of': 'text_field'},
{'id': '1.2.2.2',
'info': 'This is element',
'tag': 'e2',
'type_of': 'text_field'},
{'id': '1.2.2.3',
'info': 'This is element',
'tag': 'e3',
'type_of': 'text_field'},
{'id': '1.2.2.4',
'info': 'This is element',
'tag': 'e4',
'type_of': 'text_field'}],
'id': '1.2.2',
'info': 'This is field TWO',
'tag': 'f2'},
{'elements': [{'id': '1.2.3.1',
'info': 'This is element',
'tag': 'e5',
'type_of': 'text_field'},
{'id': '1.2.3.2',
'info': 'This is element',
'tag': 'e6',
'type_of': 'text_field'},
{'id': '1.2.3.3',
'info': 'This is element',
'tag': 'e7',
'type_of': 'text_field'},
{'id': '1.2.3.4',
'info': 'This is element ONE',
'tag': 'e8',
'type_of': 'text_field'}],
'id': '1.2.3',
'info': 'This is field THREE',
'tag': 'f3'}],
'id': '1.2',
'info': 'This is section TWO',
'tag': 's2'},
{'fields': [{'id': '1.3.1',
'info': 'This is field FOUR',
'tag': 'f4'},
{'id': '1.3.2',
'info': 'This is field FIVE',
'tag': 'f5'},
{'id': '1.3.3',
'info': 'This is field SIX',
'tag': 'f6'}],
'id': '1.3',
'info': 'This is section THREE',
'tag': 's3'}],
'tag': 'test',
'type_of': 'custom'}
>>>
So parent of ID 1.3.4 is ID 1.3, siblings are IDs 1.3.x, children are 1.3.4.x... that way retrieval and insertion shouldn't be too hard (shift indexes).

Here is a solution that replaces your count variable with the itertools.count iterator:
from itertools import count
def r(y):
counter = count()
def recurse(y, counter):
for i in y.iteritems():
i['id'] = next(counter)
for k,v in y.iteritems():
if isinstance(v, list):
[recurse(i, counter) for i in v]
else:
pass
recurse(y, counter)
return y
itertools.count() will create a generator that will return the next integer every time next() is called on it. You can pass it to the recursive function and be sure that duplicate ids will not be created.

An alternative to consider is a doubly-linked list. For example:
Index Tag Parent Children Info
0 test -1 [s1,s2,s3] ""
1 s1 0 [] "This is section ONE"
2 s2 0 [f1,f2,f3] "This is section TWO"
3 f1 2 [] "This is field ONE"
4 f2 2 [e1,e2,e3,e4] "This is field TWO"
5 e1 4 [] "This is element"
6 e2 4 [] "This is element"
.
.
.
That is a conceptual representation, an actual implementation would use the numerical row index for the children column instead of tags because your input data could be dirty, with duplicate or missing tags, and you don't want to build a structure that depends on tags being unique. Additional columns can easily be added.
You would build the table by walking the tree recursively, but it may be easier to work with items in the tree by using rows in a flat table (a 2D list of lists) to refer to them.
Edit: This is an extension of your solution to the original question (an undecorated list of nodes) that adds structured info (tag, parent, children, etc) to each node. That may be useful if you need to navigate up and down the tree.
Edit: This code:
def recurse(y, n=[], p=-1):
node = ["", p, [], "", ""] # tag, parent, children, type, info
vv = []
for k,v in y.items():
if k == "tag":
node[0] = v
elif k == "info":
node[4] = v
elif isinstance(v, list):
node[3] = k
vv = v
n.append(node)
p = len(n)-1
for i in vv:
n[p][2].append(len(n))
n = recurse(i, n, p)
return(n)
nodes = recurse(a)
for i in range(len(nodes)):
print(i, nodes[i])
produces (spaced manually into columns for readability):
0 ['test', -1, [1, 2, 14], 'sections', '']
1 [ 's1', 0, [], '', 'This is section ONE']
2 [ 's2', 0, [3, 4, 9], 'fields', 'This is section TWO']
3 [ 'f1', 2, [], '', 'This is field ONE']
4 [ 'f2', 2, [5, 6, 7, 8], 'elements', 'This is field TWO']
5 [ 'e1', 4, [], '', 'This is element']
6 [ 'e2', 4, [], '', 'This is element']
7 [ 'e3', 4, [], '', 'This is element']
8 [ 'e4', 4, [], '', 'This is element']
9 [ 'f3', 2, [10, 11, 12, 13], 'elements', 'This is field THREE']
10 [ 'e5', 9, [], '', 'This is element']
11 [ 'e6', 9, [], '', 'This is element']
12 [ 'e7', 9, [], '', 'This is element']
13 [ 'e8', 9, [], '', 'This is element ONE']
14 [ 's3', 0, [15, 16, 17], 'fields', 'This is section THREE']
15 [ 'f4', 14, [], '', 'This is field FOUR']
16 [ 'f5', 14, [], '', 'This is field FIVE']
17 [ 'f6', 14, [], '', 'This is field SIX']

Accessing python variables in a list

In the following code below, how to retrieve the value of id,Id has multiple values in it.How to access the values of id and update it to result1
def parse_results ():
try:
xml = minidom.parseString(new_results)
for xmlchild in xmldoc.childNodes[0].childNodes :
result1 = {}
result1.update ({'firstname': xmlchild.getElementsByTagName("firstname")[0].childNodes[0].nodeValue})
result1.update ({'lastname': xmlchild.getElementsByTagName("lastname")[0].childNodes[0].nodeValue})
result1.update ({'address': address})
if xmlchild.getElementsByTagName("id")[0].childNodes[0].nodeValue:
logging.debug(xmlchild.getElementsByTagName("id")[0].childNodes[0].nodeValue.lower())
result1.update ({'id': id})
Edit:
xmlchild.getElementsByTagName("id")[0].childNodes[0].nodeValue -this statement gives an exception
Adding XML:
<info><firstname>firstname</firstname><lastname>lastname</lastname><id>2</id></info>
<info><firstname>firstname</firstname><lastname>lastname</lastname><id>3</id></info>
<info><firstname>firstname</firstname><lastname>lastname</lastname><id>4</id></info>

Why are you using minidom? It is really boring to use.
I suggest you move to element tree:
import xml.etree.ElementTree as et
d = et.fromstring('''
<doc>
<info><firstname>firstname</firstname><lastname>lastname</lastname><id>2</id></info>
<info><firstname>firstname</firstname><lastname>lastname</lastname><id>3</id></info>
<info><firstname>firstname</firstname><lastname>lastname</lastname><id>4</id></info>
</doc>
''')
result = [dict((el.tag, el.text) for el in info) for info in d.findall('info')]
print result
That prints:
[{'firstname': 'firstname', 'id': '2', 'lastname': 'lastname'},
{'firstname': 'firstname', 'id': '3', 'lastname': 'lastname'},
{'firstname': 'firstname', 'id': '4', 'lastname': 'lastname'}]

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

lxml memory usage when parsing huge xml in python - python

Related

convert xml which has 'children' into dictionary

Accessing keys/values in a paginated/nested dictionary

Traversing from one node in xml to another using Python

Providing an id for each recursed item in nested dictionary of lists of dictionaries

Accessing python variables in a list

Categories

Resources