Parse hierarchical XML tags - python

Need to parse hierarchical tags from XML and get the tag's value in desired output
Input
<doc>
<pid id="231">
<label key="">Electronics</label>
<desc/>
<cid id="122">
<label key="">TV</label>
</cid>
<desc/>
<cid id="123">
<label key="">Computers</label>
<cid id="12433">
<label key="">Lenovo</label>
</cid>
<desc/>
<cid id="12434">
<label key="">IBM</label>
<desc/>
</cid>
<cid id="12435">
<label key="">Mac</label>
</cid>
<desc/>
</cid>
</pid>
<pid id="7764">
<label key="">Music</label>
<desc/>
<cid id="1224">
<label key="">Play</label>
<desc/>
<cid id="341">
<label key="">PQR</label>
</cid>
<desc/>
</cid>
<cid id="221">
<label key="">iTunes</label>
<cid id="341">
<label key="">XYZ</label>
</cid>
<desc/>
<cid id="515">
<label key="">ABC</label>
</cid>
<desc/>
</cid>
</pid>
</doc>
Output
Electornics/
Electornics/TV
Electornics/Computers/Lenovo
Electornics/Computers/IBM
Electornics/Computers/Mac
Music/
Music/Play/PQR
Music/iTunes/XYZ
Music/iTunes/ABC
What I have tried (in Python)
import xml.etree.ElementTree as ET
import os
import sys
import string
def perf_func(elem, func, level=0):
func(elem,level)
for child in elem.getchildren():
perf_func(child, func, level+1)
def print_level(elem,level):
print '-'*level+elem.tag
root = ET.parse('Products.xml')
perf_func(root.getroot(), print_level)
# Added find logic
root = tree.getroot()
for n in root.findall('doc')
l = n.find('label').text
print l
With the above code, I am able to get the nodes and its levels (just the tag not their value) . And also the 1st level of all labels.
Need some suggestion (Perl/Python) on how to proceed to get the hirerachical structure in the format mentioned in Output.

We are going to use 3 pieces: find all of the elements in the order in which they occur, get the depth of each one, build a bread crumb based on the depth and order.
from lxml import etree
xml = etree.fromstring(xml_str)
elems = xml.xpath(r'//label') #xpath expression to find all '<label ...> elements
# counts the number of parents to the root element
def get_depth(element):
depth = 0
parent = element.getparent()
while parent is not None:
depth += 1
parent = parent.getparent()
return depth
# build up the bread crumbs by tracking the depth
# when a new element is entered, it replaces the value in the list
# at that level and drops all values to the right
def reduce_by_depth(element_list):
crumbs = []
depth = 0
elem_crumb = ['']*10
for elem in element_list:
depth = get_depth(elem)
elem_crumb[depth] = elem.text
elem_crumb[depth+1:] = ['']*(10-depth-1)
# join all the non-empty string to get the breadcrumb
crumbs.append('/'.join([e for e in elem_crumb if e]))
return crumbs
reduce_by_depth(elems)
# output:
['Electronics',
'Electronics/TV',
'Electronics/Computers',
'Electronics/Computers/Lenovo',
'Electronics/Computers/IBM',
'Electronics/Computers/Mac',
'Music',
'Music/Play',
'Music/Play/PQR',
'Music/iTunes',
'Music/iTunes/XYZ',
'Music/iTunes/ABC']

The problem can also be solved by using a custom iterator, similar to this answer
The code has to keep track of when a new path segment gets added (not every hierarchy level might have a label), so that it can remove them at the right time when we go back up. I do this by storing a tuple of depth and label.
I also assume that you are interested in the parent of every label tag, which is why I check for a label element and then yield back its parent.
The code will work if the label is not the first element. If a tag has multiple labels as direct children, it will return the same parent twice.
from lxml import etree
def label_iter(element):
path = []
stack = []
stack.append(iter([element]))
while stack:
e = next(stack[-1], None)
if e == None:
stack.pop()
if(len(path) > 0 and len(stack) < path[-1][0]):
path.pop()
else:
stack.append(iter(e))
label_tag = e.find('label')
if label_tag is not None:
path.append((len(stack),label_tag.text))
if e.tag == 'label':
yield (e.getparent(), "/".join([label for debth, label in path]))
tree = etree.fromstring(xml_str)
root = tree.getroot()
for tag, path in label_iter(root):
print(path)
The code works, but I think it can be done with less repetition and in a cleaner way. I do not like the double bookkeeping with the path and stack array and looking for labels twice also seems inelegant. So feel free to improve it!

Another possibe solution is to use recursion. It feels a bit like a tool from a past, darker age, but it allows using the call stack to keep track of the depth instead of having to do that manually.
from lxml import etree
MAX_DEPTH = 20
def recursive_parse(element, path = [], depth = 0):
if depth > MAX_DEPTH:
return
label_tag = element.find('label')
if label_tag is not None: #found new path segment
path.append(label_tag.text)
print('/'.join(path))
for child in element.getchildren():
recursive_parse(child, path, depth+1)
tree = etree.fromstring(xml_str)
root = tree.getroot()
recursive_parse(root)
Instead of printing the path, it could for example be stored in a dict to make it possible to retrieve an element by its path:
dict paths = {}
...
if label_tag is not None: #found new path segment
path.append(label_tag.text)
paths['/'.join(path)] = element
To me, the solution seems less elegant, but it is shorter and might be easier to understand.

Related

How to change the structure of an an XML

From this string:
label_config={
"label1": [
"modality1",
"modality2",
"modality3"],
"choice":"single",
"required": "true",
"name" : "sentiment"},{
"label2": [
"modality1",
"modality2"],
"name" : "price"
}
I created this XML which is printed:
Anyone knows how thanks to this library: from lxml import etree
can move the slashes of the yellow elements from the end to the beginning?
Here is the code of the generation:
from lxml import etree
import sys
def topXML(dictAttrib = None):
root : {lxml.etree._Element}
root = etree.Element("View")
textEl = etree.SubElement(root, "Text")
if dictAttrib == None:
dictAttrib = {
"name":"text",
"value":"$text"
}
for k_,v_ in dictAttrib.items():
textEl.set(k_,v_)
return root
def choiceXML(root,locChoice):
headerEl = etree.SubElement(root, "Header")
choisesEl = etree.SubElement(root, "Choices")
for k_,v_ in locChoice.items():
if (isinstance(k_,str) & isinstance(v_,list)):
choices = v_
headerEl.set("value",k_)
if locChoice.get("toName") == None:
choisesEl.set("toName","text")
for op_ in choices:
opEl = etree.SubElement(root, "Choice")
opEl.set("value",op_)
else :
choisesEl.set(k_,v_)
choisesEl = etree.SubElement(root, "Choices")
return root
def checkConfig(locChoice):
if locChoice.get("name") == None :
sys.exit("Warning : label_config needs a parameter called 'name' assigned")
def xmlConstructor(label_config):
root = topXML()
for ch_ in label_config:
checkConfig(ch_)
root = choiceXML(root,ch_)
return root
The generated code will be used in this site https://labelstud.io/playground/. They use some type of XML do create the code. Unfortunately, using etree it doesn't achieve the wanted product and I found out that if I made the changes described above it will work.
In the meantime, I am contacting their team to get more info but if somoeone here has any idea on how to make it work, please come forward.
To correctly encapsulate the <Choice> nodes under its parent, <Choices>, simply make the following very simple two changes to your choiceXML method. Namely, add opEl sub elements under the choisesEl element (not root) and remove the redundant second choisesEl line at the end.
def choiceXML(root, locChoice):
headerEl = etree.SubElement(root, "Header")
choisesEl = etree.SubElement(root, "Choices")
for k_,v_ in locChoice.items():
if (isinstance(k_,str) & isinstance(v_,list)):
choices = v_
headerEl.set("value",k_)
if locChoice.get("toName") == None:
choisesEl.set("toName","text")
for op_ in choices:
opEl = etree.SubElement(choisesEl, "Choice") # CHANGE root to choisesEl
opEl.set("value",op_)
else :
choisesEl.set(k_,v_)
#choisesEl = etree.SubElement(root, "Choices") # REMOVE THIS LINE
return root
Full Process
label_config = {
"label1": [
"modality1",
"modality2",
"modality3"],
"choice":"single",
"required": "true",
"name" : "sentiment"},{
"label2": [
"modality1",
"modality2"],
"name" : "price"
}
def topXML(dictAttrib = None):
# ...NO CHANGE...
def choiceXML(root,locChoice):
# ...ABOVE CHANGE...
def checkConfig(locChoice):
# ...NO CHANGE...
def xmlConstructor(label_config):
# ...NO CHANGE...
output = xmlConstructor(label_config)
Output
print(etree.tostring(output, pretty_print=True).decode("utf-8"))
# <View>
# <Text name="text" value="$text"/>
# <Header value="label1"/>
# <Choices toName="text" choice="single" required="true" name="sentiment">
# <Choice value="modality1"/>
# <Choice value="modality2"/>
# <Choice value="modality3"/>
# </Choices>
# <Header value="label2"/>
# <Choices toName="text" name="price">
# <Choice value="modality1"/>
# <Choice value="modality2"/>
# </Choices>
# </View>
The <Choices/> is short for <Choices></Choices> (XML spec). If you just make it a closing element, you probably don't have an opening one, and the result will be invalid xml. Any program trying to read / parse that will error out.
Notice that you have trailing slashes on all your <Choices> elements, also the non-empty ones.
If you don't want the empty <Choices/> elements, you may need to look into how you generate the XML from the dict. Since you don't provide a MCVE we can't answer that part.
This is more a comment than an answer, but it's a bit too long for a comment. Looking at what you provide, it seems like the problem is not that your xml is too well formed (there's no such thing) or that the playground has some sort of weird xml structure. I believe the xml you generated is not what they are looking for.
If you look at your 2nd <Choices> element, it reads
<Choices toName="text" name="price"/>
Try dropping the closing / so it reads:
<Choices toName="text" name="price">
It will then be closed with the following <Choices/> and maybe it will work.

python xml remove grandchildren or grandgrandchildren

I've been googling for removing grandchildren from an xml file. However, I've found no perfect solution.
Here's my case:
<tree>
<category title="Item 1">item 1 text
<subitem title="subitem1">subitem1 text</subitem>
<subitem title="subitem2">subitem2 text</subitem>
</category>
<category title="Item 2">item 2 text
<subitem title="subitem21">subitem21 text</subitem>
<subitem title="subitem22">subitem22 text</subitem>
<subsubitem title="subsubitem211">subsubitem211 text</subsubitem>
</category>
</tree>
In some cases, I want to remove subitems. In other cases, I want to remove subsubitem. I know I can do like this in current given content:
import xml.etree.ElementTree as ET
root = ET.fromstring(given_content)
# case 1
for item in root.getiterator():
for subitem in item:
item.remove(subitem)
# case 2
for item in root.getiterator():
for subitem in item:
for subsubitem in subitem:
subitem.remove(subsubitem)
I can write in this style only when I know the depth of the target node. If I only know the tag name of node I want to remove, how should I implement it?
pseudo-code:
import xml.etree.ElementTree as ET
for item in root.getiterator():
if item.tag == 'subsubitem' or item.tag == 'subitem':
# remove item
If I do root.remove(item), it will certainly return an error because item is not a direct child of root.
Edited:
I cannot install any 3rd-party-lib, so I have to solve this with xml.
I finally got this work for me only on xml lib by writing a recursive function.
def recursive_xml(root):
if root.getchildren() is not None:
for child in root.getchildren():
if child.tag == 'subitem' or child.tag == 'subsubitem':
root.remove(child)
else:
recursive_xml(child)
By doing so, the function will iterate every node in ET and remove my target nodes.
test_xml = r'''
<test>
<test1>
<test2>
<test3>
</test3>
<subsubitem>
</subsubitem>
</test2>
<subitem>
</subitem>
<nothing_matters>
</nothing_matters>
</test1>
</test>
'''
root = ET.fromstring(test_xml)
recursive_xml(root)
Hope this helps someone has restricted requirements like me....
To remove instances of subsubitem or subitem, no matter what their depth, consider the following example (with the caveat that it uses lxml.etree rather than upstream ElementTree):
import lxml.etree as etree
el = etree.fromstring('<root><item><subitem><subsubitem/></subitem></item></root>')
for child in el.xpath('.//subsubitem | .//subitem'):
child.getparent().remove(child)

Removing parent element and all subelements from XML

Given an XML file with the following structure:
<Root>
<Stuff></Stuff>
<MoreStuff></MoreStuff>
<Targets>
<Target>
<ID>12345</ID>
<Type>Ground</Type>
<Size>Large</Size>
</Target>
<Target>
...
</Target>
</Targets>
</Root>
I'm trying to loop through each child under the <Targets> element, check each <ID> for a specific value, and if the value is found, then I want to delete the entire <Target> entry. I've been using the ElementTree Python library with little success. Here's what I have so far:
import xml.etree.ElementTree as ET
tree = ET.parse('file.xml')
root = tree.getroot()
iterator = root.getiterator('Target')
for item in iterator:
old = item.find('ID')
text = old.text
if '12345' in text:
item.remove(old)
tree.write('out.xml')
The problem I'm having with this approach is that only the <ID> sub element is removed, however I need the entire <Target> element and all of its child elements removed. Can anyone help! Thanks.
Unfortunately, element tree elements don't know who their parents are. There is a workaround -- You can build the mapping yourself:
tree = ET.parse('file.xml')
root = tree.getroot()
parent_map = dict((c, p) for p in tree.getiterator() for c in p)
# list so that we don't mess up the order of iteration when removing items.
iterator = list(root.getiterator('Target'))
for item in iterator:
old = item.find('ID')
text = old.text
if '12345' in text:
parent_map[item].remove(item)
continue
tree.write('out.xml')
Untested
You need to keep a reference to the Targets element so that you can remove its children, so start your iteration from there. Grab each Target, check your condition and remove what you don't like.
#!/usr/bin/env python
import xml.etree.ElementTree as ET
xmlstr="""<Root>
<Stuff></Stuff>
<MoreStuff></MoreStuff>
<Targets>
<Target>
<ID>12345</ID>
<Type>Ground</Type>
<Size>Large</Size>
</Target>
<Target>
...
</Target>
</Targets>
</Root>"""
root = ET.fromstring(xmlstr)
targets = root.find('Targets')
for target in targets.findall('Target'):
_id = target.find('ID')
if _id is not None and '12345' in _id.text:
targets.remove(target)
print ET.tostring(root)

Python ElementTree

Having trouble with XML config files using ElementTree. I want to have an easy way to find the text of an element regardless of where it is in the XML Tree. From what the documentation says, I should be able to do this with findtext(), but no matter what, I get a return of None. Where am I going wrong here? Everyone was telling me XML is so simple to handle in Python, yet I have had nothing but troubles.
configFileName = 'file.xml'
def configSet (x):
if os.path.exists(configFileName):
tree = ET.parse(configFileName)
root = tree.getroot()
return root.findtext(x)
hiTemp = configSet('hiTemp')
print hiTemp
and the XML
<configData>
<units>
<temp>F</temp>
</units>
<pins>
<lights>1</lights>
<fan>2</fan>
<co2>3</co2>
</pins>
<events>
<airTemps>
<hiTemp>80</hiTemp>
<lowTemp>72</lowTemp>
<hiTempAlarm>84</hiTempAlarm>
</airTemps>
<CO2>
<co2Hi>1500</co2Hi>
<co2Low>1400</co2Low>
<co2Alarm>600</co2Alarm>
</CO2>
</events>
<settings>
<apikeys>
<prowl>
<apikey>None</apikey>
</prowl>
</apikeys>
</settings>
expected result
80
actual result
None
findtext requires a full path, but you have given a relative path, so you cannot find the element you are looking for.
You can either provide a good xpath or modify your code
def configSet(x):
if os.path.exists(configFileName):
tree = ET.parse(configFileName)
root = tree.getroot()
for e in root.getiterator():
t = e.findtext(x)
if t is not None:
return t
Update 1:
If you want to have all matched text as a list, the code is a bit different.
def configSet(x):
matches = []
if os.path.exists(configFileName):
tree = ET.parse(configFileName)
root = tree.getroot()
for e in root.getiterator():
t = e.findtext(x)
if t is not None:
matches.append(t)
return matches
You can use xpath to get to your desired element.
return root.find('./events/airTemps/hiTemp').text
There's easy to follow documentation here.

How to copy multiple XML nodes to another file in Python

Bare in mind I am very new to Python. I'm trying to copy few XML nodes from sample1.xml to out.xml if it doesn't exist in sample2.xml.
this is how far I got before I'm stuck
import xml.etree.ElementTree as ET
tree = ET.ElementTree(file='sample1.xml')
addtree = ET.ElementTree(file='sample2.xml')
root = tree.getroot()
addroot = addtree.getroot()
for adel in addroot.findall('.//cars/car'):
for el in root.findall('cars/car'):
with open('out.xml', 'w+') as f:
f.write("BEFORE\n")
f.write(el.tag)
f.write("\n")
f.write(adel.tag)
f.write("\n")
f.write("\n")
f.write("AFTER\n")
el = adel
f.write(el.tag)
f.write("\n")
f.write(adel.tag)
I have no idea what I'm missing, but it's only copying the actual "tag" itself.
outputs this:
BEFORE
car
car
AFTER
car
car
So I'm missing the children nodes, and also the <, >, </, > tags. Expected result is below.
sample1.xml:
<cars>
<car>
<use-car>0</use-car>
<use-gas>0</use-gas>
<car-name />
<car-key />
<car-location>hawaii</car-location>
<car-port>5</car-port>
</car>
</cars>
sample2.xml:
<cars>
<old>
1
</old>
<new>
8
</new>
<car />
</cars>
expected result in out.xml (final product)
<cars>
<old>
1
</old>
<new>
8
</old>
<car>
<use-car>0</use-car>
<use-gas>0</use-gas>
<car-name />
<car-key />
<car-location>hawaii</car-location>
<car-port>5</car-port>
</car>
</cars>
All the other nodes old and new must remain untouched. I'm just trying to replace <car /> with all its children and grandchildren (if existed) nodes.
First, a couple of trivial issues with your XML:
sample1: The closing cars tag is missing a /
sample2: The closing new tag incorrectly reads old, should read new
Second, a disclaimer: my solution below has its limitations - in particular, it wouldn't handle repeatedly substituting the car node from sample1 into multiple spots in sample2. But it works fine for the sample files you've supplied.
Third: thanks to the top couple of answers on access ElementTree node parent node - they informed the implementation of get_node_parent_info below.
Finally, the code:
import xml.etree.ElementTree as ET
def find_child(node, with_name):
"""Recursively find node with given name"""
for element in list(node):
if element.tag == with_name:
return element
elif list(element):
sub_result = find_child(element, with_name)
if sub_result is not None:
return sub_result
return None
def replace_node(from_tree, to_tree, node_name):
"""
Replace node with given node_name in to_tree with
the same-named node from the from_tree
"""
# Find nodes of given name ('car' in the example) in each tree
from_node = find_child(from_tree.getroot(), node_name)
to_node = find_child(to_tree.getroot(), node_name)
# Find where to substitute the from_node into the to_tree
to_parent, to_index = get_node_parent_info(to_tree, to_node)
# Replace to_node with from_node
to_parent.remove(to_node)
to_parent.insert(to_index, from_node)
def get_node_parent_info(tree, node):
"""
Return tuple of (parent, index) where:
parent = node's parent within tree
index = index of node under parent
"""
parent_map = {c:p for p in tree.iter() for c in p}
parent = parent_map[node]
return parent, list(parent).index(node)
from_tree = ET.ElementTree(file='sample1.xml')
to_tree = ET.ElementTree(file='sample2.xml')
replace_node(from_tree, to_tree, 'car')
# ET.dump(to_tree)
to_tree.write('output.xml')
UPDATE: It was recently brought to my attention that the implementation of find_child() in the solution I originally supplied would fail if the "child" in question was not in the first branch of the XML tree that was traversed. I've updated the implementation above to rectify this.

Categories

Resources