updating XML attribute value in python - python

In the below XML, I want to parse it and update the value of "alcohol" to "yes" for all the attributes where age>21. I'm having a problem with it being a node buried inside other nodes. Could someone help me understand how to handle this?
Here's the XML again..
<root xmlns="XYZ" usingPalette="">
<grandParent hostName="XYZ">
<parent>
<child name="JohnsDad">
<grandChildren name="John" sex="male" age="22" alcohol="no"/>
</child>
<child name="PaulasDad">
<grandChildren name="Paula" sex="female" age="15" alcoho="no"/>
</child>
</parent>
</grandParent>
</root>
I tried find all and find method using this document here (http://pymotw.com/2/xml/etree/ElementTree/parse.html) but it didn't find it. For example, following code returns no results
for node in tree.findall('.//grandParent'):
print node

import xml.etree.ElementTree as ET
tree = ET.parse('data')
for node in tree.getiterator():
if int(node.attrib.get('age', 0)) > 21:
node.attrib['alcohol'] = 'yes'
root = tree.getroot()
ET.register_namespace("", "XYZ")
print(ET.tostring(root))
yields
<root xmlns="XYZ" usingPalette="">
<grandParent hostName="XYZ">
<parent>
<child name="JohnsDad">
<grandChildren age="22" alcohol="yes" name="John" sex="male" />
</child>
<child name="PaulasDad">
<grandChildren age="15" alcoho="no" name="Paula" sex="female" />
</child>
</parent>
</grandParent>
</root>
By the way, since the XML uses the namespace "XYZ", you must specify the namespace in your XPath:
for node in tree.findall('.//{XYZ}grandParent'):
print node
That will return the grandParent element, but since you want to inspect all subnodes, I think using getiterator is easier here.
To preserve comments while using xml.etree.ElementTree you could use the custom parser Fredrik Lundh shows here:
import xml.etree.ElementTree as ET
class PIParser(ET.XMLTreeBuilder):
"""
http://effbot.org/zone/element-pi.htm
"""
def __init__(self):
ET.XMLTreeBuilder.__init__(self)
# assumes ElementTree 1.2.X
self._parser.CommentHandler = self.handle_comment
self._parser.ProcessingInstructionHandler = self.handle_pi
self._target.start("document", {})
def close(self):
self._target.end("document")
return ET.XMLTreeBuilder.close(self)
def handle_comment(self, data):
self._target.start(ET.Comment, {})
self._target.data(data)
self._target.end(ET.Comment)
def handle_pi(self, target, data):
self._target.start(ET.PI, {})
self._target.data(target + " " + data)
self._target.end(ET.PI)
tree = ET.parse('data', PIParser())
Note that if you install lxml, you could instead use:
import lxml.etree as ET
parser = ET.XMLParser(remove_comments=False)
tree = etree.parse('data', parser=parser)

Related

Using BeautifulSoup to count xml elements in a function

I often use len(find_all("some_element") to count the number of entities in a xml file. I tried to build a function, but it doesn't work/ it always give me "None".
The XML file:
<parent>
<some>
<child>text</child>
<child>text</child>
<child>text</child>
</some>
</parent>
my python code:
def return_len(para1,para2): # doesn't work
if bool(suppe.para1): # the element isn't always present in the xml
return len(suppe.para1.find_all(para2))
def return_len1(): # does work
if bool(suppe.some):
return len(suppe.some.find_all("child"))
print(return_len("some","child")) # doesnt work
print(return_len1()) # does work
How must i modify my function return_len to get working / what did i wrong?
You can do like this.
from bs4 import BeautifulSoup
s = """<parent>
<some>
<child>text</child>
<child>text</child>
<child>text</child>
</some>
</parent>
"""
soup = BeautifulSoup(s, 'xml')
def return_len(para1,para2,soup):
print(f'No. of <{para2}> tags inside <{para1}> tag.')
temp = soup.find(para1)
if temp:
return len(temp.find_all(para2))
print(return_len('some', 'child', soup))
print(return_len('parent', 'some', soup))
No. of <child> tags inside <some> tag.
3
No. of <some> tags inside <parent> tag.
1
Without any external library - see the below
import xml.etree.ElementTree as ET
xml = '''<parent>
<some>
<child>text</child>
<child>text</child>
<child>text</child>
</some>
</parent>'''
root = ET.fromstring(xml)
print(f'Number of child elements is {len(root.findall(".//child"))}')
output
Number of child elements is 3

How to create a subset of document using lxml?

Suppose you have an lmxl.etree element with the contents like:
<root>
<element1>
<subelement1>blabla</subelement1>
</element1>
<element2>
<subelement2>blibli</sublement2>
</element2>
</root>
I can use find or xpath methods to get something an element rendering something like:
<element1>
<subelement1>blabla</subelement1>
</element1>
Is there a way simple to get:
<root>
<element1>
<subelement1>blabla</subelement1>
</element1>
</root>
i.e The element of interest plus all it's ancestors up to the document root?
I am not sure there is something built-in for it, but here is a terrible, "don't ever use it in real life" type of a workaround using the iterancestors() parent iterator:
from lxml import etree as ET
data = """<root>
<element1>
<subelement1>blabla</subelement1>
</element1>
<element2>
<subelement2>blibli</subelement2>
</element2>
</root>"""
root = ET.fromstring(data)
element = root.find(".//subelement1")
result = ET.tostring(element)
for node in element.iterancestors():
result = "<{name}>{text}</{name}>".format(name=node.tag, text=result)
print(ET.tostring(ET.fromstring(result), pretty_print=True))
Prints:
<root>
<element1>
<subelement1>blabla</subelement1>
</element1>
</root>
The following code removes elements that don't have any subelement1 descendants and are not named subelement1.
from lxml import etree
tree = etree.parse("input.xml") # First XML document in question
for elem in tree.iter():
if elem.xpath("not(.//subelement1)") and not(elem.tag == "subelement1"):
if elem.getparent() is not None:
elem.getparent().remove(elem)
print etree.tostring(tree)
Output:
<root>
<element1>
<subelement1>blabla</subelement1>
</element1>
</root>

Get attribute of first element using lxml

Trying to parse an XML file using lxml in Python, how do I simply get the value of an element's attribute? Example:
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<item id="123">
<sub>ABC</sub>
</item>
I'd like to get the result 123, and store it as a variable.
When using etree.parse(), simply call .getroot() to get the root element; the .attrib attribute is a dictionary of all attributes, use that to get the value:
>>> from lxml import etree
>>> tree = etree.parse('test.xml')
>>> tree.getroot().attrib['id']
'123'
If you used etree.fromstring() the object returned is the root object already, so no .getroot() call is needed:
>>> tree = etree.fromstring('''\
... <?xml version="1.0" encoding="UTF-8" standalone="yes"?>
... <item id="123">
... <sub>ABC</sub>
... </item>
... ''')
>>> tree.attrib['id']
'123'
Alternatively, you could use an XPath selector:
>>> from lxml import etree
>>> tree = etree.fromstring(b'''<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<item id="123">
<sub>ABC</sub>
</item>''')
>>> tree.xpath('/item/#id')
['123']
I think Martijn has answered your question. Building on his answer, you can also use the items() method to get a list of tuples with the attributes and values. This may be useful if you need the values of multiple attributes. Like so:
>>> from lxml import etree
>>> tree = etree.parse('test.xml')
>>> item = tree.xpath('/item')
>>> item.items()
[('id', '123')]
Or in case of string:
>>> tree = etree.fromstring("""\
... <?xml version="1.0" encoding="UTF-8" standalone="yes"?>
... <item id="123">
... <sub>ABC</sub>
... </item>
... """)
>>> tree.items()
[('id', '123')]

Python XML check next item

Here is a little xml example:
<?xml version="1.0" encoding="UTF-8"?>
<list>
<person id="1">
<name>Smith</name>
<city>New York</city>
</person>
<person id="2">
<name>Pitt</name>
</person>
...
...
</list>
Now I need all Persons with a name and city.
I tried:
#!/usr/bin/python
# coding: utf8
import xml.dom.minidom as dom
tree = dom.parse("test.xml")
for listItems in tree.firstChild.childNodes:
for personItems in listItems.childNodes:
if personItems.nodeName == "name" and personItems.nextSibling == "city":
print personItems.firstChild.data.strip()
But the ouput is empty. Without the "and" condition I become all names. How can I check that the next tag after "name" is "city"?
You can do this in minidom:
import xml.dom.minidom as minidom
def getChild(n,v):
for child in n.childNodes:
if child.localName==v:
yield child
xmldoc = minidom.parse('test.xml')
person = getChild(xmldoc, 'list')
for p in person:
for v in getChild(p,'person'):
attr = v.getAttributeNode('id')
if attr:
print attr.nodeValue.strip()
This prints id of person nodes:
1
2
use element tree check this element tree
import xml.etree.ElementTree as ET
tree = ET.parse('a.xml')
root = tree.getroot()
for person in root.findall('person'):
name = person.find('name').text
try:
city = person.find('city').text
except:
continue
print name, city
for id u can get it by id= person.get('id')
output:Smith New York
Using lxml, you can use xpath to get in one step what you need:
from lxml import etree
xmlstr = """
<list>
<person id="1">
<name>Smith</name>
<city>New York</city>
</person>
<person id="2">
<name>Pitt</name>
</person>
</list>
"""
xml = etree.fromstring(xmlstr)
xp = "//person[city]"
for person in xml.xpath(xp):
print etree.tostring(person)
lxml is external python package, but is so useful, that to me it is always worth to install.
xpath is searching for any (//) element person having (declared by content of []) subelement city.

Python minidom: list childnode attributes per parent tag

I have,
<parent id="1">
<child id="white"></child>
<child id="red"></child>
</parent>
<parent id="2">
<child id="green"></child>
<child id="gray"></child>
</parent>
I need this output,
1
white
red
2
green
gray
This is how I am doing it,
parent = xmldoc.getElementsByTagName('parent')
for item in parent:
child = xmldoc.getElementsByTagName('child')
child_id = child.getAttribute('id')
for child_id in child:
print child_id
Of course, I'm getting it wrong, but do not know how to loop through these parent ids and collect each list individually. I would appreciate some help!
Try this
import xml.dom.minidom as minidom
a = '<parent id="1"><child id="white"></child><child id="red"></child></parent>'
dom = minidom.parseString(a)
for parent in dom.childNodes:
print parent.getAttribute('id')
for child in parent.childNodes:
print ' %s' % child.getAttribute('id')
Because you are guessing code and syntax:
import xml.dom.minidom
s = """<?xml version="1.0"?>
<container>
<parent id="1">
<child id="white"></child>
<child id="red"></child>
</parent>
<parent id="2">
<child id="green"></child>
<child id="gray"></child>
</parent>
</container>
"""
xmldoc = xml.dom.minidom.parseString(s)
parent = xmldoc.getElementsByTagName('parent')
for item in parent:
print item.getAttribute('id')
for child in item.getElementsByTagName('child'):
print child.getAttribute('id')

Categories

Resources