parsing an xml file for unknown elements using python ElementTree - python

I wish to extract all the tag names and their corresponding data from a multi-purpose xml file. Then save that information into a python dictionary (e.g tag = key, data = value). The catch being the tags names and values are unknown and of unknown quantity.
<some_root_name>
<tag_x>bubbles</tag_x>
<tag_y>car</tag_y>
<tag...>42</tag...>
</some_root_name>
I'm using ElementTree and can successfully extract the root tag and can extract values by referencing the tag names, but haven't been able to find a way to simply iterate over the tags and data without referencing a tag name.
Any help would be great.
Thank you.

from lxml import etree as ET
xmlString = """
<some_root_name>
<tag_x>bubbles</tag_x>
<tag_y>car</tag_y>
<tag...>42</tag...>
</some_root_name> """
document = ET.fromstring(xmlString)
for elementtag in document.getiterator():
print "elementtag name:", elementtag.tag
EDIT:
To read from file instead of from string
document = ET.parse("myxmlfile.xml")

>>> import xml.etree.cElementTree as et
>>> xml = """
... <some_root_name>
... <tag_x>bubbles</tag_x>
... <tag_y>car</tag_y>
... <tag...>42</tag...>
... </some_root_name>
... """
>>> doc = et.fromstring(xml)
>>> print dict((el.tag, el.text) for el in doc)
{'tag_x': 'bubbles', 'tag_y': 'car', 'tag...': '42'}
If you really want 42 instead of '42', you'll need to work a little harder and less elegantly.

You could use xml.sax.handler to parse the XML:
import xml.sax as sax
import xml.sax.handler as saxhandler
import pprint
class TagParser(saxhandler.ContentHandler):
# http://docs.python.org/library/xml.sax.handler.html#contenthandler-objects
def __init__(self):
self.tags = {}
def startElement(self, name, attrs):
self.tag = name
def endElement(self, name):
if self.tag:
self.tags[self.tag] = self.data
self.tag = None
self.data = None
def characters(self, content):
self.data = content
parser = TagParser()
src = '''\
<some_root_name>
<tag_x>bubbles</tag_x>
<tag_y>car</tag_y>
<tag...>42</tag...>
</some_root_name>'''
sax.parseString(src, parser)
pprint.pprint(parser.tags)
yields
{u'tag...': u'42', u'tag_x': u'bubbles', u'tag_y': u'car'}

This could be done using lxml in python
from lxml import etree
myxml = """
<root>
value
</root> """
doc = etree.XML(myxml)
d = {}
for element in doc.iter():
key = element.tag
value = element.text
d[key] = value
print d

Related

How can I parse the below XML data using Python?

Source XML
<?xml version='1.0' encoding='UTF-8'?>
<ProcessType xmlns:xmi="http://www.omg.org/XMI" xmi:version="2.0" defaultContext="Default">
<node componentName="tRedshiftRow" componentVersion="0.102" offsetLabelX="0" offsetLabelY="0" posX="-32" posY="96">
<elementParameter field="TECHNICAL" name="QUERYSTORE:QUERYSTORE_TYPE" value="BUILT_IN"/>
<elementParameter field="TEXT" name="DBNAME" value=""""/>
<elementParameter field="TEXT" name="SCHEMA_DB" value=""""/>
<elementParameter field="MEMO_SQL" name="QUERY" value=""DELETE FROM schema.tablename;""/>
</node>
</ProcessType>
I want to get the DELETE statement only where tag is "QUERY", and write it in a text file.
Expected output : DELETE FROM schema.tablename;
I was trying the following way, which obviously didn't work out !
from lxml import etree, objectify
import xml.etree.ElementTree as ET
def convert_xml_to_comp():
metadata = 'source.xml'
parser = etree.XMLParser(remove_blank_text=True)
tree = etree.parse(metadata, parser)
root = tree.getroot()
for elem in root.getiterator():
# print(elem)
i = elem.tag.find('}')
if i >= 0:
elem.tag = elem.tag[i+1 :]
objectify.deannotate(root, cleanup_namespaces=True)
tree.write('done.xml', pretty_print=True, xml_declaration=True, encoding='UTF-8')
tree = ET.parse('done.xml')
root = tree.getroot()
def get_sql_text():
file = open( "newdelete.txt", "w")
for root in tree.getroot():
### Get the elements' names ###
for elementParameter in root.iterfind('elementParameter[#name="UNIQUE_NAME"]') :
name=elementParameter.get('value')
### Get the elements' name and SQL ###
for elementParameter in root.iterfind('elementParameter[#name="QUERY"]') :
#print (root.attrib)
val=elementParameter.get('value')
print(root.find('val[#value="DELETE FROM schema.tablename;"]'))
file.close()
get_sql_text()
if __name__ == '__main__':
convert_xml_to_comp()
You do this all in a just a couple of statements using an xpath query. Something like:
>>> from lxml import etree
>>> doc = etree.parse(open('data.xml'))
>>> query = doc.xpath('//elementParameter[#name="QUERY"]')[0].get('value')
>>> print(query)
"DELETE FROM schema.tablename;"
This says "find all the elementParameter elements with name="QUERY" and then return the value of the value attribute of the first one.
To select just those elements that contain "DELETE" in their value attribute, use the contains() function:
>>> doc.xpath('//elementParameter[#name="QUERY" and contains(#value, "DELETE")]')

python CDATA in subelement

I'm trying to add CDATA to an subElement of XML. But the XML keeps converting the < character to <.
I've read some posts here that do something with CDATA, but I can't believe it's this hard to make it work. Besides that, I can't get those examples working.
Below a part of my code simplified, using python 3.4.
import xml.etree.cElementTree as ET
from xml.dom import minidom
class MyClass():
def __init__(self):
self.xml = None
def generateXML(self):
self.xml = ET.Element("AVXML")
row = ET.SubElement(self.xml, "ROW")
ET.SubElement(row, "DATA").text = "<![CDATA[ <ART_HDR.COMMENT1>.PDF ]]>"
def saveXML(self):
rough_string = ET.tostring(self.xml, 'Windows-1252')
reparsed = minidom.parseString(rough_string)
prettyxml = reparsed.toprettyxml(indent=" ", encoding="Windows-1252")
print(prettyxml)
f = open("filetosave.xml", "wb")
f.write(prettyxml)
f.close()
m = MyClass()
m.generateXML()
m.saveXML()
This however generates:
<DATA><![CDATA[ <ART_HDR.COMMENT1>.PDF ]]></DATA>
Instead of
<DATA><![CDATA[ <ART_HDR.COMMENT1>.PDF ]]></DATA>
Oke, I used this comment and got it working.
import xml.etree.cElementTree as ET
from xml.dom import minidom
class MyClass():
def __init__(self):
self.xml = None
def generateXML(self):
self.xml = ET.Element("AVXML")
row = ET.SubElement(self.xml, "ROW")
data = " <ART_HDR.COMMENT1>.PDF "
cdata = ET.SubElement(row, "DATA")
cdata.append(ET.Comment(' --><![CDATA[' + data.replace(']]>', ']]]]><![CDATA[>') + ']]><!-- '))
def saveXML(self):
rough_string = ET.tostring(self.xml, 'Windows-1252')
reparsed = minidom.parseString(rough_string)
prettyxml = reparsed.toprettyxml(indent=" ", encoding="Windows-1252")
f = open("filetosave.xml", "wb")
f.write(prettyxml)
f.close()
m = MyClass()
m.generateXML()
m.saveXML()

variable in XML subelement

I'm thinking of Python code to create a dynamic xml ETREE subElement.
I have a hierarchical header to describe a peace of book as the following:
<Books>
<Booktype List= "Story > Fiction > Young">
#here the rest of book text
</Booktype>
<Booktype List= "Science > Math > Young">
#here the rest of book text
</Booktype>
</Books>
How to get a hierarchical xml tag like this :
<Books>
<Booktype>
<Story>
<Fiction>
<Young>
#here the rest of book text
</Young>
</Fiction>
</Story>
</Booktype>
</Books>
This is my code:
import re
import xml.etree.ElementTree as ET
from xml.etree import ElementTree
List= "Story>Fiction>Young"
List = List.split('>')
root = ET.Element('Books')
Booktype =ET.SubElement(root,'Booktype')
for l in List:
ND = ET.SubElement(Booktype,str(l))
Booktype.append(ND)
tree = ET.ElementTree(root)
ElementTree.tostring(root,'utf-8')
I got this bad result:
'<Books><Booktype><Story /><Story /><Story /><Fiction /><Fiction /><Young /><Young /><Story /><Story /><Fiction /><Fiction /><Young /><Young /></Booktype></Books>'
If you want to nest the list elements you have to keep the reference to the previous one so you can add the child element to it, and not to the Booktype element. See the variable currrent in the examples.
from xml.etree import ElementTree as ET
xml_string = '''<Books>
<Booktype List= "Story > Fiction > Young">
#here the rest of book text
</Booktype>
<Booktype List= "Science > Math > Young">
#here the rest of book text 2
</Booktype>
</Books>
'''
xml = ET.fromstring(xml_string)
for booktype in xml.findall('Booktype'):
types = map(lambda x: x.strip(), booktype.get('List').split('>'))
current = booktype
for t in types:
current = ET.SubElement(current, t)
current.text = booktype.text
booktype.text = ''
del booktype.attrib['List']
print ET.tostring(xml,'utf-8')
Gives me the result:
<Books>
<Booktype><Story><Fiction><Young>
#here the rest of book text
</Young></Fiction></Story></Booktype>
<Booktype><Science><Math><Young>
#here the rest of book text 2
</Young></Math></Science></Booktype>
</Books>
And if you want to create a completely new structure you can do:
xml = ET.fromstring(xml_string)
root = ET.Element('Books')
for booktype in xml.findall('Booktype'):
current = ET.SubElement(root, 'Booktype')
for t in map(lambda x: x.strip(), booktype.get('List').split('>')):
current = ET.SubElement(current, t)
current.text = booktype.text
print ET.tostring(root, 'utf-8')

Python: How to add a prefix to tags in an xml.etree.ElementTree

I use the python asciimathml library to parse some asciimathml and convert it to MathML
>>> from xml.etree.ElementTree import tostring
>>> tostring(asciimathml.parse('sqrt 2'))
'<math><mstyle><msqrt><mn>2</mn></msqrt></mstyle></math>'
The only trouble is I need my tags with a m: prefix. How do I change above code so I get:
'<m:math><m:mstyle><m:msqrt><m:mn>2</m:mn></m:msqrt></m:mstyle></m:math>'
You can rename the tag, adding the 'm:' prefix:
import asciimathml
from xml.etree.ElementTree import tostring
tree = asciimathml.parse('sqrt 2')
for elem in tree.getiterator():
elem.tag = 'm:' + elem.tag
print tostring(tree)
Result:
<m:math><m:mstyle><m:msqrt><m:mn>2</m:mn></m:msqrt></m:mstyle></m:math>

ElementTree element index look up

I'm using the xml.etree.ElementTree module to create an XML document with Python 3.1 from another structured document.
What ElementTree function can I use that returns the index of an existing sub element?
The getchildren method returns a list of sub-elements of an Element object. You could then use the built-in index method of a list.
>>> import xml.etree.ElementTree as ET
>>> root = ET.Element("html")
>>> head = ET.SubElement(root, "head")
>>> body = ET.SubElement(root, "body")
>>> root.getchildren().index(body)
1
import xml.etree.ElementTree as ET
root=ET.Element('C:\Users\Administrator\Desktop\ValidationToolKit_15.9\ValidationToolKit_15.9\NE3S_VTK\webservice\history\ofas.2017-1-3.10-55-21-608.xml')
childnew=ET.SubElement(root,"354")
root.getchildren().index(childnew)
0
list(root).index(childnew)
0
def alarms_validation(self, path, alarm_no, alarm_text):
with open(path) as f:
tree = et.parse(f)
root = tree.getroot()
try:
for x in xrange(10000):
print x
for y in xrange(6):
print y
if root[x][y].text == alarm_no:
print "found"
if root[x][y+1].text != alarm_text:
print "Alarm text is not proper"
else:
print "Alarm Text is proper"
except IndexError:
pass

Categories

Resources