Python Parsing xml files - python

<goal>
<value>
<comment>n</comment>
<stats>
<goals>1</goals>
<shoton>1</shoton>
</stats>
<event_incident_typefk>406</event_incident_typefk>
<elapsed>22</elapsed>
<player2>38807</player2>
<subtype>header</subtype>
<player1>37799</player1>
<sortorder>5</sortorder>
<team>10261</team>
<id>378998</id><n>295</n>
<type>goal</type>
<goal_type>n</goal_type>
</value>
<value>
<comment>n</comment>
<stats>
<goals>1</goals>
<shoton>1</shoton>
</stats>
<event_incident_typefk>393</event_incident_typefk>
<elapsed>24</elapsed>
<player2>24154</player2>
<subtype>shot</subtype>
<player1>24148</player1>
<sortorder>4</sortorder>
<team>10260</team>
<id>379019</id><n>298</n>
<type>goal</type>
<goal_type>n</goal_type>
</value>
def extract_goal_type(data):
#print(data)
if data :
root = ET.fromstring(data)
tree = ET.ElementTree(root)
root = tree.getroot()
for c in root.getchildren():
return c.findtext('subtype')
the problem is that the function is only returning the first subset and l can't seem to get the other ... l also have XML files which have up to 6 children. Please help on how l can parse this to get everything which is in XML.... Thank you
expected out
header
shot

You are retuning from the for loop, you have to travers for all element then return data.
def extract_goal_type(data):
#print(data)
ret_data = []
if data :
root = ET.fromstring(data)
tree = ET.ElementTree(root)
root = tree.getroot()
for c in root.getchildren():
ret_data.append(c.findtext('subtype'))
return ret_data
Note: This will only work if you have valid XML

Related

Python XML: Getting unnecessary new lines when adding new element

Basically I'm trying to add a new element and for it to be properly indented, but with this code I get unnecessary new lines between elements. What is causing it and how do I fix it? Thanks
Example:
from xml.dom import minidom
import xml.etree.ElementTree as ET
def example(name, category):
tree = ET.parse("example1.xml")
root = tree.getroot()
for i in root:
if i.tag == category:
ET.SubElement(i, name).text = name
xmlStr = minidom.parseString(ET.tostring(root)).toprettyxml(indent=" ")
with open("example1.xml", "w") as f:
f.write(xmlStr)
example("test", 'FRUITS')
XML File:
<?xml version="1.0" ?>
<root>
<FRUITS>
<APPLE>apple</APPLE>
<PEAR>pear</PEAR>
<PLUM>plum</PLUM>
</FRUITS>
<VEGETABLES>
<CARROT>carrot</CARROT>
<POTATO>potato</POTATO>
</VEGETABLES>

How can I parse the below XML data using Python?

Source XML
<?xml version='1.0' encoding='UTF-8'?>
<ProcessType xmlns:xmi="http://www.omg.org/XMI" xmi:version="2.0" defaultContext="Default">
<node componentName="tRedshiftRow" componentVersion="0.102" offsetLabelX="0" offsetLabelY="0" posX="-32" posY="96">
<elementParameter field="TECHNICAL" name="QUERYSTORE:QUERYSTORE_TYPE" value="BUILT_IN"/>
<elementParameter field="TEXT" name="DBNAME" value=""""/>
<elementParameter field="TEXT" name="SCHEMA_DB" value=""""/>
<elementParameter field="MEMO_SQL" name="QUERY" value=""DELETE FROM schema.tablename;""/>
</node>
</ProcessType>
I want to get the DELETE statement only where tag is "QUERY", and write it in a text file.
Expected output : DELETE FROM schema.tablename;
I was trying the following way, which obviously didn't work out !
from lxml import etree, objectify
import xml.etree.ElementTree as ET
def convert_xml_to_comp():
metadata = 'source.xml'
parser = etree.XMLParser(remove_blank_text=True)
tree = etree.parse(metadata, parser)
root = tree.getroot()
for elem in root.getiterator():
# print(elem)
i = elem.tag.find('}')
if i >= 0:
elem.tag = elem.tag[i+1 :]
objectify.deannotate(root, cleanup_namespaces=True)
tree.write('done.xml', pretty_print=True, xml_declaration=True, encoding='UTF-8')
tree = ET.parse('done.xml')
root = tree.getroot()
def get_sql_text():
file = open( "newdelete.txt", "w")
for root in tree.getroot():
### Get the elements' names ###
for elementParameter in root.iterfind('elementParameter[#name="UNIQUE_NAME"]') :
name=elementParameter.get('value')
### Get the elements' name and SQL ###
for elementParameter in root.iterfind('elementParameter[#name="QUERY"]') :
#print (root.attrib)
val=elementParameter.get('value')
print(root.find('val[#value="DELETE FROM schema.tablename;"]'))
file.close()
get_sql_text()
if __name__ == '__main__':
convert_xml_to_comp()
You do this all in a just a couple of statements using an xpath query. Something like:
>>> from lxml import etree
>>> doc = etree.parse(open('data.xml'))
>>> query = doc.xpath('//elementParameter[#name="QUERY"]')[0].get('value')
>>> print(query)
"DELETE FROM schema.tablename;"
This says "find all the elementParameter elements with name="QUERY" and then return the value of the value attribute of the first one.
To select just those elements that contain "DELETE" in their value attribute, use the contains() function:
>>> doc.xpath('//elementParameter[#name="QUERY" and contains(#value, "DELETE")]')

Print lesser than/greater than sign inside a XML node as a value in python

Trying to print lesser than/greater than sign inside an XML node as a value using element tree:
code:
messageValue = f"<![CDATA[{generate_MSH(i)}"
ET.SubElement(root, "Message", MsgID="1").text = messageValue
tree = ET.ElementTree(root)
tree.write("filename.xml", xml_declaration=True, encoding="UTF-8")
output: <Message MsgID="1"><![CDATA[MSH|^~\&|PATHL7
required output <Message MsgID="1"><![CDATA[MSH|^~\&;|PATHL7
The output to the XML file does not print < or & instead it prints < and &amp instead.

How to add a subelement in xml

<data>
<items>
<item name="item1">....text here</item>
<item name="item2">....another text here</item>
</items>
</data>
I'm in stuck.
I have this scheme of xml. I need to add an additional block with (key,value) like this (name="something") and the text value.
So, I need to make something like this
<item name="item3">text here</item>
I know, that I can do something like this:
import xml.etree.ElementTree as ET
data = ET.Element('data')
items = ET.SubElement(data, 'items')
item1 = ET.SubElement(items, 'item')
item2 = ET.SubElement(items, 'item')
item1.set('name','item1')
item2.set('name','item2')
item1.text = 'another text here...'
item2.text = '...text here'
myExample = ET.SubElement(items, 'item')
myExample.set('name', 'For example')
myExample.text = 'for instance, its a string'
mydata = ET.tostring(data)
myfile = open("items2.xml", "wb")
myfile.write(mydata)
But I need to find another approach in order to do this.
And what is more, I want to figure it out how to add it to my xml file. Am I need to rewrite file or?...
myfile = open("items2.xml", "a+b")
myfile.write(mydata)
adds result into another string.. And it's causes error in xml.
I came up with a way that a bit solves my problem.
import xml.etree.ElementTree as ET
data = ET.Element('data')
items = ET.SubElement(data, 'items')
result = iter(['Price: 2250','Price: 1','Price: 2','Price: 3'])
l = [exec("f = ET.SubElement(items, 'item')\nf.set('name', 'order')\nf.text='%s'%next(result)") for _ in range(4)]
mydata = ET.tostring(data)
myfile = open("items2.xml", "wb")
myfile.write(mydata)
I've never seen the a+b mode. You want to create a new file with the changes added, which means w or wb in your case. You don't want to append since that just adds new data to the end of your previous file.
Also, why do you need to find another approach?

variable in XML subelement

I'm thinking of Python code to create a dynamic xml ETREE subElement.
I have a hierarchical header to describe a peace of book as the following:
<Books>
<Booktype List= "Story > Fiction > Young">
#here the rest of book text
</Booktype>
<Booktype List= "Science > Math > Young">
#here the rest of book text
</Booktype>
</Books>
How to get a hierarchical xml tag like this :
<Books>
<Booktype>
<Story>
<Fiction>
<Young>
#here the rest of book text
</Young>
</Fiction>
</Story>
</Booktype>
</Books>
This is my code:
import re
import xml.etree.ElementTree as ET
from xml.etree import ElementTree
List= "Story>Fiction>Young"
List = List.split('>')
root = ET.Element('Books')
Booktype =ET.SubElement(root,'Booktype')
for l in List:
ND = ET.SubElement(Booktype,str(l))
Booktype.append(ND)
tree = ET.ElementTree(root)
ElementTree.tostring(root,'utf-8')
I got this bad result:
'<Books><Booktype><Story /><Story /><Story /><Fiction /><Fiction /><Young /><Young /><Story /><Story /><Fiction /><Fiction /><Young /><Young /></Booktype></Books>'
If you want to nest the list elements you have to keep the reference to the previous one so you can add the child element to it, and not to the Booktype element. See the variable currrent in the examples.
from xml.etree import ElementTree as ET
xml_string = '''<Books>
<Booktype List= "Story > Fiction > Young">
#here the rest of book text
</Booktype>
<Booktype List= "Science > Math > Young">
#here the rest of book text 2
</Booktype>
</Books>
'''
xml = ET.fromstring(xml_string)
for booktype in xml.findall('Booktype'):
types = map(lambda x: x.strip(), booktype.get('List').split('>'))
current = booktype
for t in types:
current = ET.SubElement(current, t)
current.text = booktype.text
booktype.text = ''
del booktype.attrib['List']
print ET.tostring(xml,'utf-8')
Gives me the result:
<Books>
<Booktype><Story><Fiction><Young>
#here the rest of book text
</Young></Fiction></Story></Booktype>
<Booktype><Science><Math><Young>
#here the rest of book text 2
</Young></Math></Science></Booktype>
</Books>
And if you want to create a completely new structure you can do:
xml = ET.fromstring(xml_string)
root = ET.Element('Books')
for booktype in xml.findall('Booktype'):
current = ET.SubElement(root, 'Booktype')
for t in map(lambda x: x.strip(), booktype.get('List').split('>')):
current = ET.SubElement(current, t)
current.text = booktype.text
print ET.tostring(root, 'utf-8')

Categories

Resources