Write 'xsi:' in front of attribute with lxml for python 3 - python

I'm adding elements to an xml file.
The document's root is as follows
<Root xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema">
And elements to add look like
<Element xsi:type="some type">
<Sub1>Some text</Sub1>
<Sub2>More text</Sub2>
...
</Element>
I'm trying to find a way for lxml to write 'xsi:' in front of my Element's attibute. This xml file is used by a program to which's source code I do not have access to. I read in a few other questions how to do it by declaring the nsmap of the xml's root, and then again in the child's attribute, which I tried but it didn't work. So far I have (that's what didn't work, the ouput file did not contain the xsi prefix):
element = SubElement(_parent=parent,
_tag='some tag',
attrib={'{%s}type' % XSI: 'some type'}
nsmap={'xsi': XSI}) # Where XSI = namespace address
The namespace is declared properly in the xml file I parse, so I don't know why this isn't working.
The output I get is the element as shown above without the 'xsi:' prefix and all on one line:
<Element type="some type"><Sub1>Some text</Sub1><Sub2>More text</Sub2>...</Element>
If anyone can also point out why in this line
self.tree.write(self.filename, pretty_print=True, encoding='utf-8')
the 'pretty_print' option doesn't work (all printed out in one line), it would be greatly appreciated.
Here is a code example of my script:
from math import floor
from lxml import etree
from lxml.etree import SubElement
def Element(root, sub1: str):
if not isinstance(sub1, str):
raise TypeError
else:
element = SubElement(root, 'Element')
element_sub1 = SubElement(element, 'Sub1')
element_sub1.text = sub1
# ...
# Omitted additional SubElements
# ...
return element
def Sub(root, sub5_sub: str):
XSI = "http://www.w3.org/2001/XMLSchema-instance"
if not isinstance(sub5_sub, str):
raise TypeError
else:
sub = SubElement(root, 'Sub5_Sub', {'{%s}type' % XSI: 'SomeType'}, nsmap={'xsi': XSI})
# ...
# Omitted additional SubElements
# ...
return sub
class Generator:
def __init__(self) -> None:
self.filename = None
self.csv_filename = None
self.csv_content = []
self.tree = None
self.root = None
self.panel = None
self.panels = None
def mainloop(self) -> None:
"""App's mainloop"""
while True:
# Getting files from user
xml_filename = input('Enter path to xml file : ')
# Parsing files
csv_content = [{'field1': 'ElementSub1', 'field2': 'something'},
{'field1': 'ElementSub1', 'field2': 'something'},
{'field1': 'ElementSub2', 'field2': 'something'}] # Replaces csv file that I use
tree = etree.parse(xml_filename)
root = tree.getroot()
elements = root.find('Elements')
for element in elements:
if element.find('Sub1').text in ['ElementSub1', 'ElementSub2']:
for line in csv_content:
if element.find('Sub5') is not None:
Sub(root=element.find('Sub5'),
sub5_sub=line['field2'])
tree.write(xml_filename, pretty_print=True, encoding='utf-8')
if input('Continue? (Y) Quit (n)').upper().startswith('Y'):
elements.clear()
continue
else:
break
#staticmethod
def get_x(x: int) -> str:
if not isinstance(x, int):
x = int(x)
return str(int(floor(9999 / 9 * x)))
#staticmethod
def get_y(y: int) -> str:
if not isinstance(y, int):
y = int(y)
return str(int(floor(999 / 9 * y)))
def quit(self) -> None:
quit()
if __name__ == "__main__":
app = Generator()
app.mainloop()
app.quit()
Here is what it outputs:
<Root xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema">
<Elements>
<Element>
<Sub1>ElementSub1</Sub1>
<Sub5>
<Sub5_Sub xsi:type="SomeType"/>
<Sub5_Sub xsi:type="SomeType"/><Sub5_Sub xsi:type="SomeType"/><Sub5_Sub xsi:type="SomeType"/></Sub5>
</Element>
<Element>
<Sub1>ElementSub1</Sub1>
<Sub5>
<Sub5_Sub xsi:type="SomeType"/>
<Sub5_Sub xsi:type="SomeType"/>
<Sub5_Sub xsi:type="SomeType"/><Sub5_Sub xsi:type="SomeType"/><Sub5_Sub xsi:type="SomeType"/></Sub5>
</Element>
<Element>
<Sub1>ElementSub1</Sub1>
</Element>
</Elements>
</Root>
For some reason, this piece of code does what I want but my real code doesn't. I've come to realize that it does put a prefix on some sub elements with the type attribute, but not all and on those it puts the prefix, it isn't always just 'xsi:'. I found a quick and dirty way to fix this problem which is less than ideal (find and replace through the file for xsi-type -> accepted by lxml's api to xsi:type). What still isn't working though is that it's all printed out in one line despite the pretty_print parameter being true.

I just recently encountered this scenario and was able to successfully create an attribute with the xsi:
qname = etree.QName("http://www.w3.org/2001/XMLSchema-instance", "type")
element = etree.Element('Element', {qname: "some type")
root.append(element)
this outputs something like
<Element xsi:type="some type">

Related

How to find a XML child element which has a default namespace in Python?

My goal is to find the XML child element which has a default name.
XML:
<?xml version='1.0' encoding='UTF-8'?>
<all:config xmlns:all="urn:base:1.0">
<interfaces xmlns="urn:ietf-interfaces">
<interface>
<name>eth0</name>
<enabled>true</enabled>
<ipv4 xmlns="urn:b-ip">
<enabled>true</enabled>
</ipv4>
<tagging xmlns="urn:b:interfaces:1.0">true</tagging>
<mac xmlns="urn:b:interfaces:1.0">00:00:10:00:00:11</mac>
</interface>
</interfaces>
</all:config>
I want to find the following element:
<mac xmlns="urn:b:interfaces:1.0">00:00:10:00:00:11</mac>
and change mac's text.
I have the following questions:
What is the xpath of mac?
How can I find "mac" using xpath since it has the default namespace?
My code does not work:
def set_element_value(file_name, element, new_value, order):
filename = file_name
tree = etree.parse(filename)
root = tree.getroot()
xml_string = etree.tostring(tree).decode('utf-8')
my_own_namespace_mapping = {'prefix': 'urn:b:interfaces:1.0'}
myele = root.xpath('.//prefix:mac', namespaces=my_own_namespace_mapping)
myele[0].text = "aaa"
for ele in root.xpath('.//prefix:mac', namespaces=my_own_namespace_mapping):
if count_order == order:
ele.text = str(new_value)
count_order += 1
def main():
filename ="./template/b.xml"
element = ".//interfaces/interface/mac"
new_value = "10"
order = 0
set_element_value(filename, element, new_value, order)
if __name__ == '__main__':
main()
I tried to dig out in the stackoverflow, but no similar answer.
Could you please give me some tips?
Thank you!
Thanks to Jack's methods, I fixed this issue:
The new code:
def set_element_value(file_name, element, new_value, order):
filename = file_name
tree = etree.parse(filename)
tag_list = tree.xpath('.//*[local-name()="mac"]')
print("tag:", tag_list, " and tag value:", tag_list[0].text)
tag_list[0].text = "10"
xml_string = etree.tostring(tree).decode('utf-8')
print(xml_string)
def main():
filename ="./template/b.xml"
element = "mac"
new_value = "10"
order = 1
set_element_value(filename, element, new_value, order)
if __name__ == '__main__':
main()
output:
tag: [<Element {urn:ietf-interfaces}mac at 0x298fc4b69c0>] and tag value: 10
<all:config xmlns:all="urn:base:1.0">
<interfaces xmlns="urn:ietf-interfaces">
<interface>
<name>eth0</name>
<enabled>true</enabled>
<ipv4 xmlns="urn:b-ip">
<enabled>true</enabled>
</ipv4>
<tagging xmlns="urn:b:interfaces:1.0">true</tagging>
<mac xmln="urn:b:interfaces:1.0">10</mac>
</interface>
</interfaces>
</all:config>
Your code seems to be a little too complicated than necessary. Try the following to get to the mac address:
ns = {"x":"urn:b:interfaces:1.0"}
root.xpath('//x:mac/text()',namespaces=ns)[0]
or if you don't want to deal with namespaces:
root.xpath('//*[local-name()="mac"]/text()')[0]
Output in either case is
00:00:10:00:00:11

How to parse xml file with <pair key = "..."> </pair> format

I hope to parse a '.xml' file using python. The format of the file is as follows:
<root><dm_log_packet>
<pair key ="type_id">LTE_PHY_Serv_Cell_Measurement</pair>
</dm_log_packet>
</root>
I tried to parse it using ElementTree but failed.
Here is my code:
from xml.etree import ElementTree
class Log:
def __init__(self,type_id=None):
self.type_id=type_id
def __str__(self):
return self.type_id
roota=ElementTree.parse("file.xml")
log_file = roota.findall("dm_log_packet")
lo = []
for aa in log_file:
log = Log()
log.type_id = aa.find("type_id").text
lo.append(log)
I expect to parse each pair, but it can't do it like I have a <type_id>...</type_id> pair.
You can use BeautifulSoup
xml = """
<root>
<dm_log_packet>
<pair key ="type_id">LTE_PHY_Serv_Cell_Measurement</pair>
</dm_log_packet>
</root>
"""
soup_obj = BeautifulSoup(xml)
soup_obj.html.body.foo.bar.findAll('type')[0]['foobar']
Output will
'1'
More Descriptive Answer
.find() and .findall() expect XPath as arguments, plain strings like "dm_log_packet" will not find anything.
from xml.etree import ElementTree
class Log:
def __init__(self, type_id=None):
self.type_id=type_id
def __str__(self):
return self.type_id
tree = ElementTree.parse("file.xml")
lo = []
for dm_log_packet in tree.findall(".//dm_log_packet"):
pair = dm_log_packet.find("./pair/[#key='type_id']")
if pair is not None:
lo.append(Log(pair.text))
Note that dm_log_packet.find("./pair/[#key='type_id']") will return None when there is no <pair key="type_id">, hence the extra check.

Recursive XML parsing python using ElementTree

I'm trying to parse below XML using Python ElementTree to product output as below. I'm trying to write modules for top elements to print them. However It is slightly tricky as category element may or may not have property and cataegory element may have a category element inside.
I've referred to previous question in this topic, but they did not consist of nested elements with same name
My Code:
http://pastebin.com/Fsv2Xzqf
work.xml:
<suite id="1" name="MainApplication">
<displayNameKey>my Application</displayNameKey>
<displayName>my Application</displayName>
<application id="2" name="Sub Application1">
<displayNameKey>sub Application1</displayNameKey>
<displayName>sub Application1</displayName>
<category id="2423" name="about">
<displayNameKey>subApp.about</displayNameKey>
<displayName>subApp.about</displayName>
<category id="2423" name="comms">
<displayNameKey>subApp.comms</displayNameKey>
<displayName>subApp.comms</displayName>
<property id="5909" name="copyright" type="string_property" width="40">
<value>2014</value>
</property>
<property id="5910" name="os" type="string_property" width="40">
<value>Linux 2.6.32-431.29.2.el6.x86_64</value>
</property>
</category>
<property id="5908" name="releaseNumber" type="string_property" width="40">
<value>9.1.0.3.0.54</value>
</property>
</category>
</application>
</suite>
Output should be as below:
Suite: MainApplication
Application: Sub Application1
Category: about
property: releaseNumber | 9.1.0.3.0.54
category: comms
property: copyright | 2014
property: os | Linux 2.6.32-431.29.2.el6.x86_64
Any pointers in right direction would be of help.
import xml.etree.ElementTree as ET
tree = ET.ElementTree(file='work.xml')
indent = 0
ignoreElems = ['displayNameKey', 'displayName']
def printRecur(root):
"""Recursively prints the tree."""
if root.tag in ignoreElems:
return
print ' '*indent + '%s: %s' % (root.tag.title(), root.attrib.get('name', root.text))
global indent
indent += 4
for elem in root.getchildren():
printRecur(elem)
indent -= 4
root = tree.getroot()
printRecur(root)
OUTPUT:
Suite: MainApplication
Application: Sub Application1
Category: about
Category: comms
Property: copyright
Value: 2014
Property: os
Value: Linux 2.6.32-431.29.2.el6.x86_64
Property: releaseNumber
Value: 9.1.0.3.0.54
This is closest I could get in 5 minutes. You should just recursively call a processor function and that would take care. You can improve on from this point :)
You can also define handler function for each tag and put all of them in a dictionary for easy lookup. Then you can check if you have an appropriate handler function for that tag, then call that else just continue with blindly printing. For example:
HANDLERS = {
'property': 'handle_property',
<tag_name>: <handler_function>
}
def handle_property(root):
"""Takes property root element and prints the values."""
data = ' '*indent + '%s: %s ' % (root.tag.title(), root.attrib['name'])
values = []
for elem in root.getchildren():
if elem.tag == 'value':
values.append(elem.text)
print data + '| %s' % (', '.join(values))
# printRecur would get modified accordingly.
def printRecur(root):
"""Recursively prints the tree."""
if root.tag in ignoreElems:
return
global indent
indent += 4
if root.tag in HANDLERS:
handler = globals()[HANDLERS[root.tag]]
handler(root)
else:
print ' '*indent + '%s: %s' % (root.tag.title(), root.attrib.get('name', root.text))
for elem in root.getchildren():
printRecur(elem)
indent -= 4
Output with above:
Suite: MainApplication
Application: Sub Application1
Category: about
Category: comms
Property: copyright | 2014
Property: os | Linux 2.6.32-431.29.2.el6.x86_64
Property: releaseNumber | 9.1.0.3.0.54
I find this very useful rather than putting tons of if/else in the code.
If you want a barebones XML recursive tree parser snippet:
from xml.etree import ElementTree
tree = ElementTree.parse('english_saheeh.xml')
root = tree.getroot()
def walk_tree_recursive(root):
#do whatever with .tags here
for child in root:
walk_tree_recursive(child)
walk_tree_recursive(root)
if you want a kind of universal xml importer, creating a record per xml element
import pandas as pd
import xml.etree.ElementTree as ET
tree = ET.parse('file.xml')
root = tree.getroot()
def rij(elem,level,tags,rtag,mtag,keys,rootkey,data):
otag=mtag
mtag=elem.tag
mtag=mtag[mtag.rfind('}')+1:]
tags.append(mtag)
if level==1:
rtag=mtag
if elem.keys() is not None:
mkey=[]
if len(elem.keys())>1:
for key in elem.keys():
mkey.append(elem.attrib.get(key))
rootkey=mkey
else:
for key in elem.keys():
rootkey=elem.attrib.get(key)
else:
if elem.keys() is not None:
mkey=[]
lkey=[]
for key in elem.keys():
if len(elem.keys())>1:
mkey.append(elem.attrib.get(key))
keys=mkey
else:
for key in elem.keys():
keys=elem.attrib.get(key)
lkey=key
if elem.text is not None:
if elem.text!='\n ':
data.append([rootkey,tags,rtag,otag,mtag,lkey,keys,elem.text])
else:
data.append([rootkey,tags,rtag,otag,mtag,lkey,keys,''])
#print(data)
level+=1
for chil in elem.getchildren():
data = rij(chil, level,tags,rtag,mtag, keys,rootkey,data)
level-=1
mtag=elem.tag
mtag=mtag[mtag.rfind('}')+1:]
tags.remove(mtag)
return data
data = rij(root,0,[],'','', [],[],[])

xmlns namespace breaking lxml

I am trying to open an xml file, and get values from certain tags. I have done this a lot but this particular xml is giving me some issues. Here is a section of the xml file:
<?xml version='1.0' encoding='UTF-8'?>
<package xmlns="http://apple.com/itunes/importer" version="film4.7">
<provider>filmgroup</provider>
<language>en-GB</language>
<actor name="John Smith" display="Doe John"</actor>
</package>
And here is a sample of my python code:
metadata = '/Users/mylaptop/Desktop/Python/metadata.xml'
from lxml import etree
parser = etree.XMLParser(remove_blank_text=True)
open(metadata)
tree = etree.parse(metadata, parser)
root = tree.getroot()
for element in root.iter(tag='provider'):
providerValue = tree.find('//provider')
providerValue = providerValue.text
print providerValue
tree.write('/Users/mylaptop/Desktop/Python/metadataDone.xml', pretty_print = True, xml_declaration = True, encoding = 'UTF-8')
When I run this it can't find the provider tag or its value. If I remove xmlns="http://apple.com/itunes/importer" then all work as expected.
My question is how can I remove this namespace, as i'm not at all interested in this, so I can get the tag values I need using lxml?
The provider tag is in the http://apple.com/itunes/importer namespace, so you either need to use the fully qualified name
{http://apple.com/itunes/importer}provider
or use one of the lxml methods that has the namespaces parameter, such as root.xpath. Then you can specify it with a namespace prefix (e.g. ns:provider):
from lxml import etree
parser = etree.XMLParser(remove_blank_text=True)
tree = etree.parse(metadata, parser)
root = tree.getroot()
namespaces = {'ns':'http://apple.com/itunes/importer'}
items = iter(root.xpath('//ns:provider/text()|//ns:actor/#name',
namespaces=namespaces))
for provider, actor in zip(*[items]*2):
print(provider, actor)
yields
('filmgroup', 'John Smith')
Note that the XPath used above assumes that <provider> and <actor> elements always appear in alternation. If that is not true, then there are of course ways to handle it, but the code becomes a bit more verbose:
for package in root.xpath('//ns:package', namespaces=namespaces):
for provider in package.xpath('ns:provider', namespaces=namespaces):
providerValue = provider.text
print providerValue
for actor in package.xpath('ns:actor', namespaces=namespaces):
print actor.attrib['name']
My suggestion is to not ignore the namespace but, instead, to take it into account. I wrote some related functions (copied with slight modification) for my work on the django-quickbooks library. With these functions, you should be able to do this:
providers = getels(root, 'provider', ns='http://apple.com/itunes/importer')
Here are those functions:
def get_tag_with_ns(tag_name, ns):
return '{%s}%s' % (ns, tag_name)
def getel(elt, tag_name, ns=None):
""" Gets the first tag that matches the specified tag_name taking into
account the QB namespace.
:param ns: The namespace to use if not using the default one for
django-quickbooks.
:type ns: string
"""
res = elt.find(get_tag_with_ns(tag_name, ns=ns))
if res is None:
raise TagNotFound('Could not find tag by name "%s"' % tag_name)
return res
def getels(elt, *path, **kwargs):
""" Gets the first set of elements found at the specified path.
Example:
>>> xml = (
"<root>" +
"<item>" +
"<id>1</id>" +
"</item>" +
"<item>" +
"<id>2</id>"* +
"</item>" +
"</root>")
>>> el = etree.fromstring(xml)
>>> getels(el, 'root', 'item', ns='correct/namespace')
[<Element item>, <Element item>]
"""
ns = kwargs['ns']
i=-1
for i in range(len(path)-1):
elt = getel(elt, path[i], ns=ns)
tag_name = path[i+1]
return elt.findall(get_tag_with_ns(tag_name, ns=ns))

Accessing XMLNS attribute with Python Elementree?

How can one access NS attributes through using ElementTree?
With the following:
<data xmlns="http://www.foo.net/a" xmlns:a="http://www.foo.net/a" book="1" category="ABS" date="2009-12-22">
When I try to root.get('xmlns') I get back None, Category and Date are fine, Any help appreciated..
I think element.tag is what you're looking for. Note that your example is missing a trailing slash, so it's unbalanced and won't parse. I've added one in my example.
>>> from xml.etree import ElementTree as ET
>>> data = '''<data xmlns="http://www.foo.net/a"
... xmlns:a="http://www.foo.net/a"
... book="1" category="ABS" date="2009-12-22"/>'''
>>> element = ET.fromstring(data)
>>> element
<Element {http://www.foo.net/a}data at 1013b74d0>
>>> element.tag
'{http://www.foo.net/a}data'
>>> element.attrib
{'category': 'ABS', 'date': '2009-12-22', 'book': '1'}
If you just want to know the xmlns URI, you can split it out with a function like:
def tag_uri_and_name(elem):
if elem.tag[0] == "{":
uri, ignore, tag = elem.tag[1:].partition("}")
else:
uri = None
tag = elem.tag
return uri, tag
For much more on namespaces and qualified names in ElementTree, see effbot's examples.
Look at the effbot namespaces documentation/examples; specifically the parse_map function. It shows you how to add an *ns_map* attribute to each element which contains the prefix/URI mapping that applies to that specific element.
However, that adds the ns_map attribute to all the elements. For my needs, I found I wanted a global map of all the namespaces used to make element look up easier and not hardcoded.
Here's what I came up with:
import elementtree.ElementTree as ET
def parse_and_get_ns(file):
events = "start", "start-ns"
root = None
ns = {}
for event, elem in ET.iterparse(file, events):
if event == "start-ns":
if elem[0] in ns and ns[elem[0]] != elem[1]:
# NOTE: It is perfectly valid to have the same prefix refer
# to different URI namespaces in different parts of the
# document. This exception serves as a reminder that this
# solution is not robust. Use at your own peril.
raise KeyError("Duplicate prefix with different URI found.")
ns[elem[0]] = "{%s}" % elem[1]
elif event == "start":
if root is None:
root = elem
return ET.ElementTree(root), ns
With this you can parse an xml file and obtain a dict with the namespace mappings. So, if you have an xml file like the following ("my.xml"):
<?xml version="1.0" encoding="UTF-8" ?>
<rss version="2.0"
xmlns:content="http://purl.org/rss/1.0/modules/content/"
xmlns:dc="http://purl.org/dc/elements/1.1/"\
>
<feed>
<item>
<title>Foo</title>
<dc:creator>Joe McGroin</dc:creator>
<description>etc...</description>
</item>
</feed>
</rss>
You will be able to use the xml namepaces and get info for elements like dc:creator:
>>> tree, ns = parse_and_get_ns("my.xml")
>>> ns
{u'content': '{http://purl.org/rss/1.0/modules/content/}',
u'dc': '{http://purl.org/dc/elements/1.1/}'}
>>> item = tree.find("/feed/item")
>>> item.findtext(ns['dc']+"creator")
'Joe McGroin'
Try this:
import xml.etree.ElementTree as ET
import re
import sys
with open(sys.argv[1]) as f:
root = ET.fromstring(f.read())
xmlns = ''
m = re.search('{.*}', root.tag)
if m:
xmlns = m.group(0)
print(root.find(xmlns + 'the_tag_you_want').text)

Categories

Resources