lxml iterparse in python can't handle namespaces

lxml iterparse in python can't handle namespaces - python

from lxml import etree
import StringIO
data= StringIO.StringIO('<root xmlns="http://some.random.schema"><a>One</a><a>Two</a><a>Three</a></root>')
docs = etree.iterparse(data,tag='a')
a,b = docs.next()
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "iterparse.pxi", line 478, in lxml.etree.iterparse.__next__ (src/lxml/lxml.etree.c:95348)
File "iterparse.pxi", line 534, in lxml.etree.iterparse._read_more_events (src/lxml/lxml.etree.c:95938)
StopIteration
Works fine untill I add the namespace to the root node. Any ideas as to what I can do as a work around, or the correct way of doing this?
I need to be event driven due to very large files.

When there is a namespace attached, the tag isn't a, it's {http://some.random.schema}a. Try this (Python 3):
from lxml import etree
from io import BytesIO
xml = '''\
<root xmlns="http://some.random.schema">
<a>One</a>
<a>Two</a>
<a>Three</a>
</root>'''
data = BytesIO(xml.encode())
docs = etree.iterparse(data, tag='{http://some.random.schema}a')
for event, elem in docs:
print(f'{event}: {elem}')
or, in Python 2:
from lxml import etree
from StringIO import StringIO
xml = '''\
<root xmlns="http://some.random.schema">
<a>One</a>
<a>Two</a>
<a>Three</a>
</root>'''
data = StringIO(xml)
docs = etree.iterparse(data, tag='{http://some.random.schema}a')
for event, elem in docs:
print event, elem
This prints something like:
end: <Element {http://some.random.schema}a at 0x10941e730>
end: <Element {http://some.random.schema}a at 0x10941e8c0>
end: <Element {http://some.random.schema}a at 0x10941e960>
As #mihail-shcheglov pointed out, a wildcard * can also be used, which works for any or no namespace:
from lxml import etree
from io import BytesIO
xml = '''\
<root xmlns="http://some.random.schema">
<a>One</a>
<a>Two</a>
<a>Three</a>
</root>'''
data = BytesIO(xml.encode())
docs = etree.iterparse(data, tag='{*}a')
for event, elem in docs:
print(f'{event}: {elem}')
See lxml.etree docs for more.

Why not with a regular expression ?
1)
Using lxml is slower than using a regex.
from time import clock
import StringIO
from lxml import etree
times1 = []
for i in xrange(1000):
data= StringIO.StringIO('<root ><a>One</a><a>Two</a><a>Three\nlittle pigs</a><b>Four</b><a>another</a></root>')
te = clock()
docs = etree.iterparse(data,tag='a')
tf = clock()
times1.append(tf-te)
print min(times1)
print [etree.tostring(y) for x,y in docs]
import re
regx = re.compile('<a>[\s\S]*?</a>')
times2 = []
for i in xrange(1000):
data= StringIO.StringIO('<root ><a>One</a><a>Two</a><a>Three\nlittle pigs</a><b>Four</b><a>another</a></root>')
te = clock()
li = regx.findall(data.read())
tf = clock()
times2.append(tf-te)
print min(times2)
print li
result
0.000150298431784
['<a>One</a>', '<a>Two</a>', '<a>Three\nlittle pigs</a>', '<a>another</a>']
2.40253998762e-05
['<a>One</a>', '<a>Two</a>', '<a>Three\nlittle pigs</a>', '<a>another</a>']
0.000150298431784 / 2.40253998762e-05 is 6.25
lxml is 6.25 times slower than a regex
.
2)
No problem if namespace:
import StringIO
import re
regx = re.compile('<a>[\s\S]*?</a>')
data= StringIO.StringIO('<root xmlns="http://some.random.schema"><a>One</a><a>Two</a><a>Three\nlittle pigs</a><b>Four</b><a>another</a></root>')
print regx.findall(data.read())
result
['<a>One</a>', '<a>Two</a>', '<a>Three\nlittle pigs</a>', '<a>another</a>']

Related

XML parse does not show nodes

from xml.etree import ElementTree
t = """<collection xmlns:y="http://tail-f.com/ns/rest">
<appliance xmlns="http://networks.com/vnms/nms">
<uuid>088fbb70-40d1-4aaf-8ea3-590fd8238828</uuid>
<name>SRVDHCPE1</name>
<num-cpus>0</num-cpus>
<memory-size>0</memory-size>
<num-nics>4</num-nics>
</appliance>
<appliance xmlns="http://networks.com/vnms/nms">
<uuid>088fbb70-40d1-4aaf-8ea3-590fd8238828</uuid>
<name>SRVDHCPE2</name>
<num-cpus>0</num-cpus>
<memory-size>0</memory-size>
<num-nics>4</num-nics>
</appliance>
</collection>"""
dom = ElementTree.fromstring(t)
for n in dom.findall("collection/appliance/name"):
print(n.text)
Looking for all the names but it does not show. What am I doing wrong here.

You case definitely related to Parsing XML with Namespaces:
dom = ET.fromstring(t)
ns = {'rest': 'http://tail-f.com/ns/rest','nms': 'http://versa-networks.com/vnms/nms'}
for n in dom.findall("nms:appliance/nms:name", ns):
print(n.text)
The output:
SRVDHCPE1
SRVDHCPE2

You need to namespace your selectors:
from xml.etree import ElementTree
from xml.etree.ElementTree import Element
t = """<collection xmlns:y="http://tail-f.com/ns/rest">
<appliance xmlns="http://versa-networks.com/vnms/nms">
<uuid>088fbb70-40d1-4aaf-8ea3-590fd8238828</uuid>
<name>SRVDHCPE1</name>
<num-cpus>0</num-cpus>
<memory-size>0</memory-size>
<num-nics>4</num-nics>
</appliance>
<appliance xmlns="http://versa-networks.com/vnms/nms">
<uuid>088fbb70-40d1-4aaf-8ea3-590fd8238828</uuid>
<name>SRVDHCPE2</name>
<num-cpus>0</num-cpus>
<memory-size>0</memory-size>
<num-nics>4</num-nics>
</appliance>
</collection>"""
if __name__ == '__main__':
dom: Element = ElementTree.fromstring(t)
namespaces = {'n': 'http://versa-networks.com/vnms/nms'}
for name in dom.findall("./n:appliance/n:name", namespaces=namespaces):
print(name.text)
which prints:
SRVDHCPE1
SRVDHCPE2
For reference:
https://docs.python.org/3.7/library/xml.etree.elementtree.html#parsing-xml-with-namespaces

Print lxml.objectify.ObjectifiedElement?

Printing a lxml.objectify.ObjectifiedElement just prints a blank line, so I have to access it via it's tags and when I don't know the tags of the response, I'm just guessing.
How do I print the entire object, showing children names and values?
As requested, here is the code I have. Not sure what purpose this holds, but:
from amazonproduct import API
api = API('xxxxx', 'xxxxx', 'us', 'xxxx')
result = api.item_lookup('B00H8U93JO', ResponseGroup='OfferSummary')
print result

Using lxml.etree.tostring() seems to work, although not prettified :
>>> from lxml import etree
>>> from lxml import objectify
>>> raw = '''<root>
... <foo>foo</foo>
... <bar>bar</bar>
... </root>'''
...
>>> root = objectify.fromstring(raw)
>>> print type(root)
<type 'lxml.objectify.ObjectifiedElement'>
>>> print etree.tostring(root)
<root><foo>foo</foo><bar>bar</bar></root>

In response to har07, You can use minidom to prettify
from lxml import objectify, etree
from xml.dom import minidom
def pretty_print( elem ):
xml = etree.tostring( elem )
pretty = minidom.parseString( xml ).toprettyxml( indent=' ' )
print( pretty )

BeautifulSoup takes forever, can this be done faster?

I'm using a Raspberry Pi 1B+ w/ Debian Linux:
Linux rbian 3.18.0-trunk-rpi #1 PREEMPT Debian 3.18.5-1~exp1+rpi16 (2015-03-28) armv6l GNU/Linux
As part of a larger Python program I'm using this code:
#!/usr/bin/env python
import time
from urllib2 import Request, urlopen
from bs4 import BeautifulSoup
_url="http://xml.buienradar.nl/"
s1 = time.time()
req = Request(_url)
print "Request = {0}".format(time.time() - s1)
s2 = time.time()
response = urlopen(req)
print "URLopen = {0}".format(time.time() - s2)
s3 = time.time()
output = response.read()
print "Read = {0}".format(time.time() - s3)
s4 = time.time()
soup = BeautifulSoup(output)
print "Soup (1) = {0}".format(time.time() - s4)
s5 = time.time()
MSwind = str(soup.buienradarnl.weergegevens.actueel_weer.weerstations.find(id=6350).windsnelheidms)
GRwind = str(soup.buienradarnl.weergegevens.actueel_weer.weerstations.find(id=6350).windrichtinggr)
ms = MSwind.replace("<"," ").replace(">"," ").split()[1]
gr = GRwind.replace("<"," ").replace(">"," ").split()[1]
print "Extracting info = {0}".format(time.time() - s5)
s6 = time.time()
soup = BeautifulSoup(urlopen(_url))
print "Soup (2) = {0}".format(time.time() - s6)
s5 = time.time()
MSwind = str(soup.buienradarnl.weergegevens.actueel_weer.weerstations.find(id=6350).windsnelheidms)
GRwind = str(soup.buienradarnl.weergegevens.actueel_weer.weerstations.find(id=6350).windrichtinggr)
ms = MSwind.replace("<"," ").replace(">"," ").split()[1]
gr = GRwind.replace("<"," ").replace(">"," ").split()[1]
print "Extracting info = {0}".format(time.time() - s5)
When I run it, I get this output:
Request = 0.00394511222839
URLopen = 0.0579500198364
Read = 0.0346400737762
Soup (1) = 23.6777830124
Extracting info = 0.183892965317
Soup (2) = 36.6107468605
Extracting info = 0.382317781448
So, the BeautifulSoup command takes about half a minute to process the _url.
I would really love it if this could be done in under 10 seconds.
Any suggestions that would significantly speed up the code (by at least -60%) would be extremely welcome.

Install the lxml library; once installed BeautifulSoup will use it as the default parser.
lxml parser the page using the libxml2 C library, which is significantly faster than the default html.parser backend, implemented in pure Python.
You can then also parse the page as XML instead of as HTML:
soup = BeautifulSoup(output, 'xml')
Parsing your given page with lxml should be faster; I can parse the page almost 50 times per second:
>>> timeit("BeautifulSoup(output, 'xml')", 'from __main__ import BeautifulSoup, output', number=50)
1.1700470447540283
Still, I wonder if you are missing some other Python acceleration libraries, as I certainly cannot reproduce your results even with the built-in parser:
>>> timeit("BeautifulSoup(output, 'html.parser')", 'from __main__ import BeautifulSoup, output', number=50)
1.7218239307403564
Perhaps you are memory constrained and the large-ish document causes your OS to swap memory a lot? Memory swapping (writing pages to disk and loading other pages from disk) can bring even the fastest programs to a grinding halt.
Note that instead of using str() on tag elements and splitting off the tags, you can get the value from a tag simply by using the .string attribute:
station_6350 = soup.buienradarnl.weergegevens.actueel_weer.weerstations.find(id=6350)
ml = station_6350.windsnelheidMS.string
gr = station_6350.windrichtingGR.string
If you are using the XML parser, take into account that tagnames must match case (HTML is a case-insensitive mark-up language).
Since this is an XML document, another option would be to use the lxml ElementTree model; you can use XPath expressions to extract the data:
from lxml import etree
response = urlopen(_url)
for event, elem in etree.iterparse(response, tag='weerstation'):
if elem.get('id') == '6350':
ml = elem.find('windsnelheidMS').text
gr = elem.find('windrichtingGR').text
break
# clear elements we are not interested in, adapted from
# http://stackoverflow.com/questions/12160418/why-is-lxml-etree-iterparse-eating-up-all-my-memory
elem.clear()
for ancestor in elem.xpath('ancestor-or-self::*'):
while ancestor.getprevious() is not None:
del ancestor.getparent()[0]
This should only build the minimal object tree required, clearing out the weather stations you don't need as you go along the document.
Demo:
>>> from lxml import etree
>>> from urllib2 import urlopen
>>> _url = "http://xml.buienradar.nl/"
>>> response = urlopen(_url)
>>> for event, elem in etree.iterparse(response, tag='weerstation'):
... if elem.get('id') == '6350':
... ml = elem.find('windsnelheidMS').text
... gr = elem.find('windrichtingGR').text
... break
... # clear elements we are not interested in
... elem.clear()
... for ancestor in elem.xpath('ancestor-or-self::*'):
... while ancestor.getprevious() is not None:
... del ancestor.getparent()[0]
...
>>> ml
'4.64'
>>> gr
'337.8'

Using requests and regular expressions can be a lot shorter and faster. For such relatively simple data gathering regexes work fine.
#!/usr/bin/env python
from __future__ import print_function
import re
import requests
import time
_url = "http://xml.buienradar.nl/"
_regex = '<weerstation id="6391">.*?'\
'<windsnelheidMS>(.*?)</windsnelheidMS>.*?'\
'<windrichtingGR>(.*?)</windrichtingGR>'
s1 = time.time()
br = requests.get(_url)
print("Request = {0}".format(time.time() - s1))
s5 = time.time()
MSwind, GRwind = re.findall(_regex, br.text)[0]
print("Extracting info = {0}".format(time.time() - s5))
print('wind speed', MSwind, 'm/s')
print('wind direction', GRwind, 'degrees')
On my desktop (which is not a raspberry, though :-) ) this runs very fast;
Request = 0.0723416805267334
Extracting info = 0.0009412765502929688
wind speed 2.35 m/s
wind direction 232.6 degrees
Of course this particular regex would fail if the windsnelheidMS and windrichtingGR tags were reversed. But given that the XML is most probably computer-generated that doesn't seem likely.
And there is an solution for it. By first using a regex to capture the text between <weerstation id="6391"> and </weerstation>, and then use two other regexes to find the wind speed and direction.

minidom.parse reading short XML?

Here are 2 similar XML files :
Long XML
<mynode>
<text>Blah</text>
<position>322,13</position>
</mynode>
Short XML
<mynode text="Blah" position="322,13" />
It seems that Python's minidom.parse doesn't like the short XML.
Is this short XML style available with minidom (XML) ?
Is it possible to write a unique code that will read both short and long XML ?

from xml.dom import minidom
def getChild(n,v):
for child in n.childNodes:
if child.localName==v:
yield child
def getValue(n, val):
res = None
for n in mynode:
rv = getChild(n,val)
for v in rv:
var = v.childNodes[0].nodeValue
res = var
if not res:
for n in mynode:
attr = n.getAttributeNode(val)
if attr:
res = attr.nodeValue.strip()
return res
xmldoc = minidom.parse('file.xml')
mynode = xmldoc.getElementsByTagName('mynode')
print getValue(mynode,'text')
print getValue(mynode,'position')
output:
Blah
322,13

You need a root node
>>> from xml.dom.minidom import parseString
>>> doc = parseString('<root><mynode text="Blah" position="322,13" /></root>')
>>> print d.firstChild.firstChild.getAttribute('text')
Blah
>>> print d.firstChild.firstChild.getAttribute('position')
322,13

How to detect starting tag of xml and then parse and objectify

I'm using lxml to parse and objectify xml files in a path, I have a lot of model and xsd's, each object model maps to certain defined classes, for example if xml starts with model tag so it is a dataModel and if it starts with page tag it is a viewModel.
My question is how to detect in efficient way that xml file starts with which tag and then parse it with an appropriate xsd file and then objectify it
files = glob(os.path.join('resources/xml', '*.xml'))
for f in files:
xmlinput = open(f)
xmlContent = xmlinput.read()
if xsdPath:
xsdFile = open(xsdPath)
# xsdFile should retrieve according to xml content
schema = etree.XMLSchema(file=xsdFile)
xmlinput.seek(0)
myxml = etree.parse(xmlinput)
try:
schema.assertValid(myxml)
except etree.DocumentInvalid as x:
print "In file %s error %s has occurred." % (xmlPath, x.message)
finally:
xsdFile.close()
xmlinput.close()

I leave aside voluntarily file reading and treatments, to concentrate on your problem:
>>> from lxml.etree import fromstring
>>> # We have XMLs with different root tag
>>> tree1 = fromstring("<model><foo/><bar/></model>")
>>> tree2 = fromstring("<page><baz/><blah/></page>")
>>>
>>> # We have different treatments
>>> def modelTreatement(etree):
... return etree.xpath('//bar')
...
>>> def pageTreatment(etree):
... return etree.xpath('//blah')
...
>>> # Here is a recipe to read the root tag
>>> tree1.getroottree().getroot().tag
'model'
>>> tree2.getroottree().getroot().tag
'page'
>>>
>>> # So, by building an appropriated dict :
>>> tag_to_treatment_map = {'model': modelTreatement, 'page': pageTreatment}
>>> # You can run the right method on the right tree
>>> for tree in [tree1, tree2]:
... tag_to_treatment_map[tree.getroottree().getroot().tag](tree)
...
[<Element bar at 0x24979b0>]
[<Element blah at 0x2497a00>]
Hope this will be useful to someone, even if I had not seen this earlier.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

lxml iterparse in python can't handle namespaces - python

Related

XML parse does not show nodes

Print lxml.objectify.ObjectifiedElement?

BeautifulSoup takes forever, can this be done faster?

minidom.parse reading short XML?

How to detect starting tag of xml and then parse and objectify

Categories

Resources