from lxml import etree
import StringIO
data= StringIO.StringIO('<root xmlns="http://some.random.schema"><a>One</a><a>Two</a><a>Three</a></root>')
docs = etree.iterparse(data,tag='a')
a,b = docs.next()
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "iterparse.pxi", line 478, in lxml.etree.iterparse.__next__ (src/lxml/lxml.etree.c:95348)
File "iterparse.pxi", line 534, in lxml.etree.iterparse._read_more_events (src/lxml/lxml.etree.c:95938)
StopIteration
Works fine untill I add the namespace to the root node. Any ideas as to what I can do as a work around, or the correct way of doing this?
I need to be event driven due to very large files.
When there is a namespace attached, the tag isn't a, it's {http://some.random.schema}a. Try this (Python 3):
from lxml import etree
from io import BytesIO
xml = '''\
<root xmlns="http://some.random.schema">
<a>One</a>
<a>Two</a>
<a>Three</a>
</root>'''
data = BytesIO(xml.encode())
docs = etree.iterparse(data, tag='{http://some.random.schema}a')
for event, elem in docs:
print(f'{event}: {elem}')
or, in Python 2:
from lxml import etree
from StringIO import StringIO
xml = '''\
<root xmlns="http://some.random.schema">
<a>One</a>
<a>Two</a>
<a>Three</a>
</root>'''
data = StringIO(xml)
docs = etree.iterparse(data, tag='{http://some.random.schema}a')
for event, elem in docs:
print event, elem
This prints something like:
end: <Element {http://some.random.schema}a at 0x10941e730>
end: <Element {http://some.random.schema}a at 0x10941e8c0>
end: <Element {http://some.random.schema}a at 0x10941e960>
As #mihail-shcheglov pointed out, a wildcard * can also be used, which works for any or no namespace:
from lxml import etree
from io import BytesIO
xml = '''\
<root xmlns="http://some.random.schema">
<a>One</a>
<a>Two</a>
<a>Three</a>
</root>'''
data = BytesIO(xml.encode())
docs = etree.iterparse(data, tag='{*}a')
for event, elem in docs:
print(f'{event}: {elem}')
See lxml.etree docs for more.
Why not with a regular expression ?
1)
Using lxml is slower than using a regex.
from time import clock
import StringIO
from lxml import etree
times1 = []
for i in xrange(1000):
data= StringIO.StringIO('<root ><a>One</a><a>Two</a><a>Three\nlittle pigs</a><b>Four</b><a>another</a></root>')
te = clock()
docs = etree.iterparse(data,tag='a')
tf = clock()
times1.append(tf-te)
print min(times1)
print [etree.tostring(y) for x,y in docs]
import re
regx = re.compile('<a>[\s\S]*?</a>')
times2 = []
for i in xrange(1000):
data= StringIO.StringIO('<root ><a>One</a><a>Two</a><a>Three\nlittle pigs</a><b>Four</b><a>another</a></root>')
te = clock()
li = regx.findall(data.read())
tf = clock()
times2.append(tf-te)
print min(times2)
print li
result
0.000150298431784
['<a>One</a>', '<a>Two</a>', '<a>Three\nlittle pigs</a>', '<a>another</a>']
2.40253998762e-05
['<a>One</a>', '<a>Two</a>', '<a>Three\nlittle pigs</a>', '<a>another</a>']
0.000150298431784 / 2.40253998762e-05 is 6.25
lxml is 6.25 times slower than a regex
.
2)
No problem if namespace:
import StringIO
import re
regx = re.compile('<a>[\s\S]*?</a>')
data= StringIO.StringIO('<root xmlns="http://some.random.schema"><a>One</a><a>Two</a><a>Three\nlittle pigs</a><b>Four</b><a>another</a></root>')
print regx.findall(data.read())
result
['<a>One</a>', '<a>Two</a>', '<a>Three\nlittle pigs</a>', '<a>another</a>']
Related
from xml.etree import ElementTree
t = """<collection xmlns:y="http://tail-f.com/ns/rest">
<appliance xmlns="http://networks.com/vnms/nms">
<uuid>088fbb70-40d1-4aaf-8ea3-590fd8238828</uuid>
<name>SRVDHCPE1</name>
<num-cpus>0</num-cpus>
<memory-size>0</memory-size>
<num-nics>4</num-nics>
</appliance>
<appliance xmlns="http://networks.com/vnms/nms">
<uuid>088fbb70-40d1-4aaf-8ea3-590fd8238828</uuid>
<name>SRVDHCPE2</name>
<num-cpus>0</num-cpus>
<memory-size>0</memory-size>
<num-nics>4</num-nics>
</appliance>
</collection>"""
dom = ElementTree.fromstring(t)
for n in dom.findall("collection/appliance/name"):
print(n.text)
Looking for all the names but it does not show. What am I doing wrong here.
You case definitely related to Parsing XML with Namespaces:
dom = ET.fromstring(t)
ns = {'rest': 'http://tail-f.com/ns/rest','nms': 'http://versa-networks.com/vnms/nms'}
for n in dom.findall("nms:appliance/nms:name", ns):
print(n.text)
The output:
SRVDHCPE1
SRVDHCPE2
You need to namespace your selectors:
from xml.etree import ElementTree
from xml.etree.ElementTree import Element
t = """<collection xmlns:y="http://tail-f.com/ns/rest">
<appliance xmlns="http://versa-networks.com/vnms/nms">
<uuid>088fbb70-40d1-4aaf-8ea3-590fd8238828</uuid>
<name>SRVDHCPE1</name>
<num-cpus>0</num-cpus>
<memory-size>0</memory-size>
<num-nics>4</num-nics>
</appliance>
<appliance xmlns="http://versa-networks.com/vnms/nms">
<uuid>088fbb70-40d1-4aaf-8ea3-590fd8238828</uuid>
<name>SRVDHCPE2</name>
<num-cpus>0</num-cpus>
<memory-size>0</memory-size>
<num-nics>4</num-nics>
</appliance>
</collection>"""
if __name__ == '__main__':
dom: Element = ElementTree.fromstring(t)
namespaces = {'n': 'http://versa-networks.com/vnms/nms'}
for name in dom.findall("./n:appliance/n:name", namespaces=namespaces):
print(name.text)
which prints:
SRVDHCPE1
SRVDHCPE2
For reference:
https://docs.python.org/3.7/library/xml.etree.elementtree.html#parsing-xml-with-namespaces
Printing a lxml.objectify.ObjectifiedElement just prints a blank line, so I have to access it via it's tags and when I don't know the tags of the response, I'm just guessing.
How do I print the entire object, showing children names and values?
As requested, here is the code I have. Not sure what purpose this holds, but:
from amazonproduct import API
api = API('xxxxx', 'xxxxx', 'us', 'xxxx')
result = api.item_lookup('B00H8U93JO', ResponseGroup='OfferSummary')
print result
Using lxml.etree.tostring() seems to work, although not prettified :
>>> from lxml import etree
>>> from lxml import objectify
>>> raw = '''<root>
... <foo>foo</foo>
... <bar>bar</bar>
... </root>'''
...
>>> root = objectify.fromstring(raw)
>>> print type(root)
<type 'lxml.objectify.ObjectifiedElement'>
>>> print etree.tostring(root)
<root><foo>foo</foo><bar>bar</bar></root>
In response to har07, You can use minidom to prettify
from lxml import objectify, etree
from xml.dom import minidom
def pretty_print( elem ):
xml = etree.tostring( elem )
pretty = minidom.parseString( xml ).toprettyxml( indent=' ' )
print( pretty )
I'm using a Raspberry Pi 1B+ w/ Debian Linux:
Linux rbian 3.18.0-trunk-rpi #1 PREEMPT Debian 3.18.5-1~exp1+rpi16 (2015-03-28) armv6l GNU/Linux
As part of a larger Python program I'm using this code:
#!/usr/bin/env python
import time
from urllib2 import Request, urlopen
from bs4 import BeautifulSoup
_url="http://xml.buienradar.nl/"
s1 = time.time()
req = Request(_url)
print "Request = {0}".format(time.time() - s1)
s2 = time.time()
response = urlopen(req)
print "URLopen = {0}".format(time.time() - s2)
s3 = time.time()
output = response.read()
print "Read = {0}".format(time.time() - s3)
s4 = time.time()
soup = BeautifulSoup(output)
print "Soup (1) = {0}".format(time.time() - s4)
s5 = time.time()
MSwind = str(soup.buienradarnl.weergegevens.actueel_weer.weerstations.find(id=6350).windsnelheidms)
GRwind = str(soup.buienradarnl.weergegevens.actueel_weer.weerstations.find(id=6350).windrichtinggr)
ms = MSwind.replace("<"," ").replace(">"," ").split()[1]
gr = GRwind.replace("<"," ").replace(">"," ").split()[1]
print "Extracting info = {0}".format(time.time() - s5)
s6 = time.time()
soup = BeautifulSoup(urlopen(_url))
print "Soup (2) = {0}".format(time.time() - s6)
s5 = time.time()
MSwind = str(soup.buienradarnl.weergegevens.actueel_weer.weerstations.find(id=6350).windsnelheidms)
GRwind = str(soup.buienradarnl.weergegevens.actueel_weer.weerstations.find(id=6350).windrichtinggr)
ms = MSwind.replace("<"," ").replace(">"," ").split()[1]
gr = GRwind.replace("<"," ").replace(">"," ").split()[1]
print "Extracting info = {0}".format(time.time() - s5)
When I run it, I get this output:
Request = 0.00394511222839
URLopen = 0.0579500198364
Read = 0.0346400737762
Soup (1) = 23.6777830124
Extracting info = 0.183892965317
Soup (2) = 36.6107468605
Extracting info = 0.382317781448
So, the BeautifulSoup command takes about half a minute to process the _url.
I would really love it if this could be done in under 10 seconds.
Any suggestions that would significantly speed up the code (by at least -60%) would be extremely welcome.
Install the lxml library; once installed BeautifulSoup will use it as the default parser.
lxml parser the page using the libxml2 C library, which is significantly faster than the default html.parser backend, implemented in pure Python.
You can then also parse the page as XML instead of as HTML:
soup = BeautifulSoup(output, 'xml')
Parsing your given page with lxml should be faster; I can parse the page almost 50 times per second:
>>> timeit("BeautifulSoup(output, 'xml')", 'from __main__ import BeautifulSoup, output', number=50)
1.1700470447540283
Still, I wonder if you are missing some other Python acceleration libraries, as I certainly cannot reproduce your results even with the built-in parser:
>>> timeit("BeautifulSoup(output, 'html.parser')", 'from __main__ import BeautifulSoup, output', number=50)
1.7218239307403564
Perhaps you are memory constrained and the large-ish document causes your OS to swap memory a lot? Memory swapping (writing pages to disk and loading other pages from disk) can bring even the fastest programs to a grinding halt.
Note that instead of using str() on tag elements and splitting off the tags, you can get the value from a tag simply by using the .string attribute:
station_6350 = soup.buienradarnl.weergegevens.actueel_weer.weerstations.find(id=6350)
ml = station_6350.windsnelheidMS.string
gr = station_6350.windrichtingGR.string
If you are using the XML parser, take into account that tagnames must match case (HTML is a case-insensitive mark-up language).
Since this is an XML document, another option would be to use the lxml ElementTree model; you can use XPath expressions to extract the data:
from lxml import etree
response = urlopen(_url)
for event, elem in etree.iterparse(response, tag='weerstation'):
if elem.get('id') == '6350':
ml = elem.find('windsnelheidMS').text
gr = elem.find('windrichtingGR').text
break
# clear elements we are not interested in, adapted from
# http://stackoverflow.com/questions/12160418/why-is-lxml-etree-iterparse-eating-up-all-my-memory
elem.clear()
for ancestor in elem.xpath('ancestor-or-self::*'):
while ancestor.getprevious() is not None:
del ancestor.getparent()[0]
This should only build the minimal object tree required, clearing out the weather stations you don't need as you go along the document.
Demo:
>>> from lxml import etree
>>> from urllib2 import urlopen
>>> _url = "http://xml.buienradar.nl/"
>>> response = urlopen(_url)
>>> for event, elem in etree.iterparse(response, tag='weerstation'):
... if elem.get('id') == '6350':
... ml = elem.find('windsnelheidMS').text
... gr = elem.find('windrichtingGR').text
... break
... # clear elements we are not interested in
... elem.clear()
... for ancestor in elem.xpath('ancestor-or-self::*'):
... while ancestor.getprevious() is not None:
... del ancestor.getparent()[0]
...
>>> ml
'4.64'
>>> gr
'337.8'
Using requests and regular expressions can be a lot shorter and faster. For such relatively simple data gathering regexes work fine.
#!/usr/bin/env python
from __future__ import print_function
import re
import requests
import time
_url = "http://xml.buienradar.nl/"
_regex = '<weerstation id="6391">.*?'\
'<windsnelheidMS>(.*?)</windsnelheidMS>.*?'\
'<windrichtingGR>(.*?)</windrichtingGR>'
s1 = time.time()
br = requests.get(_url)
print("Request = {0}".format(time.time() - s1))
s5 = time.time()
MSwind, GRwind = re.findall(_regex, br.text)[0]
print("Extracting info = {0}".format(time.time() - s5))
print('wind speed', MSwind, 'm/s')
print('wind direction', GRwind, 'degrees')
On my desktop (which is not a raspberry, though :-) ) this runs very fast;
Request = 0.0723416805267334
Extracting info = 0.0009412765502929688
wind speed 2.35 m/s
wind direction 232.6 degrees
Of course this particular regex would fail if the windsnelheidMS and windrichtingGR tags were reversed. But given that the XML is most probably computer-generated that doesn't seem likely.
And there is an solution for it. By first using a regex to capture the text between <weerstation id="6391"> and </weerstation>, and then use two other regexes to find the wind speed and direction.
Here are 2 similar XML files :
Long XML
<mynode>
<text>Blah</text>
<position>322,13</position>
</mynode>
Short XML
<mynode text="Blah" position="322,13" />
It seems that Python's minidom.parse doesn't like the short XML.
Is this short XML style available with minidom (XML) ?
Is it possible to write a unique code that will read both short and long XML ?
from xml.dom import minidom
def getChild(n,v):
for child in n.childNodes:
if child.localName==v:
yield child
def getValue(n, val):
res = None
for n in mynode:
rv = getChild(n,val)
for v in rv:
var = v.childNodes[0].nodeValue
res = var
if not res:
for n in mynode:
attr = n.getAttributeNode(val)
if attr:
res = attr.nodeValue.strip()
return res
xmldoc = minidom.parse('file.xml')
mynode = xmldoc.getElementsByTagName('mynode')
print getValue(mynode,'text')
print getValue(mynode,'position')
output:
Blah
322,13
You need a root node
>>> from xml.dom.minidom import parseString
>>> doc = parseString('<root><mynode text="Blah" position="322,13" /></root>')
>>> print d.firstChild.firstChild.getAttribute('text')
Blah
>>> print d.firstChild.firstChild.getAttribute('position')
322,13
I'm using lxml to parse and objectify xml files in a path, I have a lot of model and xsd's, each object model maps to certain defined classes, for example if xml starts with model tag so it is a dataModel and if it starts with page tag it is a viewModel.
My question is how to detect in efficient way that xml file starts with which tag and then parse it with an appropriate xsd file and then objectify it
files = glob(os.path.join('resources/xml', '*.xml'))
for f in files:
xmlinput = open(f)
xmlContent = xmlinput.read()
if xsdPath:
xsdFile = open(xsdPath)
# xsdFile should retrieve according to xml content
schema = etree.XMLSchema(file=xsdFile)
xmlinput.seek(0)
myxml = etree.parse(xmlinput)
try:
schema.assertValid(myxml)
except etree.DocumentInvalid as x:
print "In file %s error %s has occurred." % (xmlPath, x.message)
finally:
xsdFile.close()
xmlinput.close()
I leave aside voluntarily file reading and treatments, to concentrate on your problem:
>>> from lxml.etree import fromstring
>>> # We have XMLs with different root tag
>>> tree1 = fromstring("<model><foo/><bar/></model>")
>>> tree2 = fromstring("<page><baz/><blah/></page>")
>>>
>>> # We have different treatments
>>> def modelTreatement(etree):
... return etree.xpath('//bar')
...
>>> def pageTreatment(etree):
... return etree.xpath('//blah')
...
>>> # Here is a recipe to read the root tag
>>> tree1.getroottree().getroot().tag
'model'
>>> tree2.getroottree().getroot().tag
'page'
>>>
>>> # So, by building an appropriated dict :
>>> tag_to_treatment_map = {'model': modelTreatement, 'page': pageTreatment}
>>> # You can run the right method on the right tree
>>> for tree in [tree1, tree2]:
... tag_to_treatment_map[tree.getroottree().getroot().tag](tree)
...
[<Element bar at 0x24979b0>]
[<Element blah at 0x2497a00>]
Hope this will be useful to someone, even if I had not seen this earlier.