How to parse xml file with <pair key = "..."> </pair> format

How to parse xml file with <pair key = "..."> </pair> format - python

I hope to parse a '.xml' file using python. The format of the file is as follows:
<root><dm_log_packet>
<pair key ="type_id">LTE_PHY_Serv_Cell_Measurement</pair>
</dm_log_packet>
</root>
I tried to parse it using ElementTree but failed.
Here is my code:
from xml.etree import ElementTree
class Log:
def __init__(self,type_id=None):
self.type_id=type_id
def __str__(self):
return self.type_id
roota=ElementTree.parse("file.xml")
log_file = roota.findall("dm_log_packet")
lo = []
for aa in log_file:
log = Log()
log.type_id = aa.find("type_id").text
lo.append(log)
I expect to parse each pair, but it can't do it like I have a <type_id>...</type_id> pair.

You can use BeautifulSoup
xml = """
<root>
<dm_log_packet>
<pair key ="type_id">LTE_PHY_Serv_Cell_Measurement</pair>
</dm_log_packet>
</root>
"""
soup_obj = BeautifulSoup(xml)
soup_obj.html.body.foo.bar.findAll('type')[0]['foobar']
Output will
'1'
More Descriptive Answer

.find() and .findall() expect XPath as arguments, plain strings like "dm_log_packet" will not find anything.
from xml.etree import ElementTree
class Log:
def __init__(self, type_id=None):
self.type_id=type_id
def __str__(self):
return self.type_id
tree = ElementTree.parse("file.xml")
lo = []
for dm_log_packet in tree.findall(".//dm_log_packet"):
pair = dm_log_packet.find("./pair/[#key='type_id']")
if pair is not None:
lo.append(Log(pair.text))
Note that dm_log_packet.find("./pair/[#key='type_id']") will return None when there is no <pair key="type_id">, hence the extra check.

Related

XML data extraction in python

I have an XML file like the following:
<AreaModel>
...
<RecipePhase>
<UniqueName>PHASE1</UniqueName>
...
<NumberOfParameterTags>7</NumberOfParameterTags>
...
<DefaultRecipeParameter>
<Name>PARAM1</Name>
----
</DefaultRecipeParameter>
<DefaultRecipeParameter>
<Name>PARAM2</Name>
----
</DefaultRecipeParameter>
<DefaultRecipeParameter>
<Name>PARAM3</Name>
----
</DefaultRecipeParameter>
</RecipePhase>
<RecipePhase>
....
</RecipePhase>
</AreaModel>
I would like to read this file in sequential order and generate different list. One for the texts of UniqueName TAGs and a list of lists containing for each list the set of texts for tag Name under each RecipePhase element.
For example, I might have 10 RecipePhase elements, each one with TAG UniqueName and each one containing a different set of children with tag DefaultRecipeParameter.
How can I take into account when I enter into RecipePhase and when I go out of the element during parsing?
I am trying ElementTree but I am not able to find a solution.
cheers,
m

You can use xml python module:
See my example:
from xml.dom import minidom as dom
import urllib2
def fetchPage(url):
a = urllib2.urlopen(url)
return ''.join(a.readlines())
def extract(page):
a = dom.parseString(page)
item = a.getElementsByTagName('Rate')
for i in item:
if i.hasChildNodes() == True:
print i.getAttribute('currency')+"-"+ i.firstChild.nodeValue
if __name__=='__main__':
page = fetchPage("http://www.bnro.ro/nbrfxrates.xml")
extract(page)

I solved partially my problem with the following code:
import xml.etree.ElementTree as ET
tree = ET.parse('control_strategies.axml')
root = tree.getroot()
phases=[]
for recipephase in root.findall('./RecipePhase/UniqueName'):
phases.append(recipephase.text)
n_elem = len(phases)
param=[[] for _ in range(n_elem)]
i = 0
for recipephase in root.findall('./RecipePhase'):
for defparam in recipephase.findall('./DefaultRecipeParameter'):
for paramname in defparam.findall('./Name'):
param[i].append(paramname.text)
i = i + 1

Write 'xsi:' in front of attribute with lxml for python 3

I'm adding elements to an xml file.
The document's root is as follows
<Root xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema">
And elements to add look like
<Element xsi:type="some type">
<Sub1>Some text</Sub1>
<Sub2>More text</Sub2>
...
</Element>
I'm trying to find a way for lxml to write 'xsi:' in front of my Element's attibute. This xml file is used by a program to which's source code I do not have access to. I read in a few other questions how to do it by declaring the nsmap of the xml's root, and then again in the child's attribute, which I tried but it didn't work. So far I have (that's what didn't work, the ouput file did not contain the xsi prefix):
element = SubElement(_parent=parent,
_tag='some tag',
attrib={'{%s}type' % XSI: 'some type'}
nsmap={'xsi': XSI}) # Where XSI = namespace address
The namespace is declared properly in the xml file I parse, so I don't know why this isn't working.
The output I get is the element as shown above without the 'xsi:' prefix and all on one line:
<Element type="some type"><Sub1>Some text</Sub1><Sub2>More text</Sub2>...</Element>
If anyone can also point out why in this line
self.tree.write(self.filename, pretty_print=True, encoding='utf-8')
the 'pretty_print' option doesn't work (all printed out in one line), it would be greatly appreciated.
Here is a code example of my script:
from math import floor
from lxml import etree
from lxml.etree import SubElement
def Element(root, sub1: str):
if not isinstance(sub1, str):
raise TypeError
else:
element = SubElement(root, 'Element')
element_sub1 = SubElement(element, 'Sub1')
element_sub1.text = sub1
# ...
# Omitted additional SubElements
# ...
return element
def Sub(root, sub5_sub: str):
XSI = "http://www.w3.org/2001/XMLSchema-instance"
if not isinstance(sub5_sub, str):
raise TypeError
else:
sub = SubElement(root, 'Sub5_Sub', {'{%s}type' % XSI: 'SomeType'}, nsmap={'xsi': XSI})
# ...
# Omitted additional SubElements
# ...
return sub
class Generator:
def __init__(self) -> None:
self.filename = None
self.csv_filename = None
self.csv_content = []
self.tree = None
self.root = None
self.panel = None
self.panels = None
def mainloop(self) -> None:
"""App's mainloop"""
while True:
# Getting files from user
xml_filename = input('Enter path to xml file : ')
# Parsing files
csv_content = [{'field1': 'ElementSub1', 'field2': 'something'},
{'field1': 'ElementSub1', 'field2': 'something'},
{'field1': 'ElementSub2', 'field2': 'something'}] # Replaces csv file that I use
tree = etree.parse(xml_filename)
root = tree.getroot()
elements = root.find('Elements')
for element in elements:
if element.find('Sub1').text in ['ElementSub1', 'ElementSub2']:
for line in csv_content:
if element.find('Sub5') is not None:
Sub(root=element.find('Sub5'),
sub5_sub=line['field2'])
tree.write(xml_filename, pretty_print=True, encoding='utf-8')
if input('Continue? (Y) Quit (n)').upper().startswith('Y'):
elements.clear()
continue
else:
break
#staticmethod
def get_x(x: int) -> str:
if not isinstance(x, int):
x = int(x)
return str(int(floor(9999 / 9 * x)))
#staticmethod
def get_y(y: int) -> str:
if not isinstance(y, int):
y = int(y)
return str(int(floor(999 / 9 * y)))
def quit(self) -> None:
quit()
if __name__ == "__main__":
app = Generator()
app.mainloop()
app.quit()
Here is what it outputs:
<Root xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema">
<Elements>
<Element>
<Sub1>ElementSub1</Sub1>
<Sub5>
<Sub5_Sub xsi:type="SomeType"/>
<Sub5_Sub xsi:type="SomeType"/><Sub5_Sub xsi:type="SomeType"/><Sub5_Sub xsi:type="SomeType"/></Sub5>
</Element>
<Element>
<Sub1>ElementSub1</Sub1>
<Sub5>
<Sub5_Sub xsi:type="SomeType"/>
<Sub5_Sub xsi:type="SomeType"/>
<Sub5_Sub xsi:type="SomeType"/><Sub5_Sub xsi:type="SomeType"/><Sub5_Sub xsi:type="SomeType"/></Sub5>
</Element>
<Element>
<Sub1>ElementSub1</Sub1>
</Element>
</Elements>
</Root>
For some reason, this piece of code does what I want but my real code doesn't. I've come to realize that it does put a prefix on some sub elements with the type attribute, but not all and on those it puts the prefix, it isn't always just 'xsi:'. I found a quick and dirty way to fix this problem which is less than ideal (find and replace through the file for xsi-type -> accepted by lxml's api to xsi:type). What still isn't working though is that it's all printed out in one line despite the pretty_print parameter being true.

I just recently encountered this scenario and was able to successfully create an attribute with the xsi:
qname = etree.QName("http://www.w3.org/2001/XMLSchema-instance", "type")
element = etree.Element('Element', {qname: "some type")
root.append(element)
this outputs something like
<Element xsi:type="some type">

Python ElementTree parse xml with NoneType exception

I meet a problem that is so werid. Here is the code.
#!/usr/bin/python
# filename: parse_dblp.py
# author: ivanchou
import codecs, os
import xml.etree.ElementTree as ET
paper_tag = ('article','inproceedings','proceedings','book',
'incollection','phdthesis','mastersthesis','www')
class AllEntities:
def __getitem__(self, key):
return key
print ('----------parse begin----------')
# the parse result store to authors
result = codecs.open('authors','w','utf-8')
parser = ET.XMLParser()
parser.parser.UseForeignDTD(True)
parser.entity = AllEntities()
for event, article in ET.iterparse('dblp_part.xml', events=("start",
"end"), parser=parser):
for author in article.findall('author'):
result.write(author.text + u'|')
if event == 'end' and article.tag in paper_tag:
result.write(os.linesep)
article.clear()
print ('----------parse end----------')
The file dblp_part.xml i have create a gist here:
dblp_part.xml
It contains the head 2336 line of the dblp.xml, and the last article element it returns me NoneType error, and if i exchange the last two element, it's all ok. So is this a bug of ElementTree?
I am newer of python, search of help.

minidom.parse reading short XML?

Here are 2 similar XML files :
Long XML
<mynode>
<text>Blah</text>
<position>322,13</position>
</mynode>
Short XML
<mynode text="Blah" position="322,13" />
It seems that Python's minidom.parse doesn't like the short XML.
Is this short XML style available with minidom (XML) ?
Is it possible to write a unique code that will read both short and long XML ?

from xml.dom import minidom
def getChild(n,v):
for child in n.childNodes:
if child.localName==v:
yield child
def getValue(n, val):
res = None
for n in mynode:
rv = getChild(n,val)
for v in rv:
var = v.childNodes[0].nodeValue
res = var
if not res:
for n in mynode:
attr = n.getAttributeNode(val)
if attr:
res = attr.nodeValue.strip()
return res
xmldoc = minidom.parse('file.xml')
mynode = xmldoc.getElementsByTagName('mynode')
print getValue(mynode,'text')
print getValue(mynode,'position')
output:
Blah
322,13

You need a root node
>>> from xml.dom.minidom import parseString
>>> doc = parseString('<root><mynode text="Blah" position="322,13" /></root>')
>>> print d.firstChild.firstChild.getAttribute('text')
Blah
>>> print d.firstChild.firstChild.getAttribute('position')
322,13

xmlns namespace breaking lxml

I am trying to open an xml file, and get values from certain tags. I have done this a lot but this particular xml is giving me some issues. Here is a section of the xml file:
<?xml version='1.0' encoding='UTF-8'?>
<package xmlns="http://apple.com/itunes/importer" version="film4.7">
<provider>filmgroup</provider>
<language>en-GB</language>
<actor name="John Smith" display="Doe John"</actor>
</package>
And here is a sample of my python code:
metadata = '/Users/mylaptop/Desktop/Python/metadata.xml'
from lxml import etree
parser = etree.XMLParser(remove_blank_text=True)
open(metadata)
tree = etree.parse(metadata, parser)
root = tree.getroot()
for element in root.iter(tag='provider'):
providerValue = tree.find('//provider')
providerValue = providerValue.text
print providerValue
tree.write('/Users/mylaptop/Desktop/Python/metadataDone.xml', pretty_print = True, xml_declaration = True, encoding = 'UTF-8')
When I run this it can't find the provider tag or its value. If I remove xmlns="http://apple.com/itunes/importer" then all work as expected.
My question is how can I remove this namespace, as i'm not at all interested in this, so I can get the tag values I need using lxml?

The provider tag is in the http://apple.com/itunes/importer namespace, so you either need to use the fully qualified name
{http://apple.com/itunes/importer}provider
or use one of the lxml methods that has the namespaces parameter, such as root.xpath. Then you can specify it with a namespace prefix (e.g. ns:provider):
from lxml import etree
parser = etree.XMLParser(remove_blank_text=True)
tree = etree.parse(metadata, parser)
root = tree.getroot()
namespaces = {'ns':'http://apple.com/itunes/importer'}
items = iter(root.xpath('//ns:provider/text()|//ns:actor/#name',
namespaces=namespaces))
for provider, actor in zip(*[items]*2):
print(provider, actor)
yields
('filmgroup', 'John Smith')
Note that the XPath used above assumes that <provider> and <actor> elements always appear in alternation. If that is not true, then there are of course ways to handle it, but the code becomes a bit more verbose:
for package in root.xpath('//ns:package', namespaces=namespaces):
for provider in package.xpath('ns:provider', namespaces=namespaces):
providerValue = provider.text
print providerValue
for actor in package.xpath('ns:actor', namespaces=namespaces):
print actor.attrib['name']

My suggestion is to not ignore the namespace but, instead, to take it into account. I wrote some related functions (copied with slight modification) for my work on the django-quickbooks library. With these functions, you should be able to do this:
providers = getels(root, 'provider', ns='http://apple.com/itunes/importer')
Here are those functions:
def get_tag_with_ns(tag_name, ns):
return '{%s}%s' % (ns, tag_name)
def getel(elt, tag_name, ns=None):
""" Gets the first tag that matches the specified tag_name taking into
account the QB namespace.
:param ns: The namespace to use if not using the default one for
django-quickbooks.
:type ns: string
"""
res = elt.find(get_tag_with_ns(tag_name, ns=ns))
if res is None:
raise TagNotFound('Could not find tag by name "%s"' % tag_name)
return res
def getels(elt, *path, **kwargs):
""" Gets the first set of elements found at the specified path.
Example:
>>> xml = (
"<root>" +
"<item>" +
"<id>1</id>" +
"</item>" +
"<item>" +
"<id>2</id>"* +
"</item>" +
"</root>")
>>> el = etree.fromstring(xml)
>>> getels(el, 'root', 'item', ns='correct/namespace')
[<Element item>, <Element item>]
"""
ns = kwargs['ns']
i=-1
for i in range(len(path)-1):
elt = getel(elt, path[i], ns=ns)
tag_name = path[i+1]
return elt.findall(get_tag_with_ns(tag_name, ns=ns))

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

How to parse xml file with <pair key = "..."> </pair> format - python

You can use BeautifulSoup xml = """ <root> <dm_log_packet> <pair key ="type_id">LTE_PHY_Serv_Cell_Measurement</pair> </dm_log_packet> </root> """ soup_obj = BeautifulSoup(xml) soup_obj.html.body.foo.bar.findAll('type')[0]['foobar'] Output will '1' More Descriptive Answer

Related

XML data extraction in python

Write 'xsi:' in front of attribute with lxml for python 3

Python ElementTree parse xml with NoneType exception

minidom.parse reading short XML?

xmlns namespace breaking lxml

Categories

Resources