Using BeautifulSoup to count xml elements in a function - python

I often use len(find_all("some_element") to count the number of entities in a xml file. I tried to build a function, but it doesn't work/ it always give me "None".
The XML file:
<parent>
<some>
<child>text</child>
<child>text</child>
<child>text</child>
</some>
</parent>
my python code:
def return_len(para1,para2): # doesn't work
if bool(suppe.para1): # the element isn't always present in the xml
return len(suppe.para1.find_all(para2))
def return_len1(): # does work
if bool(suppe.some):
return len(suppe.some.find_all("child"))
print(return_len("some","child")) # doesnt work
print(return_len1()) # does work
How must i modify my function return_len to get working / what did i wrong?

You can do like this.
from bs4 import BeautifulSoup
s = """<parent>
<some>
<child>text</child>
<child>text</child>
<child>text</child>
</some>
</parent>
"""
soup = BeautifulSoup(s, 'xml')
def return_len(para1,para2,soup):
print(f'No. of <{para2}> tags inside <{para1}> tag.')
temp = soup.find(para1)
if temp:
return len(temp.find_all(para2))
print(return_len('some', 'child', soup))
print(return_len('parent', 'some', soup))
No. of <child> tags inside <some> tag.
3
No. of <some> tags inside <parent> tag.
1

Without any external library - see the below
import xml.etree.ElementTree as ET
xml = '''<parent>
<some>
<child>text</child>
<child>text</child>
<child>text</child>
</some>
</parent>'''
root = ET.fromstring(xml)
print(f'Number of child elements is {len(root.findall(".//child"))}')
output
Number of child elements is 3

Related

XML not returning correct child tags/data in Python

Hello I am making a requests call to return order data from a online store. My issue is that once I have passed my data to a root variable the method iter is not returning the correct results. e.g. Display multiple tags of the same name rather than one and not showing the data within the tag.
I thought this was due to the XML not being correctly formatted so I formatted it by saving it to a file using pretty_print but that hasn't fixed the error.
How do I fix this? - Thanks in advance
Code:
import requests, xml.etree.ElementTree as ET, lxml.etree as etree
url="http://publicapi.ekmpowershop24.com/v1.1/publicapi.asmx"
headers = {'content-type': 'application/soap+xml'}
body = """<?xml version="1.0" encoding="utf-8"?>
<soap12:Envelope xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:soap12="http://www.w3.org/2003/05/soap-envelope">
<soap12:Body>
<GetOrders xmlns="http://publicapi.ekmpowershop.com/">
<GetOrdersRequest>
<APIKey>my_api_key</APIKey>
<FromDate>01/07/2018</FromDate>
<ToDate>04/07/2018</ToDate>
</GetOrdersRequest>
</GetOrders>
</soap12:Body>
</soap12:Envelope>"""
#send request to ekm
r = requests.post(url,data=body,headers=headers)
#save output to file
file = open("C:/Users/Mark/Desktop/test.xml", "w")
file.write(r.text)
file.close()
#take the file and format the xml
x = etree.parse("C:/Users/Mark/Desktop/test.xml")
newString = etree.tostring(x, pretty_print=True)
file = open("C:/Users/Mark/Desktop/test.xml", "w")
file.write(newString.decode('utf-8'))
file.close()
#parse the file to get the roots
tree = ET.parse("C:/Users/Mark/Desktop/test.xml")
root = tree.getroot()
#access elements names in the data
for child in root.iter('*'):
print(child.tag)
#show orders elements attributes
tree = ET.parse("C:/Users/Mark/Desktop/test.xml")
root = tree.getroot()
for order in root.iter('{http://publicapi.ekmpowershop.com/}Order'):
out = {}
for child in order:
if child.tag in ('OrderID'):
out[child.tag] = child.text
print(out)
Elements output:
{http://publicapi.ekmpowershop.com/}Orders
{http://publicapi.ekmpowershop.com/}Order
{http://publicapi.ekmpowershop.com/}OrderID
{http://publicapi.ekmpowershop.com/}OrderNumber
{http://publicapi.ekmpowershop.com/}CustomerID
{http://publicapi.ekmpowershop.com/}CustomerUserID
{http://publicapi.ekmpowershop.com/}Order
{http://publicapi.ekmpowershop.com/}OrderID
{http://publicapi.ekmpowershop.com/}OrderNumber
{http://publicapi.ekmpowershop.com/}CustomerID
{http://publicapi.ekmpowershop.com/}CustomerUserID
Orders Output:
{http://publicapi.ekmpowershop.com/}Order {}
{http://publicapi.ekmpowershop.com/}Order {}
XML Structure after formating:
<soap:Envelope xmlns:soap="http://www.w3.org/2003/05/soap-envelope" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema">
<soap:Body>
<GetOrdersResponse xmlns="http://publicapi.ekmpowershop.com/">
<GetOrdersResult>
<Status>Success</Status>
<Errors/>
<Date>2018-07-10T13:47:00.1682029+01:00</Date>
<TotalOrders>10</TotalOrders>
<TotalCost>100</TotalCost>
<Orders>
<Order>
<OrderID>100</OrderID>
<OrderNumber>102/040718/67</OrderNumber>
<CustomerID>6910</CustomerID>
<CustomerUserID>204</CustomerUserID>
<FirstName>TestFirst</FirstName>
<LastName>TestLast</LastName>
<CompanyName>Test Company</CompanyName>
<EmailAddress>test#Test.com</EmailAddress>
<OrderStatus>Dispatched</OrderStatus>
<OrderStatusColour>#00CC00</OrderStatusColour>
<TotalCost>85.8</TotalCost>
<OrderDate>10/07/2018 14:30:43</OrderDate>
<OrderDateISO>2018-07-10T14:30:43</OrderDateISO>
<AbandonedOrder>false</AbandonedOrder>
<EkmStatus>SUCCESS</EkmStatus>
</Order>
</Orders>
<Currency>GBP</Currency>
</GetOrdersResult>
</GetOrdersResponse>
</soap:Body>
</soap:Envelope>
You need to consider the namespace when checking for tags.
>>> # Include the namespace part of the tag in the tag values that we check.
>>> tags = ('{http://publicapi.ekmpowershop.com/}OrderID', '{http://publicapi.ekmpowershop.com/}OrderNumber')
>>> for order in root.iter('{http://publicapi.ekmpowershop.com/}Order'):
... out = {}
... for child in order:
... if child.tag in tags:
... out[child.tag] = child.text
... print(out)
...
{'{http://publicapi.ekmpowershop.com/}OrderID': '100', '{http://publicapi.ekmpowershop.com/}OrderNumber': '102/040718/67'}
If you don't want the namespace prefixes in the output, you can strip them by only including that part of the tag after the } character.
>>> for order in root.iter('{http://publicapi.ekmpowershop.com/}Order'):
... out = {}
... for child in order:
... if child.tag in tags:
... out[child.tag[child.tag.index('}')+1:]] = child.text
... print(out)
...
{'OrderID': '100', 'OrderNumber': '102/040718/67'}

Extracting similar XML attributes with BeautifulSoup

Let's assume I have the following XML:
<time from="2017-07-29T08:00:00" to="2017-07-29T09:00:00">
<!-- Valid from 2017-07-29T08:00:00 to 2017-07-29T09:00:00 -->
<symbol number="4" numberEx="4" name="Cloudy" var="04"/>
<precipitation value="0"/>
<!-- Valid at 2017-07-29T08:00:00 -->
<windDirection deg="300.9" code="WNW" name="West-northwest"/>
<windSpeed mps="1.3" name="Light air"/>
<temperature unit="celsius" value="15"/>
<pressure unit="hPa" value="1002.4"/>
</time>
<time from="2017-07-29T09:00:00" to="2017-07-29T10:00:00">
<!-- Valid from 2017-07-29T09:00:00 to 2017-07-29T10:00:00 -->
<symbol number="4" numberEx="4" name="Partly cloudy" var="04"/>
<precipitation value="0"/>
<!-- Valid at 2017-07-29T09:00:00 -->
<windDirection deg="293.2" code="WNW" name="West-northwest"/>
<windSpeed mps="0.8" name="Light air"/>
<temperature unit="celsius" value="17"/>
<pressure unit="hPa" value="1002.6"/>
</time>
And I want to collect time from, symbol name and temperature value from it, and then print it out in the following manner: time from: symbol name, temperaure value -- like this: 2017-07-29, 08:00:00: Cloudy, 15°.
(And there are a few name and value attributes in this XML, as you see.)
As of now, my approach was quite straightforward:
#!/usr/bin/env python
# coding: utf-8
import re
from BeautifulSoup import BeautifulSoup
# data is set to the above XML
soup = BeautifulSoup(data)
# collect the tags of interest into lists. can it be done wiser?
time_l = []
symb_l = []
temp_l = []
for i in soup.findAll('time'):
i_time = str(i.get('from'))
time_l.append(i_time)
for i in soup.findAll('symbol'):
i_symb = str(i.get('name'))
symb_l.append(i_symb)
for i in soup.findAll('temperature'):
i_temp = str(i.get('value'))
temp_l.append(i_temp)
# join the forecast lists to a dict
forc_l = []
for i, j in zip(symb_l, temp_l):
forc_l.append([i, j])
rez = dict(zip(time_l, forc_l))
# combine and format the rezult. can this dict be printed simpler?
wew = ''
for key in sorted(rez):
wew += re.sub("T", ", ", key) + str(rez[key])
wew = re.sub("'", "", wew)
wew = re.sub("\[", ": ", wew)
wew = re.sub("\]", "°\n", wew)
# print the rezult
print wew
But I imagine there must be some better, more intelligent approach? Mostly, I'm interested in collecting the attributes from the XML, my way seems rather dumb to me, actually. Also, is there any simpler way to print out a dict {'a': '[b, c]'} nicely?
Would be grateful for any hints or suggestions.
from bs4 import BeautifulSoup
with open("sample.xml", "r") as f: # opening xml file
content = f.read() # xml content stored in this variable
soup = BeautifulSoup(content, "lxml")
for values in soup.findAll("time"):
print("{} : {}, {}°".format(values["from"], values.find("symbol")["name"], values.find("temperature")["value"]))
Output:
2017-07-29T08:00:00 : Cloudy, 15°
2017-07-29T09:00:00 : Partly cloudy, 17°
One more, also you can fetch xml data by importing xml.dom.minidom module.
Here is the data you want:
from xml.dom.minidom import parse
doc = parse("path/to/xmlfile.xml") # parse an XML file by name
itemlist = doc.getElementsByTagName('time')
for items in itemlist:
from_tag = items.getAttribute('from')
symbol_list = items.getElementsByTagName('symbol')
symbol_name = [d.getAttribute('name') for d in symbol_list ][0]
temperature_list = items.getElementsByTagName('temperature')
temp_value = [d.getAttribute('value') for d in temperature_list ][0]
print ("{} : {}, {}°". format(from_tag, symbol_name, temp_value))
Output will be as follows:
2017-07-29T08:00:00 : Cloudy, 15°
2017-07-29T09:00:00 : Partly cloudy, 17°
Hope it is useful.
Here you can also use an alternate way using builtin module(i'm using python 3.6.2):
import xml.etree.ElementTree as et # this is built-in module in python3
tree = et.parse("sample.xml")
root = tree.getroot()
for temp in root.iter("time"): # iterate time element in xml
print(temp.attrib["from"], end=": ") # prints attribute of time element
for sym in temp.iter("symbol"): # iterate symbol element within time element
print(sym.attrib["name"], end=", ")
for t in temp.iter("temperature"): # iterate temperature element within time element
print(t.attrib["value"], end="°\n")

Python XML check next item

Here is a little xml example:
<?xml version="1.0" encoding="UTF-8"?>
<list>
<person id="1">
<name>Smith</name>
<city>New York</city>
</person>
<person id="2">
<name>Pitt</name>
</person>
...
...
</list>
Now I need all Persons with a name and city.
I tried:
#!/usr/bin/python
# coding: utf8
import xml.dom.minidom as dom
tree = dom.parse("test.xml")
for listItems in tree.firstChild.childNodes:
for personItems in listItems.childNodes:
if personItems.nodeName == "name" and personItems.nextSibling == "city":
print personItems.firstChild.data.strip()
But the ouput is empty. Without the "and" condition I become all names. How can I check that the next tag after "name" is "city"?
You can do this in minidom:
import xml.dom.minidom as minidom
def getChild(n,v):
for child in n.childNodes:
if child.localName==v:
yield child
xmldoc = minidom.parse('test.xml')
person = getChild(xmldoc, 'list')
for p in person:
for v in getChild(p,'person'):
attr = v.getAttributeNode('id')
if attr:
print attr.nodeValue.strip()
This prints id of person nodes:
1
2
use element tree check this element tree
import xml.etree.ElementTree as ET
tree = ET.parse('a.xml')
root = tree.getroot()
for person in root.findall('person'):
name = person.find('name').text
try:
city = person.find('city').text
except:
continue
print name, city
for id u can get it by id= person.get('id')
output:Smith New York
Using lxml, you can use xpath to get in one step what you need:
from lxml import etree
xmlstr = """
<list>
<person id="1">
<name>Smith</name>
<city>New York</city>
</person>
<person id="2">
<name>Pitt</name>
</person>
</list>
"""
xml = etree.fromstring(xmlstr)
xp = "//person[city]"
for person in xml.xpath(xp):
print etree.tostring(person)
lxml is external python package, but is so useful, that to me it is always worth to install.
xpath is searching for any (//) element person having (declared by content of []) subelement city.

How do I set attributes for an XML element with Python?

I am using ElementTree to build an XML file.
When I try to set an element's attribute with ET.SubElement().__setattr__(), I get the error AttributeError: __setattr__.
import xml.etree.cElementTree as ET
summary = open(Summary.xml, 'w')
root = ET.Element('Summary')
ET.SubElement(root, 'TextSummary')
ET.SubElement(root,'TextSummary').__setattr__('Status','Completed') # Error occurs here
tree = ET.ElementTree(root)
tree.write(summary)
summary.close()
After code execution, my XML should resemble the following:
<Summary>
<TextSummary Status = 'Completed'/>
</Summary>
How do I add attributes to an XML element with Python using xml.etree.cElementTree?
You should be doing:
ET.SubElement(root,'TextSummary').set('Status','Completed')
The Etree documentation shows usage.
You can specify attributes for an Element or SubElement during creation with keyword arguments.
import xml.etree.ElementTree as ET
root = ET.Element('Summary')
ET.SubElement(root, 'TextSummary', Status='Completed')
XML:
<Summary>
<TextSummary Status="Completed"/>
</Summary>
Alternatively, you can use .set to add attributes to an existing element.
import xml.etree.ElementTree as ET
root = ET.Element('Summary')
sub = ET.SubElement(root, 'TextSummary')
sub.set('Status', 'Completed')
XML:
<Summary>
<TextSummary Status="Completed"/>
</Summary>
Technical Explanation:
The constructors for Element and SubElement include **extra, which accepts attributes as keyword arguments.
xml.etree.ElementTree.Element(tag, attrib={}, **extra)
xml.etree.ElementTree.SubElement(parent, tag, attrib={}, **extra)
This allows you to add an arbitrary number of attributes.
root = ET.Element('Summary', Date='2018/07/02', Timestamp='11:44am')
# <Summary Date = "2018/07/02" Timestamp = "11:44am">
You can also use use .set to add attributes to a pre-existing element. However, this can only add one element at a time. (As suggested by Thomas Orozco).
root = ET.Element('Summary')
root.set('Date', '2018/07/02')
root.set('Timestamp', '11:44am')
# <Summary Date = "2018/07/02" Timestamp = "11:44am">
Full Example:
import xml.etree.ElementTree as ET
root = ET.Element('school', name='Willow Creek High')
ET.SubElement(root, 'student', name='Jane Doe', grade='9')
print(ET.tostring(root).decode())
# <school name="Willow Creek High"><student grade="9" name="Jane Doe" /></school>
The best way to set multiple attributes in single line is below.
I wrote this code for SVG XML creation:
from xml.etree import ElementTree as ET
svg = ET.Element('svg', attrib={'height':'210','width':'500'})
g = ET.SubElement(svg,'g', attrib={'x':'10', 'y':'12','id':'groupName'})
line = ET.SubElement(g, 'line', attrib={'x1':'0','y1':'0','x2':'200','y2':'200','stroke':'red'})
print(ET.tostring(svg, encoding="us-ascii", method="xml"))

Parsing nested xml with lxml and Python

I am having trouble parsing XML when it is in the form of:
<Cars>
<Car>
<Color>Blue</Color>
<Make>Ford</Make>
<Model>Mustant</Model>
</Car>
<Car>
<Color>Red</Color>
<Make>Chevy</Make>
<Model>Camaro</Model>
</Car>
</Cars>
I have figured out how to parse 1st level children like this:
<Car>
<Color>Blue</Color>
<Make>Chevy</Make>
<Model>Camaro</Model>
</Car>
With this kind of code:
from lxml import etree
a = os.path.join(localPath,file)
element = etree.parse(a)
cars = element.xpath('//Root/Foo/Bar/Car/node()[text()]')
parsedCars = [{field.tag: field.text for field in cars} for action in cars]
print parsedCars[0]['Make'] #Chevy
How can I parse our multiple "Car" tags that is a child tag of "Cars"?
Try this
from lxml import etree
a = os.path.join(localPath,file)
element = etree.parse(a)
cars = element.xpath('//Root/Foo/Bar/Car')
for car in cars:
colors = car.xpath('./Color')
makes = car.xpath('./Make')
models = car.xpath('./Model')

Categories

Resources