Parsing nested xml with lxml and Python - python

I am having trouble parsing XML when it is in the form of:
<Cars>
<Car>
<Color>Blue</Color>
<Make>Ford</Make>
<Model>Mustant</Model>
</Car>
<Car>
<Color>Red</Color>
<Make>Chevy</Make>
<Model>Camaro</Model>
</Car>
</Cars>
I have figured out how to parse 1st level children like this:
<Car>
<Color>Blue</Color>
<Make>Chevy</Make>
<Model>Camaro</Model>
</Car>
With this kind of code:
from lxml import etree
a = os.path.join(localPath,file)
element = etree.parse(a)
cars = element.xpath('//Root/Foo/Bar/Car/node()[text()]')
parsedCars = [{field.tag: field.text for field in cars} for action in cars]
print parsedCars[0]['Make'] #Chevy
How can I parse our multiple "Car" tags that is a child tag of "Cars"?

Try this
from lxml import etree
a = os.path.join(localPath,file)
element = etree.parse(a)
cars = element.xpath('//Root/Foo/Bar/Car')
for car in cars:
colors = car.xpath('./Color')
makes = car.xpath('./Make')
models = car.xpath('./Model')

Related

Using BeautifulSoup to count xml elements in a function

I often use len(find_all("some_element") to count the number of entities in a xml file. I tried to build a function, but it doesn't work/ it always give me "None".
The XML file:
<parent>
<some>
<child>text</child>
<child>text</child>
<child>text</child>
</some>
</parent>
my python code:
def return_len(para1,para2): # doesn't work
if bool(suppe.para1): # the element isn't always present in the xml
return len(suppe.para1.find_all(para2))
def return_len1(): # does work
if bool(suppe.some):
return len(suppe.some.find_all("child"))
print(return_len("some","child")) # doesnt work
print(return_len1()) # does work
How must i modify my function return_len to get working / what did i wrong?
You can do like this.
from bs4 import BeautifulSoup
s = """<parent>
<some>
<child>text</child>
<child>text</child>
<child>text</child>
</some>
</parent>
"""
soup = BeautifulSoup(s, 'xml')
def return_len(para1,para2,soup):
print(f'No. of <{para2}> tags inside <{para1}> tag.')
temp = soup.find(para1)
if temp:
return len(temp.find_all(para2))
print(return_len('some', 'child', soup))
print(return_len('parent', 'some', soup))
No. of <child> tags inside <some> tag.
3
No. of <some> tags inside <parent> tag.
1
Without any external library - see the below
import xml.etree.ElementTree as ET
xml = '''<parent>
<some>
<child>text</child>
<child>text</child>
<child>text</child>
</some>
</parent>'''
root = ET.fromstring(xml)
print(f'Number of child elements is {len(root.findall(".//child"))}')
output
Number of child elements is 3

Python XML Parse and getElementsByTagName

I was trying to parse the following xml and fetch specific tags that i'm interested in around my business need. and i guess i'm doing something wrong. Not sure how to parse my required tags?? Wanted to leverage pandas, so that i can further filter for specifics. Apprentice all the support
My XMl coming from URI
<couponfeed>
<TotalMatches>1459</TotalMatches>
<TotalPages>3</TotalPages>
<PageNumberRequested>1</PageNumberRequested>
<link type="TEXT">
<categories>
<category id="1">Apparel</category>
</categories>
<promotiontypes>
<promotiontype id="11">Percentage off</promotiontype>
</promotiontypes>
<offerdescription>25% Off Boys Quiksilver Apparel. Shop now at Macys.com! Valid 7/23 through 7/25!</offerdescription>
<offerstartdate>2020-07-24</offerstartdate>
<offerenddate>2020-07-26</offerenddate>
<clickurl>https://click.synergy.com/fs-bin/click?id=Z&offerid=777210.100474694&type=3&subid=0</clickurl>
<impressionpixel>https://ad.synergy.com/fs-bin/show?id=ZNAweM&bids=777210.100474694&type=3&subid=0</impressionpixel>
<advertiserid>3184</advertiserid>
<advertisername>cys.com</advertisername>
<network id="1">US Network</network>
</link>
<link type="TEXT">
<categories>
<category id="1">Apparel</category>
</categories>
<promotiontypes>
<promotiontype id="11">Percentage off</promotiontype>
</promotiontypes>
<offerdescription>25% Off Boys' Quiksilver Apparel. Shop now at Macys.com! Valid 7/23 through 7/25!</offerdescription>
<offerstartdate>2020-07-24</offerstartdate>
<offerenddate>2020-07-26</offerenddate>
<clickurl>https://click.synergy.com/fs-bin/click?id=ZZvk49eM&offerid=777210.100474695&type=3&subid=0</clickurl>
<impressionpixel>https://ad.synergy.com/fs-bin/show?id=ZZvk49NAwbids=777210.100474695&type=3&subid=0</impressionpixel>
<advertiserid>3184</advertiserid>
<advertisername>cys.com</advertisername>
<network id="1">US Network</network>
</link>
My Code
from xml.dom import minidom
import urllib
import pandas as pd
url = "http://couponfeed.synergy.com/coupon?token=xxxxxxxxx122b&network=1&resultsperpage=500"
xmldoc = minidom.parse(urllib.request.urlopen(url))
#itemlist = xmldoc.getElementsByTagName('clickurl')
df_cols = ["promotiontype","category","offerdescription", "offerstartdate", "offerenddate", "clickurl","impressionpixel","advertisername","network"]
rows = []
for entry in xmldoc.couponfeed:
s_promotiontype = couponfeed.get("promotiontype","")
s_category = couponfeed.get("category","")
s_offerdescription = couponfeed.get("offerdescription", "")
s_offerstartdate = couponfeed.get("offerstartdate", "")
s_offerenddate = couponfeed.get("offerenddate", "")
s_clickurl = couponfeed.get("clickurl", "")
s_impressionpixel = couponfeed.get("impressionpixel", "")
s_advertisername = couponfeed.get("advertisername","")
s_network = couponfeed.get ("network","")
rows.append({"promotiontype":s_promotiontype, "category": s_category, "offerdescription": s_offerdescription,
"offerstartdate": s_offerstartdate, "offerenddate": s_offerenddate,"clickurl": s_clickurl,"impressionpixel":s_impressionpixel,
"advertisername": s_advertisername,"network": s_network})
out_df = pd.DataFrame(rows, columns=df_cols)
out_df.to_csv(r"C:\\Users\rai\Downloads\\merchants_offers_share.csv", index=False)
Trying easy way but i dont get any results
import lxml.etree as ET
import urllib
response = urllib.request.urlopen('http://couponfeed.synergy.com/coupon?token=xxxxxd39f4e5fe392a25538bb122b&network=1&resultsperpage=500')
xml = response.read()
root = ET.fromstring(xml)
for item in root.findall('.//item'):
title = item.find('category').text
print (title)
another try
from lxml import etree
import pandas as pd
import urllib
url = "http://couponfeed.synergy.com/coupon?token=xxxxxxd39f4e5fe392a25538bb122b&network=1&resultsperpage=500"
xtree = etree.parse(urllib.request.urlopen(url))
for value in xtree.xpath("/root/couponfeed/categories"):
print(value.text)
Another method.
from simplified_scrapy import SimplifiedDoc, utils, req
# html = req.get('http://couponfeed.synergy.com/coupon?token=xxxxxxxxx122b&network=1&resultsperpage=500')
html = '''
<couponfeed>
<TotalMatches>1459</TotalMatches>
<TotalPages>3</TotalPages>
<PageNumberRequested>1</PageNumberRequested>
<link type="TEXT">
<categories>
<category id="1">Apparel</category>
</categories>
<promotiontypes>
<promotiontype id="11">Percentage off</promotiontype>
</promotiontypes>
<offerdescription>25% Off Boys Quiksilver Apparel. Shop now at Macys.com! Valid 7/23 through 7/25!</offerdescription>
<offerstartdate>2020-07-24</offerstartdate>
<offerenddate>2020-07-26</offerenddate>
<clickurl>https://click.synergy.com/fs-bin/click?id=Z&offerid=777210.100474694&type=3&subid=0</clickurl>
<impressionpixel>https://ad.synergy.com/fs-bin/show?id=ZNAweM&bids=777210.100474694&type=3&subid=0</impressionpixel>
<advertiserid>3184</advertiserid>
<advertisername>cys.com</advertisername>
<network id="1">US Network</network>
</link>
</couponfeed>
'''
doc = SimplifiedDoc(html)
df_cols = [
"promotiontype", "category", "offerdescription", "offerstartdate",
"offerenddate", "clickurl", "impressionpixel", "advertisername", "network"
]
rows = [df_cols]
links = doc.couponfeed.links # Get all links
for link in links:
row = []
for col in df_cols:
row.append(link.select(col).text) # Get col text
rows.append(row)
utils.save2csv('merchants_offers_share.csv', rows) # Save to csv file
Result:
promotiontype,category,offerdescription,offerstartdate,offerenddate,clickurl,impressionpixel,advertisername,network
Percentage off,Apparel,25% Off Boys Quiksilver Apparel. Shop now at Macys.com! Valid 7/23 through 7/25!,2020-07-24,2020-07-26,https://click.synergy.com/fs-bin/click?id=Z&offerid=777210.100474694&type=3&subid=0,https://ad.synergy.com/fs-bin/show?id=ZNAweM&bids=777210.100474694&type=3&subid=0,cys.com,US Network
Here are more examples: https://github.com/yiyedata/simplified-scrapy-demo/tree/master/doc_examples
Remove the last empty row
import io
with io.open('merchants_offers_share.csv', "rb+") as f:
f.seek(-1,2)
l = f.read()
if l == b"\n":
f.seek(-2,2)
f.truncate()
First, the xml document wasn't parsing because you copied a raw ampersand & from the source page, which is like a keyword in xml. When your browser renders xml (or html), it converts & into &.
As for the code, the easiest way to get the data is to iterate over df_cols, then execute getElementsByTagName for each column, which will return a list of elements for the given column.
from xml.dom import minidom
import pandas as pd
import urllib
limit = 500
url = f"http://couponfeed.synergy.com/coupon?token=xxxxxxxxx122b&network=1&resultsperpage={limit}"
xmldoc = minidom.parse(urllib.request.urlopen(url))
df_cols = ["promotiontype","category","offerdescription", "offerstartdate", "offerenddate", "clickurl","impressionpixel","advertisername","network"]
# create an object for each row
rows = [{} for i in range(limit)]
nodes = xmldoc.getElementsByTagName("promotiontype")
node = nodes[0]
for row_name in df_cols:
# get results for each row_name
nodes = xmldoc.getElementsByTagName(row_name)
for i, node in enumerate(nodes):
rows[i][row_name] = node.firstChild.nodeValue
out_df = pd.DataFrame(rows, columns=df_cols)
nodes = et.getElementsByTagName("promotiontype")
node = nodes[0]
for row_name in df_cols:
nodes = et.getElementsByTagName(row_name)
for i, node in enumerate(nodes):
rows[i][row_name] = node.firstChild.nodeValue
out_df = pd.DataFrame(rows, columns=df_cols)
This isn't the most efficient way to do this, but I'm not sure how else to using minidom. If efficiency is a concern, I'd recommend using lxml instead.
Assuming no issue with parsing your XML from URL (since link is not available on our end), your first lxml can work if you parse on actual nodes. Specifically, there is no <item> node in XML document.
Instead use link. And consider a nested list/dict comprehension to migrate content to a data frame. For lxml you can swap out findall and xpath to return same result.
df = pd.DataFrame([{item.tag: item.text if item.text.strip() != "" else item.find("*").text
for item in lnk.findall("*") if item is not None}
for lnk in root.findall('.//link')])
print(df)
# categories promotiontypes offerdescription ... advertiserid advertisername network
# 0 Apparel Percentage off 25% Off Boys Quiksilver Apparel. Shop now at M... ... 3184 cys.com US Network
# 1 Apparel Percentage off 25% Off Boys' Quiksilver Apparel. Shop now at ... ... 3184 cys.com US Network

How to add attribute to lxml Element

I would like to add attribute to a lxml Element like this
<outer xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<Header>
<field1 name="blah">some value1</field1>
<field2 name="asdfasd">some value2</field2>
</Header>
</outer>
Here is what I have
E = lxml.builder.ElementMaker()
outer = E.outer
header = E.Header
FIELD1 = E.field1
FIELD2 = E.field2
the_doc = outer(
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance",
XML_2_HEADER(
FIELD1('some value1', name='blah'),
FIELD2('some value2', name='asdfasd'),
),
)
seems like this line is causing some problem
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance",
even if I replace it with
'xmlns:xsi'="http://www.w3.org/2001/XMLSchema-instance",
it won't work.
What is a way to add attribute to lxml Element?
That's a namespace definition, not an ordinary XML attribute. You can pass namespace information to ElementMaker() as a dictionary, for example :
from lxml import etree as ET
import lxml.builder
nsdef = {'xsi':'http://www.w3.org/2001/XMLSchema-instance'}
E = lxml.builder.ElementMaker(nsmap=nsdef)
doc = E.outer(
E.Header(
E.field1('some value1', name='blah'),
E.field2('some value2', name='asdfasd'),
),
)
print ET.tostring(doc, pretty_print=True)
output :
<outer xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<Header>
<field1 name="blah">some value1</field1>
<field2 name="asdfasd">some value2</field2>
</Header>
</outer>
Link to the docs: http://lxml.de/api/lxml.builder.ElementMaker-class.html

Extracting similar XML attributes with BeautifulSoup

Let's assume I have the following XML:
<time from="2017-07-29T08:00:00" to="2017-07-29T09:00:00">
<!-- Valid from 2017-07-29T08:00:00 to 2017-07-29T09:00:00 -->
<symbol number="4" numberEx="4" name="Cloudy" var="04"/>
<precipitation value="0"/>
<!-- Valid at 2017-07-29T08:00:00 -->
<windDirection deg="300.9" code="WNW" name="West-northwest"/>
<windSpeed mps="1.3" name="Light air"/>
<temperature unit="celsius" value="15"/>
<pressure unit="hPa" value="1002.4"/>
</time>
<time from="2017-07-29T09:00:00" to="2017-07-29T10:00:00">
<!-- Valid from 2017-07-29T09:00:00 to 2017-07-29T10:00:00 -->
<symbol number="4" numberEx="4" name="Partly cloudy" var="04"/>
<precipitation value="0"/>
<!-- Valid at 2017-07-29T09:00:00 -->
<windDirection deg="293.2" code="WNW" name="West-northwest"/>
<windSpeed mps="0.8" name="Light air"/>
<temperature unit="celsius" value="17"/>
<pressure unit="hPa" value="1002.6"/>
</time>
And I want to collect time from, symbol name and temperature value from it, and then print it out in the following manner: time from: symbol name, temperaure value -- like this: 2017-07-29, 08:00:00: Cloudy, 15°.
(And there are a few name and value attributes in this XML, as you see.)
As of now, my approach was quite straightforward:
#!/usr/bin/env python
# coding: utf-8
import re
from BeautifulSoup import BeautifulSoup
# data is set to the above XML
soup = BeautifulSoup(data)
# collect the tags of interest into lists. can it be done wiser?
time_l = []
symb_l = []
temp_l = []
for i in soup.findAll('time'):
i_time = str(i.get('from'))
time_l.append(i_time)
for i in soup.findAll('symbol'):
i_symb = str(i.get('name'))
symb_l.append(i_symb)
for i in soup.findAll('temperature'):
i_temp = str(i.get('value'))
temp_l.append(i_temp)
# join the forecast lists to a dict
forc_l = []
for i, j in zip(symb_l, temp_l):
forc_l.append([i, j])
rez = dict(zip(time_l, forc_l))
# combine and format the rezult. can this dict be printed simpler?
wew = ''
for key in sorted(rez):
wew += re.sub("T", ", ", key) + str(rez[key])
wew = re.sub("'", "", wew)
wew = re.sub("\[", ": ", wew)
wew = re.sub("\]", "°\n", wew)
# print the rezult
print wew
But I imagine there must be some better, more intelligent approach? Mostly, I'm interested in collecting the attributes from the XML, my way seems rather dumb to me, actually. Also, is there any simpler way to print out a dict {'a': '[b, c]'} nicely?
Would be grateful for any hints or suggestions.
from bs4 import BeautifulSoup
with open("sample.xml", "r") as f: # opening xml file
content = f.read() # xml content stored in this variable
soup = BeautifulSoup(content, "lxml")
for values in soup.findAll("time"):
print("{} : {}, {}°".format(values["from"], values.find("symbol")["name"], values.find("temperature")["value"]))
Output:
2017-07-29T08:00:00 : Cloudy, 15°
2017-07-29T09:00:00 : Partly cloudy, 17°
One more, also you can fetch xml data by importing xml.dom.minidom module.
Here is the data you want:
from xml.dom.minidom import parse
doc = parse("path/to/xmlfile.xml") # parse an XML file by name
itemlist = doc.getElementsByTagName('time')
for items in itemlist:
from_tag = items.getAttribute('from')
symbol_list = items.getElementsByTagName('symbol')
symbol_name = [d.getAttribute('name') for d in symbol_list ][0]
temperature_list = items.getElementsByTagName('temperature')
temp_value = [d.getAttribute('value') for d in temperature_list ][0]
print ("{} : {}, {}°". format(from_tag, symbol_name, temp_value))
Output will be as follows:
2017-07-29T08:00:00 : Cloudy, 15°
2017-07-29T09:00:00 : Partly cloudy, 17°
Hope it is useful.
Here you can also use an alternate way using builtin module(i'm using python 3.6.2):
import xml.etree.ElementTree as et # this is built-in module in python3
tree = et.parse("sample.xml")
root = tree.getroot()
for temp in root.iter("time"): # iterate time element in xml
print(temp.attrib["from"], end=": ") # prints attribute of time element
for sym in temp.iter("symbol"): # iterate symbol element within time element
print(sym.attrib["name"], end=", ")
for t in temp.iter("temperature"): # iterate temperature element within time element
print(t.attrib["value"], end="°\n")

Python XML check next item

Here is a little xml example:
<?xml version="1.0" encoding="UTF-8"?>
<list>
<person id="1">
<name>Smith</name>
<city>New York</city>
</person>
<person id="2">
<name>Pitt</name>
</person>
...
...
</list>
Now I need all Persons with a name and city.
I tried:
#!/usr/bin/python
# coding: utf8
import xml.dom.minidom as dom
tree = dom.parse("test.xml")
for listItems in tree.firstChild.childNodes:
for personItems in listItems.childNodes:
if personItems.nodeName == "name" and personItems.nextSibling == "city":
print personItems.firstChild.data.strip()
But the ouput is empty. Without the "and" condition I become all names. How can I check that the next tag after "name" is "city"?
You can do this in minidom:
import xml.dom.minidom as minidom
def getChild(n,v):
for child in n.childNodes:
if child.localName==v:
yield child
xmldoc = minidom.parse('test.xml')
person = getChild(xmldoc, 'list')
for p in person:
for v in getChild(p,'person'):
attr = v.getAttributeNode('id')
if attr:
print attr.nodeValue.strip()
This prints id of person nodes:
1
2
use element tree check this element tree
import xml.etree.ElementTree as ET
tree = ET.parse('a.xml')
root = tree.getroot()
for person in root.findall('person'):
name = person.find('name').text
try:
city = person.find('city').text
except:
continue
print name, city
for id u can get it by id= person.get('id')
output:Smith New York
Using lxml, you can use xpath to get in one step what you need:
from lxml import etree
xmlstr = """
<list>
<person id="1">
<name>Smith</name>
<city>New York</city>
</person>
<person id="2">
<name>Pitt</name>
</person>
</list>
"""
xml = etree.fromstring(xmlstr)
xp = "//person[city]"
for person in xml.xpath(xp):
print etree.tostring(person)
lxml is external python package, but is so useful, that to me it is always worth to install.
xpath is searching for any (//) element person having (declared by content of []) subelement city.

Categories

Resources