Python XML Parse and getElementsByTagName - python

I was trying to parse the following xml and fetch specific tags that i'm interested in around my business need. and i guess i'm doing something wrong. Not sure how to parse my required tags?? Wanted to leverage pandas, so that i can further filter for specifics. Apprentice all the support
My XMl coming from URI
<couponfeed>
<TotalMatches>1459</TotalMatches>
<TotalPages>3</TotalPages>
<PageNumberRequested>1</PageNumberRequested>
<link type="TEXT">
<categories>
<category id="1">Apparel</category>
</categories>
<promotiontypes>
<promotiontype id="11">Percentage off</promotiontype>
</promotiontypes>
<offerdescription>25% Off Boys Quiksilver Apparel. Shop now at Macys.com! Valid 7/23 through 7/25!</offerdescription>
<offerstartdate>2020-07-24</offerstartdate>
<offerenddate>2020-07-26</offerenddate>
<clickurl>https://click.synergy.com/fs-bin/click?id=Z&offerid=777210.100474694&type=3&subid=0</clickurl>
<impressionpixel>https://ad.synergy.com/fs-bin/show?id=ZNAweM&bids=777210.100474694&type=3&subid=0</impressionpixel>
<advertiserid>3184</advertiserid>
<advertisername>cys.com</advertisername>
<network id="1">US Network</network>
</link>
<link type="TEXT">
<categories>
<category id="1">Apparel</category>
</categories>
<promotiontypes>
<promotiontype id="11">Percentage off</promotiontype>
</promotiontypes>
<offerdescription>25% Off Boys' Quiksilver Apparel. Shop now at Macys.com! Valid 7/23 through 7/25!</offerdescription>
<offerstartdate>2020-07-24</offerstartdate>
<offerenddate>2020-07-26</offerenddate>
<clickurl>https://click.synergy.com/fs-bin/click?id=ZZvk49eM&offerid=777210.100474695&type=3&subid=0</clickurl>
<impressionpixel>https://ad.synergy.com/fs-bin/show?id=ZZvk49NAwbids=777210.100474695&type=3&subid=0</impressionpixel>
<advertiserid>3184</advertiserid>
<advertisername>cys.com</advertisername>
<network id="1">US Network</network>
</link>
My Code
from xml.dom import minidom
import urllib
import pandas as pd
url = "http://couponfeed.synergy.com/coupon?token=xxxxxxxxx122b&network=1&resultsperpage=500"
xmldoc = minidom.parse(urllib.request.urlopen(url))
#itemlist = xmldoc.getElementsByTagName('clickurl')
df_cols = ["promotiontype","category","offerdescription", "offerstartdate", "offerenddate", "clickurl","impressionpixel","advertisername","network"]
rows = []
for entry in xmldoc.couponfeed:
s_promotiontype = couponfeed.get("promotiontype","")
s_category = couponfeed.get("category","")
s_offerdescription = couponfeed.get("offerdescription", "")
s_offerstartdate = couponfeed.get("offerstartdate", "")
s_offerenddate = couponfeed.get("offerenddate", "")
s_clickurl = couponfeed.get("clickurl", "")
s_impressionpixel = couponfeed.get("impressionpixel", "")
s_advertisername = couponfeed.get("advertisername","")
s_network = couponfeed.get ("network","")
rows.append({"promotiontype":s_promotiontype, "category": s_category, "offerdescription": s_offerdescription,
"offerstartdate": s_offerstartdate, "offerenddate": s_offerenddate,"clickurl": s_clickurl,"impressionpixel":s_impressionpixel,
"advertisername": s_advertisername,"network": s_network})
out_df = pd.DataFrame(rows, columns=df_cols)
out_df.to_csv(r"C:\\Users\rai\Downloads\\merchants_offers_share.csv", index=False)
Trying easy way but i dont get any results
import lxml.etree as ET
import urllib
response = urllib.request.urlopen('http://couponfeed.synergy.com/coupon?token=xxxxxd39f4e5fe392a25538bb122b&network=1&resultsperpage=500')
xml = response.read()
root = ET.fromstring(xml)
for item in root.findall('.//item'):
title = item.find('category').text
print (title)
another try
from lxml import etree
import pandas as pd
import urllib
url = "http://couponfeed.synergy.com/coupon?token=xxxxxxd39f4e5fe392a25538bb122b&network=1&resultsperpage=500"
xtree = etree.parse(urllib.request.urlopen(url))
for value in xtree.xpath("/root/couponfeed/categories"):
print(value.text)

Another method.
from simplified_scrapy import SimplifiedDoc, utils, req
# html = req.get('http://couponfeed.synergy.com/coupon?token=xxxxxxxxx122b&network=1&resultsperpage=500')
html = '''
<couponfeed>
<TotalMatches>1459</TotalMatches>
<TotalPages>3</TotalPages>
<PageNumberRequested>1</PageNumberRequested>
<link type="TEXT">
<categories>
<category id="1">Apparel</category>
</categories>
<promotiontypes>
<promotiontype id="11">Percentage off</promotiontype>
</promotiontypes>
<offerdescription>25% Off Boys Quiksilver Apparel. Shop now at Macys.com! Valid 7/23 through 7/25!</offerdescription>
<offerstartdate>2020-07-24</offerstartdate>
<offerenddate>2020-07-26</offerenddate>
<clickurl>https://click.synergy.com/fs-bin/click?id=Z&offerid=777210.100474694&type=3&subid=0</clickurl>
<impressionpixel>https://ad.synergy.com/fs-bin/show?id=ZNAweM&bids=777210.100474694&type=3&subid=0</impressionpixel>
<advertiserid>3184</advertiserid>
<advertisername>cys.com</advertisername>
<network id="1">US Network</network>
</link>
</couponfeed>
'''
doc = SimplifiedDoc(html)
df_cols = [
"promotiontype", "category", "offerdescription", "offerstartdate",
"offerenddate", "clickurl", "impressionpixel", "advertisername", "network"
]
rows = [df_cols]
links = doc.couponfeed.links # Get all links
for link in links:
row = []
for col in df_cols:
row.append(link.select(col).text) # Get col text
rows.append(row)
utils.save2csv('merchants_offers_share.csv', rows) # Save to csv file
Result:
promotiontype,category,offerdescription,offerstartdate,offerenddate,clickurl,impressionpixel,advertisername,network
Percentage off,Apparel,25% Off Boys Quiksilver Apparel. Shop now at Macys.com! Valid 7/23 through 7/25!,2020-07-24,2020-07-26,https://click.synergy.com/fs-bin/click?id=Z&offerid=777210.100474694&type=3&subid=0,https://ad.synergy.com/fs-bin/show?id=ZNAweM&bids=777210.100474694&type=3&subid=0,cys.com,US Network
Here are more examples: https://github.com/yiyedata/simplified-scrapy-demo/tree/master/doc_examples
Remove the last empty row
import io
with io.open('merchants_offers_share.csv', "rb+") as f:
f.seek(-1,2)
l = f.read()
if l == b"\n":
f.seek(-2,2)
f.truncate()

First, the xml document wasn't parsing because you copied a raw ampersand & from the source page, which is like a keyword in xml. When your browser renders xml (or html), it converts & into &.
As for the code, the easiest way to get the data is to iterate over df_cols, then execute getElementsByTagName for each column, which will return a list of elements for the given column.
from xml.dom import minidom
import pandas as pd
import urllib
limit = 500
url = f"http://couponfeed.synergy.com/coupon?token=xxxxxxxxx122b&network=1&resultsperpage={limit}"
xmldoc = minidom.parse(urllib.request.urlopen(url))
df_cols = ["promotiontype","category","offerdescription", "offerstartdate", "offerenddate", "clickurl","impressionpixel","advertisername","network"]
# create an object for each row
rows = [{} for i in range(limit)]
nodes = xmldoc.getElementsByTagName("promotiontype")
node = nodes[0]
for row_name in df_cols:
# get results for each row_name
nodes = xmldoc.getElementsByTagName(row_name)
for i, node in enumerate(nodes):
rows[i][row_name] = node.firstChild.nodeValue
out_df = pd.DataFrame(rows, columns=df_cols)
nodes = et.getElementsByTagName("promotiontype")
node = nodes[0]
for row_name in df_cols:
nodes = et.getElementsByTagName(row_name)
for i, node in enumerate(nodes):
rows[i][row_name] = node.firstChild.nodeValue
out_df = pd.DataFrame(rows, columns=df_cols)
This isn't the most efficient way to do this, but I'm not sure how else to using minidom. If efficiency is a concern, I'd recommend using lxml instead.

Assuming no issue with parsing your XML from URL (since link is not available on our end), your first lxml can work if you parse on actual nodes. Specifically, there is no <item> node in XML document.
Instead use link. And consider a nested list/dict comprehension to migrate content to a data frame. For lxml you can swap out findall and xpath to return same result.
df = pd.DataFrame([{item.tag: item.text if item.text.strip() != "" else item.find("*").text
for item in lnk.findall("*") if item is not None}
for lnk in root.findall('.//link')])
print(df)
# categories promotiontypes offerdescription ... advertiserid advertisername network
# 0 Apparel Percentage off 25% Off Boys Quiksilver Apparel. Shop now at M... ... 3184 cys.com US Network
# 1 Apparel Percentage off 25% Off Boys' Quiksilver Apparel. Shop now at ... ... 3184 cys.com US Network

Related

Creating dataframe from XML file with non-unique tags

I have a directory of XML files, and I need to extract 4 values from each file and store to a dataframe/CSV.
The problem is some of the data I need to extract uses redundant tags (e.g., <PathName>) so I'm not sure of the best way to do this. I could specify the exact line # to extract, because it appears consistent with the files I have seen; but I am not certain that will always be the case, so doing it that way is too brittle.
<?xml version="1.0" encoding="utf-8"?>
<BxfMessage xsi:schemaLocation="http://smpte-ra.org/schemas/2021/2019/BXF BxfSchema.xsd" id="jffsdfs" dateTime="2023-02-02T20:11:38Z" messageType="Info" origin="url" originType="Delivery" userName="ABC Corp User" destination=" System" xmlns="http://sffe-ra.org/schema/1999/2023/BXF" xmlns:xsi="http://www.w9.org/4232/XMLSchema-instance">
<BxfData action="Spotd">
<Content timestamp="2023-02-02T20:11:38Z">
<NonProgramContent>
<Details>
<SpotType>Paid</SpotType>
<SpotType>Standard</SpotType>
<Spotvertiser>
<SpotvertiserName>Spot Plateau</SpotvertiserName>
</Spotvertiser>
<Agency>
<AgencyName>Spot Plateau</AgencyName>
</Agency>
<Product>
<Name></Name>
<BrandName>zzTop</BrandName>
<DirectResponse>
<PhoneNo></PhoneNo>
<PCode></PCode>
<DR_URL></DR_URL>
</DirectResponse>
</Product>
</Details>
<ContentMetSpotata>
<ContentId>
<BHGXId idType="CISC" auth="Agency">AAAA1111999Z</BHGXId>
</ContentId>
<Name>Pill CC Dutch</Name>
<Policy>
<PlatformType>Spotcast</PlatformType>
</Policy>
<Media>
<BaseBand>
<Audio VO="true">
<AnalogAudio primAudio="false" />
<DigitalAudio>
<MPEGLayerIIAudio house="false" audioId="1" dualMono="false" />
</DigitalAudio>
</Audio>
<Video withlate="false" sidebend="false">
<Format>1182v</Format>
<CCs>true</CCs>
</Video>
<AccessServices>
<AudioDescription_DVS>false</AudioDescription_DVS>
</AccessServices>
<QC>Passed QC (AAAA1111103H )</QC>
</BaseBand>
<MediaLocation sourceType="Primary">
<Location>
<AssetServer PAA="true" FTA="true">
<PathName>zzTap_zzTop_AAAA1111999Z_30s_Pill_aa-bb.mp4</PathName>
</AssetServer>
</Location>
<SOM>
<SmpteTimeCode>00:00:00;00</SmpteTimeCode>
</SOM>
<Duration>
<SmpteDuration>
<SmpteTimeCode>00:00:30;00</SmpteTimeCode>
</SmpteDuration>
</Duration>
</MediaLocation>
<MediaLocation sourceType="Proxy" qualifer="Low-res">
<Location>
<AssetServer PAA="true" FTA="true">
<PathName>https://app.url.com/DMM/DL/wew52f</PathName>
</AssetServer>
</Location>
<SOM>
<SmpteTimeCode>00:00:00;00</SmpteTimeCode>
</SOM>
<Duration>
<SmpteDuration>
<SmpteTimeCode>00:00:30;00</SmpteTimeCode>
</SmpteDuration>
</Duration>
</MediaLocation>
<MediaLocation sourceType="Preview" qualifer="Thumbnail">
<Location>
<AssetServer PAA="true" FTA="true">
<PathName>https://f9-int-5.rainxyz.com/url.com/media/t43fs/423gs-389a-40a4.jpg?inline</PathName>
</AssetServer>
</Location>
<SOM>
<SmpteTimeCode>00:00:00;00</SmpteTimeCode>
</SOM>
<Duration>
<SmpteDuration>
<SmpteTimeCode>00:00:00;00</SmpteTimeCode>
</SmpteDuration>
</Duration>
</MediaLocation>
</Media>
</ContentMetSpotata>
</NonProgramContent>
</Content>
</BxfData>
</BxfMessage>
Is there a more flexible method so that I can get consistent output like:
FileName Brand ID URL
zzTap_zzTop_AAAA1111999Z_30s_Pill_aa-bb zzTop AAAA1111999Z https://app.url.com/DMM/DL/wew52f
zzTap_zzTab_BAAA1111999Z_30s_Pill_aa-cc zzTab BAAA1111999Z https://app.url.com/DMM/DL/wew52c
zzTap_zzTan_CAAA1111999Z_30s_Pill_aa-dd zzTan CAAA1111999Z https://app.url.com/DMM/DL/wew523
zzTap_zzTon_DAAA1111999Z_30s_Pill_aa-zz zzTon DAAA1111999Z https://app.url.com/DMM/DL/wew52y
To parse one XML file using beautifulsoup you can use this example:
from bs4 import BeautifulSoup
def get_info(xml_file):
with open(xml_file, 'r') as f_in:
soup = BeautifulSoup(f_in.read(), 'xml')
file_name = soup.find(lambda tag: tag.name == 'PathName' and '.mp4' in tag.text).text.rsplit('.mp4', maxsplit=1)[0]
url = soup.select_one('[sourceType="Proxy"] PathName').text
brand_name = soup.select_one('BrandName').text
id_ = soup.select_one('BHGXId').text
return file_name, brand_name, id_, url
print(get_info('your_file.xml'))
Prints:
('zzTap_zzTop_AAAA1111999Z_30s_Pill_aa-bb', 'zzTop', 'AAAA1111999Z', 'https://app.url.com/DMM/DL/wew52f')
How looks your code? Here is my try.
import xml.etree.ElementTree as ET
import pandas as pd
tree = ET.parse("zzTab.xml")
root = tree.getroot()
ns = "{http://sffe-ra.org/schema/1999/2023/BXF}"
list_of_interest = [f"{ns}PathName", f"{ns}BHGXId", f"{ns}BrandName"]
PathName_dir_list = []
PathName_file_list = []
BHGXId_list = []
BrandName_list = []
for elem in root.iter():
#print(elem.tag, elem.text)
if elem.tag in list_of_interest:
if elem.tag == f"{ns}PathName" and '.mp4' not in elem.text:
#print("Dir:",elem.text)
PathName_dir_list.append(elem.text)
if elem.tag == f"{ns}PathName" and '.mp4' in elem.text:
#print("File:",elem.text)
PathName_file_list.append(elem.text)
if elem.tag == f"{ns}BHGXId":
#print("ID", elem.text)
BHGXId_list.append(elem.text)
if elem.tag == f"{ns}BrandName":
print("Brand", elem.text)
BrandName_list.append(elem.text)
t = zip(PathName_dir_list, PathName_file_list, BHGXId_list, BrandName_list,)
list_of_tuples = list(t)
df = pd.DataFrame(list_of_tuples, columns = ['Path', 'File', 'ID', 'Brand'])
df.to_csv('file_list.csv')
print(df)
If working with BeautifulSoup, I suggest looking into using .select with CSS selectors so that you can do something like
# from bs4 import BeautifulSoup
def getXMLdata(xmlFile:str, defaultVal=None):
with open(xmlFile, 'r') as f: xSoup = BeautifulSoup(f, 'xml')
selRef = {
'FileName': 'MediaLocation[sourceType="Primary"] Location',
'Brand': 'BrandName', 'ID': 'ContentId',
'URL': 'MediaLocation[sourceType="Proxy"] Location'
}
xfDets = {} # {'fromFile': xmlFile}
for k, sel in selRef.items():
t = xSoup.select_one(sel)
xfDets[k] = t.get_text(' ').strip() if t else defaultVal
fn = xfDets.get('FileName')
if isinstance(fn, str) and '.' in fn: # remove extensions like ".mp4"
xfDets['FileName'] = '.'.join(fn.split('.')[:-1])
return xfDets
Since I've seen only one example, I can't know for sure if the selectors in selRef will apply for all your files; but I saved the snippet from your question to a file name x.xml, and getXMLdata('x.xml') returned
{'FileName': 'zzTap_zzTop_AAAA1111999Z_30s_Pill_aa-bb',
'Brand': 'zzTop',
'ID': 'AAAA1111999Z',
'URL': 'https://app.url.com/DMM/DL/wew52f'}
If you had a list of paths to XML files (let's say filesList), you could tabulate their outputs with pandas like
# import pandas
# filesList = ['x.xml', ...] ## LIST OF XML FILES
xDF = pandas.DataFrame([getXMLdata(x) for x in filesList])
[ If you wanted to save that output to a csv file, you can use .to_csv like xDF.to_csv('xmldata.csv'). ]

Python - Construct DF From XML API Response

I'm receiving the below XML response from an API call and am looking to iterate through the "Results" and store all of the data points as a pandas dataframe.
I was successfully able to grab my data points of interest by chaining .find() methods shown below, but don't know how to loop through all of the Results block within the body given the structure of the XML response.
I am using Python 3.7+ in Jupyter on Windows.
What I've Tried:
import pandas as pd
from bs4 import BeautifulSoup
import xml.etree.ElementTree as ET
soup = BeautifulSoup(soap_response.text, "xml")
# print(soup.prettify())
objectid_field = soup.find('Results').find('ObjectID').text
customerkey_field = soup.find('Results').find('CustomerKey').text
name_field = soup.find('Results').find('Name').text
issendable_field = name_field = soup.find('Results').find('IsSendable').text
sendablesubscribe_field = soup.find('Results').find('SendableSubscriberField').text
# for de in soup:
# de_name = soup.find('Results').find('Name').text
# print(de_name)
# test_df = pd.read_xml(soup,
# xpath="//Results",
# namespaces={""})
Sample XML Data Structure:
<?xml version="1.0" encoding="utf-8"?>
<soap:Envelope
xmlns:soap="http://www.w3.org/2003/soap-envelope"
xmlns:xsi="http://www.w3.org/2001/XMLSchema"
xmlns:xsd="http://www.w3.org/XMLSchema"
xmlns:wsa="http://schemas.xmlsoap.org/ws/2004/08/addressing"
xmlns:wsse="http://docs.oasis-open.org/wss/2004/01/oasis-201-wss-wssecurity-secext-1.0.xsd"
xmlns:wsu="http://docs.oasis-open.org/wss/2004/01/oasis-201-wss-security-1.0.xsd">
<env:Header
xmlns:env="http://www.w3.org/2003/05/soap-envelope">
<wsa:Action>RetrieveResponse</wsa:Action>
<wsa:MessageID>urn:uuid:1234</wsa:MessageID>
<wsa:RelatesTo>urn:uuid:1234</wsa:RelatesTo>
<wsa:To>http://schemas.xmlsoap.org/ws/2004/08/dressing/role/anonymous</wsa:To>
<wsse:Security>
<wsu:Timestamp wsu:Id="Timestamp-1234">
<wsu:Created>2021-11-07T13:10:54Z</wsu:Created>
<wsu:Expires>2021-11-07T13:15:54Z</wsu:Expires>
</wsu:Timestamp>
</wsse:Security>
</env:Header>
<soap:Body>
<RetrieveResponseMsg
xmlns="http://partnerAPI">
<OverallStatus>OK</OverallStatus>
<RequestID>f9876</RequestID>
<Results xsi:type="Data">
<PartnerKey xsi:nil="true" />
<ObjectID>Object1</ObjectID>
<CustomerKey>Customer1</CustomerKey>
<Name>Test1</Name>
<IsSendable>true</IsSendable>
<SendableSubscriberField>
<Name>_Something1</Name>
</SendableSubscriberField>
</Results>
<Results xsi:type="Data">
<PartnerKey xsi:nil="true" />
<ObjectID>Object2</ObjectID>
<CustomerKey>Customer2</CustomerKey>
<Name>Name2</Name>
<IsSendable>true</IsSendable>
<SendableSubscriberField>
<Name>_Something2</Name>
</SendableSubscriberField>
</Results>
<Results xsi:type="Data">
<PartnerKey xsi:nil="true" />
<ObjectID>Object3</ObjectID>
<CustomerKey>AnotherKey</CustomerKey>
<Name>Something3</Name>
<IsSendable>false</IsSendable>
</Results>
</RetrieveResponseMsg>
</soap:Body>
</soap:Envelope>'
You're super close, you need to find all of the Results tags, then iterate over them, last grabbing the elements you want:
for el in soup.find_all('Results'):
objectid_field = el.find('ObjectID').text
customerkey_field = el.find('CustomerKey').text
name_field = el.find('Name').text
issendable_field = name_field = el.find('IsSendable').text
sendablesubscribe_field = el.find('SendableSubscriberField').text
However, SendableSubscriberField isn't always there, so you might need to check if sendable is True first:
for el in soup.find_all('Results'):
objectid_field = el.find('ObjectID').text
customerkey_field = el.find('CustomerKey').text
name_field = el.find('Name').text
issendable_field = el.find('IsSendable').text
# skip if not sendable
if issendable_field == 'false':
sendablesubscribe_field = None
continue
sendablesubscribe_field = el.find('SendableSubscriberField').find('Name').text
Edit: Constructing the dataframe
To build the dataframe from this, I'd collect everything into a list of dictionaries:
import pandas as pd
from bs4 import BeautifulSoup
soup = BeautifulSoup(...)
data = []
for el in soup.find_all('Results'):
record = {}
record['ObjectID'] = el.find('ObjectID').text
record['CustomerKey'] = el.find('CustomerKey').text
record['Name'] = el.find('Name').text
record['IsSendable'] = el.find('IsSendable').text
# skip if not sendable
if record['IsSendable'] == 'false':
record['SendableSubscriberField'] = None
continue
record['SendableSubscriberField'] = el.find('SendableSubscriberField').find('Name').text
data.append(record)
df = pd.DataFrame(data)
Reconsider use of pandas.read_xml by acknowledging the default namespace (http://partnerAPI). Also, since you need a lower-level value, run read_xml twice and join the results. Notice all attribute and element values are returned even if missing.
soap_df = (
pd.read_xml(
soap_response.text,
xpath = ".//rrm:RetrieveResponseMsg/rrm:Results",
namespaces = {"rrm": "http://partnerAPI"}
).join(
pd.read_xml(
soap_response.text,
xpath = ".//rrm:RetrieveResponseMsg/rrm:Results/rrm:SendableSubscriberField",
namespaces = {"rrm": "http://partnerAPI"},
names = ["SendableSubscriberField_Name", ""]
),
)
)
print(soap_df)
# type PartnerKey ObjectID CustomerKey Name IsSendable SendableSubscriberField SendableSubscriberField_Name
# 0 Data NaN Object1 Customer1 Test1 True NaN _Something1
# 1 Data NaN Object2 Customer2 Name2 True NaN _Something2
# 2 Data NaN Object3 AnotherKey Something3 False NaN NaN

Extracting similar XML attributes with BeautifulSoup

Let's assume I have the following XML:
<time from="2017-07-29T08:00:00" to="2017-07-29T09:00:00">
<!-- Valid from 2017-07-29T08:00:00 to 2017-07-29T09:00:00 -->
<symbol number="4" numberEx="4" name="Cloudy" var="04"/>
<precipitation value="0"/>
<!-- Valid at 2017-07-29T08:00:00 -->
<windDirection deg="300.9" code="WNW" name="West-northwest"/>
<windSpeed mps="1.3" name="Light air"/>
<temperature unit="celsius" value="15"/>
<pressure unit="hPa" value="1002.4"/>
</time>
<time from="2017-07-29T09:00:00" to="2017-07-29T10:00:00">
<!-- Valid from 2017-07-29T09:00:00 to 2017-07-29T10:00:00 -->
<symbol number="4" numberEx="4" name="Partly cloudy" var="04"/>
<precipitation value="0"/>
<!-- Valid at 2017-07-29T09:00:00 -->
<windDirection deg="293.2" code="WNW" name="West-northwest"/>
<windSpeed mps="0.8" name="Light air"/>
<temperature unit="celsius" value="17"/>
<pressure unit="hPa" value="1002.6"/>
</time>
And I want to collect time from, symbol name and temperature value from it, and then print it out in the following manner: time from: symbol name, temperaure value -- like this: 2017-07-29, 08:00:00: Cloudy, 15°.
(And there are a few name and value attributes in this XML, as you see.)
As of now, my approach was quite straightforward:
#!/usr/bin/env python
# coding: utf-8
import re
from BeautifulSoup import BeautifulSoup
# data is set to the above XML
soup = BeautifulSoup(data)
# collect the tags of interest into lists. can it be done wiser?
time_l = []
symb_l = []
temp_l = []
for i in soup.findAll('time'):
i_time = str(i.get('from'))
time_l.append(i_time)
for i in soup.findAll('symbol'):
i_symb = str(i.get('name'))
symb_l.append(i_symb)
for i in soup.findAll('temperature'):
i_temp = str(i.get('value'))
temp_l.append(i_temp)
# join the forecast lists to a dict
forc_l = []
for i, j in zip(symb_l, temp_l):
forc_l.append([i, j])
rez = dict(zip(time_l, forc_l))
# combine and format the rezult. can this dict be printed simpler?
wew = ''
for key in sorted(rez):
wew += re.sub("T", ", ", key) + str(rez[key])
wew = re.sub("'", "", wew)
wew = re.sub("\[", ": ", wew)
wew = re.sub("\]", "°\n", wew)
# print the rezult
print wew
But I imagine there must be some better, more intelligent approach? Mostly, I'm interested in collecting the attributes from the XML, my way seems rather dumb to me, actually. Also, is there any simpler way to print out a dict {'a': '[b, c]'} nicely?
Would be grateful for any hints or suggestions.
from bs4 import BeautifulSoup
with open("sample.xml", "r") as f: # opening xml file
content = f.read() # xml content stored in this variable
soup = BeautifulSoup(content, "lxml")
for values in soup.findAll("time"):
print("{} : {}, {}°".format(values["from"], values.find("symbol")["name"], values.find("temperature")["value"]))
Output:
2017-07-29T08:00:00 : Cloudy, 15°
2017-07-29T09:00:00 : Partly cloudy, 17°
One more, also you can fetch xml data by importing xml.dom.minidom module.
Here is the data you want:
from xml.dom.minidom import parse
doc = parse("path/to/xmlfile.xml") # parse an XML file by name
itemlist = doc.getElementsByTagName('time')
for items in itemlist:
from_tag = items.getAttribute('from')
symbol_list = items.getElementsByTagName('symbol')
symbol_name = [d.getAttribute('name') for d in symbol_list ][0]
temperature_list = items.getElementsByTagName('temperature')
temp_value = [d.getAttribute('value') for d in temperature_list ][0]
print ("{} : {}, {}°". format(from_tag, symbol_name, temp_value))
Output will be as follows:
2017-07-29T08:00:00 : Cloudy, 15°
2017-07-29T09:00:00 : Partly cloudy, 17°
Hope it is useful.
Here you can also use an alternate way using builtin module(i'm using python 3.6.2):
import xml.etree.ElementTree as et # this is built-in module in python3
tree = et.parse("sample.xml")
root = tree.getroot()
for temp in root.iter("time"): # iterate time element in xml
print(temp.attrib["from"], end=": ") # prints attribute of time element
for sym in temp.iter("symbol"): # iterate symbol element within time element
print(sym.attrib["name"], end=", ")
for t in temp.iter("temperature"): # iterate temperature element within time element
print(t.attrib["value"], end="°\n")

Python using XML reading multiple inner tags

I am trying to read the following XML code using python.
<Product productCode="2" productCategory="ABC" productClass="SOMETHING" salable="true" statusCode="ACTIVE" outage="false">
<PriceList>
<Currency type ="NATIVE" symbol="US$">
<Pricing priceCode="EATIN" catalogPrice="2.00" netPrice="2.00" tax="0.09" grossPrice="2.09"/>
</Currency>
</PriceList>
</Product>
I need to get the catalog price.
Here is my code. I am not sure how to get catalog price. I guess i do not know how to get this data. Any help will be appreciated.
from xml.dom import minidom
doc = minidom.parse("US_2171_ProductPricing_20170206233707.xml")
# doc.getElementsByTagName returns NodeList
# name = doc.getElementsByTagName("name")[0]
# print(name.firstChild.data)
products = doc.getElementsByTagName("Product")
for product in products:
productCodeID = product.getAttribute("productCode")
statusCode = product.getAttribute("statusCode")
catalogPrice = pricing.getElementsByTagName("catalogPrice")
print("productCode:%s , statusCode:%s, catalogPrice:%s" % (productCodeID, statusCode, catalogPrice))
I guess you already got everything figured out. Just get all the pricing inside every product and loop through them.
for product in products:
pricings = product.getElementsByTagName("Pricing")
for pricing in pricings:
print pricing.getAttribute("catalogPrice")
You could do use the beautifulsoup library. This should work:
from bs4 import BeautifulSoup
xml_text = open('US_2171_ProductPricing_20170206233707.xml').read()
soup = BeautifulSoup(xml_text, 'html.parser')
products = soup.findAll('product')
data = [(p['productcode'],
p['statuscode'],
p.findAll('pricing')[0]['catalogprice'])
for p in products]
for items in data:
print(', '.join(items))

Retrieve all content between a closing and opening html tag using Beautiful Soup

I am parsing content using Python and Beautiful Soup then writing it to a CSV file, and have run into a bugger of a problem getting a certain set of data. The data is ran through an implementation of TidyHTML that I have crafted and then other not needed data is stripped out.
The issue is that I need to retrieve all data between a set of <h3> tags.
Sample Data:
<h3>Pages 1-18</h3>
<ul><li>September 13 1880. First regular meeting of the faculty;
September 14 1880. Discussion of curricular matters. Students are
debarred from taking algebra until they have completed both mental
and fractional arithmetic; October 4 1880.</li><li>All members present.</li></ul>
<ul><li>Moved the faculty henceforth hold regular weekkly meetings in the
President's room of the University building; 11 October 1880. All
members present; 18 October 1880. Regular meeting 2. Moved that the
President wait on the property holders on 12th street and request
them to abate the nuisance on their property; 25 October 1880.
Moved that the senior and junior classes for rhetoricals be...</li></ul>
<h3>Pages 19-33</h3>`
I need to retrieve all of the content between the first closing </h3> tag and the next opening <h3> tag. This shouldn't be hard, but my thick head isn't making the necessary connections. I can grab all of the <ul> tags but that doesn't work because there is not a one to one relationship between <h3> tags and <ul> tags.
The output I am looking to achieve is:
Pages 1-18|Vol-1-pages-001.pdf|content between and tags.
The first two parts have not been a problem but content between a set of tags is difficult for me.
My current code is as follows:
import glob, re, os, csv
from BeautifulSoup import BeautifulSoup
from tidylib import tidy_document
from collections import deque
html_path = 'Z:\\Applications\\MAMP\\htdocs\\uoassembly\\AssemblyRecordsVol1'
csv_path = 'Z:\\Applications\\MAMP\\htdocs\\uoassembly\\AssemblyRecordsVol1\\archiveVol1.csv'
html_cleanup = {'\r\r\n':'', '\n\n':'', '\n':'', '\r':'', '\r\r': '', '<img src="UOSymbol1.jpg" alt="" />':''}
for infile in glob.glob( os.path.join(html_path, '*.html') ):
print "current file is: " + infile
html = open(infile).read()
for i, j in html_cleanup.iteritems():
html = html.replace(i, j)
#parse cleaned up html with Beautiful Soup
soup = BeautifulSoup(html)
#print soup
html_to_csv = csv.writer(open(csv_path, 'a'), delimiter='|',
quoting=csv.QUOTE_NONE, escapechar=' ')
#retrieve the string that has the page range and file name
volume = deque()
fileName = deque()
summary = deque()
i = 0
for title in soup.findAll('a'):
if title['href'].startswith('V'):
#print title.string
volume.append(title.string)
i+=1
#print soup('a')[i]['href']
fileName.append(soup('a')[i]['href'])
#print html_to_csv
#html_to_csv.writerow([volume, fileName])
#retrieve the summary of each archive and store
#for body in soup.findAll('ul') or soup.findAll('ol'):
# summary.append(body)
for body in soup.findAll('h3'):
body.findNextSibling(text=True)
summary.append(body)
#print out each field into the csv file
for c in range(i):
pages = volume.popleft()
path = fileName.popleft()
notes = summary
if not summary:
notes = "help"
if summary:
notes = summary.popleft()
html_to_csv.writerow([pages, path, notes])
Extract content between </h3> and <h3> tags:
from itertools import takewhile
h3s = soup('h3') # find all <h3> elements
for h3, h3next in zip(h3s, h3s[1:]):
# get elements in between
between_it = takewhile(lambda el: el is not h3next, h3.nextSiblingGenerator())
# extract text
print(''.join(getattr(el, 'text', el) for el in between_it))
The code assumes that all <h3> elements are siblings. If it is not the case then you could use h3.nextGenerator() instead of h3.nextSiblingGenerator().
If you try to extract data between <ul><li></ul></li> tags in lxml, it provides a great functionality of using CSSSelector
import lxml.html
import urllib
data = urllib.urlopen('file:///C:/Users/ranveer/st.html').read() //contains your html snippet
doc = lxml.html.fromstring(data)
elements = doc.cssselect('ul li') // CSSpath[using firebug extension]
for element in elements:
print element.text_content()
after executing the above code you will get all text between the ul,li tags. It is much cleaner than beautiful soup.
If you by any chance plan to use lxml than you can evaluate XPath expressions in the following way-
import lxml
from lxml import etree
content = etree.HTML(urllib.urlopen("file:///C:/Users/ranveer/st.html").read())
content_text = content.xpath("html/body/h3[1]/a/#href | //ul[1]/li/text() | //ul[2]/li/text() | //h3[2]/a/#href")
print content_text
You can change XPath according to your need.

Categories

Resources