Getting multiple child values from XML doc using Python

Getting multiple child values from XML doc using Python - python

I'm reading XML METAR (Weather) Data using Python. I can read the data, and have also added error checking (only for visibility_statute_mi below!). Here is an example of the XML data:
<METAR>
<raw_text>
FALE 201800Z VRB01KT 9999 FEW016 BKN028 23/22 Q1010 NOSIG
</raw_text>
<station_id>FALE</station_id>
<observation_time>2013-01-20T18:00:00Z</observation_time>
<temp_c>23.0</temp_c>
<dewpoint_c>22.0</dewpoint_c>
<wind_dir_degrees>0</wind_dir_degrees>
<wind_speed_kt>1</wind_speed_kt>
<altim_in_hg>29.822834</altim_in_hg>
<quality_control_flags>
<no_signal>TRUE</no_signal>
</quality_control_flags>
<sky_condition sky_cover="FEW" cloud_base_ft_agl="1600"/>
<sky_condition sky_cover="BKN" cloud_base_ft_agl="2800"/>
<flight_category>MVFR</flight_category>
<metar_type>METAR</metar_type>
</METAR>
Here is my Python 2.7 code to parse the data:
# Output the XML in a HTML friendly manner
def outputHTML(xml):
# The get the METAR Data list
metar_data = xml.getElementsByTagName("data")
# Our return string
outputString = ""
# Cycled through the metar_data
for state in metar_data:
#Gets the stations and cycle through them
stations = state.getElementsByTagName("METAR")
for station in stations:
# Grab data from the station element
raw_text = station.getElementsByTagName("raw_text")[0].firstChild.data
station_id = station.getElementsByTagName("station_id")[0].firstChild.data
observation_time = station.getElementsByTagName('observation_time')[0].firstChild.data
temp_c = station.getElementsByTagName('temp_c')[0].firstChild.data
dewpoint_c = station.getElementsByTagName('dewpoint_c')[0].firstChild.data
wind_dir_degrees = station.getElementsByTagName('wind_dir_degrees')[0].firstChild.data
wind_speed_kt = station.getElementsByTagName('wind_speed_kt')[0].firstChild.data
visibility_statute_mi = station.getElementsByTagName('visibility_statute_mi')
if len(visibility_statute_mi) > 0:
visibility_statute_mi = visibility_statute_mi[0].firstChild.data
altim_in_hg = station.getElementsByTagName('altim_in_hg')[0].firstChild.data
metar_type = station.getElementsByTagName('metar_type')[0].firstChild.data
# Append the data onto the string
string = "<tr><td>" + str(station_id) + "</td><td>" + str(observation_time) + "</td><td>" + str(raw_text) + "</td><td>" + str(temp_c) + "</td><td>" + str(dewpoint_c) + "</td></tr>"
outputString+=string
# Output string
return outputString
How do I read the sky_condition data and loop to get the sky_cover and cloud_base_ft_agl values?
I'll also need to check if there are any sky-condition values, because quite often there is no cloud cover and then no data.
Andre

I would parse the xml into a tree and query it, e.g. like this:
import xml.etree.ElementTree as et
xmltext = """
<METAR>
<raw_text>
FALE 201800Z VRB01KT 9999 FEW016 BKN028 23/22 Q1010 NOSIG
</raw_text>
<station_id>FALE</station_id>
<observation_time>2013-01-20T18:00:00Z</observation_time>
<temp_c>23.0</temp_c>
<dewpoint_c>22.0</dewpoint_c>
<wind_dir_degrees>0</wind_dir_degrees>
<wind_speed_kt>1</wind_speed_kt>
<altim_in_hg>29.822834</altim_in_hg>
<quality_control_flags>
<no_signal>TRUE</no_signal>
</quality_control_flags>
<sky_condition sky_cover="FEW" cloud_base_ft_agl="1600"/>
<sky_condition sky_cover="BKN" cloud_base_ft_agl="2800"/>
<flight_category>MVFR</flight_category>
<metar_type>METAR</metar_type>
</METAR>
"""
tree = et.fromstring(xmltext)
for sky_con in tree.iterfind('sky_condition'):
print sky_con.attrib["cloud_base_ft_agl"]
print sky_con.attrib.keys()
by reading the keys() you can check the presence of the attribute you're interested in.
edit: if you want to use xml.dom.minidom you can add these lines to your stations-loop to extract the same attributes:
for sky_con in station.getElementsByTagName("sky_condition"):
print sky_con._attrs["cloud_base_ft_agl"].value
print sky_con._attrs["sky_cover"].value

Related

Weird sequence when writing into txt document

Hi everyone this is my first time here, and I am a beginner in Python. I am in the middle of writing a program that returns a txt document containing information about a stock (Watchlist Info.txt), based on the input of another txt document containing the company names (Watchlist).
To achieve this, I have written 3 functions, of which 2 functions reuters_ticker() and stock_price() are completed as shown below:
def reuters_ticker(desired_search):
#from company name execute google search for and return reuters stock ticker
try:
from googlesearch import search
except ImportError:
print('No module named google found')
query = desired_search + ' reuters'
for j in search(query, tld="com.sg", num=1, stop=1, pause=2):
result = j
ticker = re.search(r'\w+\.\w+$', result)
return ticker.group()
Stock Price:
def stock_price(company, doc=None):
ticker = reuters_ticker(company)
request = 'https://www.reuters.com/companies/' + ticker
raw_main = pd.read_html(request)
data1 = raw_main[0]
data1.set_index(0, inplace=True)
data1 = data1.transpose()
data2 = raw_main[1]
data2.set_index(0, inplace=True)
data2 = data2.transpose()
stock_info = pd.concat([data1,data2], axis=1)
if doc == None:
print(company + '\n')
print('Previous Close: ' + str(stock_info['Previous Close'][1]))
print('Forward PE: ' + str(stock_info['Forward P/E'][1]))
print('Div Yield(%): ' + str(stock_info['Dividend (Yield %)'][1]))
else:
from datetime import date
with open(doc, 'a') as output:
output.write(date.today().strftime('%d/%m/%y') + '\t' + str(stock_info['Previous Close'][1]) + '\t' + str(stock_info['Forward P/E'][1]) + '\t' + '\t' + str(stock_info['Dividend (Yield %)'][1]) + '\n')
output.close()
The 3rd function, watchlist_report(), is where I am getting problems with writing the information in the format as desired.
def watchlist_report(watchlist):
with open(watchlist, 'r') as companies, open('Watchlist Info.txt', 'a') as output:
searches = companies.read()
x = searches.split('\n')
for i in x:
output.write(i + ':\n')
stock_price(i, doc='Watchlist Info.txt')
output.write('\n')
When I run watchlist_report('Watchlist.txt'), where Watchlist.txt contains 'Apple' and 'Facebook' each on new lines, my output is this:
26/04/20 275.03 22.26 1.12
26/04/20 185.13 24.72 --
Apple:
Facebook:
Instead of what I want and would expect based on the code I have written in watchlist_report():
Apple:
26/04/20 275.03 22.26 1.12
Facebook:
26/04/20 185.13 24.72 --
Therefore, my questions are:
1) Why is my output formatted this way?
2) Which part of my code do I have to change to make the written output in my desired format?
Any other suggestions about how I can clean my code and any libraries I can use to make my code nicer are also appreciated!

You handle two different file-handles - the file-handle inside your watchlist_report gets closed earlier so its being written first, before the outer functions file-handle gets closed, flushed and written.
Instead of creating a new open(..) in your function, pass the current file handle:
def watchlist_report(watchlist):
with open(watchlist, 'r') as companies, open('Watchlist Info.txt', 'a') as output:
searches = companies.read()
x = searches.split('\n')
for i in x:
output.write(i + ':\n')
stock_price(i, doc = output) # pass the file handle
output.write('\n')
Inside def stock_price(company, doc=None): use the provided filehandle:
def stock_price(company, output = None): # changed name here
# [snip] - removed unrelated code for this answer for brevity sake
if output is None: # check for None using IS
print( ... ) # print whatever you like here
else:
from datetime import date
output.write( .... ) # write whatever you want it to write
# output.close() # do not close, the outer function does this
Do not close the file handle in the inner function, the context handling with(..) of the outer function does that for you.
The main takeaway for file handling is that things you write(..) to your file are not neccessarily placed there immediately. The filehandler chooses when to actually persist data to your disk, the latests it does that is when it goes out of scope (of the context handler) or when its internal buffer reaches some threshold so it "thinks" it is now prudent to alter to data on your disc. See How often does python flush to a file? for more infos.

urllib.request: Data Not Writing to Outfile

I've got a script here which (ideally) iterates through multiple pages X of JSON data for each entity Y (in this case, multiple loans X for each team Y). The way that the api is constructed, I believe I must physically change a subdirectory within the URL in order to iterate through multiple entities. Here is the explicit documentation and URL:
GET /teams/:id/loans
Returns loans belonging to a particular team.
Example http://api.kivaws.org/v1/teams/2/loans.json
Parameters id(number) Required. The team ID for which to return loans.
page(number) The page position of results to return. Default: 1
sort_by(string) The order by which to sort results. One of: oldest,
newest Default: newest app_id(string) The application id in reverse
DNS notation. ids_only(string) Return IDs only to make the return
object smaller. One of: true, false Default: false Response
loan_listing – HTML , JSON , XML , RSS
Status Production
And here is my script, which does run and appear to extract the correct data, but doesn't seem to write any data to the outfile:
# -*- coding: utf-8 -*-
import urllib.request as urllib
import json
import time
# storing team loans dict. The key is the team id, en value is the list of lenders
team_loans = {}
url = "http://api.kivaws.org/v1/teams/"
#teams_id range 1 - 11885
for i in range(1, 100):
params = dict(
id = i
)
#i =1
try:
handle = urllib.urlopen(str(url+str(i)+"/loans.json"))
print(handle)
except:
print("Could not handle url")
continue
# reading response
item_html = handle.read().decode('utf-8')
# converting bytes to str
data = str(item_html)
# converting to json
data = json.loads(data)
# getting number of pages to crawl
numPages = data['paging']['pages']
# deleting paging data
data.pop('paging')
# calling additional pages
if numPages >1:
for pa in range(2,numPages+1,1):
#pa = 2
handle = urllib.urlopen(str(url+str(i)+"/loans.json?page="+str(pa)))
print("Pulling loan data from team " + str(i) + "...")
# reading response
item_html = handle.read().decode('utf-8')
# converting bytes to str
datatemp = str(item_html)
# converting to json
datatemp = json.loads(datatemp)
#Pagings are redundant headers
datatemp.pop('paging')
# adding data to initial list
for loan in datatemp['loans']:
data['loans'].append(loan)
time.sleep(2)
# recording loans by team in dict
team_loans[i] = data['loans']
if (data['loans']):
print("===Data added to the team_loan dictionary===")
else:
print("!!!FAILURE to add data to team_loan dictionary!!!")
# recorging data to file when 10 teams are read
print("===Finished pulling from page " + str(i) + "===")
if (int(i) % 10 == 0):
outfile = open("team_loan.json", "w")
print("===Now writing data to outfile===")
json.dump(team_loans, outfile, sort_keys = True, indent = 2, ensure_ascii=True)
outfile.close()
else:
print("!!!FAILURE to write data to outfile!!!")
# compliance with API # of requests
time.sleep(2)
print ('Done! Check your outfile (team_loan.json)')
I know that may be a heady amount of code to throw in your faces, but it's a pretty sequential process.
Again, this program is pulling the correct data, but it is not writing this data to the outfile. Can anyone understand why?

For others who may read this post, the script does in face write data to an outfile. It was simply test code logic that was wrong. Ignore the print statements I have put into place.

How to parse a single-column text file into a table using python?

I'm new here to StackOverflow, but I have found a LOT of answers on this site. I'm also a programming newbie, so i figured i'd join and finally become part of this community - starting with a question about a problem that's been plaguing me for hours.
I login to a website and scrape a big body of text within the b tag to be converted into a proper table. The layout of the resulting Output.txt looks like this:
BIN STATUS
8FHA9D8H 82HG9F RECEIVED SUCCESSFULLY AWAITING STOCKING PROCESS
INVENTORY CODE: FPBC *SOUP CANS LENTILS
BIN STATUS
HA8DHW2H HD0138 RECEIVED SUCCESSFULLY AWAITING STOCKING PROCESS
8SHDNADU 00A123 #2956- INVALID STOCK COUPON CODE (MISSING).
93827548 096DBR RECEIVED SUCCESSFULLY AWAITING STOCKING PROCESS
There are a bunch of pages with the exact same blocks, but i need them to be combined into an ACTUAL table that looks like this:
BIN INV CODE STATUS
HA8DHW2HHD0138 FPBC-*SOUP CANS LENTILS RECEIVED SUCCESSFULLY AWAITING STOCKING PROCESS
8SHDNADU00A123 FPBC-*SOUP CANS LENTILS #2956- INVALID STOCK COUPON CODE (MISSING).
93827548096DBR FPBC-*SOUP CANS LENTILS RECEIVED SUCCESSFULLY AWAITING STOCKING PROCESS
8FHA9D8H82HG9F SSXR-98-20LM NM CORN CREAM RECEIVED SUCCESSFULLY AWAITING STOCKING PROCESS
Essentially, all separate text blocks in this example would become part of this table, with the inv code repeating with its Bin values. I would post my attempts at parsing this data(have tried Pandas/bs/openpyxl/csv writer), but ill admit they are a little embarrassing, as i cannot find any information on this specific problem. Is there any benevolent soul out there that can help me out? :)
(Also, i am using Python 2.7)

A simple custom parser like the following should do the trick.
from __future__ import print_function
def parse_body(s):
line_sep = '\n'
getting_bins = False
inv_code = ''
for l in s.split(line_sep):
if l.startswith('INVENTORY CODE:') and not getting_bins:
inv_data = l.split()
inv_code = inv_data[2] + '-' + ' '.join(inv_data[3:])
elif l.startswith('INVENTORY CODE:') and getting_bins:
print("unexpected inventory code while reading bins:", l)
elif l.startswith('BIN') and l.endswith('MESSAGE'):
getting_bins = True
elif getting_bins == True and l:
bin_data = l.split()
# need to add exception handling here to make sure:
# 1) we have an inv_code
# 2) bin_data is at least 3 items big (assuming two for
# bin_id and at least one for message)
# 3) maybe some constraint checking to ensure that we have
# a valid instance of an inventory code and bin id
bin_id = ''.join(bin_data[0:2])
message = ' '.join(bin_data[2:])
# we now have a bin, an inv_code, and a message to add to our table
print(bin_id.ljust(20), inv_code.ljust(30), message, sep='\t')
elif getting_bins == True and not l:
# done getting bins for current inventory code
getting_bins = False
inv_code = ''

A rather complex one, but this might get you started:
import re, pandas as pd
from pandas import DataFrame
rx = re.compile(r'''
(?:INVENTORY\ CODE:)\s*
(?P<inv>.+\S)
[\s\S]+?
^BIN.+[\n\r]
(?P<bin_msg>(?:(?!^\ ).+[\n\r])+)
''', re.MULTILINE | re.VERBOSE)
string = your_string_here
# set up the dataframe
df = DataFrame(columns = ['BIN', 'INV', 'MESSAGE'])
for match in rx.finditer(string):
inv = match.group('inv')
bin_msg_raw = match.group('bin_msg').split("\n")
rxbinmsg = re.compile(r'^(?P<bin>(?:(?!\ {2}).)+)\s+(?P<message>.+\S)\s*$', re.MULTILINE)
for item in bin_msg_raw:
for m in rxbinmsg.finditer(item):
# append it to the dataframe
df.loc[len(df.index)] = [m.group('bin'), inv, m.group('message')]
print(df)
Explanation
It looks for INVENTORY CODE and sets up the groups (inv and bin_msg) for further processing in afterwork() (note: it would be easier if you had only one line of bin/msg as you need to split the group here afterwards).
Afterwards, it splits the bin and msg part and appends all to the df object.

I had a code written for a website scrapping which may help you.
Basically what you need to do is write click on the web page go to html and try to find the tag for the table you are looking for and using the module (i am using beautiful soup) extract the information. I am creating a json as I need to store it into mongodb you can create table.
#! /usr/bin/python
import sys
import requests
import re
from BeautifulSoup import BeautifulSoup
import pymongo
def req_and_parsing():
url2 = 'http://businfo.dimts.in/businfo/Bus_info/EtaByRoute.aspx?ID='
list1 = ['534UP','534DOWN']
for Route in list1:
final_url = url2 + Route
#r = requests.get(final_url)
#parsing_file(r.text,Route)
outdict = []
outdict = [parsing_file( requests.get(url2+Route).text,Route) for Route in list1 ]
print outdict
conn = f_connection()
for i in range(len(outdict)):
insert_records(conn,outdict[i])
def parsing_file(txt,Route):
soup = BeautifulSoup(txt)
table = soup.findAll("table",{"id" : "ctl00_ContentPlaceHolder1_GridView2"})
#trtags = table[0].findAll('tr')
tdlist = []
trtddict = {}
"""
for trtag in trtags:
print 'print trtag- ' , trtag.text
tdtags = trtag.findAll('td')
for tdtag in tdtags:
print tdtag.text
"""
divtags = soup.findAll("span",{"id":"ctl00_ContentPlaceHolder1_ErrorLabel"})
for divtag in divtags:
for divtag in divtags:
print "div tag - " , divtag.text
if divtag.text == "Currently no bus is running on this route" or "This is not a cluster (orange bus) route":
print "Page not displayed Errored with below meeeage for Route-", Route," , " , divtag.text
sys.exit()
trtags = table[0].findAll('tr')
for trtag in trtags:
tdtags = trtag.findAll('td')
if len(tdtags) == 2:
trtddict[tdtags[0].text] = sub_colon(tdtags[1].text)
return trtddict
def sub_colon(tag_str):
return re.sub(';',',',tag_str)
def f_connection():
try:
conn=pymongo.MongoClient()
print "Connected successfully!!!"
except pymongo.errors.ConnectionFailure, e:
print "Could not connect to MongoDB: %s" % e
return conn
def insert_records(conn,stop_dict):
db = conn.test
print db.collection_names()
mycoll = db.stopsETA
mycoll.insert(stop_dict)
if __name__ == "__main__":
req_and_parsing()

Python:XML List index out of range

I'm having troubles to get some values in a xml file. The error is IndexError: list index out of range
XML
<?xml version="1.0" encoding="UTF-8"?>
<nfeProc xmlns="http://www.portalfiscal.inf.br/nfe" versao="3.10">
<NFe xmlns="http://www.portalfiscal.inf.br/nfe">
<infNFe Id="NFe35151150306471000109550010004791831003689145" versao="3.10">
<ide>
<nNF>479183</nNF>
</ide>
<emit>
<CNPJ>3213213212323</CNPJ>
</emit>
<det nItem="1">
<prod>
<cProd>7030-314</cProd>
</prod>
<imposto>
<ICMS>
<ICMS10>
<orig>1</orig>
<CST>10</CST>
<vICMS>10.35</vICMS>
<vICMSST>88.79</vICMSST>
</ICMS10>
</ICMS>
</imposto>
</det>
<det nItem="2">
<prod>
<cProd>7050-6</cProd>
</prod>
<imposto>
<ICMS>
<ICMS00>
<orig>1</orig>
<CST>00</CST>
<vICMS>7.49</vICMS>
</ICMS00>
</ICMS>
</imposto>
</det>
</infNFe>
</NFe>
</nfeProc>
I'm getting the values from XML, it's ok in some xml's, those having vICMS and vICMSST tags:
vicms = doc.getElementsByTagName('vICMS')[i].firstChild.nodeValue
vicmsst = doc.getElementsByTagName('vICMSST')[1].firstChild.nodeValue
This returns:
First returns:
print vicms
>> 10.35
print vicmsst
>> 88.79
Second imposto CRASHES because don't find vICMSST tag...
**IndexError: list index out of range**
What the best form to test it? I'm using xml.etree.ElementTree:
My code:
import os
import sys
import subprocess
import base64,xml.dom.minidom
from xml.dom.minidom import Node
import glob
import xml.etree.ElementTree as ET
origem = 0
# only loops over XML documents in folder
for file in glob.glob("*.xml"):
f = open("%s" % file,'r')
data = f.read()
i = 0
doc = xml.dom.minidom.parseString(data)
for topic in doc.getElementsByTagName('emit'):
#Get Fiscal Number
nnf= doc.getElementsByTagName('nNF')[i].firstChild.nodeValue
print 'Fiscal Number %s' % nnf
print '\n'
for prod in doc.getElementsByTagName('det'):
vicms = 0
vicmsst = 0
#Get value of ICMS
vicms = doc.getElementsByTagName('vICMS')[i].firstChild.nodeValue
#Get value of VICMSST
vicmsst = doc.getElementsByTagName('vICMSST')[i].firstChild.nodeValue
#PRINT INFO
print 'ICMS %s' % vicms
print 'Valor do ICMSST: %s' % vicmsst
print '\n\n'
i +=1
print '\n\n'

There is only one vICMSST tag in your XML document. So, when i=1, the following line returns an IndexError.
vicmsst = doc.getElementsByTagName('vICMSST')[1].firstChild.nodeValue
You can restructure this to:
try:
vicmsst = doc.getElementsByTagName('vICMSST')[i].firstChild.nodeValue
except IndexError:
# set a default value or deal with this how you like
It's hard to say what you should do upon an exception without knowing more about what you're trying to do.

You are making several general mistakes in your code.
Don't use counters to index into lists you don't know the length of. Normally, iteration with for .. in is a lot better than using indexes anyway.
You have many imports you don't seem to use, get rid of them.
You can use minidom, but ElementTree is better for your task because it supports searching for nodes with XPath and it supports XML namespaces.
Don't read an XML file as a string and then use parseString. Let the XML parser handle the file directly. This way all file encoding related issues will be handled without errors.
The following is a lot better than your original approach.
import glob
import xml.etree.ElementTree as ET
def get_text(context_elem, xpath, xmlns=None):
""" helper function that gets the text value of a node """
node = context_elem.find(xpath, xmlns)
if (node != None):
return node.text
else:
return ""
# set up XML namespace URIs
xmlns = {
"nfe": "http://www.portalfiscal.inf.br/nfe"
}
for path in glob.glob("*.xml"):
doc = ET.parse(path)
for infNFe in doc.iterfind('.//nfe:infNFe', xmlns):
print 'Fiscal Number\t%s' % get_text(infNFe, ".//nfe:nNF", xmlns)
for det in infNFe.iterfind(".//nfe:det", xmlns):
print ' ICMS\t%s' % get_text(det, ".//nfe:vICMS", xmlns)
print ' Valor do ICMSST:\t%s' % get_text(det, ".//nfe:vICMSST", xmlns)
print '\n\n'

Getting multiple children's values using minidom

As you can see from the xml here there are multiple <item> nodes with a set of children such as <summary>, <status> and <key>.
The problem I've encountered is that in using minidom, it's possible to get values of the firstChild and lastChild, but not necessarily any values in between.
I've created the below which doesn't work, but I think is a close approximation of what I need to be doing
import xml.dom.minidom
xml = xml.dom.minidom.parse(result) # or xml.dom.minidom.parseString(xml_string)
itemList = xml.getElementsByTagName('item')
for item in itemList [1:]:
summaryList = item.getElementsByTagName('summary')
statusList = item.getElementsByTagName('status')
keyList = item.getElementsByTagName('key')
lineText = (summaryList[0].nodeValue + " " + statusList[0].nodeValue + " " + keyList[0].nodeValue)
p = Paragraph(lineText, style)
Story.append(p)

Define get_text() function that joins all of the text child nodes (see this answer):
def get_text(element):
return " ".join(t.nodeValue for t in element[0].childNodes
if t.nodeType == t.TEXT_NODE)
dom = xml.dom.minidom.parseString(data)
itemList = dom.getElementsByTagName('item')
for item in itemList[1:]:
summaryList = item.getElementsByTagName('summary')
statusList = item.getElementsByTagName('status')
keyList = item.getElementsByTagName('key')
print get_text(summaryList)
print get_text(statusList)
print get_text(keyList)
print "----"
prints:
Unapprove all pull request reviewers after major change
Needs Triage
STASH-4473
----
Allow using left/right arrow to move side by side diff left/right
Needs Triage
STASH-4478
----
Hope that helps.

How about something like
for item in itemList:
lineText = ' '.join(child.nodeValue for child in item.childNodes)
p = Paragraph(lineText, style)
Story.append(p)

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Getting multiple child values from XML doc using Python - python

Related

Weird sequence when writing into txt document

urllib.request: Data Not Writing to Outfile

How to parse a single-column text file into a table using python?

Python:XML List index out of range

Getting multiple children's values using minidom

Categories

Resources