Xml parsing from web response - python

I'm trying to get response from nominatim to geo-code few thousands of cities.
import os
import requests
import xml.etree.ElementTree as ET
txt = open('input.txt', 'r').readlines()
for line in txt:
lp, region, district, municipality, city = line.split('\t')
baseUrl = 'http://nominatim.openstreetmap.org/search/gb/'+region+'/'+district+'/'+municipality+'/'+city+'/?format=xml'
# eg. http://nominatim.openstreetmap.org/search/pl/podkarpackie/stalowowolski/Bojan%C3%B3w/Zapu%C5%9Bcie/?format=xml
resp = requests.get(baseUrl)
resp.encoding = 'UTF-8' # special diacritics
msg = resp.text
# parse response to get lat & long
tree = ET.parse(msg)
root = tree.getroot()
print tree
but the result is:
Traceback (most recent call last):
File "geo_miasta.py", line 17, in <module>
tree = ET.parse(msg)
File "/usr/lib/python2.7/xml/etree/ElementTree.py", line 1182, in parse
tree.parse(source, parser)
File "/usr/lib/python2.7/xml/etree/ElementTree.py", line 647, in parse
source = open(source, "rb")
IOError: [Errno 2] No such file or directory: u'<?xml version="1.0" encoding="UTF-8" ?>\n<searchresults timestamp=\'Tue, 11 Feb 14 21:13:50 +0000\' attribution=\'Data \xa9 OpenStreetMap contributors, ODbL 1.0. http://www.openstreetmap.org/copyright\' querystring=\'\u015awierczyna, Drzewica, opoczy\u0144ski, \u0142\xf3dzkie, gb\' polygon=\'false\' more_url=\'http://nominatim.openstreetmap.org/search?format=xml&exclude_place_ids=&q=%C5%9Awierczyna%2C+Drzewica%2C+opoczy%C5%84ski%2C+%C5%82%C3%B3dzkie%2C+gb\'>\n</searchresults>'
What is wrong with this?
Edit:
Thant to #rob my solution is:
#! /usr/bin/env python2.7
# -*- coding: utf-8 -*-
import os
import requests
import xml.etree.ElementTree as ET
txt = open('input.txt', 'r').read().split('\n')
for line in txt:
lp, region, district, municipality, city = line.split('\t')
baseUrl = 'http://nominatim.openstreetmap.org/search/pl/'+region+'/'+district+'/'+municipality+'/'+city+'/?format=xml'
resp = requests.get(baseUrl)
msg = resp.content
tree = ET.fromstring(msg)
for place in tree.findall('place'):
location = '{:5f}\t{:5f}'.format(
float(place.get('lat')),
float(place.get('lon')))
f = open('result.txt', 'a')
f.write(location+'\t'+region+'\t'+district+'\t'+municipality+'\t'+city)
f.close()

You are using xml.etree.ElementTree.parse(), which takes a filename or a file object as an argument. But, you are not passing a file or file object in, you are passing a unicode string.
Try xml.etree.ElementTree.fromstring(text).
Like this:
tree = ET.fromstring(msg)
Here is a complete sample program:
import os
import requests
import xml.etree.ElementTree as ET
baseUrl = 'http://nominatim.openstreetmap.org/search/pl/podkarpackie/stalowowolski/Bojan%C3%B3w/Zapu%C5%9Bcie\n/?format=xml'
resp = requests.get(baseUrl)
msg = resp.content
tree = ET.fromstring(msg)
for place in tree.findall('place'):
print u'{:s}: {:+.2f}, {:+.2f}'.format(
place.get('display_name'),
float(place.get('lon')),
float(place.get('lat'))).encode('utf-8')

import os,sys,time
import xml.etree.ElementTree as ET
from xml.etree.ElementTree import parse
tree = ET.parse('D:\Reddy\BankLoanAcctService_transactionInq.xml')
root=tree.getroot()
for TrxnEffDt in root.iter('TrxnEffDt'):
new_TrxnEffDt= str(time.strftime("%y-%m-%d"))
TrxnEffDt=str(new_TrxnEffDt)
filename2 ="D:\Reddy\BankLoanAcctService_transactionInq2.txt"
r=open(filename2,'w')
sys.stdout =r

Related

Unable to edit powerpoint XML data points

I'm utilizing python to update data within the xml below, what im trying to do in all is update text within a powerpoint dynamically using an incoming dataframe, by doing so i pull out the xml from the pptx file and can't figure out how to change the text within the xml.
Dataframe:
Old New
0 A.1 Valuation
1 A.2 12000
2 A.3 5.23
3 A.4 Test,Complete
XMLFile: Github Link
XML Snippit:
<a:tc>
<a:txBody>
<a:bodyPr/>
<a:lstStyle/>
<a:p>
<a:pPr algn="l" fontAlgn="auto"/>
<a:r>
<a:rPr lang="en-US" sz="1800" dirty="0">
<a:effectLst/>
</a:rPr>
<a:t>A.1​</a:t>
</a:r>
<a:endParaRPr lang="en-US" sz="1800" b="0" i="0" dirty="0">
<a:solidFill>
<a:srgbClr val="000000"/>
</a:solidFill>
<a:effectLst/>
<a:latin typeface="Calibri" panose="020F0502020204030204" pitchFamily="34" charset="0"/>
</a:endParaRPr>
</a:p>
</a:txBody>
<a:tcPr/>
</a:tc>
Python Code:
import shutil
from pathlib import Path
import lxml.etree as ET
import pandas as pd
xml_file = r'\Desktop\PowerPoint XML\Test\ppt\slides\slide1.xml'
Dataframe = r'\Desktop\PowerPoint XML\Dataframe.xlsx'
df = pd.read_excel(Dataframe)
df['Old'] = df['Old'].astype(str)
df['New'] = df['New'].astype(str)
# open xml file that contains slide data
tree = ET.parse(xml_file)
treeRoot = tree.getroot()
rootString = ET.tostring(treeRoot)
decodedRootString = bytes.decode(rootString)
old = df.Old; new = df.New
# search for old value then replace with new
for elem in treeRoot.iter():
elem.text.replace(old[elem],new[elem])
FinalString=str.encode(decodedRootString)
#save the XML File
root = ET.fromstring(FinalString)
my_tree = ET.ElementTree(root)
with open(xml_file, 'wb') as f:
f.write(ET.tostring(my_tree))
Error Log:
File "\tempCodeRunnerFile.py", line 36, in <module>
elem.text.replace(old[elem],new[elem])
AttributeError: 'NoneType' object has no attribute 'replace'

Processing files with listdir() breakes when directory contains subdirectories

Following code should walk through directory and grab XML files and process them (i.e. prefixing HTML classes stored in XML elements — however, this is not important in relation to the question). The code works as long as there are no subdirectories inside "/input-dir", but as soon as there are subdirectories, an error message gets thrown out:
Traceback (most recent call last):
File "/Users/ab/Code/SHCprefixer-2022/shc-prefixer_upwork.py", line 22, in content = file.readlines(); File "/codecs.py", line 322, in decode (result, consumed) = self._buffer_decode(data, self.errors, final) UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 566: invalid start byte
from bs4 import BeautifulSoup
import xml.etree.ElementTree as ET
import os
import lxml
import re
input_path = "./input-dir";
output_path = "./output-dir";
ls = os.listdir(input_path);
print(ls);
with open("classes.txt", "r") as cls:
clss = cls.readlines()
for i in range(len(clss)):
clss[i] = clss[i].strip()
print(clss);
for d in range(len(ls)):
with open(f"{input_path}/{ls[d]}", "r") as file:
content = file.readlines();
content = "".join(content)
bs_content = BeautifulSoup(content, "lxml")
str_bs_content = str(bs_content)
str_bs_content = str_bs_content.replace("""<?xml version="1.0" encoding="UTF-8"?><html><body>""", "");
str_bs_content = str_bs_content.replace("</body></html>", "");
for j in range(len(clss)):
str_bs_content = str_bs_content.replace(clss[j], f"prefix-{clss[j]}")
with open(f"{output_path}/{ls[d]}", "w") as f:
f.write(str_bs_content)
Probably the error is related to the listdir() command, and as indicated in "IsADirectoryError: [Errno 21] Is a directory: " It is a file, I should use os.walk(), but I wasn't able to implement it. Would be great if someone could help.
You need to test whether the returned file system name is a file. You also want to search the entire subtree. Instead of listdir you could use os.walk, but I think that the newer pathlib module better suites your needs. Its .glob method, when used with "**", will search the subtree and filter for a known file extension at the same time.
from bs4 import BeautifulSoup
import xml.etree.ElementTree as ET
import lxml
import re
from pathlib import Path
input_path = Path("./input-dir")
output_path = Path("./output-dir")
ls = [p for p in input_path.glob("**/*.xml") if p.is_file()]
print(", ".join(str(p) for p in ls))
with open("classes.txt", "r") as cls:
clss = cls.readlines()
for i in range(len(clss)):
clss[i] = clss[i].strip()
print(clss)
for infile in ls:
with infile.open() as file:
bs_content = BeautifulSoup(file.read(), "lxml")
str_bs_content = str(bs_content)
str_bs_content = str_bs_content.replace("""<?xml version="1.0" encoding="UTF-8"?><html><body>""", "");
str_bs_content = str_bs_content.replace("</body></html>", "");
for j in range(len(clss)):
str_bs_content = str_bs_content.replace(clss[j], f"prefix-{clss[j]}")
outfile = output_path / infile.relative_to(input_path)
outfile.parent.mkdir(parents=True, exist_ok=True)
with outfile.open("w") as f:
f.write(str_bs_content)
Looks like you will need to filter out directories from the input path dir. You could use os.path.isfile(x) to check it. Using list comprehension you can get the filtered list in one line:
ls = [f for f in os.listdir(input_path) if os.path.isfile(f)]

python getting unicode encode error when saving file

i'm trying to get text from a webpage and it makes 'Traceback (most recent call last):
File "C:\Users\username\Desktop\Python\parsing.py", line 21, in
textFile.write(str(results))
UnicodeEncodeError: 'cp949' codec can't encode character '\xa9' in position 37971: illegal multibyte sequence'
I've searched and tried
textFile.write(str(results).decode('utf-8'))
and it makes no attribute arror.
import requests
import os
from bs4 import BeautifulSoup
outputFolderName = "output"
currentPath = os.path.dirname(os.path.realpath(__file__))
outputDir = currentPath + "/" +outputFolderName
r = requests.get('https://yahoo.com/')
soup = BeautifulSoup(r.text, 'html.parser')
results = soup.findAll(text=True)
try :
os.mkdir(outputDir)
print("output directory generated")
except :
print("using existing directory")
textFile = open(outputDir + '/output.txt', 'w')
textFile.write(str(results))
textFile.close()
Is there any way to convert the codec of str(results) and save it properly??
python version is 3.7.3
Please specify the encoding like in this example
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import requests
import os
from bs4 import BeautifulSoup
outputFolderName = "output"
currentPath = os.path.dirname(os.path.realpath(__file__))
outputDir = currentPath + "/" +outputFolderName
r = requests.get('https://yahoo.com')
soup = BeautifulSoup(r.text, 'html.parser')
results = soup.findAll(text=True)
try :
os.mkdir(outputDir)
print("output directory generated")
except :
print("using existing directory")
textFile = open(outputDir + '/output.txt', mode='w', encoding='utf8')
textFile.write(str(results))
textFile.close()

Parse xml and write a csv with header columns

I am trying to parse an xml file containing meteo data and to write some value in a csv file.
I'm not sure that this code is elegant but it works.
from qgis.PyQt.QtCore import *
import requests
import xml.etree.ElementTree as ET
# url of xml to parse
baseUrl = ('http://www.arpa.veneto.it/bollettini/meteo/h24/img08/0144.xml')
resp = requests.get(baseUrl)
msg = resp.content
tree = ET.fromstring(msg)
for stazione in tree.iter('STAZIONE'):
idstaz= stazione.find('IDSTAZ').text
for sensore in stazione.iter('SENSORE'):
id= sensore.find('ID').text
for dati in sensore.iter('DATI'):
ist = dati.get('ISTANTE')
vm = dati.find('VM').text
f = open('D:/GIS/_Temp/result.csv', 'a')
print >> f, idstaz, id, ist, vm
f.close()
I'm not sure that this code is elegant but it works.
144 300000864 201701080100 -4.2
144 300000864 201701080200 -4.5
144 300000864 201701080300 -4.8
144 300000864 201701080400 -5.5
...
but I don't know how to add the headers to the columns.
Open the file before the for loop and add header to file
from qgis.PyQt.QtCore import *
import requests
import xml.etree.ElementTree as ET
# url of xml to parse
baseUrl = ('http://www.arpa.veneto.it/bollettini/meteo/h24/img08/0144.xml')
resp = requests.get(baseUrl)
msg = resp.content
tree = ET.fromstring(msg)
f = open('D:/GIS/_Temp/result.cvs', 'a')
f.write('STAZIONE,IDSTAZ,SENSORE,ISTANTE')
for stazione in tree.iter('STAZIONE'):
idstaz= stazione.find('IDSTAZ').text
for sensore in stazione.iter('SENSORE'):
id= sensore.find('ID').text
for dati in sensore.iter('DATI'):
ist = dati.get('ISTANTE')
vm = dati.find('VM').text
print >> f, idstaz, id, ist, vm
f.close()

Iterate through multiple files and append text from HTML using Beautiful Soup

I have a directory of downloaded HTML files (46 of them) and I am attempting to iterate through each of them, read their contents, strip the HTML, and append only the text into a text file. However, I'm unsure where I'm messing up, though, as nothing gets written to my text file?
import os
import glob
from bs4 import BeautifulSoup
path = "/"
for infile in glob.glob(os.path.join(path, "*.html")):
markup = (path)
soup = BeautifulSoup(markup)
with open("example.txt", "a") as myfile:
myfile.write(soup)
f.close()
-----update----
I've updated my code as below, however the text file still doesn't get created.
import os
import glob
from bs4 import BeautifulSoup
path = "/"
for infile in glob.glob(os.path.join(path, "*.html")):
markup = (infile)
soup = BeautifulSoup(markup)
with open("example.txt", "a") as myfile:
myfile.write(soup)
myfile.close()
-----update 2-----
Ah, I caught that I had my directory incorrect, so now I have:
import os
import glob
from bs4 import BeautifulSoup
path = "c:\\users\\me\\downloads\\"
for infile in glob.glob(os.path.join(path, "*.html")):
markup = (infile)
soup = BeautifulSoup(markup)
with open("example.txt", "a") as myfile:
myfile.write(soup)
myfile.close()
When this is executed, I get this error:
Traceback (most recent call last):
File "C:\Users\Me\Downloads\bsoup.py, line 11 in <module>
myfile.write(soup)
TypeError: must be str, not BeautifulSoup
I fixed this last error by changing
myfile.write(soup)
to
myfile.write(soup.get_text())
-----update 3 ----
It's working properly now, here's the working code:
import os
import glob
from bs4 import BeautifulSoup
path = "c:\\users\\me\\downloads\\"
for infile in glob.glob(os.path.join(path, "*.html")):
markup = (infile)
soup = BeautifulSoup(open(markup, "r").read())
with open("example.txt", "a") as myfile:
myfile.write(soup.get_text())
myfile.close()
actually you are not reading html file, this should work,
soup=BeautifulSoup(open(webpage,'r').read(), 'lxml')
If you want to use lxml.html directly here is a modified version of some code I've been using for a project. If you want to grab all the text, just don't filter by tag. There may be a way to do it without iterating, but I don't know. It saves the data as unicode, so you will have to take that into account when opening the file.
import os
import glob
import lxml.html
path = '/'
# Whatever tags you want to pull text from.
visible_text_tags = ['p', 'li', 'td', 'h1', 'h2', 'h3', 'h4',
'h5', 'h6', 'a', 'div', 'span']
for infile in glob.glob(os.path.join(path, "*.html")):
doc = lxml.html.parse(infile)
file_text = []
for element in doc.iter(): # Iterate once through the entire document
try: # Grab tag name and text (+ tail text)
tag = element.tag
text = element.text
tail = element.tail
except:
continue
words = None # text words split to list
if tail: # combine text and tail
text = text + " " + tail if text else tail
if text: # lowercase and split to list
words = text.lower().split()
if tag in visible_text_tags:
if words:
file_text.append(' '.join(words))
with open('example.txt', 'a') as myfile:
myfile.write(' '.join(file_text).encode('utf8'))

Categories

Resources