Combined Exceptions WIth Beautiful Soup HTTPError Not Defined - python

I'm trying to write code so that it scrap stock symbols data into a csv file. However, I get the following error.
Traceback (most recent call last):
File "company_data_v3.py", line 23, in <module>
page = urllib2.urlopen("http://finance.yahoo.com/q/ks?s="+newsymbolslist[i] +"%20Key%20Statistics").read()
File "C:\Python27\lib\urllib2.py", line 127, in urlopen
return _opener.open(url, data, timeout)
File "C:\Python27\lib\urllib2.py", line 410, in open
response = meth(req, response)
File "C:\Python27\lib\urllib2.py", line 523, in http_response
'http', request, response, code, msg, hdrs)
File "C:\Python27\lib\urllib2.py", line 448, in error
return self._call_chain(*args)
File "C:\Python27\lib\urllib2.py", line 382, in _call_chain
result = func(*args)
File "C:\Python27\lib\urllib2.py", line 531, in http_error_default
raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
urllib2.HTTPError: HTTP Error 400: Bad Request
I have tried this suggestion but it has not worked which imports urlib2 HTTPError into the program. (It seems redundant to do that since I already have the module imported.
The symbols.txt file has stock symbols. Here is the code that I am using:
import urllib2
from BeautifulSoup import BeautifulSoup
import csv
import re
import urllib
from urllib2 import HTTPError
# import modules
symbolfile = open("symbols.txt")
symbolslist = symbolfile.read()
newsymbolslist = symbolslist.split("\n")
i = 0
f = csv.writer(open("pe_ratio.csv","wb"))
# short cut to write
f.writerow(["Name","PE","Revenue % Quarterly","ROA% YOY","Operating Cashflow","Debt to Equity"])
#first write row statement
# define name_company as the following
while i<len(newsymbolslist):
page = urllib2.urlopen("http://finance.yahoo.com/q/ks?s="+newsymbolslist[i] +"%20Key%20Statistics").read()
soup = BeautifulSoup(page)
name_company = soup.findAll("div", {"class" : "title"})
for name in name_company: #add multiple iterations?
all_data = soup.findAll('td', "yfnc_tabledata1")
stock_name = name.find('h2').string #find company's name in name_company with h2 tag
try:
f.writerow([stock_name, all_data[2].getText(),all_data[17].getText(),all_data[13].getText(), all_data[29].getText(),all_data[26].getText()]) #write down PE data
except (IndexError, urllib2.HTTPError) as e:
pass
i+=1
Do I need to define the error more specifically? Thanks for your help.

You are catching the exception in the wrong location. The urlopen() call throws the exception, as shown by the first lines of your traceback:
Traceback (most recent call last):
File "company_data_v3.py", line 23, in <module>
page = urllib2.urlopen("http://finance.yahoo.com/q/ks?s="+newsymbolslist[i] +"%20Key%20Statistics").read()
Catch it there:
while i<len(newsymbolslist):
try:
page = urllib2.urlopen("http://finance.yahoo.com/q/ks?s="+newsymbolslist[i] +"%20Key%20Statistics").read()
except urllib2.HTTPError:
continue

Related

How can I download multiple PDF files with Python?

I am trying to download the publications on every page of https://occ.ca/our-publications
My end goal is to parse through the text in the PDF files and locate certain keywords.
Thus far, I have been able to scrape the links to the PDF files on all the pages. I have saved these links into a list. Now, I want to go through the list and download all the pdf files with Python. Once the files have been downloads, I want to parse through them.
This is the code that I have used thus far:
import requests
from bs4 import BeautifulSoup
import lxml
import csv
# This code adds all PDF links into a list called
#"publications".
publications=[]
for i in range(19):
response=requests.get('https://occ.ca/our-
publications/page/{}/'.format(i), headers={'User-
Agent': 'Mozilla'})
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'lxml')
pdfs = soup.findAll('div', {"class":
"publicationoverlay"})
links = [pdf.find('a').attrs['href'] for pdf in pdfs]
publications.append(links)
Next, I want to go through that list and download the PDF files.
import urllib.request
for x in publications:
urllib.request.urlretrieve(x,'Publication_{}'.format(range(213)))
This is the error I get when I run the code.
This is the error I get
Traceback (most recent call last):
File "C:\Users\plumm\AppData\Local\Programs\Python\Python37\m.py", line 23, in
urllib.request.urlretrieve(x,'Publication_ {}.pdf'.format(range(213)))
File "C:\Users\plumm\AppData\Local\Programs\Python\Python37\lib\urllib\request.py", line 247, in urlretrieve
with contextlib.closing(urlopen(url, data)) as fp:
File "C:\Users\plumm\AppData\Local\Programs\Python\Python37\lib\urllib\request.py", line 222, in urlopen
return opener.open(url, data, timeout)
File "C:\Users\plumm\AppData\Local\Programs\Python\Python37\lib\urllib\request.py", line 531, in open
response = meth(req, response)
File "C:\Users\plumm\AppData\Local\Programs\Python\Python37\lib\urllib\request.py", line 641, in http_response
'http', request, response, code, msg, hdrs)
File "C:\Users\plumm\AppData\Local\Programs\Python\Python37\lib\urllib\request.py", line 569, in error
return self._call_chain(*args)
File "C:\Users\plumm\AppData\Local\Programs\Python\Python37\lib\urllib\request.py", line 503, in _call_chain
result = func(*args)
File "C:\Users\plumm\AppData\Local\Programs\Python\Python37\lib\urllib\request.py", line 649, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 403: Forbidden
pls try:
import requests
from bs4 import BeautifulSoup
import lxml
import csv
# This code adds all PDF links into a list called
#"publications".
publications=[]
for i in range(19):
response=requests.get('https://occ.ca/our-
publications/page/{}/'.format(i), headers={'User-
Agent': 'Mozilla'})
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'lxml')
pdfs = soup.findAll('div', {"class":
"publicationoverlay"})
links = [pdf.find('a').attrs['href'] for pdf in pdfs]
publications.extend(links)
for cntr, link in enumerate(publications):
print("try to get link", link)
rslt = requests.get(link)
print("Got", rslt)
fname = "temporarypdf_%d.pdf" % cntr
with open("temporarypdf_%d.pdf" % cntr, "wb") as fout:
fout.write(rslt.raw.read())
print("saved pdf data into ", fname)
# Call here the code that reads and parses the pdf.
Could you please inform also the line number where the error occours ?

urllib2 does not handle http NO_CONTENT (204) as a successful HTTPS response

I am using an embedded python (2.4.3) in digi Connectport X4 here is the code to post to an Azure IoT Hub using HTTPS:
import urllib, sys, datetime, time
import urllib2
iot_device_id = 'HTTP_Device'
iot_endpoint = 'https://winiothub.azure-devices.net/devices/' + iot_device_id + '/messages/events?api-version=2016-02-03'
sas_header = {'Authorization': 'SharedAccessSignature sr=winiothub.azure-devices.net%2Fdevices%2FHTTP_Device&sig=o7dndsA%2FJOnkzYRUhqAwMrQXVhOTpIJqJqILyGDdQAc%3D&se=1522414643'}
while True:
#try:
body_data = { 'gateway_serial': '123', 'readingvalue':'66.00', 'date': str(datetime.datetime.now())}
iot_request = urllib2.Request(iot_endpoint, str(body_data), sas_header)
resp = urllib2.urlopen(iot_request)
contents = resp.read()
resp.close()
time.sleep(1)
#except:
# print 'error'
# time.sleep(1)
The code actually post the message to the hub, but is throwing the following error:
Traceback (most recent call last):
File "C:\Users\JeffreyBiesecker\documents\visual studio 2017\Projects\NewGateAzure\NewGateAzure\NewGateAzure2.py", line 14, in ?
urllib2.urlopen(iot_request)
File "C:\Python24\lib\urllib2.py", line 130, in urlopen
return _opener.open(url, data)
File "C:\Python24\lib\urllib2.py", line 364, in open
response = meth(req, response)
File "C:\Python24\lib\urllib2.py", line 471, in http_response
response = self.parent.error(
File "C:\Python24\lib\urllib2.py", line 402, in error
return self._call_chain(*args)
File "C:\Python24\lib\urllib2.py", line 337, in _call_chain
result = func(*args)
File "C:\Python24\lib\urllib2.py", line 480, in http_error_default
raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
urllib2.HTTPError: HTTP Error 204: No Content
Press any key to continue . . .
I get an error if running the code in the embedded Digi gateway or if I run in Python in Visual Studio using version 2.4.3.
urllib2.HttpError includes the response code that's received. Because of this, you can catch the exception, test for 204, and continue safely if it is the case. Otherwise, you can handle (or rethrow) the exception.
try:
resp = urllib2.urlopen(iot_request)
except urllib2.HTTPError as e:
if e.code == 204: pass
else: raise

urlopen only working for certain URLs in Python3

So I'm trying to get the URL of a page in python3...
If I do the following,
from urllib.request import urlopen
html = urlopen("http://google.com/")
html.read()
I get the html as desired.
However, if I were to choose a different url, as in the following,
from urllib.request import urlopen
html = urlopen("http://www.stackoverflow.com/")
html.read()
I get the following error after the second line:
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/urllib/request.py", line 153, in urlopen
return opener.open(url, data, timeout)
File "/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/urllib/request.py", line 461, in open
response = meth(req, response)
File "/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/urllib/request.py", line 574, in http_response
'http', request, response, code, msg, hdrs)
File "/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/urllib/request.py", line 499, in error
return self._call_chain(*args)
File "/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/urllib/request.py", line 433, in _call_chain
result = func(*args)
File "/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/urllib/request.py", line 582, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 403: Forbidden
Any ideas why this would be happening and how to fix it?
If you look closer at the error message you'll see that it is a HTTP error and a special one:
HTTP Error 403: Forbidden
So you talked to the server and got your response back but you don't know why you were denied.
You can get a more detailed message in an HTML returned by the server with something like this:
from urllib.request import urlopen
from urllib.error import HTTPError
try:
html = urlopen("http://www.stackoverflow.com/")
except HTTPError as e:
print(e.read().decode('utf-8'))
html.read()
For me it says:
<h2 data-translate="what_happened">What happened?</h2>
<p>The owner of this website (www.stackoverflow.com) has banned your access based on your browser's signature (213702c58d2116a6-ua48).</p>
You can treat HTTPError as a file object (https://docs.python.org/3/library/urllib.error.html#urllib.error.HTTPError):
Though being an exception (a subclass of URLError), an HTTPError can
also function as a non-exceptional file-like return value (the same
thing that urlopen() returns). This is useful when handling exotic
HTTP errors, such as requests for authentication.

urllib.request error 500 python 3.3.3

I created an algorithm which consumes the web service of the local bank using url lib.request in Python 3.3.3, but when I run it it gives out an error.
The error is the following:
Traceback (most recent call last):
File "<pyshell#20>", line 1, in <module>
tipo_de_cambio()
File "/Users/admin/Documents/TEC/Taller ProgramaciĆ³n/tdc.py", line 13, in tipo_de_cambio
f = urllib.request.urlopen(request,data)
File "/Library/Frameworks/Python.framework/Versions/3.2/lib/python3.2/urllib/request.py", line 138, in urlopen
return opener.open(url, data, timeout)
File "/Library/Frameworks/Python.framework/Versions/3.2/lib/python3.2/urllib/request.py", line 375, in open
response = meth(req, response)
File "/Library/Frameworks/Python.framework/Versions/3.2/lib/python3.2/urllib/request.py", line 487, in http_response
'http', request, response, code, msg, hdrs)
File "/Library/Frameworks/Python.framework/Versions/3.2/lib/python3.2/urllib/request.py", line 413, in error
return self._call_chain(*args)
File "/Library/Frameworks/Python.framework/Versions/3.2/lib/python3.2/urllib/request.py", line 347, in _call_chain
result = func(*args)
File "/Library/Frameworks/Python.framework/Versions/3.2/lib/python3.2/urllib/request.py", line 495, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 500: Internal Server Error
And the program I coded is the following:
import datetime
import urllib.request
from urllib.error import HTTPError
from xml.dom.minidom import parse, parseString
def tipo_de_cambio():
a = 0
fecha = datetime.date.today()
data = urllib.parse.urlencode({'tcIndicador':'318', 'tcFechaInicio':fecha.strftime("%d/%m/%Y"), 'tcFechaFinal':fecha.strftime("%d/%m/%Y"), 'tcNombre':"Usuario", 'tnSubNiveles':'N'})
data = data.encode('utf-8')
request = urllib.request.Request('http://indicadoreseconomicos.bccr.fi.cr/indicadoreseconomicos/WebServices/wsIndicadoresEconomicos.asmx?op=ObtenerIndicadoresEconomicosXML')
request.add_header("POST","application/x-www-form-urlencoded;charset=utf-8")
f = urllib.request.urlopen(request,data)
data = f.read().decode('utf-8')
dom = parseString(data)
xmlTag = dom.getElementsByTagName('NUM_VALOR')[0].toxml()
xmlData = xmlTag.replace('<NUM_VALOR>','').replace('</NUM_VALOR>','')
a = float(xmlData)
return(a)
The bank's web service url is: http://indicadoreseconomicos.bccr.fi.cr/indicadoreseconomicos/WebServices/wsIndicadoresEconomicos.asmx?op=ObtenerIndicadoresEconomicosXML
Can someone tell me why I get this error?
import datetime
import urllib.request
from xml.dom.minidom import parse, parseString
fecha = datetime.date.today().strftime("%d/%m/%Y")
base_url = 'http://indicadoreseconomicos.bccr.fi.cr/indicadoreseconomicos/WebServices/wsIndicadoresEconomicos.asmx/ObtenerIndicadoresEconomicosXML?tcIndicador={tcIndicador}&tcFechaInicio={tcFechaInicio}&tcFechaFinal={tcFechaFinal}&tcNombre={tcNombre}&tnSubNiveles={tnSubNiveles}'
data = {
'tcIndicador':'318',
'tcFechaInicio':fecha,
'tcFechaFinal':fecha,
'tcNombre':"Usuario",
'tnSubNiveles':'N'
}
result = urllib.request.urlopen(base_url.format(**data)).read()
dom = parseString(result)
xmlTag = dom.getElementsByTagName('NUM_VALOR')[0].toxml()
xmlData = xmlTag.replace('<NUM_VALOR>','').replace('</NUM_VALOR>','')
a = float(xmlData)
return(a)
From here you can use your approach to parse xml .

Script to download stock price data from yahoo finance, randomly 404s

the following script reads a .txt of ticker symbols of companies to download the corresponding financial informations in a .csv . The data is pulled from Yahoo Finance and saved in a local directory.
import urllib.request
import requests
import time
#Define the URL to download the .csv from.
url_begin = "http://real-chart.finance.yahoo.com/table.csv?s="
url_end = "&a=00&b=1&c=1950&d=11&e=31&f=2050&g=d&ignore=.csv"
#Function that reads all available ticker symbols from ticker_daten.txt. This file should be in the same directory as the program.
def readTickers(file):
read_ticker = []
ins = open( file, "r" )
for line in ins:
if line.endswith('\n'):
line=line[:-1]
read_ticker.append(line)
ins.close()
return read_ticker
#File location for tickersymbols to download
tickers = readTickers("C:/Users/Win7ADM/Desktop/Jonas/stock-price-leecher/ticker_daten.txt")
#Loop through list of ticker symbols and download .csv's .
for i in tickers:
#Forge downloadable link.
link_created = url_begin + i + url_end
#Make sure that the link actually leads to a file.
try:
r = requests.head(link_created)
if r.status_code==404:
print(str(r.status_code)+": No page found!")
time.sleep(0.5)
else:
print(link_created)
#Finally download the file, if it does exist.
urllib.request.urlretrieve(link_created, "C:/Users/Win7ADM/Desktop/Jonas/stock-price-leecher/data/"+i+".csv")
time.sleep(0.5)
except requests.ConnectionError:
#A Connection error occurred.
print ("ConnectionError: 404 No page found!")
except requests.HTTPError:
#An HTTP error occurred.
print ("HTTPError!")
except requests.Timeout:
#Connection timed out.
print ("Timeout!")
The Problem: The script crashes randomlyafter loading between 20-1750 .csv's. The crash produces the following output.
Process started >>>
http://real-chart.finance.yahoo.com/table.csv?s=0055.HK&a=00&b=1&c=1950&d=11&e=31&f=2050&g=d&ignore=.csv
http://real-chart.finance.yahoo.com/table.csv?s=0056.HK&a=00&b=1&c=1950&d=11&e=31&f=2050&g=d&ignore=.csv
http://real-chart.finance.yahoo.com/table.csv?s=0057.HK&a=00&b=1&c=1950&d=11&e=31&f=2050&g=d&ignore=.csv
http://real-chart.finance.yahoo.com/table.csv?s=0058.HK&a=00&b=1&c=1950&d=11&e=31&f=2050&g=d&ignore=.csv
Traceback (most recent call last):
File "Stock-Price Leecher.py", line 40, in <module>
urllib.request.urlretrieve(link_created, "C:/Users/Win7ADM/Desktop/Jonas/stock-price-leecher/data/"+i+".csv")
File "c:\Python34\lib\urllib\request.py", line 178, in urlretrieve
with contextlib.closing(urlopen(url, data)) as fp:
File "c:\Python34\lib\urllib\request.py", line 153, in urlopen
return opener.open(url, data, timeout)
File "c:\Python34\lib\urllib\request.py", line 461, in open
response = meth(req, response)
File "c:\Python34\lib\urllib\request.py", line 571, in http_response
'http', request, response, code, msg, hdrs)
File "c:\Python34\lib\urllib\request.py", line 499, in error
return self._call_chain(*args)
File "c:\Python34\lib\urllib\request.py", line 433, in _call_chain
result = func(*args)
File "c:\Python34\lib\urllib\request.py", line 579, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 404: Not Found
<<< Process finished. (Exit code 1)
================ READY ================
Does anyone of you have any Idea why this might happen?

Categories

Resources