the following script reads a .txt of ticker symbols of companies to download the corresponding financial informations in a .csv . The data is pulled from Yahoo Finance and saved in a local directory.
import urllib.request
import requests
import time
#Define the URL to download the .csv from.
url_begin = "http://real-chart.finance.yahoo.com/table.csv?s="
url_end = "&a=00&b=1&c=1950&d=11&e=31&f=2050&g=d&ignore=.csv"
#Function that reads all available ticker symbols from ticker_daten.txt. This file should be in the same directory as the program.
def readTickers(file):
read_ticker = []
ins = open( file, "r" )
for line in ins:
if line.endswith('\n'):
line=line[:-1]
read_ticker.append(line)
ins.close()
return read_ticker
#File location for tickersymbols to download
tickers = readTickers("C:/Users/Win7ADM/Desktop/Jonas/stock-price-leecher/ticker_daten.txt")
#Loop through list of ticker symbols and download .csv's .
for i in tickers:
#Forge downloadable link.
link_created = url_begin + i + url_end
#Make sure that the link actually leads to a file.
try:
r = requests.head(link_created)
if r.status_code==404:
print(str(r.status_code)+": No page found!")
time.sleep(0.5)
else:
print(link_created)
#Finally download the file, if it does exist.
urllib.request.urlretrieve(link_created, "C:/Users/Win7ADM/Desktop/Jonas/stock-price-leecher/data/"+i+".csv")
time.sleep(0.5)
except requests.ConnectionError:
#A Connection error occurred.
print ("ConnectionError: 404 No page found!")
except requests.HTTPError:
#An HTTP error occurred.
print ("HTTPError!")
except requests.Timeout:
#Connection timed out.
print ("Timeout!")
The Problem: The script crashes randomlyafter loading between 20-1750 .csv's. The crash produces the following output.
Process started >>>
http://real-chart.finance.yahoo.com/table.csv?s=0055.HK&a=00&b=1&c=1950&d=11&e=31&f=2050&g=d&ignore=.csv
http://real-chart.finance.yahoo.com/table.csv?s=0056.HK&a=00&b=1&c=1950&d=11&e=31&f=2050&g=d&ignore=.csv
http://real-chart.finance.yahoo.com/table.csv?s=0057.HK&a=00&b=1&c=1950&d=11&e=31&f=2050&g=d&ignore=.csv
http://real-chart.finance.yahoo.com/table.csv?s=0058.HK&a=00&b=1&c=1950&d=11&e=31&f=2050&g=d&ignore=.csv
Traceback (most recent call last):
File "Stock-Price Leecher.py", line 40, in <module>
urllib.request.urlretrieve(link_created, "C:/Users/Win7ADM/Desktop/Jonas/stock-price-leecher/data/"+i+".csv")
File "c:\Python34\lib\urllib\request.py", line 178, in urlretrieve
with contextlib.closing(urlopen(url, data)) as fp:
File "c:\Python34\lib\urllib\request.py", line 153, in urlopen
return opener.open(url, data, timeout)
File "c:\Python34\lib\urllib\request.py", line 461, in open
response = meth(req, response)
File "c:\Python34\lib\urllib\request.py", line 571, in http_response
'http', request, response, code, msg, hdrs)
File "c:\Python34\lib\urllib\request.py", line 499, in error
return self._call_chain(*args)
File "c:\Python34\lib\urllib\request.py", line 433, in _call_chain
result = func(*args)
File "c:\Python34\lib\urllib\request.py", line 579, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 404: Not Found
<<< Process finished. (Exit code 1)
================ READY ================
Does anyone of you have any Idea why this might happen?
Related
import urllib.request
import time
import json
import random
QUERY = "http://localhost:8080/query?id={}"
N = 500
def getDataPoint(quote):
stock = quote['stock']
bid_price = float(quote['top_bid']['price'])
ask_price = float(quote['top_ask']['price'])
price = (bid_price + ask_price)/2
return stock, bid_price, ask_price, price
def getRatio(price_a, price_b):
if(price_b==0):
return
return price_a/price_b
if __name__ == "__main__":
for _ in range(N):
quotes = json.loads(urllib.request.urlopen(
QUERY.format(random.random())).read())
prices = {}
for quote in quotes:
stock, bid_price, ask_price, price = getDataPoint(quote)
prices[stock] = price
print ("Quoted %s at (bid:%s, ask:%s, price:%s)" % (stock,
bid_price, ask_price, price))
print ("Ratio %s" % getRatio(prices['ABC'], prices['DEF']))
Traceback (most recent call last):
File "C:/Users/AppData/Local/Programs/Python/Python37/ client.py", line 54, in
quotes= json.loads(urllib.request.urlopen(QUERY.format(random.random())).read())
File "C:\Users\AppData\Local\Programs\Python\Python37\lib\urllib\request.py", line 222, in urlopen
return opener.open(url, data, timeout)
File "C:\Users\AppData\Local\Programs\Python\Python37\lib\urllib\request.py", line 531, in open
response = meth(req, response)
File "C:\Users\AppData\Local\Programs\Python\Python37\lib\urllib\request.py", line 641, in http_response
'http', request, response, code, msg, hdrs)
File "C:\Users\AppData\Local\Programs\Python\Python37\lib\urllib\request.py", line 569, in error
result = self._call_chain(*args)
File "C:\Users\AppData\Local\Programs\Python\Python37\lib\urllib\request.py", line 503, in _call_chain
result = func(*args)
File "C:\Users\AppData\Local\Programs\Python\Python37\lib\urllib\request.py", line 649, in in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 404: Not Found
I've error with the URL.Did some research and tried to clear still not so sure why the client part throws error while server part works fine.
This is caused due to the firewall of your computer blocking port 8080. Change the port from 8080 to say 8085 in both the client and server files.
In the above code,
Change QUERY = "http://localhost:8080/query?id={}"
To QUERY = "http://localhost:8085/query?id={}"
Similarly, there should be 8080 as the port number in the server file, change it to 8085.
Another solution would be to disable your firewall, which is not recommended.
Simply say, your server application is not running in the said location: http://localhost:8080/query
I'm trying to make a terminal app to crawl a website and return the time of the entered city name. this is my code so far:
import re
import urllib.request
city = input('Enter city name: ')
url = 'https://time.is/'
rawData = urllib.request.urlopen(url).read()
decodedData = rawData.decode('utf-8')
print(decodedData)
after the last line i get this error:
Traceback (most recent call last):
File "<pyshell#13>", line 1, in <module>
rawData = urllib.request.urlopen(url).read()
File "~/Python\Python35-32\lib\urllib\request.py", line 163, in urlopen
return opener.open(url, data, timeout)
File "~/Python\Python35-32\lib\urllib\request.py", line 472, in open
response = meth(req, response)
File "~/Python\Python35-32\lib\urllib\request.py", line 582, in http_response
'http', request, response, code, msg, hdrs)
File "~/Python\Python35-32\lib\urllib\request.py", line 510, in error
return self._call_chain(*args)
File "~/Python\Python35-32\lib\urllib\request.py", line 444, in _call_chain
result = func(*args)
File "~/Python\Python35-32\lib\urllib\request.py", line 590, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 403: Forbidden
why do i get this error? what's wrong?
[EDIT]
the reason is time.is banns requests. Always remember to read terms and conditions when doing web scraping. free APIs can be found to do the same job too.
When this happens, I usually open the debugger and try to find out whats being called when I access the website. It seems like time.is doesn't like having scripts call their website.
A quick search yielded this:
1532027279136 0 161_(UTC,_UTC+00:00) 1532027279104
Time.is is for humans. To use from scripts and apps, please ask about our API. Thank you!
Here are some APIs you could use to build your project. https://www.programmableweb.com/category/time/api
I am using an embedded python (2.4.3) in digi Connectport X4 here is the code to post to an Azure IoT Hub using HTTPS:
import urllib, sys, datetime, time
import urllib2
iot_device_id = 'HTTP_Device'
iot_endpoint = 'https://winiothub.azure-devices.net/devices/' + iot_device_id + '/messages/events?api-version=2016-02-03'
sas_header = {'Authorization': 'SharedAccessSignature sr=winiothub.azure-devices.net%2Fdevices%2FHTTP_Device&sig=o7dndsA%2FJOnkzYRUhqAwMrQXVhOTpIJqJqILyGDdQAc%3D&se=1522414643'}
while True:
#try:
body_data = { 'gateway_serial': '123', 'readingvalue':'66.00', 'date': str(datetime.datetime.now())}
iot_request = urllib2.Request(iot_endpoint, str(body_data), sas_header)
resp = urllib2.urlopen(iot_request)
contents = resp.read()
resp.close()
time.sleep(1)
#except:
# print 'error'
# time.sleep(1)
The code actually post the message to the hub, but is throwing the following error:
Traceback (most recent call last):
File "C:\Users\JeffreyBiesecker\documents\visual studio 2017\Projects\NewGateAzure\NewGateAzure\NewGateAzure2.py", line 14, in ?
urllib2.urlopen(iot_request)
File "C:\Python24\lib\urllib2.py", line 130, in urlopen
return _opener.open(url, data)
File "C:\Python24\lib\urllib2.py", line 364, in open
response = meth(req, response)
File "C:\Python24\lib\urllib2.py", line 471, in http_response
response = self.parent.error(
File "C:\Python24\lib\urllib2.py", line 402, in error
return self._call_chain(*args)
File "C:\Python24\lib\urllib2.py", line 337, in _call_chain
result = func(*args)
File "C:\Python24\lib\urllib2.py", line 480, in http_error_default
raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
urllib2.HTTPError: HTTP Error 204: No Content
Press any key to continue . . .
I get an error if running the code in the embedded Digi gateway or if I run in Python in Visual Studio using version 2.4.3.
urllib2.HttpError includes the response code that's received. Because of this, you can catch the exception, test for 204, and continue safely if it is the case. Otherwise, you can handle (or rethrow) the exception.
try:
resp = urllib2.urlopen(iot_request)
except urllib2.HTTPError as e:
if e.code == 204: pass
else: raise
I learned how to download a picture from a certain URL with python as:
import urllib
imgurl="http://www.digimouth.com/news/media/2011/09/google-logo.jpg"
resource = urllib.urlopen(imgurl)
output = open("test.jpg","wb")
output.write(resource.read())
output.close()
and it worked well, but when i changed the URL to
imgurl="http://farm1.static.flickr.com/96/242125326_607a826afe_o.jpg"
it did not work, and gave the information
File "face_down.py", line 3, in <module>
resource = urllib2.urlopen(imgurl)
File "D:\Python27\another\Lib\urllib2.py", line 154, in urlopen
return opener.open(url, data, timeout)
File "D:\Python27\another\Lib\urllib2.py", line 431, in open
response = self._open(req, data)
File "D:\Python27\another\Lib\urllib2.py", line 449, in _open
'_open', req)
File "D:\Python27\another\Lib\urllib2.py", line 409, in _call_chain
result = func(*args)
File "D:\Python27\another\Lib\urllib2.py", line 1227, in http_open
return self.do_open(httplib.HTTPConnection, req)
File "D:\Python27\another\Lib\urllib2.py", line 1197, in do_open
raise URLError(err)
urllib2.URLError: <urlopen error [Errno 10060] >
and I tried to open the latter image URL, and it could be shown as the former, I have no idea to solve it~~ help~~~~
You can try using requests module. The response will be some bytes. So, you can iterate over those byte chunks and write to the file.
import requests
url = "http://farm1.static.flickr.com/96/242125326_607a826afe_o.jpg"
r = requests.get(url)
path = "filename.jpg"
with open(path, 'wb') as f:
for chunk in r:
f.write(chunk)
I looked up both of the addresses and the second one does not lead anywhere. That is probably the problem.
import urllib
imgurl="webpage url"
openimg = urllib.urlopen(imgurl) #opens image (prepares it)
img = open("test.jpg","wb") #opens the img to read it
img.write(openimg.read()) #prints it to the console
img.close() #closes img
Try the link again in your webpage and if it turns up with "webpage not available" that is probably the problem.
I'm trying to write code so that it scrap stock symbols data into a csv file. However, I get the following error.
Traceback (most recent call last):
File "company_data_v3.py", line 23, in <module>
page = urllib2.urlopen("http://finance.yahoo.com/q/ks?s="+newsymbolslist[i] +"%20Key%20Statistics").read()
File "C:\Python27\lib\urllib2.py", line 127, in urlopen
return _opener.open(url, data, timeout)
File "C:\Python27\lib\urllib2.py", line 410, in open
response = meth(req, response)
File "C:\Python27\lib\urllib2.py", line 523, in http_response
'http', request, response, code, msg, hdrs)
File "C:\Python27\lib\urllib2.py", line 448, in error
return self._call_chain(*args)
File "C:\Python27\lib\urllib2.py", line 382, in _call_chain
result = func(*args)
File "C:\Python27\lib\urllib2.py", line 531, in http_error_default
raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
urllib2.HTTPError: HTTP Error 400: Bad Request
I have tried this suggestion but it has not worked which imports urlib2 HTTPError into the program. (It seems redundant to do that since I already have the module imported.
The symbols.txt file has stock symbols. Here is the code that I am using:
import urllib2
from BeautifulSoup import BeautifulSoup
import csv
import re
import urllib
from urllib2 import HTTPError
# import modules
symbolfile = open("symbols.txt")
symbolslist = symbolfile.read()
newsymbolslist = symbolslist.split("\n")
i = 0
f = csv.writer(open("pe_ratio.csv","wb"))
# short cut to write
f.writerow(["Name","PE","Revenue % Quarterly","ROA% YOY","Operating Cashflow","Debt to Equity"])
#first write row statement
# define name_company as the following
while i<len(newsymbolslist):
page = urllib2.urlopen("http://finance.yahoo.com/q/ks?s="+newsymbolslist[i] +"%20Key%20Statistics").read()
soup = BeautifulSoup(page)
name_company = soup.findAll("div", {"class" : "title"})
for name in name_company: #add multiple iterations?
all_data = soup.findAll('td', "yfnc_tabledata1")
stock_name = name.find('h2').string #find company's name in name_company with h2 tag
try:
f.writerow([stock_name, all_data[2].getText(),all_data[17].getText(),all_data[13].getText(), all_data[29].getText(),all_data[26].getText()]) #write down PE data
except (IndexError, urllib2.HTTPError) as e:
pass
i+=1
Do I need to define the error more specifically? Thanks for your help.
You are catching the exception in the wrong location. The urlopen() call throws the exception, as shown by the first lines of your traceback:
Traceback (most recent call last):
File "company_data_v3.py", line 23, in <module>
page = urllib2.urlopen("http://finance.yahoo.com/q/ks?s="+newsymbolslist[i] +"%20Key%20Statistics").read()
Catch it there:
while i<len(newsymbolslist):
try:
page = urllib2.urlopen("http://finance.yahoo.com/q/ks?s="+newsymbolslist[i] +"%20Key%20Statistics").read()
except urllib2.HTTPError:
continue