HTTP ERROR :403, Python with Beautiful soup - python

Here is the code:
quote_page = "https://www.timeanddate.com/holidays/fun/" + months [date.month].lower() + "/" + str(date.day)
page = urllib2.urlopen(quote_page)
soup = BeautifulSoup(page, 'html.parser')
event_box = soup.find('article', attrs={"class" : "fixed"})
event_box = event_box.find('h3')
event = event_box.text.strip()
print event
When I check my variable:
quote_page = https://www.timeanddate.com/holidays/fun/june/8
I've tried printing quote_page and the link works fine in my browser,
but when I run the code and print "event" I get this:
Traceback (most recent call last):
File "main.py", line 252, in <module>
page = urllib2.urlopen(req)
File "/usr/local/lib/python2.7/urllib2.py", line 154, in urlopen
return opener.open(url, data, timeout)
File "/usr/local/lib/python2.7/urllib2.py", line 435, in open
response = meth(req, response)
File "/usr/local/lib/python2.7/urllib2.py", line 548, in http_response
'http', request, response, code, msg, hdrs)
File "/usr/local/lib/python2.7/urllib2.py", line 473, in error
return self._call_chain(*args)
File "/usr/local/lib/python2.7/urllib2.py", line 407, in _call_chain
result = func(*args)
File "/usr/local/lib/python2.7/urllib2.py", line 556, in http_error_default
raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
urllib2.HTTPError: HTTP Error 403: ok
exit status 1
I think I remember that it worked earlier today but stopped when I tried again later.
Can anyone help?
(I'm fairly new to coding)

Related

Keep receiving "HTTP Error 429: Too Many Requests" with any delay

A simple web scraping code I wrote few weeks back keeps coming up with the error of:
HTTP Error 429: Too Many Requests
The code is designed to get the input from an excel file and find and download pdfs online.
I'm not too familiar with requests but I've slowed down the number of requests to see how many it can handle. It seems to be an unrelated issue somehow. The code will go through similar number of inputs (around 30) no matter if the delays I sat are at 5 seconds or 20 seconds. Here is the error message that keeps coming up:
Traceback (most recent call last):
File "D:\Python\New folder\Web Scraper.py", line 17, in <module>
for url in search(searchquery, stop=1, pause=2):
File "D:\Python\lib\site-packages\google-2.0.2-py3.7.egg\googlesearch\__init__.py", line 288, in search
html = get_page(url, user_agent)
File "D:\Python\lib\site-packages\google-2.0.2-py3.7.egg\googlesearch\__init__.py", line 154, in get_page
response = urlopen(request)
File "D:\Python\lib\urllib\request.py", line 222, in urlopen
return opener.open(url, data, timeout)
File "D:\Python\lib\urllib\request.py", line 531, in open
response = meth(req, response)
File "D:\Python\lib\urllib\request.py", line 641, in http_response
'http', request, response, code, msg, hdrs)
File "D:\Python\lib\urllib\request.py", line 563, in error
result = self._call_chain(*args)
File "D:\Python\lib\urllib\request.py", line 503, in _call_chain
result = func(*args)
File "D:\Python\lib\urllib\request.py", line 755, in http_error_302
return self.parent.open(new, timeout=req.timeout)
File "D:\Python\lib\urllib\request.py", line 531, in open
response = meth(req, response)
File "D:\Python\lib\urllib\request.py", line 641, in http_response
'http', request, response, code, msg, hdrs)
File "D:\Python\lib\urllib\request.py", line 569, in error
return self._call_chain(*args)
File "D:\Python\lib\urllib\request.py", line 503, in _call_chain
result = func(*args)
File "D:\Python\lib\urllib\request.py", line 649, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 429: Too Many Requests
And here is the code that I wrote:
import xlrd, requests
from googlesearch import search
from time import sleep
xlloc = ("D:/VesselBase.xlsx")
#Excel location
ws = xlrd.open_workbook(xlloc)
sheet = ws.sheet_by_index(0)
#Sheet name/index
sheet.cell_value(0, 0)
for i in range(sheet.nrows):
vesselname = sheet.cell_value(i, 1)
vesselimo = sheet.cell_value(i,0)
#Which column/row to choose, 2nd column for vessels. 0=A/1.
searchquery = 'Vessel specification information "%s" OR "%s" filetype:pdf' % (vesselname, vesselimo)
print('Searching "%s"' % searchquery)
for url in search(searchquery, stop=1, pause=20):
print('Searched for %s' % vesselname)
print('Found %s' % url)
open('D:/Newfolder/%s.pdf' % vesselname, 'wb').write(requests.get(url).content)
#Where to save
print('Saved %s' % vesselname)

Python - Scraping with BeautifulSoup and Urllib

I am trying to read website, but unfortunately something is wrong.
import bs4 as bs
import urllib.request
sauce = urllib.request.urlopen('https://csgoempire.com/withdraw').read()
soup = bs.BeautifulSoup(sauce,'lxml')
print(soup.find_all('p'))
Error:
Traceback (most recent call last):
File "F:/Informatika/Python3X/GamblinSitesBot/GamblingSitesBot.py", line 4, in <module>
sauce = urllib.request.urlopen('https://csgoempire.com/').read()
File "c:\users\edgaras\appdata\local\programs\python\python36\Lib\urllib\request.py", line 223, in urlopen
return opener.open(url, data, timeout)
File "c:\users\edgaras\appdata\local\programs\python\python36\Lib\urllib\request.py", line 532, in open
response = meth(req, response)
File "c:\users\edgaras\appdata\local\programs\python\python36\Lib\urllib\request.py", line 642, in http_response
'http', request, response, code, msg, hdrs)
File "c:\users\edgaras\appdata\local\programs\python\python36\Lib\urllib\request.py", line 570, in error
return self._call_chain(*args)
File "c:\users\edgaras\appdata\local\programs\python\python36\Lib\urllib\request.py", line 504, in _call_chain
result = func(*args)
File "c:\users\edgaras\appdata\local\programs\python\python36\Lib\urllib\request.py", line 650, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 403: Forbidden
Process finished with exit code 1
Moreover, this code works with other sites such as google.com
you can achieve the same using request library. This works fine
import bs4 as bs
import requests
sauce = requests.get('https://csgoempire.com/withdraw')
soup = bs.BeautifulSoup(sauce.content,'html.parser')
print(soup.find_all('p'))

HTTP Error 404: Not Found python urllib

My code is this:
import urllib.request
import re
http://www.weather-forecast.com/locations/Paris/forcasts/latest
city = input('Please enter a place: ')
url = 'http://www.weather-forecast.com/locations/'+city+'forcasts/latest'
data = urllib.request.urlopen(url).read()
data1 = data.decode('utf-8')
I'm having trouble with the url this is my output:
Traceback (most recent call last):
File "C:/Users/alext/AppData/Local/Programs/Python/Python36/Weather forecast.py", line 9, in
data = urllib.request.urlopen(url).read()
File "C:\Users\alext\AppData\Local\Programs\Python\Python36\lib\urllib\request.py", line 223, in urlopen
return opener.open(url, data, timeout)
File "C:\Users\alext\AppData\Local\Programs\Python\Python36\lib\urllib\request.py", line 532, in open
response = meth(req, response)
File "C:\Users\alext\AppData\Local\Programs\Python\Python36\lib\urllib\request.py", line 642, in http_response
'http', request, response, code, msg, hdrs)
File "C:\Users\alext\AppData\Local\Programs\Python\Python36\lib\urllib\request.py", line 564, in error
result = self._call_chain(*args)
File "C:\Users\alext\AppData\Local\Programs\Python\Python36\lib\urllib\request.py", line 504, in _call_chain
result = func(*args)
File "C:\Users\alext\AppData\Local\Programs\Python\Python36\lib\urllib\request.py", line 756, in http_error_302
return self.parent.open(new, timeout=req.timeout)
File "C:\Users\alext\AppData\Local\Programs\Python\Python36\lib\urllib\request.py", line 532, in open
response = meth(req, response)
File "C:\Users\alext\AppData\Local\Programs\Python\Python36\lib\urllib\request.py", line 642, in http_response
'http', request, response, code, msg, hdrs)
File "C:\Users\alext\AppData\Local\Programs\Python\Python36\lib\urllib\request.py", line 570, in error
return self._call_chain(*args)
File "C:\Users\alext\AppData\Local\Programs\Python\Python36\lib\urllib\request.py", line 504, in _call_chain
result = func(*args)
File "C:\Users\alext\AppData\Local\Programs\Python\Python36\lib\urllib \request.py", line 650, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 404: Not Found
I have checked the url and it is definitely correct. I have seen others with problems like this but am still unsure as to the solution.
you are missing a / after the city and a e in forecast. It should be
url = 'http://www.weather-forecast.com/locations/'+city+'/forecasts/latest'

HTTP ERROR in Python

I seem to be getting this error with urllib.request and it gives me this url error that i cant seem to fix.
raceback (most recent call last):
File "C:\Users\Jarvis\Documents\Python Scripts\MultiCheck by Koala.py", line 133, in <module>
Migration()
File "C:\Users\Jarvis\Documents\Python Scripts\MultiCheck by Koala.py", line 116, in Migration
rawdata_uuid = urllib.request.urlopen(url)
File "C:\Python34\lib\urllib\request.py", line 161, in urlopen
return opener.open(url, data, timeout)
File "C:\Python34\lib\urllib\request.py", line 469, in open
response = meth(req, response)
File "C:\Python34\lib\urllib\request.py", line 579, in http_response
'http', request, response, code, msg, hdrs)
File "C:\Python34\lib\urllib\request.py", line 507, in error
return self._call_chain(*args)
File "C:\Python34\lib\urllib\request.py", line 441, in _call_chain
result = func(*args)
File "C:\Python34\lib\urllib\request.py", line 587, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 429: 42
The code im using is here is for a migration checker for a game:
def Migration():
url = "https://api.mojang.com/users/profiles/minecraft/" + einfos
rawdata = urllib.request.urlopen(url)
newrawdata = rawdata.read()
jsondata = json.loads(newrawdata.decode('utf-8'))
results = jsondata['id']
url = "https://sessionserver.mojang.com/session/minecraft/profile/" + results
rawdata_uuid = urllib.request.urlopen(url)
newrawdata_uuid = rawdata_uuid.read()
jsondata_uuid = json.loads(newrawdata_uuid.decode('utf-8'))
try:
results = jsondata_uuid['legacy']
print ("Unmigrated")
except:
print("Migrated")
Error 429 means: Too many requests. You seem to have hit a rate limit. The additional number gives are the seconds you have to wait for the limitation to be dropped. So, try again in 42s, or later.

urllib2 retrieve an arbitrary file based on URL and save it into a named file

I am writing a python script to use the urllib2 module as an equivalent to the command line utility wget. The only function I want for this is that it can be used to retrieve an arbitrary file based on URL and save it into a named file. I also only need to worry about two command line arguments, the URL from which the file is to be downloaded and the name of the file into which the content are to be saved.
Example:
python Prog7.py www.python.org pythonHomePage.html
This is my code:
import urllib
import urllib2
#import requests
url = 'http://www.python.org/pythonHomePage.html'
print "downloading with urllib"
urllib.urlretrieve(url, "code.txt")
print "downloading with urllib2"
f = urllib2.urlopen(url)
data = f.read()
with open("code2.txt", "wb") as code:
code.write(data)
urllib seems to work but urllib2 does not seem to work.
Errors received:
File "Problem7.py", line 11, in <module>
f = urllib2.urlopen(url)
File "/usr/lib64/python2.6/urllib2.py", line 126, in urlopen
return _opener.open(url, data, timeout)
File "/usr/lib64/python2.6/urllib2.py", line 397, in open
response = meth(req, response)
File "/usr/lib64/python2.6/urllib2.py", line 510, in http_response
'http', request, response, code, msg, hdrs)
File "/usr/lib64/python2.6/urllib2.py", line 429, in error
result = self._call_chain(*args)
File "/usr/lib64/python2.6/urllib2.py", line 369, in _call_chain
result = func(*args)
File "/usr/lib64/python2.6/urllib2.py", line 616, in http_error_302
return self.parent.open(new, timeout=req.timeout)
File "/usr/lib64/python2.6/urllib2.py", line 397, in open
response = meth(req, response)
File "/usr/lib64/python2.6/urllib2.py", line 510, in http_response
'http', request, response, code, msg, hdrs)
File "/usr/lib64/python2.6/urllib2.py", line 435, in error
return self._call_chain(*args)
File "/usr/lib64/python2.6/urllib2.py", line 369, in _call_chain
result = func(*args)
File "/usr/lib64/python2.6/urllib2.py", line 518, in http_error_default
raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
urllib2.HTTPError: HTTP Error 404: NOT FOUND
And the URL is doesn't exist at all; https://www.python.org/pythonHomePage.html is indeed a 404 Not Found page.
The difference between urllib and urllib2 then is that the latter automatically raises an exception when a 404 page is returned, while urllib.urlretrieve() just saves the error page for you:
>>> import urllib
>>> urllib.urlopen('https://www.python.org/pythonHomePage.html').getcode()
404
>>> import urllib2
>>> urllib2.urlopen('https://www.python.org/pythonHomePage.html')
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/Users/mj/Development/Library/buildout.python/parts/opt/lib/python2.7/urllib2.py", line 127, in urlopen
return _opener.open(url, data, timeout)
File "/Users/mj/Development/Library/buildout.python/parts/opt/lib/python2.7/urllib2.py", line 410, in open
response = meth(req, response)
File "/Users/mj/Development/Library/buildout.python/parts/opt/lib/python2.7/urllib2.py", line 523, in http_response
'http', request, response, code, msg, hdrs)
File "/Users/mj/Development/Library/buildout.python/parts/opt/lib/python2.7/urllib2.py", line 448, in error
return self._call_chain(*args)
File "/Users/mj/Development/Library/buildout.python/parts/opt/lib/python2.7/urllib2.py", line 382, in _call_chain
result = func(*args)
File "/Users/mj/Development/Library/buildout.python/parts/opt/lib/python2.7/urllib2.py", line 531, in http_error_default
raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
urllib2.HTTPError: HTTP Error 404: NOT FOUND
If you wanted to save the error page, you can catch the urllib2.HTTPError exception:
try:
f = urllib2.urlopen(url)
data = f.read()
except urllib2.HTTPError as err:
data = err.read()
It is due to the different behavior by urllib and urllib2.
Since the web page returns a 404 error (webpage not found) urllib2 "catches" it while urllib downloads the html of the returned page regardless of the error.
If you want to print the html to the text file you can print the error:
import urllib2
try:
data = urllib2.urlopen('http://www.python.org/pythonHomePage.html').read()
except urllib2.HTTPError, e:
print e.code
print e.msg
print e.headers
print e.fp.read()
with open("code2.txt", "wb") as code:
code.write(e.fp.read())
req will be a Request object, fp will be a file-like object with the
HTTP error body, code will be the three-digit code of the error, msg
will be the user-visible explanation of the code and hdrs will be a
mapping object with the headers of the error.
More data about HTTP error: urllib2 documentation

Categories

Resources