I am getting the following error while trying to parse a link using beautiful soup,looks like the authentication is failing....how to authenticate the link?
from bs4 import BeautifulSoup
import argparse
import urllib
import urllib2
import getpass
import re
def update (searchurl):
page = urllib2.urlopen(searchurl)
soup = BeautifulSoup(page)
#print(soup.textarea.string)
print(soup.get_text())
def main ():
#For logging
print "test"
parser = argparse.ArgumentParser(description='This is the update.py script created by test')
parser.add_argument('-u','--url',action='store',dest='url',default=None,help='<Required> url link',required=True)
results = parser.parse_args()# collect cmd line args
url = results.url
#print url
update(url)
if __name__ == '__main__':
main()
Following is the error while trying to open the link
Traceback (most recent call last):
File "announce_update2.py", line 24, in <module>
main()
File "announce_update2.py", line 22, in main
update(url)
File "announce_update2.py", line 9, in update
page = urllib2.urlopen(searchurl)
File "C:\Python27\lib\urllib2.py", line 126, in urlopen
return _opener.open(url, data, timeout)
File "C:\Python27\lib\urllib2.py", line 406, in open
response = meth(req, response)
File "C:\Python27\lib\urllib2.py", line 519, in http_response
'http', request, response, code, msg, hdrs)
File "C:\Python27\lib\urllib2.py", line 438, in error
result = self._call_chain(*args)
File "C:\Python27\lib\urllib2.py", line 378, in _call_chain
result = func(*args)
File "C:\Python27\lib\urllib2.py", line 625, in http_error_302
return self.parent.open(new, timeout=req.timeout)
File "C:\Python27\lib\urllib2.py", line 406, in open
response = meth(req, response)
File "C:\Python27\lib\urllib2.py", line 519, in http_response
'http', request, response, code, msg, hdrs)
File "C:\Python27\lib\urllib2.py", line 444, in error
return self._call_chain(*args)
File "C:\Python27\lib\urllib2.py", line 378, in _call_chain
result = func(*args)
File "C:\Python27\lib\urllib2.py", line 527, in http_error_default
raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
Related
I am trying to parse some data from 'https://datausa.io/profile/geo/jacksonville-fl/#intro', but I am not sure how to access it from python. My code is:
adress, headers = urllib.request.urlretrieve(' https://datausa.io/profile/geo/jacksonville-fl/#intro')
handle = open(adress)
and it returns the error:
Traceback (most recent call last):
File "C:/Users/Jared/AppData/Local/Programs/Python/Python36-32/capstone1.py", line 16, in <module>
adress, headers = urllib.request.urlretrieve(' https://datausa.io/profile/geo/jacksonville-fl/#intro')
File "C:\Users\Jared\AppData\Local\Programs\Python\Python36-32\lib\urllib\request.py", line 248, in urlretrieve
with contextlib.closing(urlopen(url, data)) as fp:
File "C:\Users\Jared\AppData\Local\Programs\Python\Python36-32\lib\urllib\request.py", line 223, in urlopen
return opener.open(url, data, timeout)
File "C:\Users\Jared\AppData\Local\Programs\Python\Python36-32\lib\urllib\request.py", line 532, in open
response = meth(req, response)
File "C:\Users\Jared\AppData\Local\Programs\Python\Python36-32\lib\urllib\request.py", line 642, in http_response
'http', request, response, code, msg, hdrs)
File "C:\Users\Jared\AppData\Local\Programs\Python\Python36-32\lib\urllib\request.py", line 570, in error
return self._call_chain(*args)
File "C:\Users\Jared\AppData\Local\Programs\Python\Python36-32\lib\urllib\request.py", line 504, in _call_chain
result = func(*args)
File "C:\Users\Jared\AppData\Local\Programs\Python\Python36-32\lib\urllib\request.py", line 650, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 403: Forbidden
Please explain what is wrong or tell me a better way to access the page. Also, does the ' .io ' suffix affecthow python handles it?
Thanks.
This worked for me:
import requests
url = "https://datausa.io/profile/geo/jacksonville-fl/#intro"
req = requests.request("GET",url)
My code is this:
import urllib.request
import re
http://www.weather-forecast.com/locations/Paris/forcasts/latest
city = input('Please enter a place: ')
url = 'http://www.weather-forecast.com/locations/'+city+'forcasts/latest'
data = urllib.request.urlopen(url).read()
data1 = data.decode('utf-8')
I'm having trouble with the url this is my output:
Traceback (most recent call last):
File "C:/Users/alext/AppData/Local/Programs/Python/Python36/Weather forecast.py", line 9, in
data = urllib.request.urlopen(url).read()
File "C:\Users\alext\AppData\Local\Programs\Python\Python36\lib\urllib\request.py", line 223, in urlopen
return opener.open(url, data, timeout)
File "C:\Users\alext\AppData\Local\Programs\Python\Python36\lib\urllib\request.py", line 532, in open
response = meth(req, response)
File "C:\Users\alext\AppData\Local\Programs\Python\Python36\lib\urllib\request.py", line 642, in http_response
'http', request, response, code, msg, hdrs)
File "C:\Users\alext\AppData\Local\Programs\Python\Python36\lib\urllib\request.py", line 564, in error
result = self._call_chain(*args)
File "C:\Users\alext\AppData\Local\Programs\Python\Python36\lib\urllib\request.py", line 504, in _call_chain
result = func(*args)
File "C:\Users\alext\AppData\Local\Programs\Python\Python36\lib\urllib\request.py", line 756, in http_error_302
return self.parent.open(new, timeout=req.timeout)
File "C:\Users\alext\AppData\Local\Programs\Python\Python36\lib\urllib\request.py", line 532, in open
response = meth(req, response)
File "C:\Users\alext\AppData\Local\Programs\Python\Python36\lib\urllib\request.py", line 642, in http_response
'http', request, response, code, msg, hdrs)
File "C:\Users\alext\AppData\Local\Programs\Python\Python36\lib\urllib\request.py", line 570, in error
return self._call_chain(*args)
File "C:\Users\alext\AppData\Local\Programs\Python\Python36\lib\urllib\request.py", line 504, in _call_chain
result = func(*args)
File "C:\Users\alext\AppData\Local\Programs\Python\Python36\lib\urllib \request.py", line 650, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 404: Not Found
I have checked the url and it is definitely correct. I have seen others with problems like this but am still unsure as to the solution.
you are missing a / after the city and a e in forecast. It should be
url = 'http://www.weather-forecast.com/locations/'+city+'/forecasts/latest'
I want to read the value of different stocks from websites. Therefore I wrote this tiny script, which reads the page source and then parses out the value:
stock_reader.py
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from re import search
from urllib import request
def main():
links = [
[
'CSG',
'UBS',
],
[
'http://www.tradegate.de/orderbuch.php?isin=CH0012138530',
'http://www.tradegate.de/orderbuch.php?isin=CH0244767585',
],
]
for i in in range(len(links[0])):
url = links[1][i]
htmltext = request.urlopen(url).read().decode('utf-8')
source = htmltext.splitlines()
for line in source:
if 'id="bid"' in line:
m = search('\d+.\d+', line)
print('{}'.format(m.string[m.start():m.end()]))
if __name__ == '__main__':
main()
sometimes it works but sometimes this error gets raised:
error message
Traceback (most recent call last):
File "./aktien_reader.py", line 39, in <module>
main()
File "./aktien_reader.py", line 30, in main
htmltext = request.urlopen(url).read().decode('utf-8')
File "/usr/lib/python3.3/urllib/request.py", line 160, in urlopen
return opener.open(url, data, timeout)
File "/usr/lib/python3.3/urllib/request.py", line 479, in open
response = meth(req, response)
File "/usr/lib/python3.3/urllib/request.py", line 591, in http_response
'http', request, response, code, msg, hdrs)
File "/usr/lib/python3.3/urllib/request.py", line 511, in error
result = self._call_chain(*args)
File "/usr/lib/python3.3/urllib/request.py", line 451, in _call_chain
result = func(*args)
File "/usr/lib/python3.3/urllib/request.py", line 696, in http_error_302
return self.parent.open(new, timeout=req.timeout)
File "/usr/lib/python3.3/urllib/request.py", line 479, in open
response = meth(req, response)
File "/usr/lib/python3.3/urllib/request.py", line 591, in http_response
'http', request, response, code, msg, hdrs)
File "/usr/lib/python3.3/urllib/request.py", line 511, in error
result = self._call_chain(*args)
File "/usr/lib/python3.3/urllib/request.py", line 451, in _call_chain
result = func(*args)
File "/usr/lib/python3.3/urllib/request.py", line 696, in http_error_302
return self.parent.open(new, timeout=req.timeout)
File "/usr/lib/python3.3/urllib/request.py", line 479, in open
response = meth(req, response)
File "/usr/lib/python3.3/urllib/request.py", line 591, in http_response
'http', request, response, code, msg, hdrs)
File "/usr/lib/python3.3/urllib/request.py", line 511, in error
result = self._call_chain(*args)
File "/usr/lib/python3.3/urllib/request.py", line 451, in _call_chain
result = func(*args)
File "/usr/lib/python3.3/urllib/request.py", line 696, in http_error_302
return self.parent.open(new, timeout=req.timeout)
File "/usr/lib/python3.3/urllib/request.py", line 479, in open
response = meth(req, response)
File "/usr/lib/python3.3/urllib/request.py", line 591, in http_response
'http', request, response, code, msg, hdrs)
File "/usr/lib/python3.3/urllib/request.py", line 511, in error
result = self._call_chain(*args)
File "/usr/lib/python3.3/urllib/request.py", line 451, in _call_chain
result = func(*args)
File "/usr/lib/python3.3/urllib/request.py", line 696, in http_error_302
return self.parent.open(new, timeout=req.timeout)
File "/usr/lib/python3.3/urllib/request.py", line 479, in open
response = meth(req, response)
File "/usr/lib/python3.3/urllib/request.py", line 591, in http_response
'http', request, response, code, msg, hdrs)
File "/usr/lib/python3.3/urllib/request.py", line 511, in error
result = self._call_chain(*args)
File "/usr/lib/python3.3/urllib/request.py", line 451, in _call_chain
result = func(*args)
File "/usr/lib/python3.3/urllib/request.py", line 686, in http_error_302
self.inf_msg + msg, headers, fp)
urllib.error.HTTPError: HTTP Error 302: The HTTP server returned a redirect error that would lead to an infinite loop.
The last 30x error message was:
Found
My question is: why is it happening and how can I avoid it?
This happens probably because the destination site uses cookies and redirect you in case you don't send cookies.
What you can use is something like that :
from http.cookiejar import CookieJar
url = "http://www.tradegate.de/orderbuch.php?isin=CH0012138530"
req = urllib.request.Request(url, None, {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8','Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3','Accept-Encoding': 'gzip, deflate, sdch','Accept-Language': 'en-US,en;q=0.8','Connection': 'keep-alive'})
cj = CookieJar()
opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj))
response = opener.open(req)
response.read()
This way, you support Cookies and website will allow you to get the page :-)
Another way would be to use the requests package which is really simplest to use. In your case, it would lead to :
import requests
url = "http://www.tradegate.de/orderbuch.php?isin=CH0012138530"
r = requests.get(url, headers={'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'}, timeout=15)
print(r.content)
This answer is a simplification of the one by Cédric J. You don't really need to import CookieJar or set various Accept headers if you don't want to. You should however generally set a timeout. It is tested with Python 3.7. I would typically remember to use a new opener for each random URL that I want cookies for.
from urllib.request import build_opener, HTTPCookieProcessor, Request
url = 'https://www.cell.com/cell-metabolism/fulltext/S1550-4131(18)30630-2'
opener = build_opener(HTTPCookieProcessor())
Without a Request object:
response = opener.open(url, timeout=30)
content = response.read()
With a Request object:
request = Request(url)
response = opener.open(request, timeout=30)
content = response.read()
HTTP Status code 302 it's a kind of a redirect, it will have a header with a new URL for access (Not necessary a working URL..)
Location: http://www.example.com/x/y/
This is quite often used to block bots who make to many requests in too of a short time frame. So not an coding problem.
without importing urllib2_file my code works fine .
import urllib2
import urllib
import random
import mimetypes
import string
import urllib2_file
proxy = urllib2.ProxyHandler({'http': '10.200.1.26'})
opener = urllib2.build_opener(proxy)
urllib2.install_opener(opener)
u = urllib2.urlopen("http://127.0.0.1:3333/command/core/create-importing-job",data=urllib.urlencode({"test":""}))
print u.read()
After importing urllib2_file library its complaining :
Traceback (most recent call last):
File "C:/hari/latest refine code/trialrefine.py", line 11, in <module>
u = urllib2.urlopen("http://127.0.0.1:3333/command/core/create-importing-job",data=urllib.urlencode({"test":""}))
File "C:\Python27\lib\urllib2.py", line 126, in urlopen
return _opener.open(url, data, timeout)
File "C:\Python27\lib\urllib2.py", line 391, in open
response = self._open(req, data)
File "C:\Python27\lib\urllib2.py", line 409, in _open
'_open', req)
File "C:\Python27\lib\urllib2.py", line 369, in _call_chain
result = func(*args)
File "C:\Python27\urllib2_file.py", line 207, in http_open
return self.do_open(httplib.HTTP, req)
File "C:\Python27\urllib2_file.py", line 298, in do_open
return self.parent.error('http', req, fp, code, msg, hdrs)
File "C:\Python27\lib\urllib2.py", line 435, in error
return self._call_chain(*args)
File "C:\Python27\lib\urllib2.py", line 369, in _call_chain
result = func(*args)
File "C:\Python27\lib\urllib2.py", line 518, in http_error_default
raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
HTTPError: HTTP Error 404: Not Found
you are getting a 404 error. it means the url was wrong/server was down. note that urllib2_file overwrites the default HTTP handler of urllib2 :
urllib2._old_HTTPHandler = urllib2.HTTPHandler
urllib2.HTTPHandler = newHTTPHandler
one thing you could do is explicitly pass the urllib2._old_HTTPHandler to the opener. Other than that you really should go into the urllib2_file with a debugger to understand whats going wrong.
I know we can handle redirection with requests and urllib2 packages. I do not want to use requests as it is not a prebuilt package. Please help me with urllib2 to handle a URL which takes a 302 and then moves to 404. I am not concerned about the 404 and I would like to track whether it is a 301 or 302.
I referred this doc but it still throws the 404.
Here is my code
import urllib2
class My_HTTPRedirectHandler(urllib2.HTTPRedirectHandler):
def http_error_302(self, req, fp, code, msg, headers):
return urllib2.HTTPRedirectHandler.http_error_302(self, req, fp, code, msg, headers)
my_opener = urllib2.build_opener(My_HTTPRedirectHandler)
urllib2.install_opener(my_opener)
response =urllib2.urlopen("MY URL")
print response.read()
Here is my response
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/usr/lib/python2.7/urllib2.py", line 126, in urlopen
return _opener.open(url, data, timeout)
File "/usr/lib/python2.7/urllib2.py", line 406, in open
response = meth(req, response)
File "/usr/lib/python2.7/urllib2.py", line 519, in http_response
'http', request, response, code, msg, hdrs)
File "/usr/lib/python2.7/urllib2.py", line 438, in error
result = self._call_chain(*args)
File "/usr/lib/python2.7/urllib2.py", line 378, in _call_chain
result = func(*args)
File "<stdin>", line 3, in http_error_302
File "/usr/lib/python2.7/urllib2.py", line 625, in http_error_302
return self.parent.open(new, timeout=req.timeout)
File "/usr/lib/python2.7/urllib2.py", line 406, in open
response = meth(req, response)
File "/usr/lib/python2.7/urllib2.py", line 519, in http_response
'http', request, response, code, msg, hdrs)
File "/usr/lib/python2.7/urllib2.py", line 444, in error
return self._call_chain(*args)
File "/usr/lib/python2.7/urllib2.py", line 378, in _call_chain
result = func(*args)
File "/usr/lib/python2.7/urllib2.py", line 527, in http_error_default
raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
urllib2.HTTPError: HTTP Error 404: Not Found
It's better to use httplib and HTTP/1.1 HEAD method. That way no response body is received.
What’s the best way to get an HTTP response code from a URL?