I was trying to get a web page, but got into this problem. I've looked up some references, and this is what I've done so far:
import sys
import urllib2
from bs4 import BeautifulSoup
user = 'myuserID'
password = "mypassword"
ip = sys.argv[1]
url = "http://www.websites.com/" + ip
passman = urllib2.HTTPPasswordMgrWithDefaultRealm()
passman.add_password(None, url, user, password)
handler = urllib2.HTTPBasicAuthHandler(passman)
opener = urllib2.build_opener(handler)
urllib2.install_opener(opener)
header = {
'Connection' : 'keep-alive',
'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0',
'Accept-Language' : 'en-US,en;q=0.5',
'Accept-Encoding' : 'gzip, deflate'
}
html = urllib2.urlopen(urllib2.Request(url, None, header))
soup = BeautifulSoup(html, 'html.parser')
# some if else function afterwards #
When I try to run the script, it shows this kind of error:
python checker.py 8.8.8.8
Traceback (most recent call last):
File "checker.py", line 34, in <module>
html = urllib2.urlopen(urllib2.Request(url, None, header))
File "C:\Python27\lib\urllib2.py", line 154, in urlopen
return opener.open(url, data, timeout)
File "C:\Python27\lib\urllib2.py", line 437, in open
response = meth(req, response)
File "C:\Python27\lib\urllib2.py", line 550, in http_response
'http', request, response, code, msg, hdrs)
File "C:\Python27\lib\urllib2.py", line 469, in error
result = self._call_chain(*args)
File "C:\Python27\lib\urllib2.py", line 409, in _call_chain
result = func(*args)
File "C:\Python27\lib\urllib2.py", line 656, in http_error_302
return self.parent.open(new, timeout=req.timeout)
File "C:\Python27\lib\urllib2.py", line 437, in open
response = meth(req, response)
File "C:\Python27\lib\urllib2.py", line 550, in http_response
'http', request, response, code, msg, hdrs)
File "C:\Python27\lib\urllib2.py", line 475, in error
return self._call_chain(*args)
File "C:\Python27\lib\urllib2.py", line 409, in _call_chain
result = func(*args)
File "C:\Python27\lib\urllib2.py", line 558, in http_error_default
raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
urllib2.HTTPError: HTTP Error 401: authenticationrequired
But if I opened the page or other web page, and manually enter my credential, this script works fine after that. Am I missing something?
Just to add, my current network are using McAfee web gateway device. So sometimes we need to enter our credential to proceed browsing the net. Our user/pass are integrated with Active Directory. Is that may cause the issue?
This seems to work really well (taken from another thread)
import urllib2
import base64
import sys
user = 'myuserID'
password = "mypassword"
ip = sys.argv[1]
url = "http://www.websites.com/" + ip
request = urllib2.Request(url)
base64string = base64.encodestring('%s:%s' % (user, password)).replace('\n', '')
request.add_header("Authorization", "Basic %s" % base64string)
result = urllib2.urlopen(request)
Or you may use requests:
from requests.auth import HTTPBasicAuth
user = 'myuserID'
password = "mypassword"
ip = sys.argv[1]
url = "http://www.websites.com/" + ip
res=requests.get(url , auth=HTTPBasicAuth(user, password))
print res.text
Related
I using python want to get the raw HTML of a webpage that requires authentication.
Similar to this question but the answers here do not work.
Code I am trying:
import urllib, urllib2, cookielib
username = 'redacted'
password = 'redacted'
cj = cookielib.CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
login_data = urllib.urlencode({'username' : username, 'j_password' : password})
opener.open('https://redacted.net', login_data)#http://www.example.com/login.php
resp = opener.open('https://redacted.net')#http://www.example.com/hiddenpage.php
print resp.read() #print strait HTML of the page can use opener to view any page using your session cookie.
Error:
Traceback (most recent call last):
File "C:/Users/Jacob/Desktop/School/Python_Scripts/session refresher/session_refresher.py", line 9, in <module>
opener.open('Redacted', login_data)#http://www.example.com/login.php
File "C:\Python27\lib\urllib2.py", line 437, in open
response = meth(req, response)
File "C:\Python27\lib\urllib2.py", line 550, in http_response
'http', request, response, code, msg, hdrs)
File "C:\Python27\lib\urllib2.py", line 475, in error
return self._call_chain(*args)
File "C:\Python27\lib\urllib2.py", line 409, in _call_chain
result = func(*args)
File "C:\Python27\lib\urllib2.py", line 558, in http_error_default
raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
HTTPError: HTTP Error 401: Unauthorized
Here is what the window that popups to ask for authentication when I go to the webpage with a browser.
I'd use requests for this as it is simpler than what urllib provides for authentication.
import requests
r = requests.get("https://redacted.net", auth=('username', 'password'))
print(r.text)
use requests and supply your user/pass pair in the request:
import requests
requests.get('https://redacted.net', auth=('user', 'pass'))
I looked at this answer: Again urllib.error.HTTPError: HTTP Error 400: Bad Request because it was very similar to my question, but that solution did not work. I'm using Python 3.3.2. The lines that are like ..something.. are just values I replaced to protect my privacy. They should be strings
I'm getting an Error 400: Bad request from this code:
import urllib.parse
import urllib.request
url = '..url..'
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
values = {'email':'..my email address..',
'github':'..my github account..'}
headers = {'User-Agent':user_agent}
data = urllib.parse.urlencode(values)
data = data.encode('utf-8')
req = urllib.request.Request(url, data, headers)
response = urllib.request.urlopen(req) #this line causes the errors
page = response.read()
This is the particular error message:
Traceback (most recent call last):
File "/Users/.../Documents/Code 2040/File.py", line 23, in <module>
response = urllib.request.urlopen(req)
File "/Library/Frameworks/Python.framework/Versions/3.3/lib/python3.3/urllib/request.py", line 156, in urlopen
return opener.open(url, data, timeout)
File "/Library/Frameworks/Python.framework/Versions/3.3/lib/python3.3/urllib/request.py", line 475, in open
response = meth(req, response)
File "/Library/Frameworks/Python.framework/Versions/3.3/lib/python3.3/urllib/request.py", line 587, in http_response
'http', request, response, code, msg, hdrs)
File "/Library/Frameworks/Python.framework/Versions/3.3/lib/python3.3/urllib/request.py", line 513, in error
return self._call_chain(*args)
File "/Library/Frameworks/Python.framework/Versions/3.3/lib/python3.3/urllib/request.py", line 447, in _call_chain
result = func(*args)
File "/Library/Frameworks/Python.framework/Versions/3.3/lib/python3.3/urllib/request.py", line 595, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 400: Bad Request
Hy!
I tried to open web-page, that is normally opening in browser, but python just swears and does not want to work.
import urllib.request, urllib.error
f = urllib.request.urlopen('http://www.booking.com/reviewlist.html?cc1=tr;pagename=sapphire')
And another way
import urllib.request, urllib.error
opener=urllib.request.build_opener()
f=opener.open('http://www.booking.com/reviewlist.html?cc1=tr;pagename=sapphi
re')
Both options give one type of error:
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "C:\Python34\lib\urllib\request.py", line 461, in open
response = meth(req, response)
File "C:\Python34\lib\urllib\request.py", line 571, in http_response
'http', request, response, code, msg, hdrs)
File "C:\Python34\lib\urllib\request.py", line 493, in error
result = self._call_chain(*args)
File "C:\Python34\lib\urllib\request.py", line 433, in _call_chain
result = func(*args)
File "C:\Python34\lib\urllib\request.py", line 676, in http_error_302
return self.parent.open(new, timeout=req.timeout)
File "C:\Python34\lib\urllib\request.py", line 461, in open
response = meth(req, response)
File "C:\Python34\lib\urllib\request.py", line 571, in http_response
'http', request, response, code, msg, hdrs)
File "C:\Python34\lib\urllib\request.py", line 499, in error
return self._call_chain(*args)
File "C:\Python34\lib\urllib\request.py", line 433, in _call_chain
result = func(*args)
File "C:\Python34\lib\urllib\request.py", line 579, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 400: Bad Request
Any ideas?
They are probably blocking the fact that it isn't coming from a browser. You probably need a valid User-Agent header or something.
Using requests, this works:
import requests
headers =
{
'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36'
}
r = requests.get('http://www.booking.com/reviewlist.html?cc1=tr;pagename=sapphire', headers=headers)
print r
print r.headers
This URL seems to be doing user agent string checking. If I adjust my user agent string in Firefox to Python-urllib/2.7, it fails with the Bad Request you are seeing.
As you are using urllib, you can adjust the User Agent following this tutorial
from urllib.request import FancyURLopener
class MyOpener(FancyURLopener):
version = 'My new User-Agent' # Set this to a string you want for your user agent
myopener = MyOpener()
page = myopener.open('http://www.booking.com/reviewlist.html?cc1=tr;pagename=sapphire')
I have this code at the top of my Google App Engine program:
from google.appengine.api import urlfetch
urlfetch.set_default_fetch_deadline(60)
I am using an opener to load stuff:
cj = CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor( cj ) )
opener.addheaders = [ ( 'User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64)' ) ]
resp = opener.open( 'http://www.example.com/' )
an exception is being thrown after 5 seconds:
File "/base/data/home/runtimes/python27/python27_dist/lib/python2.7/urllib2.py", line 404, in open
response = self._open(req, data)
File "/base/data/home/runtimes/python27/python27_dist/lib/python2.7/urllib2.py", line 422, in _open
'_open', req)
File "/base/data/home/runtimes/python27/python27_dist/lib/python2.7/urllib2.py", line 382, in _call_chain
result = func(*args)
File "/base/data/home/runtimes/python27/python27_dist/lib/python2.7/urllib2.py", line 1222, in https_open
return self.do_open(httplib.HTTPSConnection, req)
File "/base/data/home/runtimes/python27/python27_dist/lib/python2.7/urllib2.py", line 1187, in do_open
r = h.getresponse(buffering=True)
File "/base/data/home/runtimes/python27/python27_dist/lib/python2.7/gae_override/httplib.py", line 524, in getresponse
raise HTTPException(str(e))
HTTPException: Deadline exceeded while waiting for HTTP response from URL: http://www.example.com
How can I avoid the error?
Did you try setting the timeout on the .open() call?
resp = opener.open('http://example.com', None, 60)
If you reach the timeout as specified by set_default_fetch_deadline, Python will throw a DownloadError or DeadlineExceededErrors exception: https://cloud.google.com/appengine/docs/python/urlfetch/exceptions
You can also patch the httplib2 library and set the deadline to 60 seconds
httplib2/__init__.py:
def fixed_fetch(url, payload=None, method="GET", headers={},
allow_truncated=False, follow_redirects=True,
deadline=60):
return fetch(url, payload=payload, method=method, headers=headers,
allow_truncated=allow_truncated,
follow_redirects=follow_redirects, deadline=60,
validate_certificate=validate_certificate)
return fixed_fetch
It is a work around.
I'm trying to use urllib2 and python-ntlm to connect to an NT authenticated server, but I'm getting an error. Here's the code I'm using, from the python-ntlm site:
user = 'DOMAIN\user.name'
password = 'Password123'
url = 'http://corporate.domain.com/page.aspx?id=foobar'
passman = urllib2.HTTPPasswordMgrWithDefaultRealm()
passman.add_password(None, url, user, password)
# create the NTLM authentication handler
auth_NTLM = HTTPNtlmAuthHandler.HTTPNtlmAuthHandler(passman)
# create and install the opener
opener = urllib2.build_opener(auth_NTLM)
urllib2.install_opener(opener)
# retrieve the result
response = urllib2.urlopen(url)
return response.read()
And here's the error I get:
Traceback (most recent call last):
File "C:\Python27\test.py", line 112, in get_ntlm_data
response = urllib2.urlopen(url)
File "C:\Python27\lib\urllib2.py", line 126, in urlopen
return _opener.open(url, data, timeout)
File "C:\Python27\lib\urllib2.py", line 398, in open
response = meth(req, response)
File "C:\Python27\lib\urllib2.py", line 511, in http_response
'http', request, response, code, msg, hdrs)
File "C:\Python27\lib\urllib2.py", line 430, in error
result = self._call_chain(*args)
File "C:\Python27\lib\urllib2.py", line 370, in _call_chain
result = func(*args)
File "C:\Python27\lib\site-packages\python_ntlm-1.0.1-py2.7.egg\ntlm\HTTPNtlmAuthHandler.py", line 99, in http_error_401
return self.http_error_authentication_required('www-authenticate', req, fp, headers)
File "C:\Python27\lib\site-packages\python_ntlm-1.0.1-py2.7.egg\ntlm\HTTPNtlmAuthHandler.py", line 35, in http_error_authentication_required
return self.retry_using_http_NTLM_auth(req, auth_header_field, None, headers)
File "C:\Python27\lib\site-packages\python_ntlm-1.0.1-py2.7.egg\ntlm\HTTPNtlmAuthHandler.py", line 72, in retry_using_http_NTLM_auth
UserName = user_parts[1]
IndexError: list index out of range
Any idea what I'm doing wrong?
Try:
user = r'DOMAIN\user.name'