Cookiejar use in an opener - python

I have the following code at the moment:
tw_jar = cookielib.CookieJar()
tw_jar.set_cookie(c1)
tw_jar.set_cookie(c2)
o = urllib2.build_opener( urllib2.HTTPCookieProcessor(tw_jar) )
urllib2.install_opener( o )
Now I later in my code I don't want to use any of the cookies (Also new cookies created meanwhile).
Can I do a simple tw_jar.clear() or do I need to build and install the opener again to get rid of all cookies used in the requests?

This is how HTTPCookieProcessor is defined in my Python installation:
class HTTPCookieProcessor(BaseHandler):
def __init__(self, cookiejar=None):
import cookielib
if cookiejar is None:
cookiejar = cookielib.CookieJar()
self.cookiejar = cookiejar
def http_request(self, request):
self.cookiejar.add_cookie_header(request)
return request
def http_response(self, request, response):
self.cookiejar.extract_cookies(response, request)
return response
https_request = http_request
https_response = http_response
As only a reference is saved, you can just manipulate the original tw_jar instance and it will affect all future requests.

If you don't want any cookies, I'd recommend to create a new opener. However, if for some reason you want to keep the old one, removing the cookie processor from the list of handlers should work:
o.handlers = [h for h in o.handlers
if not isinstance(h, urllib2.HTTPCookieProcessor)]

Related

How to pass proxy-authentication (requires digest auth) by using python requests module

I was using Mechanize module a while ago, and now try to use Requests module.
(Python mechanize doesn't work when HTTPS and Proxy Authentication required)
I have to go through proxy-server when I access the Internet.
The proxy-server requires authentication. I wrote the following codes.
import requests
from requests.auth import HTTPProxyAuth
proxies = {"http":"192.168.20.130:8080"}
auth = HTTPProxyAuth("username", "password")
r = requests.get("http://www.google.co.jp/", proxies=proxies, auth=auth)
The above codes work well when proxy-server requires basic authentication.
Now I want to know what I have to do when proxy-server requires digest authentication.
HTTPProxyAuth seems not to be effective in digest authentication (r.status_code returns 407).
No need to implement your own! in most cases
Requests has built in support for proxies, for basic authentication:
proxies = { 'https' : 'https://user:password#proxyip:port' }
r = requests.get('https://url', proxies=proxies)
see more on the docs
Or in case you need digest authentication HTTPDigestAuth may help.
Or you might need try to extend it like yutaka2487 did bellow.
Note: must use ip of proxy server not its name!
I wrote the class that can be used in proxy authentication (based on digest auth).
I borrowed almost all codes from requests.auth.HTTPDigestAuth.
import requests
import requests.auth
class HTTPProxyDigestAuth(requests.auth.HTTPDigestAuth):
def handle_407(self, r):
"""Takes the given response and tries digest-auth, if needed."""
num_407_calls = r.request.hooks['response'].count(self.handle_407)
s_auth = r.headers.get('Proxy-authenticate', '')
if 'digest' in s_auth.lower() and num_407_calls < 2:
self.chal = requests.auth.parse_dict_header(s_auth.replace('Digest ', ''))
# Consume content and release the original connection
# to allow our new request to reuse the same one.
r.content
r.raw.release_conn()
r.request.headers['Authorization'] = self.build_digest_header(r.request.method, r.request.url)
r.request.send(anyway=True)
_r = r.request.response
_r.history.append(r)
return _r
return r
def __call__(self, r):
if self.last_nonce:
r.headers['Proxy-Authorization'] = self.build_digest_header(r.method, r.url)
r.register_hook('response', self.handle_407)
return r
Usage:
proxies = {
"http" :"192.168.20.130:8080",
"https":"192.168.20.130:8080",
}
auth = HTTPProxyDigestAuth("username", "password")
# HTTP
r = requests.get("http://www.google.co.jp/", proxies=proxies, auth=auth)
r.status_code # 200 OK
# HTTPS
r = requests.get("https://www.google.co.jp/", proxies=proxies, auth=auth)
r.status_code # 200 OK
I've written a Python module (available here) which makes it possible to authenticate with a HTTP proxy using the digest scheme. It works when connecting to HTTPS websites (through monkey patching) and allows to authenticate with the website as well. This should work with latest requests library for both Python 2 and 3.
The following example fetches the webpage https://httpbin.org/ip through HTTP proxy 1.2.3.4:8080 which requires HTTP digest authentication using user name user1 and password password1:
import requests
from requests_digest_proxy import HTTPProxyDigestAuth
s = requests.Session()
s.proxies = {
'http': 'http://1.2.3.4:8080/',
'https': 'http://1.2.3.4:8080/'
}
s.auth = HTTPProxyDigestAuth('user1', 'password1')
print(s.get('https://httpbin.org/ip').text)
Should the website requires some kind of HTTP authentication, this can be specified to HTTPProxyDigestAuth constructor this way:
# HTTP Basic authentication for website
s.auth = HTTPProxyDigestAuth(('user1', 'password1'),
auth=requests.auth.HTTPBasicAuth('user1', 'password0'))
print(s.get('https://httpbin.org/basic-auth/user1/password0').text))
# HTTP Digest authentication for website
s.auth = HTTPProxyDigestAuth(('user1', 'password1'),,
auth=requests.auth.HTTPDigestAuth('user1', 'password0'))
print(s.get('https://httpbin.org/digest-auth/auth/user1/password0').text)
This snippet works for both types of requests (http and https). Tested on the current version of requests (2.23.0).
import re
import requests
from requests.utils import get_auth_from_url
from requests.auth import HTTPDigestAuth
from requests.utils import parse_dict_header
from urllib3.util import parse_url
def get_proxy_autorization_header(proxy, method):
username, password = get_auth_from_url(proxy)
auth = HTTPProxyDigestAuth(username, password)
proxy_url = parse_url(proxy)
proxy_response = requests.request(method, proxy_url, auth=auth)
return proxy_response.request.headers['Proxy-Authorization']
class HTTPSAdapterWithProxyDigestAuth(requests.adapters.HTTPAdapter):
def proxy_headers(self, proxy):
headers = {}
proxy_auth_header = get_proxy_autorization_header(proxy, 'CONNECT')
headers['Proxy-Authorization'] = proxy_auth_header
return headers
class HTTPAdapterWithProxyDigestAuth(requests.adapters.HTTPAdapter):
def proxy_headers(self, proxy):
return {}
def add_headers(self, request, **kwargs):
proxy = kwargs['proxies'].get('http', '')
if proxy:
proxy_auth_header = get_proxy_autorization_header(proxy, request.method)
request.headers['Proxy-Authorization'] = proxy_auth_header
class HTTPProxyDigestAuth(requests.auth.HTTPDigestAuth):
def init_per_thread_state(self):
# Ensure state is initialized just once per-thread
if not hasattr(self._thread_local, 'init'):
self._thread_local.init = True
self._thread_local.last_nonce = ''
self._thread_local.nonce_count = 0
self._thread_local.chal = {}
self._thread_local.pos = None
self._thread_local.num_407_calls = None
def handle_407(self, r, **kwargs):
"""
Takes the given response and tries digest-auth, if needed.
:rtype: requests.Response
"""
# If response is not 407, do not auth
if r.status_code != 407:
self._thread_local.num_407_calls = 1
return r
s_auth = r.headers.get('proxy-authenticate', '')
if 'digest' in s_auth.lower() and self._thread_local.num_407_calls < 2:
self._thread_local.num_407_calls += 1
pat = re.compile(r'digest ', flags=re.IGNORECASE)
self._thread_local.chal = requests.utils.parse_dict_header(
pat.sub('', s_auth, count=1))
# Consume content and release the original connection
# to allow our new request to reuse the same one.
r.content
r.close()
prep = r.request.copy()
requests.cookies.extract_cookies_to_jar(prep._cookies, r.request, r.raw)
prep.prepare_cookies(prep._cookies)
prep.headers['Proxy-Authorization'] = self.build_digest_header(prep.method, prep.url)
_r = r.connection.send(prep, **kwargs)
_r.history.append(r)
_r.request = prep
return _r
self._thread_local.num_407_calls = 1
return r
def __call__(self, r):
# Initialize per-thread state, if needed
self.init_per_thread_state()
# If we have a saved nonce, skip the 407
if self._thread_local.last_nonce:
r.headers['Proxy-Authorization'] = self.build_digest_header(r.method, r.url)
r.register_hook('response', self.handle_407)
self._thread_local.num_407_calls = 1
return r
session = requests.Session()
session.proxies = {
'http': 'http://username:password#proxyhost:proxyport',
'https': 'http://username:password#proxyhost:proxyport'
}
session.trust_env = False
session.mount('http://', HTTPAdapterWithProxyDigestAuth())
session.mount('https://', HTTPSAdapterWithProxyDigestAuth())
response_http = session.get("http://ww3.safestyle-windows.co.uk/the-secret-door/")
print(response_http.status_code)
response_https = session.get("https://stackoverflow.com/questions/13506455/how-to-pass-proxy-authentication-requires-digest-auth-by-using-python-requests")
print(response_https.status_code)
Generally, the problem of proxy autorization is also relevant for other types of authentication (ntlm, kerberos) when connecting using the protocol HTTPS. And despite the large number of issues (since 2013, and maybe there are earlier ones that I did not find):
in requests: Digest Proxy Auth, NTLM Proxy Auth, Kerberos Proxy Auth
in urlib3: NTLM Proxy Auth, NTLM Proxy Auth
and many many others,the problem is still not resolved.
The root of the problem in the function _tunnel of the module httplib(python2)/http.client(python3). In case of unsuccessful connection attempt, it raises an OSError without returning a response code (407 in our case) and additional data needed to build the autorization header. Lukasa gave a explanation here.
As long as there is no solution from maintainers of urllib3 (or requests), we can only use various workarounds (for example, use the approach of #Tey' or do something like this).In my version of workaround, we pre-prepare the necessary authorization data by sending a request to the proxy server and processing the received response.
You can use digest authentication by using requests.auth.HTTPDigestAuth instead of requests.auth.HTTPProxyAuth
For those of you that still end up here, there appears to be a project called requests-toolbelt that has this plus other common but not built in functionality of requests.
https://toolbelt.readthedocs.org/en/latest/authentication.html#httpproxydigestauth
This works for me. Actually, don't know about security of user:password in this soulution:
import requests
import os
http_proxyf = 'http://user:password#proxyip:port'
os.environ["http_proxy"] = http_proxyf
os.environ["https_proxy"] = http_proxyf
sess = requests.Session()
# maybe need sess.trust_env = True
print(sess.get('https://some.org').text)
import requests
import os
# in my case I had to add my local domain
proxies = {
'http': 'proxy.myagency.com:8080',
'https': 'user#localdomain:password#proxy.myagency.com:8080',
}
r=requests.get('https://api.github.com/events', proxies=proxies)
print(r.text)
Here is an answer that is not for http Basic Authentication - for example a transperant proxy within organization.
import requests
url = 'https://someaddress-behindproxy.com'
params = {'apikey': '123456789'} #if you need params
proxies = {'https': 'https://proxyaddress.com:3128'} #or some other port
response = requests.get(url, proxies=proxies, params=params)
I hope this helps someone.

Set useragent on Client __init__ (Python suds)

I would like to know how to set the useragent in all SOAP request with suds in Python, including WSDL get.
Indeed, on the following code :
Client('http://...')
The WSDL is get with the default Python useragent.
The WSDL is available on the server only for specific useragent.
Thank you
I don't know whether that's the easiest way to do it, but it is certainly possible to do using httplib2 (this trick also gives you keep-alive connections) :
from suds.transport import Transport
import httplib2, StringIO
class Httplib2Response:
pass
class Httplib2Transport(Transport):
def __init__(self, **kwargs):
Transport.__init__(self)
self.http = httplib2.Http()
def send(self, request):
url = request.url
message = request.message
headers = request.headers
headers['User-Agent']='XYZ'
response = Httplib2Response()
response.headers, response.message = self.http.request(url,
"PUT", body=message, headers=headers)
return response
def open(self, request):
response = Httplib2Response()
request.headers['User-Agent']='XYZ'
response.headers, response.message = self.http.request(request.url, "GET",
body=request.message, headers=request.headers)
return StringIO.StringIO(response.message)
And then you need to pass the transport class to the suds.client:
http = Httplib2Transport()
client = Client(url,transport=http)
You can override the u2opener method of Transport class to set your own addheaders attribute:
class HttpTransportCustomUserAgent(HttpTransport):
def __init__(self, **kwargs):
self.user_agent = kwargs.get('user_agent', 'Python-urllib/%s' % urllib2.__version__)
if 'user_agent' in kwargs:
del(kwargs['user_agent'])
HttpTransport.__init__(self, **kwargs)
def u2opener(self):
"""
Create a urllib opener.
#return: An opener.
#rtype: I{OpenerDirector}
"""
if self.urlopener is None:
result = urllib2.build_opener(*self.u2handlers())
result.addheaders = [('User-agent', self.user_agent)]
return result
else:
return self.urlopener
Now you can use this new transporter class for suds.client:
http = HttpTransportCustomUserAgent(user_agent='My custom User Agent')
client = Client(url, transport=http)

count cookies in python

I am struggling with python ,I want to Write a Python script that creates a cookie and Counts how many times the cookie is called during a session
I just have tried as per below:
import Cookie
import os
if os.environ.has_key('HTTP_COOKIE'):
cookie=SimpleCookie(os.environ['HTTP_COOKIE'])
cookie=SimpleCookie()
for key in initialvalues.keys():
if not cookie.has_key(key):
cookie[key]=intialvalues[key]
return cookie
if __name__=='__main__':
c=getCookie({'counter':0})
c['counter']=int(c['counter'].value)+1
print c
But I know it is wrong, can someone help me to write down the script?
Any help would be appreciated
I'm confused by your question. What I believe you want to do is request some webpage and count how many times your cookie was found? You can gather cookies using a CookieJar:
import urllib, urllib2, cookielib
url = "http://example.com/cookies"
form_data = {'username': '', 'password': '' }
jar = cookielib.CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(jar))
form_data = urllib.urlencode(form_data)
# data returned from this pages contains redirection
resp = opener.open(url, form_data)
print resp.read()
for cookie in jar:
# Look for your cookie

Python form POST using urllib2 (also question on saving/using cookies)

I am trying to write a function to post form data and save returned cookie info in a file so that the next time the page is visited, the cookie information is sent to the server (i.e. normal browser behavior).
I wrote this relatively easily in C++ using curlib, but have spent almost an entire day trying to write this in Python, using urllib2 - and still no success.
This is what I have so far:
import urllib, urllib2
import logging
# the path and filename to save your cookies in
COOKIEFILE = 'cookies.lwp'
cj = None
ClientCookie = None
cookielib = None
logger = logging.getLogger(__name__)
# Let's see if cookielib is available
try:
import cookielib
except ImportError:
logger.debug('importing cookielib failed. Trying ClientCookie')
try:
import ClientCookie
except ImportError:
logger.debug('ClientCookie isn\'t available either')
urlopen = urllib2.urlopen
Request = urllib2.Request
else:
logger.debug('imported ClientCookie succesfully')
urlopen = ClientCookie.urlopen
Request = ClientCookie.Request
cj = ClientCookie.LWPCookieJar()
else:
logger.debug('Successfully imported cookielib')
urlopen = urllib2.urlopen
Request = urllib2.Request
# This is a subclass of FileCookieJar
# that has useful load and save methods
cj = cookielib.LWPCookieJar()
login_params = {'name': 'anon', 'password': 'pass' }
def login(theurl, login_params):
init_cookies();
data = urllib.urlencode(login_params)
txheaders = {'User-agent' : 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'}
try:
# create a request object
req = Request(theurl, data, txheaders)
# and open it to return a handle on the url
handle = urlopen(req)
except IOError, e:
log.debug('Failed to open "%s".' % theurl)
if hasattr(e, 'code'):
log.debug('Failed with error code - %s.' % e.code)
elif hasattr(e, 'reason'):
log.debug("The error object has the following 'reason' attribute :"+e.reason)
sys.exit()
else:
if cj is None:
log.debug('We don\'t have a cookie library available - sorry.')
else:
print 'These are the cookies we have received so far :'
for index, cookie in enumerate(cj):
print index, ' : ', cookie
# save the cookies again
cj.save(COOKIEFILE)
#return the data
return handle.read()
# FIXME: I need to fix this so that it takes into account any cookie data we may have stored
def get_page(*args, **query):
if len(args) != 1:
raise ValueError(
"post_page() takes exactly 1 argument (%d given)" % len(args)
)
url = args[0]
query = urllib.urlencode(list(query.iteritems()))
if not url.endswith('/') and query:
url += '/'
if query:
url += "?" + query
resource = urllib.urlopen(url)
logger.debug('GET url "%s" => "%s", code %d' % (url,
resource.url,
resource.code))
return resource.read()
When I attempt to log in, I pass the correct username and pwd,. yet the login fails, and no cookie data is saved.
My two questions are:
can anyone see whats wrong with the login() function, and how may I fix it?
how may I modify the get_page() function to make use of any cookie info I have saved ?
There are quite a few problems with the code that you've posted. Typically you'll want to build a custom opener which can handle redirects, https, etc. otherwise you'll run into trouble. As far as the cookies themselves so, you need to call the load and save methods on your cookiejar, and use one of subclasses, such as MozillaCookieJar or LWPCookieJar.
Here's a class I wrote to login to Facebook, back when I was playing silly web games. I just modified it to use a file based cookiejar, rather than an in-memory one.
import cookielib
import os
import urllib
import urllib2
# set these to whatever your fb account is
fb_username = "your#facebook.login"
fb_password = "secretpassword"
cookie_filename = "facebook.cookies"
class WebGamePlayer(object):
def __init__(self, login, password):
""" Start up... """
self.login = login
self.password = password
self.cj = cookielib.MozillaCookieJar(cookie_filename)
if os.access(cookie_filename, os.F_OK):
self.cj.load()
self.opener = urllib2.build_opener(
urllib2.HTTPRedirectHandler(),
urllib2.HTTPHandler(debuglevel=0),
urllib2.HTTPSHandler(debuglevel=0),
urllib2.HTTPCookieProcessor(self.cj)
)
self.opener.addheaders = [
('User-agent', ('Mozilla/4.0 (compatible; MSIE 6.0; '
'Windows NT 5.2; .NET CLR 1.1.4322)'))
]
# need this twice - once to set cookies, once to log in...
self.loginToFacebook()
self.loginToFacebook()
self.cj.save()
def loginToFacebook(self):
"""
Handle login. This should populate our cookie jar.
"""
login_data = urllib.urlencode({
'email' : self.login,
'pass' : self.password,
})
response = self.opener.open("https://login.facebook.com/login.php", login_data)
return ''.join(response.readlines())
test = WebGamePlayer(fb_username, fb_password)
After you've set your username and password, you should see a file, facebook.cookies, with your cookies in it. In practice you'll probably want to modify it to check whether you have an active cookie and use that, then log in again if access is denied.
If you are having a hard time making your POST requests to work (like I had with a login form), it definitely pays to quickly install the Live HTTP headers extension to Firefox (http://livehttpheaders.mozdev.org/index.html). This small extension can, among other things, show you the exact POST data that are sent when you manually log in.
In my case, I had banged my head against the wall for hours because the site insisted on an extra field with 'action=login' (doh!).
Please using ignore_discard and ignore_expires while save cookie, in mine case it saved OK.
self.cj.save(cookie_file, ignore_discard=True, ignore_expires=True)

How do you get default headers in a urllib2 Request?

I have a Python web client that uses urllib2. It is easy enough to add HTTP headers to my outgoing requests. I just create a dictionary of the headers I want to add, and pass it to the Request initializer.
However, other "standard" HTTP headers get added to the request as well as the custom ones I explicitly add. When I sniff the request using Wireshark, I see headers besides the ones I add myself. My question is how do a I get access to these headers? I want to log every request (including the full set of HTTP headers), and can't figure out how.
any pointers?
in a nutshell: How do I get all the outgoing headers from an HTTP request created by urllib2?
If you want to see the literal HTTP request that is sent out, and therefore see every last header exactly as it is represented on the wire, then you can tell urllib2 to use your own version of an HTTPHandler that prints out (or saves, or whatever) the outgoing HTTP request.
import httplib, urllib2
class MyHTTPConnection(httplib.HTTPConnection):
def send(self, s):
print s # or save them, or whatever!
httplib.HTTPConnection.send(self, s)
class MyHTTPHandler(urllib2.HTTPHandler):
def http_open(self, req):
return self.do_open(MyHTTPConnection, req)
opener = urllib2.build_opener(MyHTTPHandler)
response = opener.open('http://www.google.com/')
The result of running this code is:
GET / HTTP/1.1
Accept-Encoding: identity
Host: www.google.com
Connection: close
User-Agent: Python-urllib/2.6
The urllib2 library uses OpenerDirector objects to handle the actual opening. Fortunately, the python library provides defaults so you don't have to. It is, however, these OpenerDirector objects that are adding the extra headers.
To see what they are after the request has been sent (so that you can log it, for example):
req = urllib2.Request(url='http://google.com')
response = urllib2.urlopen(req)
print req.unredirected_hdrs
(produces {'Host': 'google.com', 'User-agent': 'Python-urllib/2.5'} etc)
The unredirected_hdrs is where the OpenerDirectors dump their extra headers. Simply looking at req.headers will show only your own headers - the library leaves those unmolested for you.
If you need to see the headers before you send the request, you'll need to subclass the OpenerDirector in order to intercept the transmission.
Hope that helps.
EDIT: I forgot to mention that, once the request as been sent, req.header_items() will give you a list of tuples of ALL the headers, with both your own and the ones added by the OpenerDirector. I should have mentioned this first since it's the most straightforward :-) Sorry.
EDIT 2: After your question about an example for defining your own handler, here's the sample I came up with. The concern in any monkeying with the request chain is that we need to be sure that the handler is safe for multiple requests, which is why I'm uncomfortable just replacing the definition of putheader on the HTTPConnection class directly.
Sadly, because the internals of HTTPConnection and the AbstractHTTPHandler are very internal, we have to reproduce much of the code from the python library to inject our custom behaviour. Assuming I've not goofed below and this works as well as it did in my 5 minutes of testing, please be careful to revisit this override if you update your Python version to a revision number (ie: 2.5.x to 2.5.y or 2.5 to 2.6, etc).
I should therefore mention that I am on Python 2.5.1. If you have 2.6 or, particularly, 3.0, you may need to adjust this accordingly.
Please let me know if this doesn't work. I'm having waaaayyyy too much fun with this question:
import urllib2
import httplib
import socket
class CustomHTTPConnection(httplib.HTTPConnection):
def __init__(self, *args, **kwargs):
httplib.HTTPConnection.__init__(self, *args, **kwargs)
self.stored_headers = []
def putheader(self, header, value):
self.stored_headers.append((header, value))
httplib.HTTPConnection.putheader(self, header, value)
class HTTPCaptureHeaderHandler(urllib2.AbstractHTTPHandler):
def http_open(self, req):
return self.do_open(CustomHTTPConnection, req)
http_request = urllib2.AbstractHTTPHandler.do_request_
def do_open(self, http_class, req):
# All code here lifted directly from the python library
host = req.get_host()
if not host:
raise URLError('no host given')
h = http_class(host) # will parse host:port
h.set_debuglevel(self._debuglevel)
headers = dict(req.headers)
headers.update(req.unredirected_hdrs)
headers["Connection"] = "close"
headers = dict(
(name.title(), val) for name, val in headers.items())
try:
h.request(req.get_method(), req.get_selector(), req.data, headers)
r = h.getresponse()
except socket.error, err: # XXX what error?
raise urllib2.URLError(err)
r.recv = r.read
fp = socket._fileobject(r, close=True)
resp = urllib2.addinfourl(fp, r.msg, req.get_full_url())
resp.code = r.status
resp.msg = r.reason
# This is the line we're adding
req.all_sent_headers = h.stored_headers
return resp
my_handler = HTTPCaptureHeaderHandler()
opener = urllib2.OpenerDirector()
opener.add_handler(my_handler)
req = urllib2.Request(url='http://www.google.com')
resp = opener.open(req)
print req.all_sent_headers
shows: [('Accept-Encoding', 'identity'), ('Host', 'www.google.com'), ('Connection', 'close'), ('User-Agent', 'Python-urllib/2.5')]
How about something like this:
import urllib2
import httplib
old_putheader = httplib.HTTPConnection.putheader
def putheader(self, header, value):
print header, value
old_putheader(self, header, value)
httplib.HTTPConnection.putheader = putheader
urllib2.urlopen('http://www.google.com')
A low-level solution:
import httplib
class HTTPConnection2(httplib.HTTPConnection):
def __init__(self, *args, **kwargs):
httplib.HTTPConnection.__init__(self, *args, **kwargs)
self._request_headers = []
self._request_header = None
def putheader(self, header, value):
self._request_headers.append((header, value))
httplib.HTTPConnection.putheader(self, header, value)
def send(self, s):
self._request_header = s
httplib.HTTPConnection.send(self, s)
def getresponse(self, *args, **kwargs):
response = httplib.HTTPConnection.getresponse(self, *args, **kwargs)
response.request_headers = self._request_headers
response.request_header = self._request_header
return response
Example:
conn = HTTPConnection2("www.python.org")
conn.request("GET", "/index.html", headers={
"User-agent": "test",
"Referer": "/",
})
response = conn.getresponse()
response.status, response.reason:
1: 200 OK
response.request_headers:
[('Host', 'www.python.org'), ('Accept-Encoding', 'identity'), ('Referer', '/'), ('User-agent', 'test')]
response.request_header:
GET /index.html HTTP/1.1
Host: www.python.org
Accept-Encoding: identity
Referer: /
User-agent: test
A other solution, witch used the idea from How do you get default headers in a urllib2 Request? But doesn't copy code from std-lib:
class HTTPConnection2(httplib.HTTPConnection):
"""
Like httplib.HTTPConnection but stores the request headers.
Used in HTTPConnection3(), see below.
"""
def __init__(self, *args, **kwargs):
httplib.HTTPConnection.__init__(self, *args, **kwargs)
self.request_headers = []
self.request_header = ""
def putheader(self, header, value):
self.request_headers.append((header, value))
httplib.HTTPConnection.putheader(self, header, value)
def send(self, s):
self.request_header = s
httplib.HTTPConnection.send(self, s)
class HTTPConnection3(object):
"""
Wrapper around HTTPConnection2
Used in HTTPHandler2(), see below.
"""
def __call__(self, *args, **kwargs):
"""
instance made in urllib2.HTTPHandler.do_open()
"""
self._conn = HTTPConnection2(*args, **kwargs)
self.request_headers = self._conn.request_headers
self.request_header = self._conn.request_header
return self
def __getattribute__(self, name):
"""
Redirect attribute access to the local HTTPConnection() instance.
"""
if name == "_conn":
return object.__getattribute__(self, name)
else:
return getattr(self._conn, name)
class HTTPHandler2(urllib2.HTTPHandler):
"""
A HTTPHandler which stores the request headers.
Used HTTPConnection3, see above.
>>> opener = urllib2.build_opener(HTTPHandler2)
>>> opener.addheaders = [("User-agent", "Python test")]
>>> response = opener.open('http://www.python.org/')
Get the request headers as a list build with HTTPConnection.putheader():
>>> response.request_headers
[('Accept-Encoding', 'identity'), ('Host', 'www.python.org'), ('Connection', 'close'), ('User-Agent', 'Python test')]
>>> response.request_header
'GET / HTTP/1.1\\r\\nAccept-Encoding: identity\\r\\nHost: www.python.org\\r\\nConnection: close\\r\\nUser-Agent: Python test\\r\\n\\r\\n'
"""
def http_open(self, req):
conn_instance = HTTPConnection3()
response = self.do_open(conn_instance, req)
response.request_headers = conn_instance.request_headers
response.request_header = conn_instance.request_header
return response
EDIT: Update the source
see urllib2.py:do_request (line 1044 (1067)) and urllib2.py:do_open (line 1073)
(line 293) self.addheaders = [('User-agent', client_version)] (only 'User-agent' added)
It sounds to me like you're looking for the headers of the response object, which include Connection: close, etc. These headers live in the object returned by urlopen. Getting at them is easy enough:
from urllib2 import urlopen
req = urlopen("http://www.google.com")
print req.headers.headers
req.headers is a instance of httplib.HTTPMessage
It should send the default http headers (as specified by w3.org) alongside the ones you specify. You can use a tool like WireShark if you would like to see them in their entirety.
Edit:
If you would like to log them, you can use WinPcap to capture packets sent by specific applications (in your case, python). You can also specify the type of packets and many other details.
-John

Categories

Resources