I tried to make proxy server by tornado in Python. The simple http proxy server has worked well, but the https proxy has some problem.
Part of my programs which might have problem are below.
import tornado.ioloop
from tornado.web import RequestHandler, Application
from tornado.httpclient import AsyncHTTPClient, HTTPRequest
from tornado.httpserver import HTTPServer
class HTTPSHandler(RequestHandler):
#tornado.web.asynchronous
def get(self):
print self.request.host, self.request.method
def handle_request(response):
if response.error and not isinstance(response.error, tornado.httpclient.HTTPError):
print "Error:", response.error
else:
self.write(response.body)
self.finish(" ")#in case of response.body == None
request = self.request
req = HTTPRequest(url=request.uri, method=request.method,
headers=request.headers, body=request.body,
allow_nonstandard_methods = True, follow_redirects = False,
validate_cert=True)
http_client = AsyncHTTPClient()
try:
http_client.fetch(req, handle_request)
except Exception as e:
print e
#tornado.web.asynchronous
def post(self):
return self.get()
#tornado.web.asynchronous
def head(self):
return self.get()
#tornado.web.asynchronous
def delete(self):
return self.get()
#tornado.web.asynchronous
def patch(self):
return self.get()
#tornado.web.asynchronous
def put(self):
return self.get()
#tornado.web.asynchronous
def options(self):
return self.get()
if __name__ == "__main__":
app2 = Application([(r"https:.*", HTTPSHandler),])
httpsServer = HTTPServer(app2, ssl_options = {
"certfile": "./server.crt",
"keyfile": "./server.key",
})
app2.listen(444)
tornado.ioloop.IOLoop.instance().start()
It outputs a WARNING like below (when I access to https://www.google.com and https://github.com)
WARNING:tornado.access:405 CONNECT www.google.co.jp:443 (127.0.0.1) 0.69ms
WARNING:tornado.access:405 CONNECT github.com:443 (127.0.0.1) 0.58ms
Finally, web pages which use https protocol could not be displayed with browser error.
ERR_TUNNEL_CONNECTION_FAILED
I guess, this is caused by the tornado’s requestHandler because it does not support CONNECT method.
My question is how can I use the CONNECT method?
I have noticed the way to fix this problem.
At first, SUPPORTED_METHOD should be written in HTTPSHandler class.
This can solve the 405 WARNING and browser error.
class HTTPSHandler(RequestHandler):
SUPPORTED_METHODS = ("CONNECT", "GET", "HEAD", "POST", "DELETE", "PATCH", "PUT", "OPTIONS")
#tornado.web.asynchronous
def get(self):
This is written in official document as follows.
If you want to support more methods than the standard GET/HEAD/POST, you
should override the class variable ``SUPPORTED_METHODS`` in your
`RequestHandler` subclass.
Moreover, to handle and process the CONNECT method request, additional method is needed in HTTPSHandler class.
#tornado.web.asynchronous
def connect(self):
print "some specific processings here"
Finally, I took stupid mistakes in regular expressions and ssl_option.
app2 = Application([(r"https:.*", HTTPSHandler),]) # not correct
app2 = Application([(r".*", HTTPSHandler),]) # correct
httpServer = HTTPServer(app2) # ssl_options is not needed
In theory you should be able to implement the CONNECT method in the same way that WebSocketHandler works, by hijacking the underlying connection's IOStream. But be warned that this is uncharted territory; the HTTP proxy protocol has some differences from plain HTTP and I don't know how well it will work to implement a proxy on top of a normal application-level HTTP service.
Related
To give you an idea of what I am trying to accomplish with Twisted Web and Autobahn websockets: my UI currently sends an initial HTTP GET request with an upgrade to a websocket in the header. Upon reading that in Twisted Web, the connection needs to switch from HTTP to a websocket protocol to pass data back and forth. Note that this websocket upgrade happens on the same port, port 8000.
Does anyone know how I can implement what I am trying to do? Thank you so much.
EDIT: updated code for working example. You can find it here: Payload from POST Request is Cutoff (Twisted Web)
Here is my code using Twisted Web:
class HttpResource(resource.Resource):
isLeaf = 1
def __init__(self):
self.children = {}
self.ws_port = None
print 'resource invoked'
def render_GET(self, request):
print 'render invoked'
if request.getHeader('Sec-WebSocket-Key'):
# Processing the Key as per RFC 6455
key = request.getHeader('Sec-WebSocket-Key')
h = hashlib.sha1(key + "258EAFA5-E914-47DA-95CA-C5AB0DC85B11")
request.setHeader('Sec-WebSocket-Accept', base64.b64encode(h.digest()))
# setting response headers
request.setHeader('Upgrade', 'websocket')
request.setHeader('Connection', 'Upgrade')
request.setResponseCode(101)
return ''
else:
log("Regular HTTP GET request.")
return "<html><body style='margin: 0; overflow: hidden;'><iframe style='width: 100%; height: 100%; border: none;' src='http://tsa-graphiql.herokuapp.com/'></iframe></body></html>"
def render_POST(self,request):
log("POST request")
request.setResponseCode(200)
def handle_single_query(self, queryData):
log("Handle single query data.")
return
class HttpWsChannel(http.HTTPChannel):
def dataReceived(self, data):
log('Data received:\n{}'.format(data))
if data.startswith('GET'):
# This will invoke the render method of resource provided
http.HTTPChannel.dataReceived(self, data)
if data.startswith('POST'):
http.HTTPChannel.dataReceived(self, data)
else:
"""
Pass binary data to websocket class.
"""
ws_protocol = self.site.ws_factory.protocol(self.site.ws_factory.connection_subscriptions)
log(ws_protocol)
#just echo for now
# self.transport.write(data)
class HttpFactory(Site):
"""
Factory which takes care of tracking which protocol
instances or request instances are responsible for which
named response channels, so incoming messages can be
routed appropriately.
"""
def __init__(self, resource):
http.HTTPFactory.__init__(self)
self.resource = resource
self.ws_factory = WsProtocolFactory("ws://127.0.0.1:8000")
self.ws_factory.protocol = WsProtocol
def buildProtocol(self, addr):
try:
channel = HttpWsChannel()
channel.requestFactory = self.requestFactory
channel.site = self
return channel
except Exception as e:
log("Could not build protocol: {}".format(e))
site = HttpFactory(HttpResource())
if __name__ == '__main__':
reactor.listenTCP(8000, site)
reactor.run()
EDIT 7/8/2017: Here is the new code I am trying below. The websocket messages are received successfully via the onMessage method. However the HTTP requests are not working. The current error I am getting on a GET request is:
<html>
<head><title>404 - No Such Resource</title></head>
<body>
<h1>No Such Resource</h1>
<p>No such child resource.</p>
</body>
</html>
Python code
from twisted.web.server import (
Site,
)
from twisted.internet import reactor
from twisted.web.resource import (
Resource,
)
from autobahn.twisted.websocket import (
WebSocketServerProtocol,
WebSocketServerFactory,
)
from autobahn.twisted.resource import (
WebSocketResource,
)
class WebSocketProtocol(WebSocketServerProtocol):
def onConnect(self, request):
print("WebSocket connection request: {}".format(request))
def onMessage(self, payload, isBinary):
print("onMessage: {}".format(payload))
if __name__ == '__main__':
factory = WebSocketServerFactory()
factory.protocol = WebSocketProtocol
resource = WebSocketResource(factory)
root = Resource()
root.putChild(b"ws", resource)
site = Site(root)
reactor.listenTCP(8000, site)
reactor.run()
Use WebSocketResource to expose a WebSocketServerFactory as part of a Site.
from twisted.web.server import (
Site,
)
from twisted.web.resource import (
Resource,
)
from autobahn.twisted.websocket import (
WebSocketServerProtocol,
WebSocketServerFactory,
)
from autobahn.twisted.resource import (
WebSocketResource,
)
class YourAppProtocol(WebSocketServerProtocol):
def onConnect(self, request):
...
...
def main():
factory = WebSocketRendezvousFactory()
factory.protocol = YourAppProtocol
resource = WebSocketResource(factory)
root = Resource()
root.putChild(b"some-path-segment", resource)
root.putChild(...)
site = Site(root)
reactor.listenTCP(8080, site)
reactor.run()
The problems with truncated request bodies is probably because your upgrade protocol implementation is buggy. There is no framing applied at the level of dataReceived so you can't expect checks like startswith("GET") to be reliable.
Using WebSocketResource and Site gives you the correct HTTP parsing code from Twisted Web and Autobahn while also allowing you to speak WebSocket to a particular URL and regular HTTP to others.
So after reading a little bit on Google, I found this website that explains how to upgrade the HTTP connection to a websocket connection via Autobahn Twisted: Read and Set request headers via Autobahn Twisted.
The code that I was able to get to work is shown below!
from twisted.web.server import (
Site,
)
from twisted.internet import reactor
from twisted.web.resource import (
Resource,
)
from autobahn.twisted.websocket import (
WebSocketServerProtocol,
WebSocketServerFactory,
)
from autobahn.twisted.resource import (
WebSocketResource,
)
class HttpResource(Resource):
isLeaf = True
def render_GET(self, request):
return "<html><body style='margin: 0; overflow: hidden;'><iframe style='width: 100%; height: 100%; border: none;' src='http://tsa-graphiql.herokuapp.com/'></iframe></body></html>"
class WebSocketProtocol(WebSocketServerProtocol):
def onConnect(self, request):
custom_header = {}
if request.headers['sec-websocket-key']:
custom_header['sec-websocket-protocol'] = 'graphql-ws'
return (None, custom_header)
def onMessage(self, payload, isBinary):
print("onMessage: {}".format(payload))
if __name__ == '__main__':
factory = WebSocketServerFactory()
factory.protocol = WebSocketProtocol
resource = WebSocketResource(factory)
root = Resource()
root.putChild("", HttpResource())
root.putChild(b"ws", ws_resource)
site = Site(root)
reactor.listenTCP(8000, site)
I have a custom HTTP request handler that can be simplified to something like this:
# Python 3:
from http import server
class MyHandler(server.BaseHTTPRequestHandler):
def do_GET(self):
self.send_response(200)
self.send_header("Content-type", "text/html")
self.end_headers()
# Here's where all the complicated logic is done to generate HTML.
# For clarity here, replace with a simple stand-in:
html = "<html><p>hello world</p></html>"
self.wfile.write(html.encode())
I'd like to unit-test this handler (i.e. make sure that my do_GET executes without an exception) without actually starting a web server. Is there any lightweight way to mock the SimpleHTTPServer so that I can test this code?
Expanding on the answer from jakevdp, I managed to be able to check the output, too:
try:
import unittest2 as unittest
except ImportError:
import unittest
try:
from io import BytesIO as IO
except ImportError:
from StringIO import StringIO as IO
from server import MyHandlerSSL # My BaseHTTPRequestHandler child
class TestableHandler(MyHandlerSSL):
# On Python3, in socketserver.StreamRequestHandler, if this is
# set it will use makefile() to produce the output stream. Otherwise,
# it will use socketserver._SocketWriter, and we won't be able to get
# to the data
wbufsize = 1
def finish(self):
# Do not close self.wfile, so we can read its value
self.wfile.flush()
self.rfile.close()
def date_time_string(self, timestamp=None):
""" Mocked date time string """
return 'DATETIME'
def version_string(self):
""" mock the server id """
return 'BaseHTTP/x.x Python/x.x.x'
class MockSocket(object):
def getsockname(self):
return ('sockname',)
class MockRequest(object):
_sock = MockSocket()
def __init__(self, path):
self._path = path
def makefile(self, *args, **kwargs):
if args[0] == 'rb':
return IO(b"GET %s HTTP/1.0" % self._path)
elif args[0] == 'wb':
return IO(b'')
else:
raise ValueError("Unknown file type to make", args, kwargs)
class HTTPRequestHandlerTestCase(unittest.TestCase):
maxDiff = None
def _test(self, request):
handler = TestableHandler(request, (0, 0), None)
return handler.wfile.getvalue()
def test_unauthenticated(self):
self.assertEqual(
self._test(MockRequest(b'/')),
b"""HTTP/1.0 401 Unauthorized\r
Server: BaseHTTP/x.x Python/x.x.x\r
Date: DATETIME\r
WWW-Authenticate: Basic realm="MyRealm", charset="UTF-8"\r
Content-type: text/html\r
\r
<html><head><title>Authentication Failed</title></html><body><h1>Authentication Failed</h1><p>Authentication Failed. Authorised Personnel Only.</p></body></html>"""
)
def main():
unittest.main()
if __name__ == "__main__":
main()
The code I am testing returns a 401 Unauthorised for "/". Change the response as appopriate for your test case.
Here's one approach I came up with to mock the server. Note that this should be compatible with both Python 2 and python 3. The only issue is that I can't find a way to access the result of the GET request, but at least the test will catch any exceptions it comes across!
try:
# Python 2.x
import BaseHTTPServer as server
from StringIO import StringIO as IO
except ImportError:
# Python 3.x
from http import server
from io import BytesIO as IO
class MyHandler(server.BaseHTTPRequestHandler):
"""Custom handler to be tested"""
def do_GET(self):
# print just to confirm that this method is being called
print("executing do_GET") # just to confirm...
self.send_response(200)
self.send_header("Content-type", "text/html")
self.end_headers()
# Here's where all the complicated logic is done to generate HTML.
# For clarity here, replace with a simple stand-in:
html = "<html><p>hello world</p></html>"
self.wfile.write(html.encode())
def test_handler():
"""Test the custom HTTP request handler by mocking a server"""
class MockRequest(object):
def makefile(self, *args, **kwargs):
return IO(b"GET /")
class MockServer(object):
def __init__(self, ip_port, Handler):
handler = Handler(MockRequest(), ip_port, self)
# The GET request will be sent here
# and any exceptions will be propagated through.
server = MockServer(('0.0.0.0', 8888), MyHandler)
test_handler()
So this is a little tricky depending on how "deep" you want to go into the BaseHTTPRequestHandler behavior to define your unit test. At the most basic level I think you can use this example from the mock library:
>>> from mock import MagicMock
>>> thing = ProductionClass()
>>> thing.method = MagicMock(return_value=3)
>>> thing.method(3, 4, 5, key='value')
3
>>> thing.method.assert_called_with(3, 4, 5, key='value')
So if you know which methods in the BaseHTTPRequestHandler your class is going to call you could mock the results of those methods to be something acceptable. This can of course get pretty complex depending on how many different types of server responses you want to test.
I'm trying to use scrapy over Tor. I've been trying to get my head around how to write a DownloadHandler for scrapy that uses socksipy connections.
Scrapy's HTTP11DownloadHandler is here: https://github.com/scrapy/scrapy/blob/master/scrapy/core/downloader/handlers/http11.py
Here is an example for creating a custom download handler:
https://github.com/scrapinghub/scrapyjs/blob/master/scrapyjs/dhandler.py
Here's the code for creating a SocksiPyConnection class: http://blog.databigbang.com/distributed-scraping-with-multiple-tor-circuits/
class SocksiPyConnection(httplib.HTTPConnection):
def __init__(self, proxytype, proxyaddr, proxyport = None, rdns = True, username = None, password = None, *args, **kwargs):
self.proxyargs = (proxytype, proxyaddr, proxyport, rdns, username, password)
httplib.HTTPConnection.__init__(self, *args, **kwargs)
def connect(self):
self.sock = socks.socksocket()
self.sock.setproxy(*self.proxyargs)
if isinstance(self.timeout, float):
self.sock.settimeout(self.timeout)
self.sock.connect((self.host, self.port))
With the complexity of twisted reactors in the scrapy code, I can't figure out how plug socksipy into it. Any thoughts?
Please do not answer with privoxy-like alternatives or post answers saying "scrapy doesn't work with socks proxies" - I know that, which is why I'm trying to write a custom Downloader that makes requests using socksipy.
I was able to make this work with https://github.com/habnabit/txsocksx.
After doing a pip install txsocksx, I needed to replace scrapy's ScrapyAgent with txsocksx.http.SOCKS5Agent.
I simply copied the code for HTTP11DownloadHandler and ScrapyAgent from scrapy/core/downloader/handlers/http.py, subclassed them and wrote this code:
class TorProxyDownloadHandler(HTTP11DownloadHandler):
def download_request(self, request, spider):
"""Return a deferred for the HTTP download"""
agent = ScrapyTorAgent(contextFactory=self._contextFactory, pool=self._pool)
return agent.download_request(request)
class ScrapyTorAgent(ScrapyAgent):
def _get_agent(self, request, timeout):
bindaddress = request.meta.get('bindaddress') or self._bindAddress
proxy = request.meta.get('proxy')
if proxy:
_, _, proxyHost, proxyPort, proxyParams = _parse(proxy)
scheme = _parse(request.url)[0]
omitConnectTunnel = proxyParams.find('noconnect') >= 0
if scheme == 'https' and not omitConnectTunnel:
proxyConf = (proxyHost, proxyPort,
request.headers.get('Proxy-Authorization', None))
return self._TunnelingAgent(reactor, proxyConf,
contextFactory=self._contextFactory, connectTimeout=timeout,
bindAddress=bindaddress, pool=self._pool)
else:
_, _, host, port, proxyParams = _parse(request.url)
proxyEndpoint = TCP4ClientEndpoint(reactor, proxyHost, proxyPort,
timeout=timeout, bindAddress=bindaddress)
agent = SOCKS5Agent(reactor, proxyEndpoint=proxyEndpoint)
return agent
return self._Agent(reactor, contextFactory=self._contextFactory,
connectTimeout=timeout, bindAddress=bindaddress, pool=self._pool)
In settings.py, something like this is needed:
DOWNLOAD_HANDLERS = {
'http': 'crawler.http.TorProxyDownloadHandler'
}
Now proxying with Scrapy with work through a socks proxy like Tor.
Try txsocksx that comes from one of the Twisted developers.
Thanks to greatness of Twsisted and Scrapy, you can easily use SOCKS as proxy:
In downloader.py:
import scrapy.core.downloader.handlers.http11 as handler
from twisted.internet import reactor
from txsocksx.http import SOCKS5Agent
from twisted.internet.endpoints import TCP4ClientEndpoint
from scrapy.core.downloader.webclient import _parse
class TorScrapyAgent(handler.ScrapyAgent):
_Agent = SOCKS5Agent
def _get_agent(self, request, timeout):
proxy = request.meta.get('proxy')
if proxy:
proxy_scheme, _, proxy_host, proxy_port, _ = _parse(proxy)
if proxy_scheme == 'socks5':
endpoint = TCP4ClientEndpoint(reactor, proxy_host, proxy_port)
return self._Agent(reactor, proxyEndpoint=endpoint)
return super(TorScrapyAgent, self)._get_agent(request, timeout)
class TorHTTPDownloadHandler(handler.HTTP11DownloadHandler):
def download_request(self, request, spider):
agent = TorScrapyAgent(contextFactory=self._contextFactory, pool=self._pool,
maxsize=getattr(spider, 'download_maxsize', self._default_maxsize),
warnsize=getattr(spider, 'download_warnsize', self._default_warnsize))
return agent.download_request(request)
Register new handler in settings.py:
DOWNLOAD_HANDLERS = {
'http': 'crawler.downloader.TorHTTPDownloadHandler',
'https': 'crawler.downloader.TorHTTPDownloadHandler'
}
Now, you only have to tell crawlers to use proxy. I recommend to do it via middleware:
class ProxyDownloaderMiddleware(object):
def process_request(self, request, spider):
request.meta['proxy'] = 'socks5://127.0.0.1:950'
I adapted this sample code in order to get webapp2 sessions to work on Google App Engine.
What do I need to do to be able to return webapp2.Response objects from a handler that's inheriting from a BaseHandler that overrides the dispatch method?
Here's a demonstration of the kind of handler I want to write:
import webapp2
import logging
from webapp2_extras import sessions
class BaseHandler(webapp2.RequestHandler):
def dispatch(self):
# Get a session store for this request.
self.session_store = sessions.get_store(request=self.request)
try:
# Dispatch the request.
webapp2.RequestHandler.dispatch(self)
finally:
# Save all sessions.
self.session_store.save_sessions(self.response)
class HomeHandler(BaseHandler):
def get(self):
logging.debug('In homehandler')
response = webapp2.Response()
response.write('Foo')
return response
config = {}
config['webapp2_extras.sessions'] = {
'secret_key': 'some-secret-key',
}
app = webapp2.WSGIApplication([
('/test', HomeHandler),
], debug=True, config=config)
This code is obviously not working, since BaseHandler always calls dispatch with self. I've looked through the code of webapp2.RequestHandler, but it seriously eludes me how to modify my BaseHandler (or perhaps set a custom dispatcher) such that I can simply return response objects from inheriting handlers.
Curiously, the shortcut of assigning self.response = copy.deepcopy(response) does not work either.
You're mixing the two responses in one method. Use either
return webapp2.Response('Foo')
or
self.response.write('Foo')
...not both.
I took a look at webapp2.RequestHandler and noticed that returned values are just passed up the stack.
A solution which works for me is to use the returned Response when one is returned from the handler, or self.response when nothing is returned.
class BaseHandler(webapp2.RequestHandler):
def dispatch(self):
# Get a session store for this request.
self.session_store = sessions.get_store(request=self.request)
response = None
try:
# Dispatch the request.
response = webapp2.RequestHandler.dispatch(self)
return response
finally:
# Save all sessions.
if response is None:
response = self.response
self.session_store.save_sessions(response)
While I was playing I noticed that my session stored as a secure cookie was not getting updated when exceptions were raised in the handler.
I'm trying to make a web client application using twisted but having some trouble with cookies. Does anyone have an example I can look at?
While it's true that getPage doesn't easily allow direct access to the request or response headers (just one example of how getPage isn't a super awesome API), cookies are actually supported.
cookies = {cookies: tosend}
d = getPage(url, cookies=cookies)
def cbPage(result):
print 'Look at my cookies:', cookies
d.addCallback(cbPage)
Any cookies in the dictionary when it is passed to getPage will be sent. Any new cookies the server sets in response to the request will be added to the dictionary.
You might have missed this feature when looking at getPage because the getPage signature doesn't have a cookies parameter anywhere in it! However, it does take **kwargs, and this is how cookies is supported: any extra arguments passed to getPage that it doesn't know about itself, it passes on to HTTPClientFactory.__init__. Take a look at that method's signature to see all of the things you can pass to getPage.
Turns out there is no easy way afaict
The headers are stored in twisted.web.client.HTTPClientFactory but not available from twisted.web.client.getPage() which is the function designed for pulling back a web page. I ended up rewriting the function:
from twisted.web import client
def getPage(url, contextFactory=None, *args, **kwargs):
fact = client._makeGetterFactory(
url,
HTTPClientFactory,
contextFactory=contextFactory,
*args, **kwargs)
return fact.deferred.addCallback(lambda data: (data, fact.response_headers))
from twisted.internet import reactor
from twisted.web import client
def getPage(url, contextFactory=None, *args, **kwargs):
return client._makeGetterFactory(
url,
CustomHTTPClientFactory,
contextFactory=contextFactory,
*args, **kwargs).deferred
class CustomHTTPClientFactory(client.HTTPClientFactory):
def __init__(self,url, method='GET', postdata=None, headers=None,
agent="Twisted PageGetter", timeout=0, cookies=None,
followRedirect=1, redirectLimit=20):
client.HTTPClientFactory.__init__(self, url, method, postdata,
headers, agent, timeout, cookies,
followRedirect, redirectLimit)
def page(self, page):
if self.waiting:
self.waiting = 0
res = {}
res['page'] = page
res['headers'] = self.response_headers
res['cookies'] = self.cookies
self.deferred.callback(res)
if __name__ == '__main__':
def cback(result):
for k in result:
print k, '==>', result[k]
reactor.stop()
def eback(error):
print error.getTraceback()
reactor.stop()
d = getPage('http://example.com', agent='example web client',
cookies={ 'some' : 'cookie' } )
d.addCallback(cback)
d.addErrback(eback)
reactor.run()