How to deal with deflated response by urllib2? [duplicate]

How to deal with deflated response by urllib2? [duplicate] - python

This question already has answers here:
Python: Inflate and Deflate implementations
(2 answers)
Closed 4 years ago.
I currently use following code to decompress gzipped response by urllib2:
opener = urllib2.build_opener()
response = opener.open(req)
data = response.read()
if response.headers.get('content-encoding', '') == 'gzip':
data = StringIO.StringIO(data)
gzipper = gzip.GzipFile(fileobj=data)
html = gzipper.read()
Does it handle deflated response too or do I need to write seperate code to handle deflated response?

You can try
if response.headers.get('content-encoding', '') == 'deflate':
html = zlib.decompress(response.read())
if fail, here is another way, I found it in requests source code,
if response.headers.get('content-encoding', '') == 'deflate':
html = zlib.decompressobj(-zlib.MAX_WBITS).decompress(response.read())

There is a better way outlined at:
http://rationalpie.wordpress.com/2010/06/02/python-streaming-gzip-decompression/
The author explains how to decompress chunk by chunk, rather than all at once in memory. This is the preferred method when larger files are involved.
Also found this helpful site for testing:
http://carsten.codimi.de/gzip.yaws/

To answer from above comment, the HTTP spec (http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.3) says:
If no Accept-Encoding field is present in a request, the server MAY assume that the client will accept any content coding. In this case, if "identity" is one of the available content-codings, then the server SHOULD use the "identity" content-coding, unless it has additional information that a different content-coding is meaningful to the client.
I take that to mean it should use identity. I've never seen a server that doesn't.

you can see the code in urllib3
class DeflateDecoder(object):
def __init__(self):
self._first_try = True
self._data = binary_type()
self._obj = zlib.decompressobj()
def __getattr__(self, name):
return getattr(self._obj, name)
def decompress(self, data):
if not data:
return data
if not self._first_try:
return self._obj.decompress(data)
self._data += data
try:
return self._obj.decompress(data)
except zlib.error:
self._first_try = False
self._obj = zlib.decompressobj(-zlib.MAX_WBITS)
try:
return self.decompress(self._data)
finally:
self._data = None
class GzipDecoder(object):
def __init__(self):
self._obj = zlib.decompressobj(16 + zlib.MAX_WBITS)
def __getattr__(self, name):
return getattr(self._obj, name)
def decompress(self, data):
if not data:
return data
return self._obj.decompress(data)

Related

In the controllers.py file of odoo, how to get the json string in the post request body?

I have defined a class in the controllers.py file to receive HTTP requests. The remote server sends a POST request and the data in the request body is a JSON string.
I can get the data in the request body directly by converting the JSON string to a dictionary via method http.request.jsonrequest, but for now, I need to get the original JSON string in the request body directly instead of a dictionary to verify a signature.
The method(json.dumps()) of directly converting to JSON strings cannot be used, as the string obtained in this way is not the same as the JSON string in the original request body, which can lead to a failure when verifying the signature.
What should I do about it? Please help me. Thank you.
this is my controllers.py
# -*- coding: utf-8 -*-
from odoo import http
class CallbackNotification(http.Controller):
def safety_judgement(self):
"""
:return:
"""
header = http.request.httprequest.headers.environ
signature = header['HTTP_X_TSIGN_OPEN_SIGNATURE']
time_stamp = header['HTTP_X_TSIGN_OPEN_TIMESTAMP']
remote_addr = http.request.httprequest.remote_addr
if remote_addr != '47.99.80.224':
return
#http.route('/signature/process/my_odoo', type='json', auth='none')
def receive_institution_auth(self, **kw):
"""
:param kw:
:return:
"""
self.safety_judgement()
request_body = http.request.jsonrequest
action = request_body['action']
flow_num = request_body['flowId']
http_env = http.request.env
sign_process_id = http_env['sign.process'].sudo().search([('flow_num', '=', flow_num)]).id
if action == 'SIGN_FLOW_UPDATE':
third_order = request_body['thirdOrderNo']
name_id_user_list = third_order.split(',')
model = name_id_user_list[0]
record_id = name_id_user_list[1]
approve_user_id = name_id_user_list[2]
if approve_user_id != 'p':
record_obj = http_env[model].sudo(user=int(approve_user_id)).browse(int(record_id))
sign_result = request_body['signResult']
result_description = request_body['resultDescription']
account_num = request_body['accountId']
org_or_account_num = request_body['authorizedAccountId']
sign_user_id = http_env['sign.users'].sudo().search([('account_num','=',account_num)]).id
http_manual_env = http_env['manual.sign'].sudo()
if account_num == org_or_account_num:
manual_id = http_manual_env.search([('sign_process_id','=',sign_process_id),
('sign_user_id','=',sign_user_id)]).id
else:
institution_id = http_env['institution.account'].sudo().search([('org_num','=',org_or_account_num)]).id
manual_id = http_manual_env.search([('sign_process_id', '=', sign_process_id),
('sign_user_id', '=', sign_user_id),
('institution_id','=',institution_id)]).id
if sign_result == 2:
http_manual_env.browse(manual_id).write({'sign_result':'success'})
http.request._cr.commit()
if approve_user_id != 'p':
record_obj.approve_action('approved','')
else:
http_env[model].sudo().browse(int(record_id)).write({'partner_sign_state':'success'})
elif sign_result == 3:
http_manual_env.browse(manual_id).write({'sign_result':'failed'})
if approve_user_id == 'p':
http_env[model].sudo().browse(int(record_id)).write({'partner_sign_state':'failed'})
elif sign_result == 4:
http_manual_env.browse(manual_id).write({'sign_result':'reject'})
http.request._cr.commit()
if approve_user_id != 'p':
record_obj.approve_action('reject', result_description)
else:
http_env[model].sudo().browse(int(record_id)).write({'partner_sign_state':'reject','partner_reject_reason':result_description})

To obtain a raw body data, you can use the following code:
raw_body_data = http.request.httprequest.data

pychong
You can send the value from JSON-RPC into your Json controller.
Js File:
Where pass the value before calling the controller like this.
var ajax = require('web.ajax');
ajax.jsonRpc("/custom/url", 'call', {'Your Key': Your Value}).then(function(data) {
if (data) {
// Code
} else {
// Code
}});
Py File :
Get the data from the post like this,
#http.route(['/custom/url'], type='json', auth="public", website=True)
def custom_cotroller(self, **post):
get_data = post.get('Your Key')
# Your Customise Code
Thanks

#Dipen Shah #CoolFlash95 #Charif DZ
Hello everyone,I've found a solution to this problem.But as I lay out the solution, I hope we can understand the root cause of the problem, so let's examine odoo's source code.
from odoo.http import JsonRequest--odoo version 10.0--line598
from odoo.http import JsonRequest--odoo version 11.0--line609
In Odoo10
request = self.httprequest.stream.read(),thenself.jsonrequest = json.loads(request)
In Odoo11
request=self.httprequest.get_data().decode(self.httprequest.charset),thenself.jsonrequest = json.loads(request)
We find that the self object of JsonRequest has the attribute jsonrequest that the type is dict.Unfortunately, the source code does not allow self to have 'another' attribute, which contains the original string in the request body.However,it is very easy to add the 'another' attribute,why not?
We can use setattr to dynamically change the methods in the source code.Let's change the method__init__of JsonRequest and add another attribute named stream_str.
eg.Odoo version is 10,python version is 2.7
# -*- coding: utf-8 -*-
import logging
from odoo.http import JsonRequest
import werkzeug
import json
_logger = logging.getLogger(__name__)
def __init__(self, *args):
"""
We have copied the method __init__ directly from the source code and added
only one line of code to it
"""
super(JsonRequest, self).__init__(*args)
self.jsonp_handler = None
args = self.httprequest.args
jsonp = args.get('jsonp')
self.jsonp = jsonp
request = None
request_id = args.get('id')
if jsonp and self.httprequest.method == 'POST':
# jsonp 2 steps step1 POST: save call
def handler():
self.session['jsonp_request_%s' % (request_id,)] = self.httprequest.form['r']
self.session.modified = True
headers = [('Content-Type', 'text/plain; charset=utf-8')]
r = werkzeug.wrappers.Response(request_id, headers=headers)
return r
self.jsonp_handler = handler
return
elif jsonp and args.get('r'):
# jsonp method GET
request = args.get('r')
elif jsonp and request_id:
# jsonp 2 steps step2 GET: run and return result
request = self.session.pop('jsonp_request_%s' % (request_id,), '{}')
else:
# regular jsonrpc2
request = self.httprequest.stream.read()
# We added this line of code,a new attribute named stream_str contains the origin string in request body when the request type is json.
self.stream_str = request
# Read POST content or POST Form Data named "request"
try:
self.jsonrequest = json.loads(request)
except ValueError:
msg = 'Invalid JSON data: %r' % (request,)
_logger.info('%s: %s', self.httprequest.path, msg)
raise werkzeug.exceptions.BadRequest(msg)
self.params = dict(self.jsonrequest.get("params", {}))
self.context = self.params.pop('context', dict(self.session.context))
# Replacing the __init__ method in the source code with the new __init__ method, but without changing the source code
setattr(JsonRequest, '__init__', __init__)
In the definition of the routing function, we can do this.
# -*- coding: utf-8 -*-
from odoo.http import Controller,route,request
class CallbackNotification(http.Controller):
#route('/signature/process/my_odoo', type='json', auth='none')
def receive_institution_auth(self, **kw):
# When the type='json',the request is the object of JsonRequest,we can get the new attribute stream_str very easy!
stream_str = request.stream_str
Now the problem has been solved.

OpenSubtitle API, returning blank data with status 200

(A same question is available in Stackoverflow. But that didn't help me because it used other function)
API Documentation
Hello, I am trying to implement Opensubtitle API with Python. I prefer trying to search subtitle file with hash, because it's accurate.
As I have never used xmlrpc before and quite new to using APIs, I had to study to make it work. But I am stuck at the final point. My program is returning Status 200 (OK), but the 'data' array is blank. I think, I am doing something wrong with the paramater passing thing. The code is here:
from xmlrpclib import ServerProxy
import hashCheck, os
server = 'http://api.opensubtitles.org/xml-rpc'
class MainEngine(object):
def __init__(self, language="None"):
self.rpc = ServerProxy(server, allow_none=True)
user_agent = 'OSTestUserAgentTemp'
self.Start()
def getToken(self):
self.logindata = self.rpc.LogIn(None, None, "en", "OSTestUserAgentTemp")
self.Token = self.logindata["token"]
return self.Token
def subSearch(self, path):
self.hash = hashCheck.hash(self.path)
token = self.getToken()
self.param = [
token, # token
[
'eng', # sublanguageid
self.hash, #hash
os.path.getsize(path), # byte size
]
]
Obj = self.rpc.SearchSubtitles(token, self.param)
print Obj
def Start(self):
# print server
self.path = "E:\Movies\English\Captain Phillips\Captain Phillips.mp4"
self.subSearch(self.path)
def main():
MainEngine()
if __name__ == '__main__':
main()

what's wrong with this python code

This is a code with Web crawler.
I'm a beginer in learning python.So i don't know how to solve.
It seems wrong with search()
# -*- coding:utf-8 -*-
import urllib,urllib2,re
class BDTB:
def __init__(self,baseUrl,seeLZ):
self.baseUrl = baseUrl
self.seeLZ = '?see_lz' + str(seeLZ)
def getPage(self,pageNum):
try:
url = self.baseUrl + self.seeLZ + '&pn=' + str(pageNum)
request = urllib2.Request(url)
response = urllib2.urlopen(request)
#print response.read().decode('utf-8')
return response
except urllib2.URLError,e:
if hasattr(e,'reason'):
print u'连接百度贴吧失败，错误原因',e.reason
return None
def getTitle(self):
page = self.getPage(1)
pattern = re.compile('<h3 class.*?px">(.*?)</h3>',re.S)
result = re.search(pattern,page)
if result:
print result.group(1)
return result.group(1).strip()
else:
return None
baseURL = 'http://tieba.baidu.com/p/4095047339'
bdtb = BDTB(baseURL,1)
bdtb.getTitle()

This will raise a TypeError: expected string or buffer because you are passing the object returned from urllib2.urlopen(request) to re.search() when it requires an str.
If you change the return value from:
return responce # returns the object
to one that returns the text contained in the request:
return responce.read() # returns the text contained in the responce
Your script works and after executing it returns:
广告兼职及二手物品交易集中贴
Additionally, since you're working with Python 2.x you might want to change you object from class BDTB: to class BDTB(object) in order to use new style classes.

How to abstract from pycurl the correct way

I would like to write a pseudo module that makes one do a GET request that keeps on going (pretty much like the one to consume the Twitter Streamming API), but making unnecessary to give in all the parameters everytime someone wants to call a function to make that same GET request.
In my module.py I have
class viewResults():
def __init__(self,username,password,keyname,consume):
self.buffer = ""
self.consume = consume
self.conn = pycurl.Curl()
self.conn.setopt(pycurl.USERPWD, "%s:%s" % (username, password))
self.conn.setopt(pycurl.URL, "http://crowdprocess.no.de/"+keyname+"/results")
self.conn.setopt(pycurl.WRITEFUNCTION, self.on_receive)
# self.conn.setopt(pycurl.VERBOSE, 1)
# self.conn.setopt(pycurl.DEBUGFUNCTION, self.debug)
self.conn.perform()
# def debug(self,debug_type,debug_message):
# print 'type: '+str(debug_type)+' message'+str(debug_message)
def on_receive(self, data):
self.buffer += data
if data.endswith("\r\n") and self.buffer.strip():
content = json.loads(self.buffer)
self.consume(content)
self.buffer = ""
And on index.py I have
from module import viewResults
def consume(content):
print content
viewResults('username','password','keyname',consume)
So I wanted to pass only the parameters username, password, keyname and the "consume" function that should be called when the buffer is full of valid JSON data...
What's happening is that the request is actually made, if VERBOSE is on I can see all the data arriving, but that "higher level consume" function get's nothing...
How can I achieve this ?
Thanks.

As I understand you want to archive debug data?
Create your custom debug function to store you data: custom_debug(debug_type, debug_msg)
>>> import human_curl as hurl
>>> import json
>>> r = hurl.get("http://crowdprocess.no.de/"+keyname+"/results"",
... debug=custom_debug, auth=('username', 'password'),)
>>> consume(json.loads(r.content))

I could not see in your post code that on_receive(self, data): print something.
Add to it sys.stderr.write("%s\n" % data)
def on_receive(self, data):
# -- print data to stderr --
import sys
sys.stderr.write("%s\n" % data)
# -- end --
self.buffer += data
if data.endswith("\r\n") and self.buffer.strip():
content = json.loads(self.buffer)
self.consume(content)
self.buffer = ""

Python - seek in http response stream

Using urllibs (or urllibs2) and wanting what I want is hopeless.
Any solution?

I'm not sure how the C# implementation works, but, as internet streams are generally not seekable, my guess would be it downloads all the data to a local file or in-memory object and seeks within it from there. The Python equivalent of this would be to do as Abafei suggested and write the data to a file or StringIO and seek from there.
However, if, as your comment on Abafei's answer suggests, you want to retrieve only a particular part of the file (rather than seeking backwards and forwards through the returned data), there is another possibility. urllib2 can be used to retrieve a certain section (or 'range' in HTTP parlance) of a webpage, provided that the server supports this behaviour.
The range header
When you send a request to a server, the parameters of the request are given in various headers. One of these is the Range header, defined in section 14.35 of RFC2616 (the specification defining HTTP/1.1). This header allows you to do things such as retrieve all data starting from the 10,000th byte, or the data between bytes 1,000 and 1,500.
Server support
There is no requirement for a server to support range retrieval. Some servers will return the Accept-Ranges header (section 14.5 of RFC2616) along with a response to report if they support ranges or not. This could be checked using a HEAD request. However, there is no particular need to do this; if a server does not support ranges, it will return the entire page and we can then extract the desired portion of data in Python as before.
Checking if a range is returned
If a server returns a range, it must send the Content-Range header (section 14.16 of RFC2616) along with the response. If this is present in the headers of the response, we know a range was returned; if it is not present, the entire page was returned.
Implementation with urllib2
urllib2 allows us to add headers to a request, thus allowing us to ask the server for a range rather than the entire page. The following script takes a URL, a start position, and (optionally) a length on the command line, and tries to retrieve the given section of the page.
import sys
import urllib2
# Check command line arguments.
if len(sys.argv) < 3:
sys.stderr.write("Usage: %s url start [length]\n" % sys.argv[0])
sys.exit(1)
# Create a request for the given URL.
request = urllib2.Request(sys.argv[1])
# Add the header to specify the range to download.
if len(sys.argv) > 3:
start, length = map(int, sys.argv[2:])
request.add_header("range", "bytes=%d-%d" % (start, start + length - 1))
else:
request.add_header("range", "bytes=%s-" % sys.argv[2])
# Try to get the response. This will raise a urllib2.URLError if there is a
# problem (e.g., invalid URL).
response = urllib2.urlopen(request)
# If a content-range header is present, partial retrieval worked.
if "content-range" in response.headers:
print "Partial retrieval successful."
# The header contains the string 'bytes', followed by a space, then the
# range in the format 'start-end', followed by a slash and then the total
# size of the page (or an asterix if the total size is unknown). Lets get
# the range and total size from this.
range, total = response.headers['content-range'].split(' ')[-1].split('/')
# Print a message giving the range information.
if total == '*':
print "Bytes %s of an unknown total were retrieved." % range
else:
print "Bytes %s of a total of %s were retrieved." % (range, total)
# No header, so partial retrieval was unsuccessful.
else:
print "Unable to use partial retrieval."
# And for good measure, lets check how much data we downloaded.
data = response.read()
print "Retrieved data size: %d bytes" % len(data)
Using this, I can retrieve the final 2,000 bytes of the Python homepage:
blair#blair-eeepc:~$ python retrieverange.py http://www.python.org/ 17387
Partial retrieval successful.
Bytes 17387-19386 of a total of 19387 were retrieved.
Retrieved data size: 2000 bytes
Or 400 bytes from the middle of the homepage:
blair#blair-eeepc:~$ python retrieverange.py http://www.python.org/ 6000 400
Partial retrieval successful.
Bytes 6000-6399 of a total of 19387 were retrieved.
Retrieved data size: 400 bytes
However, the Google homepage does not support ranges:
blair#blair-eeepc:~$ python retrieverange.py http://www.google.com/ 1000 500
Unable to use partial retrieval.
Retrieved data size: 9621 bytes
In this case, it would be necessary to extract the data of interest in Python prior to any further processing.

It may work best just to write the data to a file (or even to a string, using StringIO), and to seek in that file (or string).

I did not find any existing implementations of a file-like interface with seek() to HTTP URLs, so I rolled my own simple version: https://github.com/valgur/pyhttpio. It depends on urllib.request but could probably easily be modified to use requests, if necessary.
The full code:
import cgi
import time
import urllib.request
from io import IOBase
from sys import stderr
class SeekableHTTPFile(IOBase):
def __init__(self, url, name=None, repeat_time=-1, debug=False):
"""Allow a file accessible via HTTP to be used like a local file by utilities
that use `seek()` to read arbitrary parts of the file, such as `ZipFile`.
Seeking is done via the 'range: bytes=xx-yy' HTTP header.
Parameters
----------
url : str
A HTTP or HTTPS URL
name : str, optional
The filename of the file.
Will be filled from the Content-Disposition header if not provided.
repeat_time : int, optional
In case of HTTP errors wait `repeat_time` seconds before trying again.
Negative value or `None` disables retrying and simply passes on the exception (the default).
"""
super().__init__()
self.url = url
self.name = name
self.repeat_time = repeat_time
self.debug = debug
self._pos = 0
self._seekable = True
with self._urlopen() as f:
if self.debug:
print(f.getheaders())
self.content_length = int(f.getheader("Content-Length", -1))
if self.content_length < 0:
self._seekable = False
if f.getheader("Accept-Ranges", "none").lower() != "bytes":
self._seekable = False
if name is None:
header = f.getheader("Content-Disposition")
if header:
value, params = cgi.parse_header(header)
self.name = params["filename"]
def seek(self, offset, whence=0):
if not self.seekable():
raise OSError
if whence == 0:
self._pos = 0
elif whence == 1:
pass
elif whence == 2:
self._pos = self.content_length
self._pos += offset
return self._pos
def seekable(self, *args, **kwargs):
return self._seekable
def readable(self, *args, **kwargs):
return not self.closed
def writable(self, *args, **kwargs):
return False
def read(self, amt=-1):
if self._pos >= self.content_length:
return b""
if amt < 0:
end = self.content_length - 1
else:
end = min(self._pos + amt - 1, self.content_length - 1)
byte_range = (self._pos, end)
self._pos = end + 1
with self._urlopen(byte_range) as f:
return f.read()
def readall(self):
return self.read(-1)
def tell(self):
return self._pos
def __getattribute__(self, item):
attr = object.__getattribute__(self, item)
if not object.__getattribute__(self, "debug"):
return attr
if hasattr(attr, '__call__'):
def trace(*args, **kwargs):
a = ", ".join(map(str, args))
if kwargs:
a += ", ".join(["{}={}".format(k, v) for k, v in kwargs.items()])
print("Calling: {}({})".format(item, a))
return attr(*args, **kwargs)
return trace
else:
return attr
def _urlopen(self, byte_range=None):
header = {}
if byte_range:
header = {"range": "bytes={}-{}".format(*byte_range)}
while True:
try:
r = urllib.request.Request(self.url, headers=header)
return urllib.request.urlopen(r)
except urllib.error.HTTPError as e:
if self.repeat_time is None or self.repeat_time < 0:
raise
print("Server responded with " + str(e), file=stderr)
print("Sleeping for {} seconds before trying again".format(self.repeat_time), file=stderr)
time.sleep(self.repeat_time)
A potential usage example:
url = "https://www.python.org/ftp/python/3.5.0/python-3.5.0-embed-amd64.zip"
f = SeekableHTTPFile(url, debug=True)
zf = ZipFile(f)
zf.printdir()
zf.extract("python.exe")
Edit: There is actually a mostly identical, if slightly more minimal, implementation in this answer: https://stackoverflow.com/a/7852229/2997179

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

How to deal with deflated response by urllib2? [duplicate] - python

Related

In the controllers.py file of odoo, how to get the json string in the post request body?

OpenSubtitle API, returning blank data with status 200

what's wrong with this python code

How to abstract from pycurl the correct way

Python - seek in http response stream

Categories

Resources