parsing string from request data - python

I am using python requests to obtain a file's source code, and then parse a string from the source. The string I am trying to parse is magic: 8susjdhdyrhsisj3864jsud (not always the same string). If I observe the source by printing it out to the screen it shows just fine. When I parse the string sometimes I get a result, and other times I get nothing. Please see the following screenshots: http://i.imgur.com/NW1zFZK.png, http://i.imgur.com/cb9e2cb.png. Now the string I want always appears in the source so it must be a regex issue? I've tried findall and search, but both methods give me the same outcome. Results sometimes and other times I get nothing. What seems to be my issue?
class Solvemedia():
def __init__(self, key):
self.key = key
def timestamp(self, source):
timestamp_regex = re.compile(ur'chalstamp:\s+(\d+),')
print re.findall(timestamp_regex, source)
def magic(self, source):
magic_regex = re.compile(ur'magic:\s+\'(\w+)\',')
print re.findall(magic_regex, source)
def source(self):
solvemedia = requests.Session()
solvemedia.headers.update({
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
})
source = solvemedia.get('http://api.solvemedia.com/papi/challenge.script?k={}'.format(self.key)).text
return source
def test(self):
js_source = self.source()
print js_source
self.magic(js_source)
self.timestamp(js_source)
solvemedia = Solvemedia('HUaZ-6d2wtQT3-LkLVDPJB5C.E99j9ZK')
solvemedia.test()

There is a . in one of the values, but \w doesn't match dots. Compare:
magic: 'AZJEXYx.ZsExcTHvjH9mwQ',
// ^
with:
magic: 'xfF9i4YBAQP1EgoNhgEBAw',
A better bet is to allow all characters except a quote:
magic_regex = re.compile(ur"magic:\s+'([^']+)',")
Demo:
>>> import re
>>> samples = [
... u"magic: 'xfF9i4YBAQP1EgoNhgEBAw',",
... u"magic: 'AZJEXYx.ZsExcTHvjH9mwQ',",
... ]
>>> magic_regex = re.compile(ur"magic:\s+'([^']+)',")
>>> for sample in samples:
... print magic_regex.search(sample).group(1)
...
xfF9i4YBAQP1EgoNhgEBAw
AZJEXYx.ZsExcTHvjH9mwQ

Related

How can i improve my multithreading speed and effciency in python?

How can improve my multithreading speed in my code?
My code takes 130 seconds with 100 threads to do 700 requests which is really slow and frustrating assuming that i use 100 threads.
My code edits the parameter values from an url and makes a request to it including the original url (unedited) the urls are received from a file (urls.txt)
Let me show you an example:
Let's consider the following url:
https://www.test.com/index.php?parameter=value1&parameter2=value2
The url contains 2 parameters so my code will make 3 requests.
1 request to the original url:
https://www.test.com/index.php?parameter=value1&parameter2=value2
1 request to the first modified value:
https://www.test.com/index.php?parameter=replaced_value&parameter2=value2
1 request to the second modified value:
https://www.test.com/index.php?parameter=value1&parameter2=replaced_value
I have tried using asyncio for this but I had more success with concurrent.futures
I even tried increasing the threads which I thought it was the issue at first but in this case wasnt if I would increase the threads considerably then the script would freeze at start for 30-50 seconds and it really didnt increased the speed as i expected
I assume this is a code issue how I build up the multithreading becuase I saw other people achieved incredible speeds with concurrent.futures
import requests
import uuid
from concurrent.futures import ThreadPoolExecutor, as_completed
import time
start = time.time()
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'}
def make_request(url2):
try:
if '?' and '=':
request_1 = requests.get(url2, headers=headers, timeout=10)
url2_modified = url2.split("?")[1]
times = url2_modified.count("&") + 1
for x in range(0, times):
split1 = url2_modified.split("&")[x]
value = split1.split("=")[1]
parameter = split1.split("=")[0]
url = url2.replace('='+value, '=1')
request_2 = requests.get(url, stream=True, headers=headers, timeout=10)
html_1 = request_1.text
html_2 = request_2.text
print(request_1.status_code + ' - ' + url2)
print(request_2.status_code + ' - ' + url)
except requests.exceptions.RequestException as e:
return e
def runner():
threads= []
with ThreadPoolExecutor(max_workers=100) as executor:
file1 = open('urls.txt', 'r', errors='ignore')
Lines = file1.readlines()
count = 0
for line in Lines:
count += 1
threads.append(executor.submit(make_request, line.strip()))
runner()
end = time.time()
print(end - start)
Inside loop in make_request you run normal requests.get and it doesn't use thread (or any other method) to make it faster - so it has to wait for end of previous request to run next request.
In make_request I use another ThreadPoolExecutor to run every requests.get (created in loop) in separated thread
executor.submit(make_modified_request, modified_url)
and it gives me time ~1.2s
If I use normal
make_modified_request(modified_url)
then it gives me time ~3.2s
Minimal working example:
I use real urls https://httpbin.org/get so everyone can simply copy and run it.
from concurrent.futures import ThreadPoolExecutor
import requests
import time
#import urllib.parse
# --- constansts --- (PEP8: UPPER_CASE_NAMES)
HEADERS = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'}
# --- functions ---
def make_modified_request(url):
"""Send modified url."""
print('send:', url)
response = requests.get(url, stream=True, headers=HEADERS)
print(response.status_code, '-', url)
html = response.text # ???
# ... code to process HTML ...
def make_request(url):
"""Send normal url and create threads with modified urls."""
threads = []
with ThreadPoolExecutor(max_workers=10) as executor:
print('send:', url)
# send base url
response = requests.get(url, headers=HEADERS)
print(response.status_code, '-', url)
html = response.text # ???
#parts = urllib.parse.urlparse(url)
#print('query:', parts.query)
#arguments = urllib.parse.parse_qs(parts.query)
#print('arguments:', arguments) # dict {'a': ['A'], 'b': ['B'], 'c': ['C'], 'd': ['D'], 'e': ['E']}
arguments = url.split("?")[1]
arguments = arguments.split("&")
arguments = [arg.split("=") for arg in arguments]
print('arguments:', arguments) # list [['a', 'A'], ['b', 'B'], ['c', 'C'], ['d', 'D'], ['e', 'E']]
for name, value in arguments:
modified_url = url.replace('='+value, '=1')
print('modified_url:', modified_url)
# run thread with modified url
threads.append(executor.submit(make_modified_request, modified_url))
# run normal function with modified url
#make_modified_request(modified_url)
print('[make_request] len(threads):', len(threads))
def runner():
threads = []
with ThreadPoolExecutor(max_workers=10) as executor:
#fh = open('urls.txt', errors='ignore')
fh = [
'https://httpbin.org/get?a=A&b=B&c=C&d=D&e=E',
'https://httpbin.org/get?f=F&g=G&h=H&i=I&j=J',
'https://httpbin.org/get?k=K&l=L&m=M&n=N&o=O',
'https://httpbin.org/get?a=A&b=B&c=C&d=D&e=E',
'https://httpbin.org/get?f=F&g=G&h=H&i=I&j=J',
'https://httpbin.org/get?k=K&l=L&m=M&n=N&o=O',
]
for line in fh:
url = line.strip()
# create thread with url
threads.append(executor.submit(make_request, url))
print('[runner] len(threads):', len(threads))
# --- main ---
start = time.time()
runner()
end = time.time()
print('time:', end - start)
BTW:
I was thinking to use single
executor = ThreadPoolExecutor(max_workers=10)
and later use the same executor in all functions - and maybe it would run little faster - but at this moment I don't have working code.

Recreating python mechanize script in R

I'd like to recreate the python script below which uses mechanize and http.cookiejar in R. I thought it would be straight forward using rvest but I was unable to do so. Any insight on which packages to use and apply would be extremely helpful. I realize reticulate may be a possibility but I figure that there has to be a way to do this in R that is straight forward.
import mechanize
import http.cookiejar
b = mechanize.Browser()
b.set_handle_refresh(True)
b.set_debug_redirects(True)
b.set_handle_redirect(True)
b.set_debug_http(True)
cj = http.cookiejar.CookieJar()
b.set_cookiejar(cj)
b.addheaders = [
('User-agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.94 Safari/537.36'),
('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'),
('Host', 'www.fangraphs.com'),
('Referer', 'https://www.fangraphs.com/auctiontool.aspx?type=pit&proj=atc&pos=1,1,1,1,5,1,1,0,0,1,5,5,0,18,0&dollars=400&teams=12&mp=5&msp=5&mrp=5&mb=1&split=&points=c|0,1,2,3,4,5|0,1,2,3,4,5&lg=MLB&rep=0&drp=0&pp=C,SS,2B,3B,OF,1B&players=')
]
b.open("https://www.fangraphs.com/auctiontool.aspx?type=pit&proj=atc&pos=1,1,1,1,5,1,1,0,0,1,5,5,0,18,0&dollars=400&teams=12&mp=5&msp=5&mrp=5&mb=1&split=&points=c|0,1,2,3,4,5|0,1,2,3,4,5&lg=MLB&rep=0&drp=0&pp=C,SS,2B,3B,OF,1B&players=")
def is_form1_form(form):
return "id" in form.attrs and form.attrs['id'] == "form1"
b.select_form(predicate=is_form1_form)
b.form.find_control(name='__EVENTTARGET').readonly = False
b.form.find_control(name='__EVENTARGUMENT').readonly = False
b.form['__EVENTTARGET'] = 'AuctionBoard1$cmdCSV'
b.form['__EVENTARGUMENT'] = ''
print(b.submit().read())
The R code I was using to attempt to recreate this with rvest is below. The comments indicate the main source of my confusion. In particular the needed fields grabbed by the python code were not showing up when I grabbed the form with rvest and when I tried to manually insert them I got a Connection Refused upon submitting.
library(rvest)
atc.pitcher.link = "https://www.fangraphs.com/auctiontool.aspx?type=pit&proj=atc&pos=1,1,1,1,5,1,1,0,0,1,5,5,0,18,0&dollars=400&teams=12&mp=5&msp=5&mrp=5&mb=1&split=&points=c|0,1,2,3,4,5|0,1,2,3,4,5&lg=MLB&rep=0&drp=0&pp=C,SS,2B,3B,OF,1B&players="
proj.data = html_session(atc.pitcher.link)
form.unfilled = proj.data %>% html_node("form") %>% html_form()
# note: I am suprised "__EVENTTARGET" and "__EVENTARGUMENT" are not included as attributes of the unfilled form. I can select them in the posted python script.
# If I try and create them with the appropriate values I get a Connection Refused Error.
form.unfilled[[5]]$`__EVENTTARGET` = form.unfilled[[5]]$`__VIEWSTATE`
form.unfilled[[5]]$`__EVENTARGUMENT`= form.unfilled[[5]]$`__VIEWSTATE`
form.unfilled[[5]]$`__EVENTTARGET`$readonly = FALSE
form.unfilled[[5]]$`__EVENTTARGET`$value = "AuctionBoard1$cmdCSV"
form.unfilled[[5]]$`__EVENTARGUMENT`$value = ""
form.unfilled[[5]]$`__EVENTARGUMENT`$readonly = FALSE
form.filled = form.unfilled
session = submit_form(proj.data, form.filled)
Here is a way to do it using RSelenium and setting chrome to be headless an enabling remote download to your working directory. It automatically brings up a headless browser and then lets the code drive it.
I believe to do the equivalent in rvest you need to write some native phantomjs.
library(RSelenium)
library(wdman)
eCaps <- list(
chromeOptions = list(
args = c('--headless','--disable-gpu', '--window-size=1280,800'),
prefs = list(
"profile.default_content_settings.popups" = 0L,
"download.prompt_for_download" = FALSE,
"download.default_directory" = getwd()
)
)
)
cDrv <- wdman::chrome()
rD <- RSelenium::rsDriver(extraCapabilities = eCaps)
remDr <- rD$client
remDr$queryRD(
ipAddr = paste0(remDr$serverURL, "/session/", remDr$sessionInfo[["id"]], "/chromium/send_command"),
method = "POST",
qdata = list(
cmd = "Page.setDownloadBehavior",
params = list(
behavior = "allow",
downloadPath = getwd()
)
)
)
atc.pitcher.link= "http://www.fangraphs.com/auctiontool.aspx?type=pit&proj=atc&pos=1,1,1,1,5,1,1,0,0,1,5,5,0,18,0&dollars=400&teams=12&mp=5&msp=5&mrp=5&mb=1&split=&points=c|0,1,2,3,4,5|0,1,2,3,4,5&lg=MLB&rep=0&drp=0&pp=C,SS,2B,3B,OF,1B&players="
remDr$navigate(atc.pitcher.link)
# sleep to be nice and give things time to load
Sys.sleep(8)
# find the button the page we want to click
option <- remDr$findElement('id', 'AuctionBoard1_cmdCSV')
#click it
option$clickElement()
list.files(getwd(),pattern = 'sysdata')
remDr$closeall()
cDrv$stop()

Converting a string representation of a json/dict to something usable with python request

I finally had to give up and ask for help. I am retrieving a document (with requests) that has a json type of format (but is not well formatted - no double quotes) and trying to extract the data as a normal dict. Here is what I have: this works and will get you the output from which I am trying to extract the data.
def test():
url = "http://www.sgx.com/JsonRead/JsonstData"
payload = {}
payload['qryId'] = 'RSTIc'
payload['timeout'] = 60
header = {'User-Agent' : 'Mozilla/5.0 (compatible; MSIE 10.0; Linux i686; Trident/2.0)', 'Content-Type': 'text/html; charset=utf-8'}
req = requests.get(url, headers = header, params = payload)
print(req.url)
prelim = req.content.decode('utf-8')
print(type(prelim))
print(prelim)
test()
What I would like to have after that is: (assuming a properly functioning dict)
for stock in prelim['items']:
print(stock['N'])
Which should give me a list of all the stocks names.
I have tried most json functions: prelim.json(), loads., load., dump., dumps., parse. None seems to work because the data is not formatted properly. I also tried ast.literal_eval() without success. I tried some examples on Stack Overflow to convert that string in a proper dict but no luck. I don't seem to be able to convert that string to make it behave as a proper dictionary. If you can point me in the right direction that would be much appreciated.
Good samaritains have asked for an example of the data. The data coming from the above request is a bit longer but I removed a few 'items' so people can see the general look of the retrieved data.
{}&& {identifier:'ID', label:'As at 19-03-2018 8:38 AM',items:[{ID:0,N:'AscendasReit',SIP:'',NC:'A17U',R:'',I:'',M:'',LT:0,C:0,VL:97.600,BV:485.300,B:'2.670',S:'2.670',SV:1009.100,O:0,H:0,L:0,V:259811.200,SC:'9',PV:2.660,P:0,P_:'X',V_:''},
{ID:1,N:'CapitaComTrust',SIP:'',NC:'C61U',R:'',I:'',M:'',LT:0,C:0,VL:126.349,BV:1467.300,B:'1.800',S:'1.800',SV:620.900,O:0,H:0,L:0,V:228691.690,SC:'9',PV:1.810,P:0,P_:'X',V_:''},
{ID:2,N:'CapitaLand',SIP:'',NC:'C31',R:'',I:'',M:'',LT:0,C:0,VL:78.000,BV:184.900,B:'3.670',S:'3.670',SV:372.900,O:0,H:0,L:0,V:286026.000,SC:'9',PV:3.660,P:0,P_:'X',V_:''},
{ID:28,N:'Wilmar Intl',SIP:'',NC:'F34',R:'CD',I:'',M:'',LT:0,C:0,VL:0.000,BV:32.000,B:'3.210',S:'3.210',SV:73.100,O:0,H:0,L:0,V:0.000,SC:'2',PV:3.220,P:0,P_:'',V_:''},
{ID:29,N:'YZJ Shipbldg SGD',SIP:'',NC:'BS6',R:'',I:'',M:'',LT:0,C:0,VL:0.000,BV:349.500,B:'1.330',S:'1.330',SV:417.700,O:0,H:0,L:0,V:0.000,SC:'2',PV:1.340,P:0,P_:'',V_:''}]}
Following the recent commentaries, I know I could do this:
def test2():
my_text = "{}&& {identifier:'ID', label:'As at 19-03-2018 8:38 AM',items:[{ID:0,N:'AscendasReit',SIP:'',NC:'A17U',R:'',I:'',M:'',LT:0,C:0,VL:97.600,BV:485.300,B:'2.670',S:'2.670',SV:1009.100,O:0,H:0,L:0,V:259811.200,SC:'9',PV:2.660,P:0,P_:'X',V_:''}, {ID:1,N:'CapitaComTrust',SIP:'',NC:'C61U',R:'',I:'',M:'',LT:0,C:0,VL:126.349,BV:1467.300,B:'1.800',S:'1.800',SV:620.900,O:0,H:0,L:0,V:228691.690,SC:'9',PV:1.810,P:0,P_:'X',V_:''}, {ID:2,N:'CapitaLand',SIP:'',NC:'C31',R:'',I:'',M:'',LT:0,C:0,VL:78.000,BV:184.900,B:'3.670',S:'3.670',SV:372.900,O:0,H:0,L:0,V:286026.000,SC:'9',PV:3.660,P:0,P_:'X',V_:''}, {ID:28,N:'Wilmar Intl',SIP:'',NC:'F34',R:'CD',I:'',M:'',LT:0,C:0,VL:0.000,BV:32.000,B:'3.210',S:'3.210',SV:73.100,O:0,H:0,L:0,V:0.000,SC:'2',PV:3.220,P:0,P_:'',V_:''}, {ID:29,N:'YZJ Shipbldg SGD',SIP:'',NC:'BS6',R:'',I:'',M:'',LT:0,C:0,VL:0.000,BV:349.500,B:'1.330',S:'1.330',SV:417.700,O:0,H:0,L:0,V:0.000,SC:'2',PV:1.340,P:0,P_:'',V_:''}]}"
prelim = my_text.split("items:[")[1].replace("}]}", "}")
temp_list = prelim.split(", ")
end_list = []
main_dict = {}
for tok1 in temp_list:
temp_dict = {}
temp = tok1.replace("{","").replace("}","").split(",")
for tok2 in temp:
my_key = tok2.split(":")[0]
my_value = tok2.split(":")[1].replace("'","")
temp_dict[my_key] = my_value
end_list.append(temp_dict)
main_dict['items'] = end_list
for stock in main_dict['items']:
print(stock['N'])
test2()
Which is the desired result. I am just asking, if there is an easier (more elegant/pythonic) way of doing this.
You need to convert the string to JSON convertible text first then use json.loads to get dictionary
prelim is not in JSON format and values are not surrounded by "
Remove '{}&& '
Surround properties with "
Apply json.loads(new_text) to get the dictionary representation
i.e
import requests, json
#replace tuples
reps = (('identifier:', '"identifier":'),
('label:', '"label":'),
('items:', '"items":'),
('NC:', '"NC":'),
('ID:', '"ID":'),
('N:', '"N":'),
('SIP:', '"SIP":'),
('SC:', '"SC":'),
('R:', '"R":'),
('I:', '"I":'),
('M:', '"M":'),
('LT:', '"LT":'),
('C:', '"C":'),
('VL:', '"VL":'),
('BV:', '"BV":'),
('BL:', '"BL":'),
('B:', '"B":'),
('S:', '"S":'),
('SV:', '"SV":'),
('O:', '"O":'),
('H:', '"H":'),
('L:', '"L":'),
('PV:', '"PV":'),
('V:', '"V":'),
('P_:', '"P_":'),
('P:', '"P":'),
('V_:', '"V_":'))
#getting rid of invalid json text
prelim = prelim.replace('{}&& ', '')
#replacing single quotes with double quotes
prelim = prelim.replace("'", "\"")
print(prelim)
#reduce to get all replacements
dict_text = fn.reduce(lambda a, kv: a.replace(*kv), reps, prelim)
dic = json.loads(dict_text)
print(dic)
get Items:
for x in dic['items']:
print(x['N'])
Output:
2ndChance W200123
3Cnergy
3Cnergy W200528
800 Super
8Telecom^
A-Smart
A-Sonic Aero^
AA
....

Changing text variable from another imported script

So we have two scripts the first being AdidasStock.py and the second being StockWindow.py. I am trying to replace the base url in getVarientStock from StockWindow.py. Once again my apology's I am really new to python.
I am getting an error :
aulocale1() takes exactly 2 arguments (1 given)
class AdidasStock:
def __init__(self, clientId, sku):
self.session = requests.session()
self.headers = {"User-Agent" : "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36",
"Accept-Language" : "REPLACETHISPLIZZZ"}
self.locale = ''
self.clientId = clientId
self.sku = sku
self.skus = []
def getVarientStock(self, sku, base):
base = "http://www.adidas.com.au/on/demandware.store/Sites-adidas-AU-Site/en_AU"
urlVariantStock = base + '/Product-GetVariants?pid=' + sku
r = requests.get(urlVariantStock, headers=self.headers)
Here is how I am trying to change the above base , self.locale, and a portion of self.headers. I am using a Tkinter Checkbutton to trigger this function.
Checkbutton
aulocale = IntVar()
aucheck = Checkbutton(self.master, variable=aulocale, onvalue=1, offvalue=0, text="AU",command=self.aulocale1)
This is the Function
def aulocale1(self,base):
base.replace = "http://www.adidas.com.au/on/demandware.store/Sites-adidas-AU-Site/en_AU"
self.locale.replace = ('','AU')
self.headers.replace = ('REPLACETHISPLIZZZ','en-AU,en;q=0.8')
def uklocale1(self,base):
base.replace = "www.adidas.co.uk/on/demandware.store/Sites-adidas-GB-Site/en_GB"
self.locale.replace = ('','GB')
elf.headers.replace = ('REPLACETHISPLIZZZ','en-GB,en;q=0.8')
Function def aulocale1(self,base): expects one argument - base but when you assign this function to Checkbox using command=self.aulocale1 then Checkbox will execute this function without arguments - it will run self.aulocale1()
You can assign to command function with arguments using lambda
command=lambda:self.aulocale1("argument")
(BTW: if you will use lambda in for loop then you will have other problems ;) )
base is local variable so you can't change it ... but you can run this function with argument base so you can use default value for this argument
def getVarientStock(self, sku, base="http://www.adidas.com.au/ ...")
urlVariantStock = base + '/Product-GetVariants?pid=' + sku
r = requests.get(urlVariantStock, headers=self.headers)
If you run it without base
getVarientStock("XX")
then it uses "http://www.adidas.com.au/ ..." as base
but if you run it with second argument
getVarientStock("XX", "http://stackoverflow.com")
then it uses "http://stackoverflow.com" as base

Error compress/decompress spdy name/value block with zlib+dictionary

I was trying to write a spdy proxy server, but have problems compress/decompress spdy name/value blocks.
I was using python 3.3 zlib library to compress/decompress with dictionary.
When receiving spdy frames from chrome 31, frames can be parsed most of the time, but some name/value blocks can not be decompressed correctly.
I have 3 test cases:
import zlib
dictionary = (
b"optionsgetheadpostputdeletetraceacceptaccept-charsetaccept-encodingaccept-"
b"languageauthorizationexpectfromhostif-modified-sinceif-matchif-none-matchi"
b"f-rangeif-unmodifiedsincemax-forwardsproxy-authorizationrangerefererteuser"
b"-agent10010120020120220320420520630030130230330430530630740040140240340440"
b"5406407408409410411412413414415416417500501502503504505accept-rangesageeta"
b"glocationproxy-authenticatepublicretry-afterservervarywarningwww-authentic"
b"ateallowcontent-basecontent-encodingcache-controlconnectiondatetrailertran"
b"sfer-encodingupgradeviawarningcontent-languagecontent-lengthcontent-locati"
b"oncontent-md5content-rangecontent-typeetagexpireslast-modifiedset-cookieMo"
b"ndayTuesdayWednesdayThursdayFridaySaturdaySundayJanFebMarAprMayJunJulAugSe"
b"pOctNovDecchunkedtext/htmlimage/pngimage/jpgimage/gifapplication/xmlapplic"
b"ation/xhtmltext/plainpublicmax-agecharset=iso-8859-1utf-8gzipdeflateHTTP/1"
b".1statusversionurl\0")
def decompress(buf):
decompressor = zlib.decompressobj(zdict=dictionary)
return decompressor.decompress(buf)
def compress(buf):
compressor = zlib.compressobj(zdict=dictionary)
return compressor.compress(buf)
if __name__ == '__main__':
# Test 1: buf -(compress)-> cb -(decompress)->buf2, buf2 become ''
buf = b'\x00\x01\x00\x06status\x00\x1a200 Connection established'
print(buf)
cb = compress(buf)
print(cb) # b'x\xbb\xdf\xa2Q\xb2'
buf = decompress(cb)
print(buf) # b''
# Test 2: This name/value block data was sent by chrome, which decompressed correctly
print(decompress(b'8\xea\xdf\xa2Q\xb2b`e`\x01\xe5\x12\x06\x9e4`\xc6K\x02\x06\x83^r~.\x03[.0o\xe6\xa70\xb0;\xfb\xfb\xf9\xb9:\x8700\x83\x14\x0b\x00\x04PZbrjR~~\xb6^r~\xae\x95\x89\x891#\x001p!\x12<C\x8eo~UfNN\xa2\xbe\xa9\x9e\x81\x82Fxf^J~y\xb1\x82_\x88\x82\x99\x9e\xa1\xb5B\xb8\x7f\xb8\x99\x89\xa6\x82#\xd0K\xa9\xe1\xa9I\xde\x99%\xfa\xa6\xc6\xe6z\xc6f\n\x1a\xde\x1e!\xbe>:\n9\x99\xd9\xa9\n\xee\xa9\xc9\xd9\xf9\x9a\n\xce\x19\xc0\xdc\x9b\xaaol\xa8g\xa0ghfj\xa0gf\xac\x10\x9c\x98\x96X\x94\t\xd5\xc5\xc0\x0e\xf5\x04\x03\x07\xcco\x00\x00\x00\x00\xff\xff'))
# b'\x00\x05\x00\x04host\x00\x0cfacebook.com\x00\x06method\x00\x07CONNECT\x00\x03url\x00\x10facebook.com:443\x00\nuser-agent\x00lMozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36\x00\x07version\x00\x08HTTP/1.1'
# Test 3: This was another name/value block data sent by chrome, which can not be decompressed
print(decompress(b'"\xcd+\x00\x01\x94\x96\x98\x9c\x9a\x94\x9f\x9f\xad\x97\x9c\x9fkebb\x0c\x10#\x83\xca+\x00\x00\x00\x00\xff\xff'))
# Error -3 while decompressing data: incorrect header check
I'm new to python3+zlib (use python 2.7 before this project) and spdy.
I really appreciate your help.
You need to flush for both compression and decompression. Otherwise some or all of the data remains in the object. I.e.:
def decompress(buf):
decompressor = zlib.decompressobj()
result = decompressor.decompress(buf)
return result + decompressor.flush()
def compress(buf):
compressor = zlib.compressobj()
result = compressor.compress(buf)
return result + compressor.flush()

Categories

Resources