How to change text in TextEdit actively [duplicate] - python

This question already has answers here:
Pyqt Gui Freezes while in loop
(2 answers)
Closed 7 years ago.
I'm having problem with actively updating my TextEdit box from PyQt. I want to make an app that will download files in parts (new thread for each part, downloading parallely) and update the current status of each part in textbox, but my app "freezes" for the downloading time and sets the textbox after downloading is complete although if I print the result it looks fine, no freeze on console.
I know that this code is "a mess" right now, but I was changing many things and experimented with different approaches. I marked this "print" which works fine, and just below there is setText which freezes my app for the downloading time.If it's the problem with "TextEdit" from PyQt please let me know, I'll change it but I didn't find any information like that so far.Thanks!
def supervi(self):
import os
import urllib2
N=2
url = self.__url
dir = self.path
f_name = url.split("/")[len(url.split("/")) - 1]
dir_tmp=dir + "\\TMP." + f_name
if os.path.isdir(dir_tmp) == False:
os.mkdir(dir_tmp)
for n in range(0,N):
with open(dir_tmp+"\\file"+str(n), "w+b") as f:
#f.write("")
pass
data = urllib2.urlopen(url)
file_size = int(data.headers["Content-Length"].strip())
import multiprocessing as mp
data_block = file_size/N
p=mp.Pool(N)
for i in range(0, N):
start = i * data_block
stop = 0
if not i == N - 1:
stop = i * data_block + data_block - 1
else:
stop = file_size
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:20.0) Gecko/20100101 Firefox/20.0",
"Accept-Encoding": "gzip, deflate, sdch",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Language": "pl-PL,pl;q=0.8,en-US;q=0.6,en;q=0.4",
"Connection": "keep-alive",
"Range": "bytes=" + str(start) + "-" + str(stop)
}
req = urllib2.Request(url, headers=headers)
from main import dziecko
p.apply_async(dziecko,[i,req,dir_tmp])
while True:
sum=0
for n in range(0,N):
sum=sum+os.path.getsize(dir_tmp + "\\file" + str(n))
if not sum < file_size:
from main import del_and_combine
del_and_combine(dir,dir_tmp,f_name,N)
break
for n in range(0,N):
size=os.path.getsize(dir_tmp + "\\file" + str(n))
print size ##################THIS ONE
self.url.setText(str(os.path.getsize(dir_tmp + "\\file0")))

Add : QtCore.QCoreApplication.processEvents() inside your loop. This will update the text every iteration.
Without this, PyQt will always freeze during loops.
For more information :
< pyqt-gui-freezes-while-in-loop >

Related

Request post method doesn't return valid response

I am trying to work on a website that has simple captcha. Here's the link.
Steps:
One is supposed to type a case number e.g. 200078510, then type the numbers in the captcha, then click on Search button.
Progress:
I could solve the part of the captcha, but when trying to use the POST method in requests library, I didn't get a valid response. I got this string حدث خطأ ما , which means that Something went wrong. A successful response would have included the case number in the response e.g. 200078510.
Question:
90% of the time myCaptcha is correct so the problem, I think, is with the POST request. Can anyone see what is wrong with my POST request?
I provide a working VBA example at the end, as additional info, in case that helps.
Here's the code that I could do till now:
import requests
import cv2
import numpy as np
import pytesseract
from PIL import Image
sNumber = 'Number.png'
sTemp = 'Temp.png'
pytesseract.pytesseract.tesseract_cmd=r'C:\\Program Files\\Tesseract-OCR\\tesseract.exe'
def getCaptcha():
response = requests.get("https://eservices.moj.gov.kw/captcha/imgCaptcha.jsp")
with open(sNumber, "wb") as f:
f.write(response.content)
f.close()
img = cv2.imread(sNumber)
lower = np.array([0, 0, 0])
upper = np.array([46, 46, 255])
thresh = cv2.inRange(img, lower, upper)
thresh = 255 - thresh
cv2.imwrite(sTemp, thresh)
img=Image.open(sTemp)
text=pytesseract.image_to_string(img, lang='eng',config='--psm 10 --oem 3 -c tessedit_char_whitelist=0123456789')
return text
myCaptcha = getCaptcha()
print(myCaptcha)
payload = {'txtCaseNo': '200078510', 'txtCaptcha2': myCaptcha, 'searchType': '0'}
r = requests.post("https://eservices.moj.gov.kw/viewResults/validateCase.jsp", data=payload)
print(r.url)
print(r.text)
I even tried using headers like that and the same problem
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36',
'Content-Type':'application/x-www-form-urlencoded'}
payload = {'txtCaseNo': '200078510', 'txtCaptcha2': myCaptcha, 'searchType': '0'}
r = requests.post("https://eservices.moj.gov.kw/viewResults/validateCase.jsp",headers = headers, data=payload)
Simply I need to be able to use the POST method of the requests package so as to be able to send the suitable arguments and then navigate to multiple sections that are related to the searched number.
Supplementary Information (Working example reference in VBA):
I have a working code in VBA for the entire process. The code navigates to a URL and enter a number and enter the numbers on captcha. Here's the code:
Public vCaptcha
Sub Test()
Dim wsIndex As Worksheet, wsData As Worksheet, http As New XMLHTTP60, html As New HTMLDocument, htmlData As New HTMLDocument, postCasePane As Object, oTables As Object, postTable As Object, postWrongSec As Object, strArg As String, xTemp As String, sTemp As String, r As Long, lr As Long, i As Long, ii As Long, vMAX As Long, cnt As Long
Set wsIndex = ThisWorkbook.Worksheets("Index")
Set wsData = ThisWorkbook.Worksheets("Data")
wsData.Range("A1").CurrentRegion.Offset(1).ClearContents
For r = 2 To wsIndex.Cells(Rows.Count, 1).End(xlUp).Row
If r Mod 10 = 0 Then ThisWorkbook.Save
lr = wsData.Cells(Rows.Count, 1).End(xlUp).Row + 1
If wsIndex.Cells(r, 1).Value = "" Then GoTo Skipper
sPoint:
Application.StatusBar = "Case Number: " & wsIndex.Cells(r, 1).Value & " ------- Row " & r
DecryptCaptcha
strArg = "txtCaseNo=" & wsIndex.Cells(r, 1).Value & "&txtCaptcha2=" & vCaptcha & "&searchType=0"
With http
.Open "POST", "https://eservices.moj.gov.kw/viewResults/validateCase.jsp", False
.setRequestHeader "Content-type", "application/x-www-form-urlencoded"
.send strArg
html.body.innerHTML = .responseText
Set postWrongSec = html.querySelector("span[lang='AR-KW']")
If Not postWrongSec Is Nothing Then
If postWrongSec.innerText = "ÚÝæÇ: ÑãÒ ÇáÍãÇíÉ ÛíÑ ÕÍíÍ !!!" Then
cnt = cnt + 1
Debug.Print "Wrong Captcha " & cnt: GoTo sPoint
End If
End If
Set postCasePane = html.querySelector("#caseViewPane span h4")
If postCasePane Is Nothing Then wsData.Range("A" & lr).Value = wsIndex.Cells(r, 1).Value: wsData.Range("C" & lr).Value = "ÑÞã ÇáÞÖíÉ ÛíÑ ÕÍíÍ": GoTo Skipper
.Open "POST", "https://eservices.moj.gov.kw/viewResults/viewLastEvents.jsp", False
.setRequestHeader "Content-type", "application/x-www-form-urlencoded"
.send
html.body.innerHTML = .responseText
End With
Set html = Nothing: Set htmlData = Nothing
Skipper:
Application.Wait Now + TimeValue("00:00:05")
Next r
Application.StatusBar = Empty
MsgBox "Done...", 64
End Sub
And this is the part the is responsible for the captcha
Private Sub DecryptCaptcha()
Dim res, sDestFolder As String, strFile As String, sURL As String
sDestFolder = ThisWorkbook.Path & "\"
strFile = "Number.png"
sURL = "https://eservices.moj.gov.kw/captcha/imgCaptcha.jsp"
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", sURL, False
.send
res = .responseBody
End With
With CreateObject("ADODB.Stream")
.Type = 1
.Open
.write res
.SaveToFile sDestFolder & strFile, 2
End With
vCaptcha = CleanNumber(ScriptFile(sDestFolder & strFile))
End Sub
Function ScriptFile(strImage As String) As String
Dim wshShell As Object, sOutput As String, strCommand As String
sOutput = ThisWorkbook.Path & "\OutputNumber.txt"
strCommand = "Powershell.exe -File ""C:\Users\" & Environ("USERNAME") & "\Desktop\ConvertImage.ps1"" " & strImage
Set wshShell = CreateObject("WScript.Shell")
wshShell.Run strCommand, 0, True
ScriptFile = CreateObject("Scripting.FileSystemObject").OpenTextFile(sOutput).ReadAll
End Function
Function CleanNumber(ByVal strText As String) As String
With CreateObject("VBScript.RegExp")
.IgnoreCase = True
.Global = True
.Pattern = "[^0-9]"
If .Test(strText) Then
CleanNumber = WorksheetFunction.Trim(.Replace(strText, vbNullString))
Else
CleanNumber = strText
End If
End With
End Function
And as for the powershell file these are the contents
$image=$args[0]
$desktop= (Join-Path $env:USERPROFILE 'Desktop')
$imagefile=(Join-Path $desktop 'NumberNew.png')
$textfile=(Join-Path $desktop 'OutputNumber')
cd (Join-Path $desktop '\')
magick convert $image -resize 300x160 -density 300 -quality 100 $imagefile
magick convert $imagefile -negate -lat 300x160+40% -negate $imagefile
tesseract.exe $imagefile $textfile -l eng
Of course the code requires tesseract to be installed and also the imagemagick to deal and manipulate the image. The code is working in VBA but I would like to use python for that to improve my skills. Now I am stuck and have no more points of success. Thanks advanced for help.

Recreating python mechanize script in R

I'd like to recreate the python script below which uses mechanize and http.cookiejar in R. I thought it would be straight forward using rvest but I was unable to do so. Any insight on which packages to use and apply would be extremely helpful. I realize reticulate may be a possibility but I figure that there has to be a way to do this in R that is straight forward.
import mechanize
import http.cookiejar
b = mechanize.Browser()
b.set_handle_refresh(True)
b.set_debug_redirects(True)
b.set_handle_redirect(True)
b.set_debug_http(True)
cj = http.cookiejar.CookieJar()
b.set_cookiejar(cj)
b.addheaders = [
('User-agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.94 Safari/537.36'),
('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'),
('Host', 'www.fangraphs.com'),
('Referer', 'https://www.fangraphs.com/auctiontool.aspx?type=pit&proj=atc&pos=1,1,1,1,5,1,1,0,0,1,5,5,0,18,0&dollars=400&teams=12&mp=5&msp=5&mrp=5&mb=1&split=&points=c|0,1,2,3,4,5|0,1,2,3,4,5&lg=MLB&rep=0&drp=0&pp=C,SS,2B,3B,OF,1B&players=')
]
b.open("https://www.fangraphs.com/auctiontool.aspx?type=pit&proj=atc&pos=1,1,1,1,5,1,1,0,0,1,5,5,0,18,0&dollars=400&teams=12&mp=5&msp=5&mrp=5&mb=1&split=&points=c|0,1,2,3,4,5|0,1,2,3,4,5&lg=MLB&rep=0&drp=0&pp=C,SS,2B,3B,OF,1B&players=")
def is_form1_form(form):
return "id" in form.attrs and form.attrs['id'] == "form1"
b.select_form(predicate=is_form1_form)
b.form.find_control(name='__EVENTTARGET').readonly = False
b.form.find_control(name='__EVENTARGUMENT').readonly = False
b.form['__EVENTTARGET'] = 'AuctionBoard1$cmdCSV'
b.form['__EVENTARGUMENT'] = ''
print(b.submit().read())
The R code I was using to attempt to recreate this with rvest is below. The comments indicate the main source of my confusion. In particular the needed fields grabbed by the python code were not showing up when I grabbed the form with rvest and when I tried to manually insert them I got a Connection Refused upon submitting.
library(rvest)
atc.pitcher.link = "https://www.fangraphs.com/auctiontool.aspx?type=pit&proj=atc&pos=1,1,1,1,5,1,1,0,0,1,5,5,0,18,0&dollars=400&teams=12&mp=5&msp=5&mrp=5&mb=1&split=&points=c|0,1,2,3,4,5|0,1,2,3,4,5&lg=MLB&rep=0&drp=0&pp=C,SS,2B,3B,OF,1B&players="
proj.data = html_session(atc.pitcher.link)
form.unfilled = proj.data %>% html_node("form") %>% html_form()
# note: I am suprised "__EVENTTARGET" and "__EVENTARGUMENT" are not included as attributes of the unfilled form. I can select them in the posted python script.
# If I try and create them with the appropriate values I get a Connection Refused Error.
form.unfilled[[5]]$`__EVENTTARGET` = form.unfilled[[5]]$`__VIEWSTATE`
form.unfilled[[5]]$`__EVENTARGUMENT`= form.unfilled[[5]]$`__VIEWSTATE`
form.unfilled[[5]]$`__EVENTTARGET`$readonly = FALSE
form.unfilled[[5]]$`__EVENTTARGET`$value = "AuctionBoard1$cmdCSV"
form.unfilled[[5]]$`__EVENTARGUMENT`$value = ""
form.unfilled[[5]]$`__EVENTARGUMENT`$readonly = FALSE
form.filled = form.unfilled
session = submit_form(proj.data, form.filled)
Here is a way to do it using RSelenium and setting chrome to be headless an enabling remote download to your working directory. It automatically brings up a headless browser and then lets the code drive it.
I believe to do the equivalent in rvest you need to write some native phantomjs.
library(RSelenium)
library(wdman)
eCaps <- list(
chromeOptions = list(
args = c('--headless','--disable-gpu', '--window-size=1280,800'),
prefs = list(
"profile.default_content_settings.popups" = 0L,
"download.prompt_for_download" = FALSE,
"download.default_directory" = getwd()
)
)
)
cDrv <- wdman::chrome()
rD <- RSelenium::rsDriver(extraCapabilities = eCaps)
remDr <- rD$client
remDr$queryRD(
ipAddr = paste0(remDr$serverURL, "/session/", remDr$sessionInfo[["id"]], "/chromium/send_command"),
method = "POST",
qdata = list(
cmd = "Page.setDownloadBehavior",
params = list(
behavior = "allow",
downloadPath = getwd()
)
)
)
atc.pitcher.link= "http://www.fangraphs.com/auctiontool.aspx?type=pit&proj=atc&pos=1,1,1,1,5,1,1,0,0,1,5,5,0,18,0&dollars=400&teams=12&mp=5&msp=5&mrp=5&mb=1&split=&points=c|0,1,2,3,4,5|0,1,2,3,4,5&lg=MLB&rep=0&drp=0&pp=C,SS,2B,3B,OF,1B&players="
remDr$navigate(atc.pitcher.link)
# sleep to be nice and give things time to load
Sys.sleep(8)
# find the button the page we want to click
option <- remDr$findElement('id', 'AuctionBoard1_cmdCSV')
#click it
option$clickElement()
list.files(getwd(),pattern = 'sysdata')
remDr$closeall()
cDrv$stop()

python requests enable cookies/javascript

I try to download an excel file from a specific website. In my local computer it works perfectly:
>>> r = requests.get('http://www.health.gov.il/PublicationsFiles/IWER01_2004.xls')
>>> r.status_code
200
>>> r.content
b'\xd0\xcf\x11\xe0\xa1\xb1...\x00\x00' # Long binary string
But when I connect to a remote ubuntu server, I get a message related to enabling cookies/javascript.
r = requests.get('http://www.health.gov.il/PublicationsFiles/IWER01_2004.xls')
>>> r.status_code
200
>>> r.content
b'<HTML>\n<head>\n<script>\nChallenge=141020;\nChallengeId=120854618;\nGenericErrorMessageCookies="Cookies must be enabled in order to view this page.";\n</script>\n<script>\nfunction test(var1)\n{\n\tvar var_str=""+Challenge;\n\tvar var_arr=var_str.split("");\n\tvar LastDig=var_arr.reverse()[0];\n\tvar minDig=var_arr.sort()[0];\n\tvar subvar1 = (2 * (var_arr[2]))+(var_arr[1]*1);\n\tvar subvar2 = (2 * var_arr[2])+var_arr[1];\n\tvar my_pow=Math.pow(((var_arr[0]*1)+2),var_arr[1]);\n\tvar x=(var1*3+subvar1)*1;\n\tvar y=Math.cos(Math.PI*subvar2);\n\tvar answer=x*y;\n\tanswer-=my_pow*1;\n\tanswer+=(minDig*1)-(LastDig*1);\n\tanswer=answer+subvar2;\n\treturn answer;\n}\n</script>\n<script>\nclient = null;\nif (window.XMLHttpRequest)\n{\n\tvar client=new XMLHttpRequest();\n}\nelse\n{\n\tif (window.ActiveXObject)\n\t{\n\t\tclient = new ActiveXObject(\'MSXML2.XMLHTTP.3.0\');\n\t};\n}\nif (!((!!client)&&(!!Math.pow)&&(!!Math.cos)&&(!![].sort)&&(!![].reverse)))\n{\n\tdocument.write("Not all needed JavaScript methods are supported.<BR>");\n\n}\nelse\n{\n\tclient.onreadystatechange = function()\n\t{\n\t\tif(client.readyState == 4)\n\t\t{\n\t\t\tvar MyCookie=client.getResponseHeader("X-AA-Cookie-Value");\n\t\t\tif ((MyCookie == null) || (MyCookie==""))\n\t\t\t{\n\t\t\t\tdocument.write(client.responseText);\n\t\t\t\treturn;\n\t\t\t}\n\t\t\t\n\t\t\tvar cookieName = MyCookie.split(\'=\')[0];\n\t\t\tif (document.cookie.indexOf(cookieName)==-1)\n\t\t\t{\n\t\t\t\tdocument.write(GenericErrorMessageCookies);\n\t\t\t\treturn;\n\t\t\t}\n\t\t\twindow.location.reload(true);\n\t\t}\n\t};\n\ty=test(Challenge);\n\tclient.open("POST",window.location,true);\n\tclient.setRequestHeader(\'X-AA-Challenge-ID\', ChallengeId);\n\tclient.setRequestHeader(\'X-AA-Challenge-Result\',y);\n\tclient.setRequestHeader(\'X-AA-Challenge\',Challenge);\n\tclient.setRequestHeader(\'Content-Type\' , \'text/plain\');\n\tclient.send();\n}\n</script>\n</head>\n<body>\n<noscript>JavaScript must be enabled in order to view this page.</noscript>\n</body>\n</HTML>'
On local I run from MACos that has Chrome installed (I'm not actively using it for the script, but maybe it's related?), on remote I run ubuntu on digital ocean without any GUI browser installed.
The behavior of requests has nothing to do with what browsers are installed on the system, it does not depend on or interact with them in any way.
The problem here is that the resource you are requesting has some kind of "bot mitigation" mechanism enabled to prevent just this kind of access. It returns some javascript with logic that needs to be evaluated, and the results of that logic are then used for an additional request to "prove" you're not a bot.
Luckily, it appears that this specific mitigation mechanism has been solved before, and I was able to quickly get this request working utilizing the challenge-solving functions from that code:
from math import cos, pi, floor
import requests
URL = 'http://www.health.gov.il/PublicationsFiles/IWER01_2004.xls'
def parse_challenge(page):
"""
Parse a challenge given by mmi and mavat's web servers, forcing us to solve
some math stuff and send the result as a header to actually get the page.
This logic is pretty much copied from https://github.com/R3dy/jigsaw-rails/blob/master/lib/breakbot.rb
"""
top = page.split('<script>')[1].split('\n')
challenge = top[1].split(';')[0].split('=')[1]
challenge_id = top[2].split(';')[0].split('=')[1]
return {'challenge': challenge, 'challenge_id': challenge_id, 'challenge_result': get_challenge_answer(challenge)}
def get_challenge_answer(challenge):
"""
Solve the math part of the challenge and get the result
"""
arr = list(challenge)
last_digit = int(arr[-1])
arr.sort()
min_digit = int(arr[0])
subvar1 = (2 * int(arr[2])) + int(arr[1])
subvar2 = str(2 * int(arr[2])) + arr[1]
power = ((int(arr[0]) * 1) + 2) ** int(arr[1])
x = (int(challenge) * 3 + subvar1)
y = cos(pi * subvar1)
answer = x * y
answer -= power
answer += (min_digit - last_digit)
answer = str(int(floor(answer))) + subvar2
return answer
def main():
s = requests.Session()
r = s.get(URL)
if 'X-AA-Challenge' in r.text:
challenge = parse_challenge(r.text)
r = s.get(URL, headers={
'X-AA-Challenge': challenge['challenge'],
'X-AA-Challenge-ID': challenge['challenge_id'],
'X-AA-Challenge-Result': challenge['challenge_result']
})
yum = r.cookies
r = s.get(URL, cookies=yum)
print(r.content)
if __name__ == '__main__':
main()
you can use this code to avoid block
url = 'your url come here'
s = HTMLSession()
s.headers['user-agent'] = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36'
r = s.get(url)
r.html.render(timeout=8000)
print(r.status_code)
print(r.content)

Python requests - download cut short

I've recently been trying to scrape a site that contains chemistry exam tests in pdf using Python. I used requests for python and everything was going well, until some of the downloads were cut short at a very small size i.e. 2KB. What's curious though - it happens completely at random with every run of the script the files cut are different. I've been scratching my head for a while now and decided to ask here. Downloading them manually probably would have proved faster by now, but I want to know why the script isn't working, for future reference.
I've written the script to be asynchronous, thus it occurred to me that I could have been DoSing the server. However, I've replaced every Pool with a synchronous for loop, even adding time.sleep() here and there - it didn't help. Using this approach none of the files were fully downloaded - practically every single one stopping at 2KB.
Please forgive me if the question is naive or my mistake is foolish as I am only a hobby programmer. I'll be grateful for any help.
P.S. I've intercepted the headers using Postman from Chrome, without them the response was 500, however I won't include them as they contain session ids that would enable you to login into my account.
The script is as follows:
from shutil import copyfileobj
from multiprocessing.dummy import Pool as ThreadPool
from requests import get
from time import sleep
titles = {
"95": "Budowa atomu. Układ okresowy pierwiastków chemicznych",
"96": "Wiązania chemiczne",
"97": "Systematyka związków nieorganicznych",
"98": "Stechiometria",
"99": "Reakcje utleniania-redukcji. Elektrochemia",
"100": "Roztwory",
"101": "Kinetyka chemiczna",
"102": "Reakcje w wodnych roztworach elektrolitów",
"103": "Charakterystyka pierwiastków i związków chemicznych",
"104": "Chemia organiczna jako chemia związków węgla",
"105": "Węglowodory",
"106": "Jednofunkcyjne pochodne węglowodorów",
"107": "Wielofunkcyjne pochodne węglowodorów",
"108": "Arkusz maturalny"
}
#collection = {"120235": "Chemia nieorganiczna", "120586": "Chemia organiczna"}
url = "https://e-testy.terazmatura.pl/print/%s/quiz_%s/%s"
def downloadTest(id):
with ThreadPool(2) as tp:
tp.starmap(downloadActualTest, [(id, "blank"), (id, "key")])
def downloadActualTest(id, dataType):
name = titles[str(id)]
if id in range(95, 104):
collectionId = 120235
else:
collectionId = 120586
if dataType == "blank":
with open("Pulled Data/%s - pusty.pdf" % name, "wb") as test:
print("Downloading: " + url % (collectionId, id, "blank") + '\n')
r = get(url % (collectionId, id, "blank"),
stream=True,
headers=headers)
r.raw.decode_content = True
copyfileobj(r.raw, test)
elif dataType == "key":
with open("Pulled Data/%s - klucz.pdf" % name, "wb") as test:
print("Downloading: " + url % (collectionId, id, "key") + '\n')
r = get(url % (collectionId, id, "key"),
stream=True,
headers=headers)
r.raw.decode_content = True
copyfileobj(r.raw, test)
with ThreadPool(3) as p:
p.map(downloadTest, range(95, 109))

Header Check in Python (GAE)

I was wondering how I would go about checking HTTP headers to determine whether the request is valid or malformed. How can I do this in Python, more specifically, how can I do this in GAE?
For some debugging and viewing the request with the headers I use the following DDTHandler class.
import cgi
import wsgiref.handlers
import webapp2
class DDTHandler(webapp2.RequestHandler):
def __start_display(self):
self.response.out.write("<!--\n")
def __end_display(self):
self.response.out.write("-->\n")
def __show_dictionary_items(self,dictionary,title):
if (len(dictionary) > 0):
request = self.request
out = self.response.out
out.write("\n" + title + ":\n")
for key, value in dictionary.iteritems():
out.write(key + " = " + value + "\n")
def __show_request_members(self):
request = self.request
out = self.response.out
out.write(request.url+"\n")
out.write("Query = "+request.query_string+"\n")
out.write("Remote = "+request.remote_addr+"\n")
out.write("Path = "+request.path+"\n\n")
out.write("Request payload:\n")
if (len(request.arguments()) > 0):
for argument in request.arguments():
value = cgi.escape(request.get(argument))
out.write(argument+" = "+value+"\n")
else:
out.write("Empty\n")
self.__show_dictionary_items(request.headers, "Headers")
self.__show_dictionary_items(request.cookies, "Cookies")
def view_request(self):
self.__start_display()
self.__show_request_members()
self.__end_display()
def view(self, aString):
self.__start_display()
self.response.out.write(aString+"\n")
self.__end_display()
Example:
class RootPage(DDTHandler):
def get(self):
self.view_request()
Will output the request and contains the headers.
So check the code and get what you need. Thought as said, a malformed "invalid" request won't probably hit your app.
<!--
http://localhost:8081/
Query =
Remote = 127.0.0.1
Path = /
Request payload:
Empty
Headers:
Referer = http://localhost:8081/_ah/login?continue=http%3A//localhost%3A8081/
Accept-Charset = ISO-8859-7,utf-8;q=0.7,*;q=0.3
Cookie = hl=en_US; dev_appserver_login="test#example.com:False:185804764220139124118"
User-Agent = Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.52 Safari/537.17
Host = localhost:8081
Accept = text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8
Accept-Language = en-US,en;q=0.8,el;q=0.6
Cookies:
dev_appserver_login = test#example.com:False:185804764220139124118
hl = en_US
-->

Categories

Resources