I'm using python 3.7 and the requests-html library.
I have tried to send a get request in a session to a site with a form. First I use the response to get the CAPTCHA image and download it, and than send a POST request in the same session including the decoded CAPTCHA code.
The first part of sending the get request and getting a "ProcessKey" and the CAPTCHA image works great.
For some reason the second part where I'm send the POST request keeps redirect me to the previous page and it's not working properly.
I tried to change the user agent and the request headers to be similar to what I got with chrome dev panel as you can see in my code.
Before i made it to work with Selenium library but it is not good for uses.
from requests_html import HTMLSession
import time
url = 'https://www.misim.gov.il/svinfonadlan2010/'
url2 = 'https://www.misim.gov.il/svinfonadlan2010/startpageNadlanNewDesign.aspx?ProcessKey='
url3 = 'https://www.misim.gov.il/svinfonadlan2010/InfoNadlanPerutWithMap.aspx?ProcessKey='
session = HTMLSession()
request = session.get(url)
process_key = request.url.split('ProcessKey=')[1]
# Get the captcha image code:
image_url = request.html.find('#ContentUsersPage_RadCaptcha1_CaptchaImageUP', first=True)
image_url = url + image_url.attrs['src']
image_file_name = process_key + '.png'
with open('captcha_temp_files/' + image_file_name, 'wb') as f:
f.write(session.get(image_url).content)
print(request.url)
ans = input('Enter the captcha: ')
all_inputs = request.html.find('input')
data = {}
for i in all_inputs:
if 'value' in i.attrs.keys():
data[i.attrs['name']] = i.attrs['value']
else:
data[i.attrs['name']] = None
data["ctl00$ContentUsersPage$rbYeshuvOrGush"] = "rbMegush"
data['ctl00$ContentUsersPage$txtmegusha'] = 30010
data['ctl00$ContentUsersPage$txthelka'] = 129
data['ctl00$ContentUsersPage$txtadGush'] = 30010
data['ctl00$ContentUsersPage$txtadHelka'] = 129
data['ctl00$ContentUsersPage$DDLTypeNehes'] = 1
data['ctl00$ContentUsersPage$DDLMahutIska'] = 999
data['ctl00$ContentUsersPage$RadCaptcha1$CaptchaTextBox'] = ans
post_request_header = {
"Accept": 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "en-US,en;q=0.9,he-IL;q=0.8,he;q=0.7",
"Cache-Control": "max-age=0",
"Connection": "keep-alive",
#"Content-Length": "10663",
"Content-Type": "application/x-www-form-urlencoded",
"DNT": "1",
"Host": "www.misim.gov.il",
"Origin": "https://www.misim.gov.il",
"Referer": url2 + process_key,
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "same-origin",
"Sec-Fetch-User": "?1",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Mobile Safari/537.36",
}
session.headers = post_request_header
request2 = session.post(url=url2 + process_key, data=data)
print(request2.url)
time.sleep(2)
request3 = session.get(url=url3 + process_key)
print(request3.url)
Please help me to understand what is wrong here or is there another library that can do this except Selenium
Thank you in advance!
Related
I'm trying to use the requests module in Python to handle cgi-bin to login in a website, but I have to login using a digital certificate, file .pfx
I'm using this code that I found https://gist.github.com/erikbern/756b1d8df2d1487497d29b90e81f8068
#contextlib.contextmanager
def pfx_to_pem(pfx_path, pfx_password):
''' Decrypts the .pfx file to be used with requests. '''
with tempfile.NamedTemporaryFile(suffix='.pem') as t_pem:
f_pem = open(t_pem.name, 'wb')
pfx = open(pfx_path, 'rb').read()
p12 = OpenSSL.crypto.load_pkcs12(pfx, pfx_password)
f_pem.write(OpenSSL.crypto.dump_privatekey(OpenSSL.crypto.FILETYPE_PEM, p12.get_privatekey()))
f_pem.write(OpenSSL.crypto.dump_certificate(OpenSSL.crypto.FILETYPE_PEM, p12.get_certificate()))
ca = p12.get_ca_certificates()
if ca is not None:
for cert in ca:
f_pem.write(OpenSSL.crypto.dump_certificate(OpenSSL.crypto.FILETYPE_PEM, cert))
f_pem.close()
yield t_pem.name
with pfx_to_pem('path/file.pfx', 'password') as cert:
login_page = "https://zeusr.sii.cl/AUT2000/InicioAutenticacion/IngresoCertificado.html?https://misiir.sii.cl/cgi_misii/siihome.cgi"
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "es-ES,es;q=0.9",
"Cache-Control": "max-age=0",
"Connection": "keep-alive",
"Cookie": "s_cc=true",
"Host": "herculesr.sii.cl",
"Origin": "https://zeusr.sii.cl",
"Referer": "https://misiir.sii.cl/cgi_misii/siihome.cgi",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "same-site",
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36"
}
s = requests.Session()
s.cert = cert
print(s.cert)
r = s.get(login_page, cert=s.cert, headers=headers)
print(r.content)
When I sent with headers I received
b'Error 400'
and without headers I received the html document with a message that I am not logged
Well finally I have to convert the .pfx file to cert and key and works easily, just in request I have to add verify=false
s = requests.Session()
s.cert = cert
r = s.get(login_page, cert=("certificate.cert", "certkey.key"), verify=false)
print(r.content)
I am trying to scrape a table of https://www.domeinquarantaine.nl/, however, for some reason, it does not give a response of the table
#The parameters
baseURL = "https://www.domeinquarantaine.nl/tabel.php"
PARAMS = {"qdate": "2019-04-21", "pagina": "2", "order": "karakter"}
DATA = {"qdate=2019-04-21&pagina=3&order="}
HEADERS = {"Host": "www.domeinquarantaine.nl",
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:65.0) Gecko/20100101 Firefox/65.0",
"Accept": "*/*",
"Accept-Language": "en-US,en;q=0.5",
"Accept-Encoding": "gzip, deflate, br",
"Referer": "https://www.domeinquarantaine.nl/",
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
"X-Requested-With": "XMLHttpRequest",
"Content-Length": "41",
"Connection": "keep-alive",
"Cookie": "_ga=GA1.2.1612813080.1548179877; PHPSESSID=5694f8e2e4f0b10e53ec2b54310c02cb; _gid=GA1.2.1715527396.1555747200"}
#POST request
r = requests.post(baseURL, headers = HEADERS, data = PARAMS)
#Checking the response
r.text
The response consists of strange tokens and question marks
So my question is why it is returning this response? And how to fix it to eventually end up with the scraped table?
Open web browser, turn off JavaScript and you will see what requests can get.
But using DevTools in Chrome/Firefox (tab Network, filter XHR requests) you should see POST request to url https://www.domeinquarantaine.nl/tabel.php and it sends back HTML with table.
If you open this url in browser then you see table - so you can get it event with GET but using POST you probably can filter data.
After writing this explanation I saw you already has this url in code - you didn't mention it in description.
You have different problem - you set
"Accept-Encoding": "gzip, deflate, br"
so server sends compressed response and you should uncompress it.
Or use
"Accept-Encoding": "deflate"
and server will send uncompressed data and you will see HTML with table
So there are a couple of reasons why you're getting what you're getting:
Your headers don't look correct
The data that you are sending contains some extra variables
The website requires cookies in order to display the table
This can be easily fixed by changing the data and headers variables and adding requests.session() to your code (which will automatically collect and inject cookies)
All in all your code should look like this:
import requests
session = requests.session()
headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:66.0) Gecko/20100101 Firefox/66.0", "Accept": "*/*", "Accept-Language": "en-US,en;q=0.5", "Accept-Encoding": "gzip, deflate", "Referer": "https://www.domeinquarantaine.nl/", "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8", "X-Requested-With": "XMLHttpRequest", "DNT": "1", "Connection": "close"}
data={"qdate": "2019-04-20"}
session.get("https://www.domeinquarantaine.nl", headers=headers)
r = session.post("https://www.domeinquarantaine.nl/tabel.php", headers=headers, data=data)
r.text
Hope this helps!
I'm completely stuck trying to log into this page:
https://login.wyborcza.pl/
I have tried this solution:
import requests
# Fill in your details here to be posted to the login form.
payload = {
'name': 'username',
'pass': 'password'
}
# Use 'with' to ensure the session context is closed after use.
with requests.Session() as s:
login_url = "https://login.wyborcza.pl/"
p = s.post(login_url, data=payload)
# print the html returned or something more intelligent to see if it's a successful login page.
print p.text
# the authorised request.
r = s.get('A protected web page url')
print r.text
# etc...
that I found here, but I only get a 400 status.
Thanks for reading.
UPDATE:
Another problem occurs:
When I'm trying to read in this page with request.get(), I get a message, that an adblocker is on, and the content of the page isn't loaded. But if I try to access the page in a browser, there's no problem - all content is loaded.
import requests
# Fill in your details here to be posted to the login form.
payload = {
'username': 'username',
'password': 'password'
}
# Use 'with' to ensure the session context is closed after use.
with requests.Session() as s:
login_url = "https://login.wyborcza.pl/services/ajax/wyborcza/login"
p = s.post(login_url, data=payload)
# print the html returned or something more intelligent to see if it's a successful login page.
cookiesALL = s.cookies.get_dict()
s.headers.update({
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "en-US,en;q=0.9,nb;q=0.8",
"Cache-Control": "max-age=0",
"Connection": "keep-alive",
"Content-Length": "101",
"Content-Type": "application/x-www-form-urlencoded",
"Cookie": "SsoSessionPermanent=da7c41fb3ce67a9c36068c8752ecb6f6c595261ec033bef85f5a00a09b992491; _gcl_au=1.1.1603784452.1550874547; __utmc=150475568; __utmz=150475568.1550874547.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); _ga=GA1.2.624566896.1550874547; _gid=GA1.2.1373698316.1550874547; _fbp=fb.1.1550874547334.2017607101; __gfp_64b=MOGLe6FatMqipvP6ZL6AdioAq5LZyXL4TZ4CaKZlx8H.U7; customDataLayer_customer=%7B%22state%22%3A%22anonymous%22%2C%22validPeriod%22%3A%22%22%7D; __gads=ID=6024a627e7962b38:T=1550874563:S=ALNI_MY5DVzG-IY0cLZRQFFrv-45kvL9AQ; GazetaPlUser=213A32A242A37k1550874561871; SquidLocalUID=f1b0394447af42427a2985c4; __utma=150475568.624566896.1550874547.1550874547.1550913993.2; __utmb=150475568.0.10.1550913993",
"Host": "login.wyborcza.pl",
"Origin": "http://wyborcza.pl",
"Referer": "http://wyborcza.pl/1,76842,3360710.html",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"
})
# the authorised request.
r = s.get('https://wyborcza.pl/1,76842,3360710.html')
print(r.text)
# etc...
This script should solve your issue:
import requests
# Fill in your details here to be posted to the login form.
payload = {
'username': 'username',
'password': 'password'
}
# Use 'with' to ensure the session context is closed after use.
with requests.Session() as s:
login_url = "https://login.wyborcza.pl/services/ajax/wyborcza/login"
p = s.post(login_url, data=payload)
# print the html returned or something more intelligent to see if it's a successful login page.
cookiesALL = s.cookies.get_dict()
s.headers.update({
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:65.0) Gecko/20100101 Firefox/65.0",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
"Accept-Encoding": "gzip, deflate",
"DNT": "1",
"Connection": "close",
"Upgrade-Insecure-Requests": "1",
"Cookie":"SsoSessionPermanent={}; GW_SID=220D44FAAD9071FAC49796195720D348.tomwybo17; ag-rd-params=; __gfp_64b=-TURNEDOFF; customDataLayer_customer=%7B%22state%22%3A%22anonymous%22%2C%22validPeriod%22%3A%22%22%7D; bwGuidv2=b952c7409c97c249520c9e8a; SquidLocalUID=3c9df34214b0a589cf4863b7; wyborczaXYZ=test; test=131A251A253A104k1550911251283; bwVisitId=f5a2c74d1ba13dfde8d36c40".format(cookiesALL['SsoSessionPermanent'])
})
# the authorised request.
r = s.get('https://wyborcza.pl/1,76842,3360710.html')
print(r.text)
# etc...
the problem you were dealing with was linked with you faulty parameter names (in payloads) and the login_url you were sending a POST request to.
Hope this helps
I'm trying to automate the recovery of data from this website (The one I want is "
BVBG.086.01 PriceReport"). Checking with firefox, I found out that the request URL to which the POST is made is "http://www.bmf.com.br/arquivos1/lum-download_ipn.asp", and the parameters are:
hdnStatus: "ativo"
chkArquivoDownload_ativo "28"
txtDataDownload_ativo "09/02/2018"
imgSubmeter "Download"
txtDataDownload_externo_ativo [3]
0 "25/08/2017"
1 "25/08/2017"
2 "25/08/2017"
So, if I use hurl.it to make the request, the response is the correct 302 redirect (Pointing to a FTP URL where the requested files are, something like "Location: /FTP/Temp/10981738/Download.ex_"). (Example of the request here).
So I've tried doing the same with with the following code (Using python's library "requests", and I have tried both versions of request_body, trying to put it into the "data" parameter of the post method)
request_url = "http://www.bmf.com.br/arquivos1/lum-download_ipn.asp"
request_headers = {
"Host": "www.bmf.com.br",
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:54.0) Gecko/20100101 Firefox/54.0",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
"Accept-Encoding": "gzip, deflate",
"Referer": "http://www.bmf.com.br/arquivos1/lum-arquivos_ipn.asp?idioma=pt-BR&status=ativo",
"Content-Type": "application/x-www-form-urlencoded",
"Content-Length": "236",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1"
}
# request_body = "hdnStatus=ativo&chkArquivoDownload_ativo=28&txtDataDownload_ativo=09/02/2018&imgSubmeter=Download&txtDataDownload_externo_ativo=25/08/2017&txtDataDownload_externo_ativo=25/08/2017&txtDataDownload_externo_ativo=25/08/2017"
request_body = {
"hdnStatus" : "ativo",
"chkArquivoDownload_ativo": "28",
"txtDataDownload_ativo": "09/02/2018",
"imgSubmeter": "Download",
"txtDataDownload_externo_ativo": ["25/08/2017", "25/08/2017", "25/08/2017"]
}
result_query = post(request_url, request_body, headers=request_headers)
# result_query = post(request_url, data=request_body, headers=request_headers)
for red in result_query.history:
print(BeautifulSoup(red.content, "lxml"))
print()
print(result_query.url)
And what I get is the following response:
<html><head><title>Object moved</title></head>
<body><h1>Object Moved</h1>This object may be found here.</body>
</html>
<html><head><title>Object moved</title></head>
<body><h1>Object Moved</h1>This object may be found here.</body>
</html>
<html><head><title>Object moved</title></head>
<body><h1>Object Moved</h1>This object may be found here.</body>
</html>
http://www.bmf.com.br/arquivos1/lum-arquivos_ipn.asp?idioma=pt-BR&status=ativo
And not the one I wanted (Which should point to the location of the file). What am I doing wrong here?
How could I send two consecutive requests including redirecting
I tried to use Python requests to mimic the search function on the browser.
However, it's not as simple as other simple requests.
I opened the developer mode on Chrome browser and copied the two requests in Curl form then converted it into Python request form.
I can only get 500 error via Python, but I could get the correct response on the browser.
Current code , it only returns 500 error
cookies = {
'optimizelyEndUserId': 'oeu1454030467608r0.5841516454238445',
~~~
'_gat': '1',
}
headers = {
'Origin': 'https://m.flyscoot.com',
~~~~
}
data = 'origin=KHH&destination=KIX&departureDate=20160309&returnDate=&roundTrip=false&adults=1&children=0&infants=0&promoCode='
req = requests.session()
resp_1 = req.post('https://m.flyscoot.com/search', headers=headers, cookies=cookies, data=data)
headers = {
'Accept-Encoding': 'gzip, deflate, sdch',
~~~~
}
# because the first request will be redirected to a unknown status, so I copied the first response set_cookie for the 2nd request uses.
resp_2 = req.get('https://m.flyscoot.com/select', headers=headers, cookies=resp_1.history[0].cookies)
It's seem it's the mobile url. Mostly you should set a web agent. Try this (Python 3):
import urllib
import requests
FF_USER_AGENT = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:21.0.0) '
'Gecko/20121011 Firefox/21.0.0',
"Origin": "http://makeabooking.flyscoot.com",
"Referer": "http://makeabooking.flyscoot.com",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Encoding": "gzip,deflate,sdch",
"Accept-Language": "fr-FR,fr;q=0.8,en-US;q=0.6,en;q=0.4",
"Cache-Control": "max-age=0",
"Connection": "keep-alive",
}
req = requests.session()
resp_1 = req.get('http://makeabooking.flyscoot.com/', headers=FF_USER_AGENT)
# form urlencoded data
raw_data = (
"availabilitySearch.SearchInfo.SearchStations%5B0%5D.DepartureStationCode"
"=ADL"
"&availabilitySearch.SearchInfo.SearchStations%5B0%5D.ArrivalStationCode"
"=SIN"
"&availabilitySearch.SearchInfo.SearchStations%5B0%5D.DepartureDate=2%2F17"
"%2F2016&availabilitySearch.SearchInfo.SearchStations%5B1%5D"
".DepartureStationCode=SIN&availabilitySearch.SearchInfo.SearchStations%5B1"
"%5D.ArrivalStationCode=ADL&availabilitySearch.SearchInfo.SearchStations"
"%5B1"
"%5D.DepartureDate=3%2F17%2F2016&availabilitySearch.SearchInfo.Direction"
"=Return&Singapore+%28SIN%29=Singapore+%28SIN%29&availabilitySearch"
".SearchInfo.AdultCount=1&availabilitySearch.SearchInfo.ChildrenCount=0"
"&availabilitySearch.SearchInfo.InfantCount=0&availabilitySearch.SearchInfo"
".PromoCode=")
dict_data = dict(urllib.parse.parse_qsl(raw_data))
final = req.post('http://makeabooking.flyscoot.com/',
headers=FF_USER_AGENT,
data=dict_data)
print(final.status_code)
print(final.url)
[MOBILE Version]
import urllib
import requests
# debug request
import http.client
http.client.HTTPConnection.debuglevel = 1
import logging
logging.basicConfig()
logging.getLogger().setLevel(logging.DEBUG)
requests_log = logging.getLogger("requests.packages.urllib3")
requests_log.setLevel(logging.DEBUG)
requests_log.propagate = True
FF_USER_AGENT = {
'User-Agent': "Mozilla/5.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) AppleWebKit/600.1.3 (KHTML, like Gecko) Version/8.0 Mobile/12A4345d Safari/600.1.4",
"Origin": "https://m.flyscoot.com",
"Referer": "https://m.flyscoot.com/search",
"Host": "m.flyscoot.com",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Encoding": "gzip,deflate",
"Accept-Language": "fr-FR,fr;q=0.8,en-US;q=0.6,en;q=0.4",
"Cache-Control": "max-age=0",
"Connection": "keep-alive",
"X-Requested-With": "XMLHttpRequest",
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
}
req = requests.session()
resp_1 = req.get('https://m.flyscoot.com', headers=FF_USER_AGENT)
# form urlencoded data
raw_data = (
"origin=MEL&destination=CAN&departureDate=20160220&returnDate=20160227&roundTrip=true&adults=1&children=0&infants=0&promoCode=")
dict_data = dict(urllib.parse.parse_qsl(raw_data))
final = req.post('https://m.flyscoot.com/search',
headers=FF_USER_AGENT,
data=dict_data)