python post url with requests.Session - python

I am trying to download a file with python using "requests.Session" in order to take cookies into account.
My following code return the source code of the second webpage but does not download the file... even if I have included the header and the parameters...
I am running out of idea to find the problem here ...
s = requests.Session()
url_euronex = "https://www.euronext.com/equities/directory"
response = s.get(url_euronex)
response_code = response.status_code
if response_code==200:
content = response.text
token = content[content.index('formKey=nyx_pd_filter_values:')+29 : content.index('dataTableInitCallback')-3]
url = "https://www.euronext.com/fr/popup/data/download?ml=nyx_pd_stocks&cmd=default&formKey=nyx_pd_filter_values%3A"+str(token)
print(url)
headers = {
'Host':'www.euronext.com',
'User-Agent':'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:56.0) Gecko/20100101 Firefox/56.0',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language':'en-US,en;q=0.5',
'Accept-Encoding':'gzip, deflate, br',
'Referer':url,
'Content-Type':'application/x-www-form-urlencoded',
'Content-Length':'135',
'Connection':'keep-alive',
'Upgrade-Insecure-Requests':'1'}
Params = {'Query string':{'ml':'nyx_pd_stocks', 'cmd':'default', 'formKey':'nyx_pd_filter_values:'+str(token)}, 'Form data':{'format':'3','layout':'2','decimal_separator':'1','date_format':'1','op':'Go','form_build_id':'form-64080cce1044e288464d174290cb40e9','form_id':'nyx_download_form'}}
data = [{'url': url, 'params': Params, 'method': 'post'}]
r = s.post(url, json=data, headers=headers)
if r.status_code==200:
print('coucou')
#resultat = (r.text).encode('utf-8')
print(r.text)

Related

Python request cookies setting does not work

I want to crawl the data from this website:'http://www.stcn.com/article/search.html?search_type=all&page_time=1', but the website needs to have cookies on the homepage first, so I first get the cookies he needs from this website('http://www.stcn.com/article/search.html') and set them into the request, but it doesn't work after many attempts.
My code looks like this:
import requests
headers = {
'User-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36','Host':'www.stcn.com'}
def _getStcnCookie(keyWords='all'):
url = "http://www.stcn.com/article/search.html"
data = {'keyword': keyWords}
r = requests.get(url, data, headers=headers, timeout=10)
if r.status_code != 200:
return None
return requests.utils.dict_from_cookiejar(r.cookies)
def searchStcnData(url,keyWords) :
myHeader = dict.copy(headers)
myHeader['X-Requested-With'] = 'XMLHttpRequest'
cookies = _getStcnCookie(keyWords=keyWords)
print(cookies)
jar = requests.cookies.cookiejar_from_dict(cookies)
data = {'keyword':'Paxlovid', 'page_time': 1, 'search_type': 'all'}
#Option One
s = requests.Session()
response = s.post(url, data, headers=myHeader, timeout=5, cookies=cookies)
print(response.text)
# Option two
# myHeader['Cookie'] = 'advanced-stcn_web=potef1789mm5nqgmd6jc1rcih3; path=/; HttpOnly;'+cookiesStr
# Option three
r = requests.post(url, data, headers=myHeader, timeout=5, cookies=cookies)
print(r.json())
return r.json()
searchStcnData('http://www.stcn.com/article/search.html?search_type=all&page_time=1','Paxlovid')
I've tried options 1, 2, and 3 to no avail.
I set cookies in Postman, and only set 'advanced-stcn_web=5sdfitvu42qggmnjvop4dearj4' can get the data, like this :
{
"state": 1,
"msg": "操作成功",
"data": "<li class=\"\">\n <div class=\"content\">\n <div class=\"tt\">\n <a href=\"/article/detail/769123.html\" target=\"_blank\">\n ......
"page_time": 2
}

Python request, succeed login, and then logout in different url

here my code :
session = requests.Session()
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:86.0) Gecko/20100101 Firefox/86.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'Cache-Control': 'max-age=0'}
def generating_data():
main_url='https://opencorporates.com/users/sign_in'
r1 = session.get(main_url, headers=headers)
soup = BeautifulSoup(r1.text, 'html.parser')
tokens = soup.find('meta', attrs={'name':'csrf-token'})
token = tokens.get('content')
print(f'token is : {token}')
print('Login!')
datas = {
'utf8': '✓',
'authenticity_token': token,
'user[email]':'user',
'user[password]':'pass',
'submit':''
}
r2 = session.post('https://opencorporates.com/users/sign_in',headers=headers, data=datas, cookies=r1.cookies)
r3 = session.get('https://opencorporates.com/companies?utf8=%E2%9C%93&q=above+and+beyond&commit=Go&jurisdiction_code=&utf8=%E2%9C%93&commit=Go&nbsp=&controller=searches&action=search_companies&inactive=false&mode=best_fields&search_fields[]=name&branch=false&nonprofit=&order=score', headers=headers, cookies=r1.cookies)
f = open('./res.html', 'w+')
f.write(r3.text)
f.close
generating_data()
i already get the result of login if print the r2 line, but when change to next line r3, it show the page like we are not login yet, anyone can help ? thanks
You need to remove the portion cookies=r1.cookies since you are already using a session. What this does is it overwrites the cookies collected from response of r2 that would have been sent along with the request, and which might been important for logging in. Same goes for the r2. In general, you do not need to deal with cookies yourself when you are using a session with requests. Your code for generating_data() then becomes:
def generating_data():
main_url='https://opencorporates.com/users/sign_in'
r1 = session.get(main_url, headers=headers)
soup = BeautifulSoup(r1.text, 'html.parser')
tokens = soup.find('meta', attrs={'name':'csrf-token'})
token = tokens.get('content')
print(f'token is : {token}')
print('Login!')
datas = {
'utf8': '✓',
'authenticity_token': token,
'user[email]':'user',
'user[password]':'pass',
'submit':''
}
r2 = session.post('https://opencorporates.com/users/sign_in',headers=headers, data=datas)
r3 = session.get('https://opencorporates.com/companies?utf8=%E2%9C%93&q=above+and+beyond&commit=Go&jurisdiction_code=&utf8=%E2%9C%93&commit=Go&nbsp=&controller=searches&action=search_companies&inactive=false&mode=best_fields&search_fields[]=name&branch=false&nonprofit=&order=score', headers=headers)
f = open('./res.html', 'w+')
f.write(r3.text)
f.close

How to get a correct session_id? (Scrapy, Python)

There is an url: https://maps.leicester.gov.uk/map/Aurora.svc/run?inspect_query=QPPRN&inspect_value=ROH9385&script=%5CAurora%5Cw3%5CPLANNING%5Cw3PlanApp_MG.AuroraScript%24&nocache=f73eee56-45da-f708-87e7-42e82982370f&resize=always
It returns the coordinates. To get the coordinates - it does 3 requests(I SUPPOSE):
the url mentioned above
requesting session_id
getting coordinates using previousely mentioned session_id.
I am getting session_id in the 2nd step, but it is wrong. I can't get coordinates in step 3 using it. How can I know that the problem is in session_id? When I insert the session_id taken from the browser - my code works fine and coordinates are received.
Here are the requests in browser:
Here is the correct response from browser:
And this is what I'm getting with my code:
Here is my code (it is for Scrapy framework):
'''
import inline_requests
#inline_requests.inline_requests
def get_map_data(self, response):
""" Getting map data. """
map_referer = ("https://maps.leicester.gov.uk/map/Aurora.svc/run?inspect_query=QPPRN&"
"inspect_value=ROH9385&script=%5CAurora%5Cw3%5CPLANNING%5Cw3PlanApp_MG.AuroraScript"
"%24&nocache=f73eee56-45da-f708-87e7-42e82982370f&resize=always")
response = yield scrapy.Request(
url=map_referer,
meta=response.meta,
method='GET',
dont_filter=True,
)
time_str = str(int(time.time()*1000))
headers = {
'Referer': response.url,
'Accept': 'application/javascript, */*; q=0.8',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7',
'Host': 'maps.leicester.gov.uk',
'Sec-Fetch-Dest': 'script',
'Sec-Fetch-Mode': 'no-cors',
'Sec-Fetch-Site': 'same-origin',
'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36'
}
response.meta['handle_httpstatus_all'] = True
url = ( 'https://maps.leicester.gov.uk/map/Aurora.svc/RequestSession?userName=inguest'
'&password=&script=%5CAurora%5Cw3%5CPLANNING%5Cw3PlanApp_MG.AuroraScript%24&'
f'callback=_jqjsp&_{time_str}=' )
reqest_session_response = yield scrapy.Request(
url=url,
meta=response.meta,
method='GET',
headers=headers,
dont_filter=True,
)
session_id = re.search(r'"SessionId":"([^"]+)', reqest_session_response.text)
session_id = session_id.group(1) if session_id else None
print(8888888888888)
print(session_id)
# session_id = '954f04e2-e52c-4dd9-9046-f3f013d3f633'
# pprn = item.get('other', {}).get('PPRN')
pprn = 'ROH9385' # hard coded for the current page
if session_id and pprn:
time_str = str(int(time.time()*1000))
url = ('https://maps.leicester.gov.uk/map/Aurora.svc/FindValue'
f'Location?sessionId={session_id}&value={pprn}&query=QPPRN&callback=_jqjsp'
f'&_{time_str}=')
coords_response = yield scrapy.Request(
url = url,
method='GET',
meta=reqest_session_response.meta,
dont_filter = True,
)
print(coords_response.text)
breakpoint()'''
Could you please correct my code so that it could get coordinates?
The website creates a sessionId first, then use the sessionId creates a layer on server (I guess). Then you can start requesting, otherwise it can't find the map layer under that sessionId.
import requests
url = "https://maps.leicester.gov.uk/map/Aurora.svc/RequestSession?userName=inguest&password=&script=%5CAurora%5Cw3%5CPLANNING%5Cw3PlanApp_MG.AuroraScript%24"
res = requests.get(url, verify=False).json()
sid = res["Session"]["SessionId"]
url = f"https://maps.leicester.gov.uk/map/Aurora.svc/OpenScriptMap?sessionId={sid}"
res = requests.get(url, verify=False)
url = f"https://maps.leicester.gov.uk/map/Aurora.svc/FindValueLocation?sessionId={sid}&value=ROH9385&query=QPPRN"
res = requests.get(url, verify=False).json()
print(res)

Logging to tricky site with python

I am trying to scrape servers list from https://www.astrill.com/member-zone/tools/vpn-servers which is for members only. Username, password and captcha are required. Everything works if I login with browser and copy 'PHPSESSID' cookie, but I want to log in with Python. I am downloading capthca and enter it manually. But anyway I am not able to login. Login URL: https://www.astrill.com/member-zone/log-in
Could anybody help me, please?
SERVERS_URL = 'https://www.astrill.com/member-zone/tools/vpn-servers'
LOGIN_URL = 'https://www.astrill.com/member-zone/log-in'
def get_capcha(url):
print(f'Scraping url: {url}')
try:
response = requests.get(url)
response.raise_for_status()
except Exception as e:
print(type(e), e)
if response.status_code == 200:
print('Success!')
page = response.content
soup = bs4.BeautifulSoup(page, 'html.parser')
captcha_url = (soup.find('img', alt='captcha')['src'])
captcha_file = os.path.join(BASE_FOLDER, 'captcha.jpg')
id = soup.find(id='csrf_token')
print(id['value'])
print(f'Captcha: {captcha_url}')
print(response.headers)
urlretrieve(captcha_url, captcha_file)
return id['value']
def login(url, id):
captcha_text = input('Captcha: ')
print(id)
payload = {
'action': 'log-in',
'username': 'myusername#a.com',
'password': '1111111',
'captcha': captcha_text,
'_random': 'l4r1b7hf4g',
'csrf_token': id
}
session = requests.session()
post = session.post(url, data=payload)
r = session.get(SERVERS_URL)
print(r.text)
print(r.cookies)
if __name__ == '__main__':
id = get_capcha(LOGIN_URL)
login(LOGIN_URL, id)
First of all I was not sure about payload fields to POST. They can be easily discovered with Firefox Developer Tools - Network. You can find what does your browser actually post there. Second thing which I discovered was that I need to request capthca file within the session with my headers and cookies. So my code looks like following now and it works! (probably some header fields can be removed)
cookies = {}
headers = {
'Host': 'www.astrill.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:76.0) Gecko/20100101 Firefox/76.0',
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Language': 'ru-RU,ru;q=0.8,en-US;q=0.5,en;q=0.3',
'Accept-Encoding': 'gzip, deflate, br',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'X-Requested-With': 'XMLHttpRequest',
'Content-Length': '169',
'Origin': 'https://www.astrill.com',
'Connection': 'keep-alive',
'Referer': 'https://www.astrill.com/member-zone/log-in',
}
payload = {
'action': 'log-in',
'username': 'myusername#a.com',
'password': '1111111',
'remember_me': 0,
'captcha': '',
'_random': 'somerandom1',
'csrf_token': ''
}
def get_capcha(url):
print(f'Scraping url: {url}')
try:
response = session.get(url)
response.raise_for_status()
except Exception as e:
print(type(e), e)
if response.status_code == 200:
print('Success!')
page = response.content
soup = bs4.BeautifulSoup(page, 'html.parser')
captcha_url = (soup.find('img', alt='captcha')['src'])
captcha_file = os.path.join(BASE_FOLDER, 'captcha.jpg')
payload['csrf_token'] = soup.find(id='csrf_token')['value']
print(f'csrf_token: {payload["csrf_token"]}')
print(f'Captcha: {captcha_url}')
cookies.update(response.cookies)
captcha_img = session.get(captcha_url, headers=headers, cookies=cookies)
file = open(captcha_file, "wb")
file.write(captcha_img.content)
file.close()
payload['captcha'] = input('Captcha: ')
return
def login(url):
post = session.post(url, data=payload, headers=headers, cookies=cookies)
print(post.text)
r = session.get(SERVERS_URL, cookies=cookies)
print(r.text)
print(r.cookies)
def main():
get_capcha(LOGIN_URL)
login(LOGIN_URL)
if __name__ == '__main__':
main()

Is it possible to log python requests by passing xml parameters?

I did research on og python requests by passing xml parameters but without much success with this specific page. I can not log in. If someone already had a similar experience any direction help is welcome. My code as follows:
import requests
from bs4 import BeautifulSoup
def main():
# 'https://plataformafinanceira.xxxxxxxxbr.corp/xxxxxxxxcdc/login/login.html?timestamp=1478706683443?redirect=true'
# 'LOGIN:Login'
s = requests.Session()
headers = {
'Accept': 'application/xml, text/xml, */*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-US,en;q=0.9',
'Connection': 'Keep-Alive',
'Content-Type': 'text/xml',
'Host': 'plataformafinanceira.xxxxxxxxbr.corp',
'Referer': 'https://plataformafinanceira.xxxxxxxxbr.corp/xxxxxxxxcdc/login/login.html?redirect=true',
'X-Requested-With': 'XMLHttpRequest',
'User-Agent': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E)'
}
url = 'https://plataformafinanceira.xxxxxxxxbr.corp/xxxxxxxxcdc/login/login.html?timestamp=1478706683443?redirect=true'
r = s.get(url, headers=headers, verify=False)
print('************************************')
print(r.status_code)
print(r.cookies.get_dict())
print('************************************')
cookies = r.cookies.get_dict()
xml = '''
<?xml version="1.0" encoding="ISO-8859-1"?><request>
<login type="group">
<row>
<id_user><![CDATA[x050432]]></id_user>
<ds_password><![CDATA[NDY0NnBvcnQ=]]></ds_password>
<version><![CDATA[2]]></version>
</row>
</login>
</request>
'''
payload = {
'id_user': 'x050432',
'txtcd_Pwd': '4646port',
'version': '2'
}
url = 'https://plataformafinanceira.xxxxxxxxbr.corp/xxxxxxxxcdc/common/callService.do?name=LOGIN:Login-%3Elogin'
r = s.post(url, headers=headers, auth=('x050432', '4646port'), cookies=cookies, verify=False)
print('++++++++++++++++++++++++++++++++++++')
print(r.status_code)
print(r.cookies.get_dict())
print('++++++++++++++++++++++++++++++++++++')
# r = s.post(url, headers=headers, auth=('x050432', '4646port'), data=payload, cookies=cookies)
# r = s.post(url, headers=headers, data=payload, cookies=cookies, verify=False)
# url = 'https://plataformafinanceira.xxxxxxxxbr.corp/xxxxxxxxcdc/login/iframePrincipal.html?funcao=index&timestamp=1562604252980'
# r = s.post(url, headers=headers, cookies=cookies, verify=False)
# r = s.post(url, headers=headers, auth=('x050432', '4646port'), data=payload, cookies=cookies)
# print(r.status_code)
# print(r.cookies.get_dict())
# # print(r.text)
with open('portal.html', 'w') as f:
f.write(r.text)
# print(r.text)
# InsecureRequestWarning: Unverified HTTPS request is being made. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#ssl-warnings
# url = 'https://plataformafinanceira.xxxxxxxxbr.corp/xxxxxxxxcdc/login/login.html'
if __name__ == '__main__':
main()
The image:

Categories

Resources