How to get a correct session_id? (Scrapy, Python) - python

There is an url: https://maps.leicester.gov.uk/map/Aurora.svc/run?inspect_query=QPPRN&inspect_value=ROH9385&script=%5CAurora%5Cw3%5CPLANNING%5Cw3PlanApp_MG.AuroraScript%24&nocache=f73eee56-45da-f708-87e7-42e82982370f&resize=always
It returns the coordinates. To get the coordinates - it does 3 requests(I SUPPOSE):
the url mentioned above
requesting session_id
getting coordinates using previousely mentioned session_id.
I am getting session_id in the 2nd step, but it is wrong. I can't get coordinates in step 3 using it. How can I know that the problem is in session_id? When I insert the session_id taken from the browser - my code works fine and coordinates are received.
Here are the requests in browser:
Here is the correct response from browser:
And this is what I'm getting with my code:
Here is my code (it is for Scrapy framework):
'''
import inline_requests
#inline_requests.inline_requests
def get_map_data(self, response):
""" Getting map data. """
map_referer = ("https://maps.leicester.gov.uk/map/Aurora.svc/run?inspect_query=QPPRN&"
"inspect_value=ROH9385&script=%5CAurora%5Cw3%5CPLANNING%5Cw3PlanApp_MG.AuroraScript"
"%24&nocache=f73eee56-45da-f708-87e7-42e82982370f&resize=always")
response = yield scrapy.Request(
url=map_referer,
meta=response.meta,
method='GET',
dont_filter=True,
)
time_str = str(int(time.time()*1000))
headers = {
'Referer': response.url,
'Accept': 'application/javascript, */*; q=0.8',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7',
'Host': 'maps.leicester.gov.uk',
'Sec-Fetch-Dest': 'script',
'Sec-Fetch-Mode': 'no-cors',
'Sec-Fetch-Site': 'same-origin',
'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36'
}
response.meta['handle_httpstatus_all'] = True
url = ( 'https://maps.leicester.gov.uk/map/Aurora.svc/RequestSession?userName=inguest'
'&password=&script=%5CAurora%5Cw3%5CPLANNING%5Cw3PlanApp_MG.AuroraScript%24&'
f'callback=_jqjsp&_{time_str}=' )
reqest_session_response = yield scrapy.Request(
url=url,
meta=response.meta,
method='GET',
headers=headers,
dont_filter=True,
)
session_id = re.search(r'"SessionId":"([^"]+)', reqest_session_response.text)
session_id = session_id.group(1) if session_id else None
print(8888888888888)
print(session_id)
# session_id = '954f04e2-e52c-4dd9-9046-f3f013d3f633'
# pprn = item.get('other', {}).get('PPRN')
pprn = 'ROH9385' # hard coded for the current page
if session_id and pprn:
time_str = str(int(time.time()*1000))
url = ('https://maps.leicester.gov.uk/map/Aurora.svc/FindValue'
f'Location?sessionId={session_id}&value={pprn}&query=QPPRN&callback=_jqjsp'
f'&_{time_str}=')
coords_response = yield scrapy.Request(
url = url,
method='GET',
meta=reqest_session_response.meta,
dont_filter = True,
)
print(coords_response.text)
breakpoint()'''
Could you please correct my code so that it could get coordinates?

The website creates a sessionId first, then use the sessionId creates a layer on server (I guess). Then you can start requesting, otherwise it can't find the map layer under that sessionId.
import requests
url = "https://maps.leicester.gov.uk/map/Aurora.svc/RequestSession?userName=inguest&password=&script=%5CAurora%5Cw3%5CPLANNING%5Cw3PlanApp_MG.AuroraScript%24"
res = requests.get(url, verify=False).json()
sid = res["Session"]["SessionId"]
url = f"https://maps.leicester.gov.uk/map/Aurora.svc/OpenScriptMap?sessionId={sid}"
res = requests.get(url, verify=False)
url = f"https://maps.leicester.gov.uk/map/Aurora.svc/FindValueLocation?sessionId={sid}&value=ROH9385&query=QPPRN"
res = requests.get(url, verify=False).json()
print(res)

Related

scrapy_splash not rendering for list of urls

I created a spider with scrapy_splash,
I hardcoded 3 urls in start_requests.
When I run with any one url it is working fine for all the urls.
when I put all the urls in a list and run one by one, it is not working, and splash not returning complete rendered html in response.body.
kindly help.
code:
import re
import time
import json
import scrapy
import w3lib
from scrapy_splash import SplashRequest
class SpeSpider(scrapy.Spider):
name = 'spe'
# allowed_domains = ['s']
# start_urls = ['http://s/']
without_wait_script = """
function main(splash, args)
splash.private_mode_enabled = false
assert(splash:go(args.url))
assert(splash:wait(2))
return {
html = splash:html(),
}
end
"""
wait_script = """
function main(splash, args)
assert(splash:go(args.url))
assert(splash:wait(10))
return {
html = splash:html(),
}
end
"""
splash_headers = {
'authority': 'www.avivainvestors.com',
'sec-ch-ua': '"Google Chrome";v="95", "Chromium";v="95", ";Not A Brand";v="99"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'sec-fetch-site': 'same-origin',
'sec-fetch-mode': 'navigate',
'sec-fetch-user': '?1',
'sec-fetch-dest': 'document',
'referer': 'https://www.avivainvestors.com/fr-fr/nos-expertises/nos-fonds/',
'accept-language': 'en-US,en;q=0.9,lb;q=0.8',
}
def start_requests(self):
url1="https://www.avivainvestors.com/fr-fr/nos-expertises/equities/uk-listed-equity-high-alpha-fund/lu0160960752-gbp/"
url2 = "https://www.avivainvestors.com/fr-fr/nos-expertises/equities/japon-isr/fr0013340841-eur/"
url3 = "https://www.avivainvestors.com/fr-fr/nos-expertises/fixed-income/emerging-markets-corporate-bond-fund/lu1550133976-usd/"
urls = [url1, url2, url3]
for url in urls:
time.sleep(10)
yield SplashRequest(
url=url,
endpoint="execute",
callback=self.scrape_document_id,
args={"lua_source":self.wait_script},
splash_headers= self.splash_headers
)
def scrape_document_id(self, response):
value = response.xpath('//div[#class="ec-table__cell-content ng-binding ng-scope" and text() = "Rapport annuel"]/../..//td/ec-button/#mstar-component-id').get()
print("VALUE", value)
v = re.search(r"\[([^]]+)\]", value).group().strip("[]")
yield {
"url": response.url,
"id" : v
}
This is because you are using a yield statement which is a generator.
My guess is that you are just doing this,
x = SpeSpider()
x.start_requests()
which only creates a generator from your yield statement.
Try this,
x = SpeSpider()
list(x.start_requests())
It will run your function and produce a list though I am not sure if this is the behaviour you want because I don't any code on how you instantiate the class objects or what the results should look like.

python handling incoming from url

i'm sending below request to URL and get the response from it
import requests
url = "http://localhost/dat.txt"
payload = {}
headers = {
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36',
'Sec-Fetch-Dest': 'document',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9'
}
response = requests.request("GET", url, headers=headers, data = payload)
print(response.text.encode('utf8'))
Below is the response data that I get -
mohame4|nameon#example.com|passsd!##$4|head,customer|manager,devlop
mohame3|nameon3#example.com|passsd!##$4|head,customer|manager,devlop
I do this with the data
for i in response.text:
try:
i = i.strip().split('|')
userna = i[0]
emaill = i[1]
passd = i[2]
rol1= i[3]
rol2= i[4]
except:
pass
How can I make rol1 as
this head,customer
to
rol1=['head','customer']
Simply split the string you're getting:
rol1 = i[3].split(',')
You could do this more... gracefully, though, using iterable unpacking:
username, email, password, rol1, rol2 = i.strip().split('|')
rol1 = rol1.split(',')
thanks for all helper special #ForceBru
import requests
url = "http://localhost/dat.txt"
response = requests.request("GET", url)
print(response.text)
dat = str(response.text).split('\n')
for i in dat:
i = i.strip().split('|')
print(i[3].split(","))
# TODO: write code...

Python requests appending ambersand (&) to URL when adding results from variable as parameter

Below is my code which basically retrieves data from the database, puts it into a variable in CSV format which I then am trying to append on to a GET request URL. However, the get request results in null as the GET Request URL has an ampersand (&) sign in it.
Question is how do I get rid of it?
This is the URL, note the ampersand (&):
https://demo-api.ig.com/gateway/deal/clientsentiment?marketIds=&JGB,BCHUSD,AT20,
import requests
import json
import time
import datetime
import csv
import pandas as pd
import psycopg2
conn_string = "host=' dbname='' user='' password=''"
conn = psycopg2.connect(conn_string)
cursor=conn.cursor()
# Query to source marketIds
postgreSQL_select_Query = "SELECT DISTINCT () FROM static WHERE TYPE!='' AND marketId!='None'"
cursor.execute(postgreSQL_select_Query)
#print("Selecting marketId from table using cursor.fetchall")
instrument_static_marketId = cursor.fetchall()
cursor.execute(postgreSQL_select_Query )
#This puts the sql result into nice CSV format
y=','.join([y[0] for y in cursor.fetchall() ])
print(y)
# closing database connection.
conn.close ()
def main():
headers = {
'Connection': 'keep-alive',
'Origin': 'https://.com',
'X-IG-API-KEY': '',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
'Content-Type': 'application/json; charset=UTF-8',
'Accept': 'application/json; charset=UTF-8',
'X-SECURITY-TOKEN': '',
'CST': '',
'Sec-Fetch-Site': 'same-site',
'Sec-Fetch-Mode': 'cors',
'Referer': 'https://',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-GB,en-US;q=0.9,en;q=0.8',
}
response = requests.get('https://demo-api.ig.com/gateway/deal/clientsentiment?marketIds=',params=y, headers=headers)
print(response.url)
result = response.json()
print(result)
if __name__ == '__main__':
main()
You've included part of a parameter in your URL which is incorrect and confused requests.
Leave that off, and pass a dictionary for params, just like you're already doing with headers:
y = 'JGB,BCHUSD,AT20'
params = {
'marketIDs': y,
}
url = 'https://demo-api.ig.com/gateway/deal/clientsentiment'
response = requests.get(url, params=params, headers=headers)

Scrapy - Simulating AJAX requests with headers and request payload

https://www.kralilan.com/liste/kiralik-bina
This is the website I am trying to scrape. When you open the website, the listings are generated with an ajax request. The same request keeps populating page whenever you scroll down. This is how they implemented infinite scrolling...
I found out this is the request sent to the server when I scroll down and I tried to simulate the same request with headers and request payload. This is my spider.
class MySpider(scrapy.Spider):
name = 'kralilanspider'
allowed_domains = ['kralilan.com']
start_urls = [
'https://www.kralilan.com/liste/satilik-bina'
]
def parse(self, response):
headers = {'Referer': 'https://www.kralilan.com/liste/kiralik-bina',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:67.0) Gecko/20100101 Firefox/67.0',
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate, br',
#'Content-Type': 'application/json; charset=utf-8',
#'X-Requested-With': 'XMLHttpRequest',
#'Content-Length': 246,
#'Connection': 'keep-alive',
}
yield scrapy.Request(
url='https://www.kralilan.com/services/ki_operation.asmx/getFilter',
method='POST',
headers=headers,
callback=self.parse_ajax
)
def parse_ajax(self, response):
yield {'data': response.text}
If I uncomment the commented headers, request fails with status code 400 or 500.
I tried to send request payload as a body in the parse method. That didn't work either.
If I try to yield response.body, I get TypeError: Object of type bytes is not JSON serializable.
What am I missing here?
The following implementation will fetch you the response you would like to grab. You missed the most important part data to pass as a parameter in your post requests.
import json
import scrapy
class MySpider(scrapy.Spider):
name = 'kralilanspider'
data = {'incomestr':'["Bina","1",-1,-1,-1,-1,-1,5]', 'intextstr':'{"isCoordinates":false,"ListDrop":[],"ListText":[{"id":"78","Min":"","Max":""},{"id":"107","Min":"","Max":""}],"FiyatData":{"Max":"","Min":""}}', 'index':0 , 'count':'10' , 'opt':'1' , 'type':'3'}
def start_requests(self):
yield scrapy.Request(
url='https://www.kralilan.com/services/ki_operation.asmx/getFilter',
method='POST',
body=json.dumps(self.data),
headers={"content-type": "application/json"}
)
def parse(self, response):
items = json.loads(response.text)['d']
yield {"data":items}
In case you wanna parse data from multiple pages (new page index is recorded when you scroll downward), the following will do the trick. The pagination is within index key in your data.
import json
import scrapy
class MySpider(scrapy.Spider):
name = 'kralilanspider'
data = {'incomestr':'["Bina","1",-1,-1,-1,-1,-1,5]', 'intextstr':'{"isCoordinates":false,"ListDrop":[],"ListText":[{"id":"78","Min":"","Max":""},{"id":"107","Min":"","Max":""}],"FiyatData":{"Max":"","Min":""}}', 'index':0 , 'count':'10' , 'opt':'1' , 'type':'3'}
headers = {"content-type": "application/json"}
url = 'https://www.kralilan.com/services/ki_operation.asmx/getFilter'
def start_requests(self):
yield scrapy.Request(
url=self.url,
method='POST',
body=json.dumps(self.data),
headers=self.headers,
meta={'index': 0}
)
def parse(self, response):
items = json.loads(response.text)['d']
res = scrapy.Selector(text=items)
for item in res.css(".list-r-b-div"):
title = item.css(".add-title strong::text").get()
price = item.css(".item-price::text").get()
yield {"title":title,"price":price}
page = response.meta['index'] + 1
self.data['index'] = page
yield scrapy.Request(self.url, headers=self.headers, method='POST', body=json.dumps(self.data), meta={'index': page})
Why do you ignore POST body? You need to submit it too:
def parse(self, response):
headers = {'Referer': 'https://www.kralilan.com/liste/kiralik-bina',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:67.0) Gecko/20100101 Firefox/67.0',
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate, br',
'Content-Type': 'application/json; charset=utf-8',
'X-Requested-With': 'XMLHttpRequest',
#'Content-Length': 246,
#'Connection': 'keep-alive',
}
payload = """
{ incomestr:'["Bina","2",-1,-1,-1,-1,-1,5]', intextstr:'{"isCoordinates":false,"ListDrop":[],"ListText":[{"id":"78","Min":"","Max":""},{"id":"107","Min":"","Max":""}],"FiyatData":{"Max":"","Min":""}}', index:'0' , count:'10' , opt:'1' , type:'3'}
"""
yield scrapy.Request(
url='https://www.kralilan.com/services/ki_operation.asmx/getFilter',
method='POST',
body=payload,
headers=headers,
callback=self.parse_ajax
)

Reading data from a website passing parameters

import requests
from lxml import html
from bs4 import BeautifulSoup
session_requests = requests.session()
sw_url = "https://www.southwest.com"
sw_url2 = "https://www.southwest.com/flight/select-flight.html?displayOnly=&int=HOMEQBOMAIR"
#result = session_requests.get(sw_url)
#tree = html.fromstring(result.text)
payload = {"name":"AirFormModel","origin":"MCI","destination":"DAL","departDate":"2018-02-28T06:00:00.000Z","returnDate":"2018-03-03T06:00:00.000Z","tripType":"true","priceType":"DOLLARS","adult":1,"senior":0,"promoCode":""}
#{
# 'origin': 'MCI',
# 'destination': 'DAL',
# 'departDate':'2018-02-28T06:00:00.000Z',
# 'returnDate':'2018-03-01T06:00:00.000Z',
# 'adult':'1'
#}
p = requests.post(sw_url,params=payload)
#print(p.text)
print(p.content)
p1 = requests.get(sw_url2)
soup = BeautifulSoup(p.text,'html.parser')
print(soup.find("div",{"class":"productPricing"}))
pr = soup.find_all("span",{"class":"currency_symbol"})
for tag in pr:
print(tag)
print('++++')
print(tag.next_sibling)
print(soup.find("div",{"class":"twoSegments"}))
soup = BeautifulSoup(p1.text,'html.parser')
print(soup.find("div",{"class":"productPricing"}))
pr = soup.find_all("span",{"class":"currency_symbol"})
for tag in pr:
print(tag)
print('++++')
print(tag.next_sibling)
print(soup.find("div",{"class":"twoSegments"}))
I need to retrieve prices for flights between 2 locations on specific dates. I identified the parameters by looking at the session info from inspector of the browser and included them in the post request.
I am not sure what I'm doing wrong here, but I am unable to read the data from the tags correctly. It's printing none.
Edit : 4/25/2018
I'm using the following code now, but it doesn't seem to help. Please advise.
import threading
from lxml import html
from bs4 import BeautifulSoup
import time
import datetime
import requests
def worker(oa,da,ods):
"""thread worker function"""
print (oa + ' ' + da + ' ' + ods + ' ' + str(datetime.datetime.now()))
url = "https://www.southwest.com/api/air-booking/v1/air-booking/page/air/booking/shopping"
rh = {
'accept': 'application/json,text/javascript,*/*;q=0.01',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en-US,en;q=0.5',
'cache-control': 'max-age=0',
'content-length': '454',
'content-type': 'application/json',
'referer': 'https://www.southwest.com/air/booking/select.html?originationAirportCode=MCI&destinationAirportCode=LAS&returnAirportCode=&departureDate=2018-05-29&departureTimeOfDay=ALL_DAY&returnDate=&returnTimeOfDay=ALL_DAY&adultPassengersCount=1&seniorPassengersCount=0&fareType=USD&passengerType=ADULT&tripType=oneway&promoCode=&reset=true&redirectToVision=true&int=HOMEQBOMAIR&leapfrogRequest=true',
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
}
fd = {
'returnAirport':'',
'twoWayTrip':'false',
'fareType':'DOLLARS',
'originAirport':oa,
'destinationAirport':da,
'outboundDateString':ods,
'returnDateString':'',
'adultPassengerCount':'1',
'seniorPassengerCount':'0',
'promoCode':'',
'submitButton':'true'
}
with requests.Session() as s:
r = s.post(url,headers = rh )
# soup = BeautifulSoup(r.content,'html.parser')
# soup = BeautifulSoup(r.content,'lxml')
print(r)
print(r.content)
print (oa + ' ' + da + ' ' + ods + ' ' + str(datetime.datetime.now()))
return
#db = MySQLdb.connect(host="localhost",user="root",passwd="vikram",db="garmin")
rcount = 0
tdelta = 55
#print(strt_date)
threads = []
count = 1
thr_max = 2
r = ["MCI","DEN","MCI","MDW","MCI","DAL"]
strt_date = (datetime.date.today() + datetime.timedelta(days=tdelta)).strftime("%m/%d/%Y")
while count < 2:
t = threading.Thread(name=r[count-1]+r[count],target=worker,args=(r[count-1],r[count],strt_date))
threads.append(t)
t.start()
count = count + 2
When you say looked at the session info from inspector of the browser, I'm assuming you meant the network tab. If that's the case, are you sure you noted the data being sent properly?
Here's the URL that gets sent by the browser, following which the page you required is fetched:
url = 'https://www.southwest.com/flight/search-flight.html'
You didn't use headers in your request, which, in my opinion, should be passed compulsorily in some cases. Here are the headers that the browser passes:
:authority:www.southwest.com
:method:POST
:path:/flight/search-flight.html
:scheme:https
accept:text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8
accept-encoding:gzip, deflate, br
accept-language:en-US,en;q=0.9
cache-control:max-age=0
content-length:564
content-type:application/x-www-form-urlencoded
origin:https://www.southwest.com
referer:https://www.southwest.com/flight/search-flight.html?int=HOMEQBOMAIR
upgrade-insecure-requests:1
user-agent:Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36
Note:
I removed the cookie header, because that would be taken care of by requests if you're using session.
The first four headers (those that begin with a colon (':')) cannot be passed in Python's requests; so, I skipped them.
Here's the dict that I used to pass the headers:
rh = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en-US,en;q=0.9',
'cache-control': 'max-age=0',
'content-length': '564',
'content-type': 'application/x-www-form-urlencoded',
'origin': 'https://www.southwest.com',
'referer': 'https://www.southwest.com/flight/search-flight.html?int=HOMEQBOMAIR',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36'
}
And here is the form data sent by browser:
fd = {
'toggle_selfltnew': '',
'toggle_AggressiveDrawers': '',
'transitionalAwardSelected': 'false',
'twoWayTrip': 'true',
'originAirport': 'MCI',
# 'originAirport_displayed': 'Kansas City, MO - MCI',
'destinationAirport': 'DAL',
# 'destinationAirport_displayed': 'Dallas (Love Field), TX - DAL',
'airTranRedirect': '',
'returnAirport': 'RoundTrip',
'returnAirport_displayed': '',
'outboundDateString': '02/28/2018',
'outboundTimeOfDay': 'ANYTIME',
'returnDateString': '03/01/2018',
'returnTimeOfDay': 'ANYTIME',
'adultPassengerCount': '1',
'seniorPassengerCount': '0',
'promoCode': '',
'fareType': 'DOLLARS',
'awardCertificateToggleSelected': 'false',
'awardCertificateProductId': ''
}
Note that I commented out two of the items above, but it didn't make any difference. I assumed you'd be having only the location codes and not the full name. If you do have them or if you can extract them from the page, you can send those as well along with other data.
I don't know if it makes any difference, but I used data instead of params:
with requests.Session() as s:
r = s.post(url, headers = rh, data = fd)
soup = BeautifulSoup(r.content, 'lxml')
Finally, here is the result:
>>> soup.find('span', {'class': 'currency_symbol'}).text
'$'

Categories

Resources