Ebay scraper stops after 5 pages

Ebay scraper stops after 5 pages - python

I am a total noob using scrapy for the first time. I have set it up to get some information, but it always stops after 5 pages. I want it to scrape a lot more pages since at least 20 are available.
import scrapy
from myproject.items import EbaySold
class EbaySpider(scrapy.Spider):
name = 'EbaySold'
allowed_domains = ['www.ebay.com']
start_urls = ['https://www.ebay.com/b/Apple-Unlocked-Smartphones/9355/bn_599372?LH_Sold=1&mag=1&rt=nc&_dmd=1&_pgn=1&_sop=13']
def parse(self, response):
products = response.css('li.s-item')
product_item = EbaySold()
for product in products:
product_item['name'] = product.css('h3.s-item__title::text').get()
if product_item['name'] is None:
product_item['name'] = product.css('span.BOLD::text').get()
product_item['sold_price'] = product.css('span.POSITIVE::text').get()
product_item['date_sold'] = product.css('div.s-item__title-tag::text').get().replace('SOLD ', '')
yield product_item
next_page = response.css('a[type=next]').attrib['href']
if next_page is not None:
yield response.follow(next_page, callback=self.parse)

In your scrapy project settings.py file. Make sure you have the following settings configured.
ROBOTSTXT_OBEY = False
COOKIES_ENABLED = True
DEFAULT_REQUEST_HEADERS = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-language': 'en-US,en;q=0.9',
'accept-encoding': 'gzip, deflate, br',
'sec-fetch-site': 'same-origin',
'upgrade-insecure-requests': 1,
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'
}
CONCURRENT_REQUESTS = 2 # small number
Then try running the spider again.

Related

Can someone tell me why this spider method won't run

I am learning scrapy and am trying to scrape this realtor site in Quebec. I am using their API to collect homes and print the URLs to the screen. But my last function print_urls() won't run. I really am stuck here i tried debugging it and it just skips right over my whole function block.
class CentrishomesSpider(scrapy.Spider):
name = 'centrisHomes'
# allowed_domains = ['www.centris.ca']
# start_urls = ['http://www.centris.ca/']
def start_requests(self):
query = {...
}
yield scrapy.Request(
url='https://www.centris.ca/property/UpdateQuery',
method='POST',
body=json.dumps(query),
headers={
'Content-Type': 'application/json',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36'
},
callback=self.get_inscriptions
)
...
def get_inscriptions(self, response):
resp, success = self.success(response)
if success == True:
print(Fore.GREEN + 'Query Updated' + Style.RESET_ALL)
else:
print(Fore.RED + 'Query Not Updated' + Style.RESET_ALL)
yield scrapy.Request(
url='https://www.centris.ca/Property/GetInscriptions',
method='POST',
body=json.dumps({"startPosition": 0}),
headers={
'Content-Type': 'application/json',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36'
},
callback=self.handle_inscriptions
)
def handle_inscriptions(self, response):
homes, success = self.success(response)
if success == True:
print(Fore.GREEN + 'Count ' + str(homes['d']['Result']['count']) + Style.RESET_ALL)
# self.test()
self.html = Selector(text=homes['d']['Result']['html'])
self.print_urls()
# print(response.body)
...
def success(self, response):
my_dict = literal_eval(response.body.decode(
'utf-8').replace(':true}', ':True}'))
if my_dict['d']['Succeeded'] == True:
return my_dict, True
else:
return False
def print_urls(self):
print('try')
# page_html = Selector(resp['d']['Result']['html'])
page_html = self.html
homes = page_html.xpath('//div[contains(#class, "property-thumbnail-item")]')
for home in homes:
yield{
'home_url':home.xpath('.//a[#class="property-thumbnail-summary-link"]/#href').get()
}
...

I figured out my own problem, it was because I turned my print_urls function into a generator, and calling self.print_urls() doesn't make my generator do anything. S/o #AbdealiJK I figured it out because of his answer.
https://stackoverflow.com/a/34609397/19966841

scrapy_splash not rendering for list of urls

I created a spider with scrapy_splash,
I hardcoded 3 urls in start_requests.
When I run with any one url it is working fine for all the urls.
when I put all the urls in a list and run one by one, it is not working, and splash not returning complete rendered html in response.body.
kindly help.
code:
import re
import time
import json
import scrapy
import w3lib
from scrapy_splash import SplashRequest
class SpeSpider(scrapy.Spider):
name = 'spe'
# allowed_domains = ['s']
# start_urls = ['http://s/']
without_wait_script = """
function main(splash, args)
splash.private_mode_enabled = false
assert(splash:go(args.url))
assert(splash:wait(2))
return {
html = splash:html(),
}
end
"""
wait_script = """
function main(splash, args)
assert(splash:go(args.url))
assert(splash:wait(10))
return {
html = splash:html(),
}
end
"""
splash_headers = {
'authority': 'www.avivainvestors.com',
'sec-ch-ua': '"Google Chrome";v="95", "Chromium";v="95", ";Not A Brand";v="99"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'sec-fetch-site': 'same-origin',
'sec-fetch-mode': 'navigate',
'sec-fetch-user': '?1',
'sec-fetch-dest': 'document',
'referer': 'https://www.avivainvestors.com/fr-fr/nos-expertises/nos-fonds/',
'accept-language': 'en-US,en;q=0.9,lb;q=0.8',
}
def start_requests(self):
url1="https://www.avivainvestors.com/fr-fr/nos-expertises/equities/uk-listed-equity-high-alpha-fund/lu0160960752-gbp/"
url2 = "https://www.avivainvestors.com/fr-fr/nos-expertises/equities/japon-isr/fr0013340841-eur/"
url3 = "https://www.avivainvestors.com/fr-fr/nos-expertises/fixed-income/emerging-markets-corporate-bond-fund/lu1550133976-usd/"
urls = [url1, url2, url3]
for url in urls:
time.sleep(10)
yield SplashRequest(
url=url,
endpoint="execute",
callback=self.scrape_document_id,
args={"lua_source":self.wait_script},
splash_headers= self.splash_headers
)
def scrape_document_id(self, response):
value = response.xpath('//div[#class="ec-table__cell-content ng-binding ng-scope" and text() = "Rapport annuel"]/../..//td/ec-button/#mstar-component-id').get()
print("VALUE", value)
v = re.search(r"\[([^]]+)\]", value).group().strip("[]")
yield {
"url": response.url,
"id" : v
}

This is because you are using a yield statement which is a generator.
My guess is that you are just doing this,
x = SpeSpider()
x.start_requests()
which only creates a generator from your yield statement.
Try this,
x = SpeSpider()
list(x.start_requests())
It will run your function and produce a list though I am not sure if this is the behaviour you want because I don't any code on how you instantiate the class objects or what the results should look like.

Multiple requests with Scrapy

I am trying to scrape the following URLs, however, I need to create two different requests, one for properties for sale and one for rent, as the URL differs.
When I run the code I have I am only able to parse the properties that are for sale ('propbay') and not for rent ('rentbay'). And I am not sure what I am doing wrong with the second request.
Does anyone have any suggestions? Here is my code:
import scrapy
import re
import requests
class ProbRentBaySpider(scrapy.Spider):
name = 'prob_rent_bay'
start_urls = [
'https://www.propbay.co.za/for-sale-property/residential/za/western_cape/7',
'https://www.rentbay.co.za/to-rent-property/residential/za/western_cape/7'
]
headers = {
'authority': 'www.propbay.co.za',
'pragma': 'no-cache',
'cache-control': 'no-cache',
'sec-ch-ua': '"Chromium";v="86", "\\"Not\\\\A;Brand";v="99", "Google Chrome";v="86"',
'accept': '*/*',
'x-requested-with': 'XMLHttpRequest',
'sec-ch-ua-mobile': '?1',
'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Mobile Safari/537.36',
'content-type': 'application/json',
'sec-fetch-site': 'same-origin',
'sec-fetch-mode': 'cors',
'sec-fetch-dest': 'empty',
'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
'cookie': '_ga=GA1.3.1758254184.1598490742; ASP.NET_SessionId=v1muaskigrgnn40m42lsqzct; __RequestVerificationToken=AIEv13vh8ksXZeG6Tf_o-vLCscKt7sYKJjwB0kz0CfqmCe8ZpYRQQdGk2BnN095p2A6wlFf7o_lVYyxe1Jro-I5vHE01; _gid=GA1.3.900892753.1605696808',
}
headers2 = {
'authority': 'www.rentbay.co.za',
'pragma': 'no-cache',
'cache-control': 'no-cache',
'sec-ch-ua': '"Chromium";v="86", "\\"Not\\\\A;Brand";v="99", "Google Chrome";v="86"',
'accept': '*/*',
'x-requested-with': 'XMLHttpRequest',
'sec-ch-ua-mobile': '?1',
'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Mobile Safari/537.36',
'content-type': 'application/json',
'sec-fetch-site': 'same-origin',
'sec-fetch-mode': 'cors',
'sec-fetch-dest': 'empty',
'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
'cookie': 'ASP.NET_SessionId=rexgmrrgju10aw4rirulzrmk; _ga=GA1.3.225479946.1606814269; __RequestVerificationToken=az6ZATA2H0dJfBQ6KuwDwz39XGSiSuIjc4iZwRT8BGSD2surYfA6iOmkIQk2p835G51hYqJd5FFoiSQYsvx-V3Ndx6s1; _gid=GA1.3.1154633027.1607081144; _gat_gtag_UA_7413963_2=1',
}
base_url_sale = ['https://www.propbay.co.za/Property/ListByCityAjax?city=Cape%20Town&province=western%20cape&cityCode=199041&category=3&page=',
'https://www.propbay.co.za/Property/ListByCityAjax?city=Milnerton&province=western%20cape&cityCode=199014&category=3&page=' ]
base_url_rent = ['https://www.rentbay.co.za/Property/ListByCityAjax?city=Cape%20Town&province=western%20cape&cityCode=199041&category=3&page=',
'https://www.rentbay.co.za/Property/ListByCityAjax?city=Milnerton&province=western%20cape&cityCode=199014&category=3&page=' ]
def parse(self,response):
for page in range(2, 8):# specify page range you would like to scrape data for
for link in self.base_url_sale:
next_page = link + str(page)
response = requests.get(url=next_page, headers=self.headers)
for product in response.json()['Data']['Suburbs']:
area_url = 'https://www.propbay.co.za'+ product['FriendlyUrl']
yield scrapy.Request(area_url,callback=self.parse_sale)
for page2 in range(2, 8):# specify page range you would like to scrape data for
for link2 in self.base_url_rent:
next_page2 = link2 + str(page2)
response2 = requests.get(url=next_page2, headers=self.headers2)
for product2 in response2.json()['Data']['Suburbs']:
area_url_2 = 'https://www.rentbay.co.za'+ product2['FriendlyUrl']
yield scrapy.Request(area_url_2,callback=self.parse_rent)
def parse_sale(self, response):
# follow links to property pages
for href in response.xpath('//a[#class="u-text-uppercase"]/#href').getall():
follow_link = 'https://www.propbay.co.za'+ href
yield response.follow(follow_link, self.parse_property)
# follow pagination links
for href in response.xpath('//*[#id="btnNext"]/#href'):
yield response.follow(href, self.parse_sale)
def parse_rent(self, response):
# follow links to property pages
for href in response.xpath('//a[#class="u-text-uppercase"]/#href').getall():
follow_link = 'https://www.rentbay.co.za'+ href
yield response.follow(follow_link, self.parse_property)
# follow pagination links
for href in response.xpath('//*[#id="btnNext"]/#href').getall():
yield response.follow(href, self.parse_rent)
def parse_property(self, response):
title = response.css('span.u-text-capitalize::text').get()
bedrooms = response.xpath('//span[contains(text(), "Bedrooms")]/following-sibling::span/text()').get()
bedrooms = bedrooms.split()[0] if bedrooms is not None else None
...
Edited code:
I have tried to make separate parsing functions, however, I am only getting rental properties, not sure how to get also the properties for sale.
def parse(self,response):
for page in range(2, 8):# specify page range you would like to scrape data for
for link in self.base_url_sale:
next_page = link + str(page)
response = requests.get(url=next_page, headers=self.headers)
for product in response.json()['Data']['Suburbs']:
area_url = 'https://www.propbay.co.za'+ product['FriendlyUrl']
yield scrapy.Request(area_url,callback=self.parse_rent)
def parse_rent(self, response):
for page2 in range(2, 8):# specify page range you would like to scrape data for
for link2 in self.base_url_rent:
next_page2 = link2 + str(page2)
response = requests.get(url=next_page2, headers=self.headers2)
for product2 in response.json()['Data']['Suburbs']:
item = dict()
area_url_2 = 'https://www.rentbay.co.za'+ product2['FriendlyUrl']
yield scrapy.Request(area_url_2,callback=self.parse_all)
def parse_all(self, response):
# follow links to property pages
for href in response.xpath('//a[#class="u-text-uppercase"]/#href').getall():
follow_link = 'https://www.propbay.co.za'+ href
yield response.follow(follow_link, self.parse_property)
# follow pagination links
for href in response.xpath('//*[#id="btnNext"]/#href'):
yield response.follow(href, self.parse_all)

First things first, you need to override start_requests and specify different callbacks for each url and then split you parse() logic into those two methods. Else you can at least have an if check on url before looping over either propbay or rentbay part. For now the responses of both urls are being treated the same way in your parse. So may be first time the request is incorrect as your response is for propbay but second time when it was correct it is filtered by the dupefilter.
For an instant fix you can try adding don't_filter=True to your requests in parse method.

Why getting empty list while try to get links which includes specific class in python using bs4?

I'm try to get some links which are includes specific class, therefore i writed this code:
from bs4 import BeautifulSoup
import requests
def getPages(requestedURLS):
_buff = []
for url in requestedURLS:
try:
_buff.append(requests.get(url))
except requests.exceptions.RequestException as err:
print(err)
return _buff
def getProductList(pages):
links = []
for page in pages:
content = BeautifulSoup(page.content, 'html.parser')
links.extend(content.find_all("a", class_="sresult lvresult clearfix li shic"))
print(links)
def main():
pageLinks = [
"https://www.ebay.co.uk/sch/m.html?_nkw=&_armrs=1&_from=&_ssn=carabaeuro13&_clu=2&_fcid=3&_localstpos=&_stpos=&gbr=1&_pppn=r1&scp=ce0",
"https://www.ebay.co.uk/sch/m.html?_nkw=&_armrs=1&_from=&_ssn=carabaeuro13&_clu=2&_fcid=3&_localstpos=&_stpos=&gbr=1&_pgn=2&_skc=200&rt=nc",
"https://www.ebay.co.uk/sch/m.html?_nkw=&_armrs=1&_from=&_ssn=carabaeuro13&_clu=2&_fcid=3&_localstpos=&_stpos=&gbr=1&_pgn=3&_skc=400&rt=nc"
]
#Results in : <div id="Results" class="results "> <ul id="ListViewInner">
pages = getPages(pageLinks)
productList = getProductList(pages)
if __name__ == '__main__':
main()
You can check links there are lots of links which includes this class but output is empty as you see in below:
C:\Users\projects\getMarketData\venv\Scripts\python.exe C:/Users/projects/getMarketData/getData.py
[]
[]
[]
Process finished with exit code 0
What is wrong?

Using appropriate headers should do the trick:
url = 'https://www.ebay.co.uk/sch/m.html?_nkw=&_armrs=1&_from=&_ssn=carabaeuro13&_clu=2&_fcid=3&_localstpos=&_stpos=&gbr=1&_pppn=r1&scp=ce0'
request_headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-US,en;q=0.9',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Host': 'www.ebay.co.uk',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
}
r = requests.get(url, headers=request_headers)
soup = BeautifulSoup(r.content, 'lxml')
items = soup.find_all('li', {'class': 'sresult lvresult clearfix li shic'})
Result:
print(len(items)) # 18
print(items[0]['listingid']) # 333200565336
Use the network tab of the developer tools to inspect the traffic. Check this out if you've never done this before.

Reading data from a website passing parameters

import requests
from lxml import html
from bs4 import BeautifulSoup
session_requests = requests.session()
sw_url = "https://www.southwest.com"
sw_url2 = "https://www.southwest.com/flight/select-flight.html?displayOnly=&int=HOMEQBOMAIR"
#result = session_requests.get(sw_url)
#tree = html.fromstring(result.text)
payload = {"name":"AirFormModel","origin":"MCI","destination":"DAL","departDate":"2018-02-28T06:00:00.000Z","returnDate":"2018-03-03T06:00:00.000Z","tripType":"true","priceType":"DOLLARS","adult":1,"senior":0,"promoCode":""}
#{
# 'origin': 'MCI',
# 'destination': 'DAL',
# 'departDate':'2018-02-28T06:00:00.000Z',
# 'returnDate':'2018-03-01T06:00:00.000Z',
# 'adult':'1'
#}
p = requests.post(sw_url,params=payload)
#print(p.text)
print(p.content)
p1 = requests.get(sw_url2)
soup = BeautifulSoup(p.text,'html.parser')
print(soup.find("div",{"class":"productPricing"}))
pr = soup.find_all("span",{"class":"currency_symbol"})
for tag in pr:
print(tag)
print('++++')
print(tag.next_sibling)
print(soup.find("div",{"class":"twoSegments"}))
soup = BeautifulSoup(p1.text,'html.parser')
print(soup.find("div",{"class":"productPricing"}))
pr = soup.find_all("span",{"class":"currency_symbol"})
for tag in pr:
print(tag)
print('++++')
print(tag.next_sibling)
print(soup.find("div",{"class":"twoSegments"}))
I need to retrieve prices for flights between 2 locations on specific dates. I identified the parameters by looking at the session info from inspector of the browser and included them in the post request.
I am not sure what I'm doing wrong here, but I am unable to read the data from the tags correctly. It's printing none.
Edit : 4/25/2018
I'm using the following code now, but it doesn't seem to help. Please advise.
import threading
from lxml import html
from bs4 import BeautifulSoup
import time
import datetime
import requests
def worker(oa,da,ods):
"""thread worker function"""
print (oa + ' ' + da + ' ' + ods + ' ' + str(datetime.datetime.now()))
url = "https://www.southwest.com/api/air-booking/v1/air-booking/page/air/booking/shopping"
rh = {
'accept': 'application/json,text/javascript,*/*;q=0.01',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en-US,en;q=0.5',
'cache-control': 'max-age=0',
'content-length': '454',
'content-type': 'application/json',
'referer': 'https://www.southwest.com/air/booking/select.html?originationAirportCode=MCI&destinationAirportCode=LAS&returnAirportCode=&departureDate=2018-05-29&departureTimeOfDay=ALL_DAY&returnDate=&returnTimeOfDay=ALL_DAY&adultPassengersCount=1&seniorPassengersCount=0&fareType=USD&passengerType=ADULT&tripType=oneway&promoCode=&reset=true&redirectToVision=true&int=HOMEQBOMAIR&leapfrogRequest=true',
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
}
fd = {
'returnAirport':'',
'twoWayTrip':'false',
'fareType':'DOLLARS',
'originAirport':oa,
'destinationAirport':da,
'outboundDateString':ods,
'returnDateString':'',
'adultPassengerCount':'1',
'seniorPassengerCount':'0',
'promoCode':'',
'submitButton':'true'
}
with requests.Session() as s:
r = s.post(url,headers = rh )
# soup = BeautifulSoup(r.content,'html.parser')
# soup = BeautifulSoup(r.content,'lxml')
print(r)
print(r.content)
print (oa + ' ' + da + ' ' + ods + ' ' + str(datetime.datetime.now()))
return
#db = MySQLdb.connect(host="localhost",user="root",passwd="vikram",db="garmin")
rcount = 0
tdelta = 55
#print(strt_date)
threads = []
count = 1
thr_max = 2
r = ["MCI","DEN","MCI","MDW","MCI","DAL"]
strt_date = (datetime.date.today() + datetime.timedelta(days=tdelta)).strftime("%m/%d/%Y")
while count < 2:
t = threading.Thread(name=r[count-1]+r[count],target=worker,args=(r[count-1],r[count],strt_date))
threads.append(t)
t.start()
count = count + 2

When you say looked at the session info from inspector of the browser, I'm assuming you meant the network tab. If that's the case, are you sure you noted the data being sent properly?
Here's the URL that gets sent by the browser, following which the page you required is fetched:
url = 'https://www.southwest.com/flight/search-flight.html'
You didn't use headers in your request, which, in my opinion, should be passed compulsorily in some cases. Here are the headers that the browser passes:
:authority:www.southwest.com
:method:POST
:path:/flight/search-flight.html
:scheme:https
accept:text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8
accept-encoding:gzip, deflate, br
accept-language:en-US,en;q=0.9
cache-control:max-age=0
content-length:564
content-type:application/x-www-form-urlencoded
origin:https://www.southwest.com
referer:https://www.southwest.com/flight/search-flight.html?int=HOMEQBOMAIR
upgrade-insecure-requests:1
user-agent:Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36
Note:
I removed the cookie header, because that would be taken care of by requests if you're using session.
The first four headers (those that begin with a colon (':')) cannot be passed in Python's requests; so, I skipped them.
Here's the dict that I used to pass the headers:
rh = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en-US,en;q=0.9',
'cache-control': 'max-age=0',
'content-length': '564',
'content-type': 'application/x-www-form-urlencoded',
'origin': 'https://www.southwest.com',
'referer': 'https://www.southwest.com/flight/search-flight.html?int=HOMEQBOMAIR',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36'
}
And here is the form data sent by browser:
fd = {
'toggle_selfltnew': '',
'toggle_AggressiveDrawers': '',
'transitionalAwardSelected': 'false',
'twoWayTrip': 'true',
'originAirport': 'MCI',
# 'originAirport_displayed': 'Kansas City, MO - MCI',
'destinationAirport': 'DAL',
# 'destinationAirport_displayed': 'Dallas (Love Field), TX - DAL',
'airTranRedirect': '',
'returnAirport': 'RoundTrip',
'returnAirport_displayed': '',
'outboundDateString': '02/28/2018',
'outboundTimeOfDay': 'ANYTIME',
'returnDateString': '03/01/2018',
'returnTimeOfDay': 'ANYTIME',
'adultPassengerCount': '1',
'seniorPassengerCount': '0',
'promoCode': '',
'fareType': 'DOLLARS',
'awardCertificateToggleSelected': 'false',
'awardCertificateProductId': ''
}
Note that I commented out two of the items above, but it didn't make any difference. I assumed you'd be having only the location codes and not the full name. If you do have them or if you can extract them from the page, you can send those as well along with other data.
I don't know if it makes any difference, but I used data instead of params:
with requests.Session() as s:
r = s.post(url, headers = rh, data = fd)
soup = BeautifulSoup(r.content, 'lxml')
Finally, here is the result:
>>> soup.find('span', {'class': 'currency_symbol'}).text
'$'

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Ebay scraper stops after 5 pages - python

Related

Can someone tell me why this spider method won't run

scrapy_splash not rendering for list of urls

Multiple requests with Scrapy

Why getting empty list while try to get links which includes specific class in python using bs4?

Reading data from a website passing parameters

Categories

Resources