scrapy_splash not rendering for list of urls - python

I created a spider with scrapy_splash,
I hardcoded 3 urls in start_requests.
When I run with any one url it is working fine for all the urls.
when I put all the urls in a list and run one by one, it is not working, and splash not returning complete rendered html in response.body.
kindly help.
code:
import re
import time
import json
import scrapy
import w3lib
from scrapy_splash import SplashRequest
class SpeSpider(scrapy.Spider):
name = 'spe'
# allowed_domains = ['s']
# start_urls = ['http://s/']
without_wait_script = """
function main(splash, args)
splash.private_mode_enabled = false
assert(splash:go(args.url))
assert(splash:wait(2))
return {
html = splash:html(),
}
end
"""
wait_script = """
function main(splash, args)
assert(splash:go(args.url))
assert(splash:wait(10))
return {
html = splash:html(),
}
end
"""
splash_headers = {
'authority': 'www.avivainvestors.com',
'sec-ch-ua': '"Google Chrome";v="95", "Chromium";v="95", ";Not A Brand";v="99"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'sec-fetch-site': 'same-origin',
'sec-fetch-mode': 'navigate',
'sec-fetch-user': '?1',
'sec-fetch-dest': 'document',
'referer': 'https://www.avivainvestors.com/fr-fr/nos-expertises/nos-fonds/',
'accept-language': 'en-US,en;q=0.9,lb;q=0.8',
}
def start_requests(self):
url1="https://www.avivainvestors.com/fr-fr/nos-expertises/equities/uk-listed-equity-high-alpha-fund/lu0160960752-gbp/"
url2 = "https://www.avivainvestors.com/fr-fr/nos-expertises/equities/japon-isr/fr0013340841-eur/"
url3 = "https://www.avivainvestors.com/fr-fr/nos-expertises/fixed-income/emerging-markets-corporate-bond-fund/lu1550133976-usd/"
urls = [url1, url2, url3]
for url in urls:
time.sleep(10)
yield SplashRequest(
url=url,
endpoint="execute",
callback=self.scrape_document_id,
args={"lua_source":self.wait_script},
splash_headers= self.splash_headers
)
def scrape_document_id(self, response):
value = response.xpath('//div[#class="ec-table__cell-content ng-binding ng-scope" and text() = "Rapport annuel"]/../..//td/ec-button/#mstar-component-id').get()
print("VALUE", value)
v = re.search(r"\[([^]]+)\]", value).group().strip("[]")
yield {
"url": response.url,
"id" : v
}

This is because you are using a yield statement which is a generator.
My guess is that you are just doing this,
x = SpeSpider()
x.start_requests()
which only creates a generator from your yield statement.
Try this,
x = SpeSpider()
list(x.start_requests())
It will run your function and produce a list though I am not sure if this is the behaviour you want because I don't any code on how you instantiate the class objects or what the results should look like.

Related

Ebay scraper stops after 5 pages

I am a total noob using scrapy for the first time. I have set it up to get some information, but it always stops after 5 pages. I want it to scrape a lot more pages since at least 20 are available.
import scrapy
from myproject.items import EbaySold
class EbaySpider(scrapy.Spider):
name = 'EbaySold'
allowed_domains = ['www.ebay.com']
start_urls = ['https://www.ebay.com/b/Apple-Unlocked-Smartphones/9355/bn_599372?LH_Sold=1&mag=1&rt=nc&_dmd=1&_pgn=1&_sop=13']
def parse(self, response):
products = response.css('li.s-item')
product_item = EbaySold()
for product in products:
product_item['name'] = product.css('h3.s-item__title::text').get()
if product_item['name'] is None:
product_item['name'] = product.css('span.BOLD::text').get()
product_item['sold_price'] = product.css('span.POSITIVE::text').get()
product_item['date_sold'] = product.css('div.s-item__title-tag::text').get().replace('SOLD ', '')
yield product_item
next_page = response.css('a[type=next]').attrib['href']
if next_page is not None:
yield response.follow(next_page, callback=self.parse)
In your scrapy project settings.py file. Make sure you have the following settings configured.
ROBOTSTXT_OBEY = False
COOKIES_ENABLED = True
DEFAULT_REQUEST_HEADERS = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-language': 'en-US,en;q=0.9',
'accept-encoding': 'gzip, deflate, br',
'sec-fetch-site': 'same-origin',
'upgrade-insecure-requests': 1,
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'
}
CONCURRENT_REQUESTS = 2 # small number
Then try running the spider again.

Read URLs from external file

I found the following TikTok Downloader which is working fine.
from argparse import ArgumentParser
import os
from urllib.parse import parse_qsl, urlparse
import requests
class TikTokDownloader:
HEADERS = {
'Connection': 'keep-alive',
'Pragma': 'no-cache',
'Cache-Control': 'no-cache',
'DNT': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36',
'Accept': '*/*',
'Sec-Fetch-Site': 'same-site',
'Sec-Fetch-Mode': 'no-cors',
'Sec-Fetch-Dest': 'video',
'Referer': 'https://www.tiktok.com/',
'Accept-Language': 'en-US,en;q=0.9,bs;q=0.8,sr;q=0.7,hr;q=0.6',
'sec-gpc': '1',
'Range': 'bytes=0-',
}
def __init__(self, url: str, web_id: str):
self.__url = url
self.__cookies = {
'tt_webid': web_id,
'tt_webid_v2': web_id
}
def __get_video_url(self) -> str:
response = requests.get(self.__url, cookies=self.__cookies, headers=TikTokDownloader.HEADERS)
return response.text.split('"playAddr":"')[1].split('"')[0].replace(r'\u0026', '&')
def download(self, file_path: str):
video_url = self.__get_video_url()
url = urlparse(video_url)
params = tuple(parse_qsl(url.query))
request = requests.Request(method='GET',
url='{}://{}{}'.format(url.scheme,
url.netloc, url.path),
cookies=self.__cookies,
headers=TikTokDownloader.HEADERS,
params=params)
prepared_request = request.prepare()
session = requests.Session()
response = session.send(request=prepared_request)
response.raise_for_status()
if os.path.exists(file_path):
choice = input('File already exists. Overwrite? (Y/N): ')
if choice.lower() != 'y':
return
with open(os.path.abspath(file_path), 'wb') as output_file:
output_file.write(response.content)
if __name__ == "__main__":
parser = ArgumentParser()
parser.add_argument('--web-id', help='Value of tt_webid or tt_webid_v2 cookie (they are the same).')
parser.add_argument('-o', '--output', default='download.mp4', help='Full output path.')
parser.add_argument('url', help='Video url (https://www.tiktok.com/#username/video/1234567890123456789 or https://vm.tiktok.com/a1b2c3/).')
args = parser.parse_args()
downloader = TikTokDownloader(args.url, args.web_id)
downloader.download(args.output)
The issue is that I have to run this command to download each video:
python3 ./tiktok.py --web-id 1234567890123 -o ./file.mp4 https://vm.tiktok.com/...
And I have 1000 links to download. All the links are in A txt file without comma. Like:
Https://tiktok.com/1
Https://tiktok.com/2
Https://tiktok.com/3
So- I'm looking to find a way to read the text file and automatically replace the link in the command that I have to run. Or should I change the actual script?
Use my code please, I have just defined a function that will help you to download all those videos by just entering the path where the file with a thousand links is located, preferably save this python script in the same directory where your file with a thousand links is located:
Use the function
A_thousand_links_jbsidis("my_file_with_1000_links.txt")
This is going to put automatic names to each video based on date and time, I tested it and it works!
Here is the code by jbsidis:
from argparse import ArgumentParser
import os
from urllib.parse import parse_qsl, urlparse
import requests
class TikTokDownloaderjbsidis:
HEADERS = {
'Connection': 'keep-alive',
'Pragma': 'no-cache',
'Cache-Control': 'no-cache',
'DNT': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36',
'Accept': '*/*',
'Sec-Fetch-Site': 'same-site',
'Sec-Fetch-Mode': 'no-cors',
'Sec-Fetch-Dest': 'video',
'Referer': 'https://www.tiktok.com/',
'Accept-Language': 'en-US,en;q=0.9,bs;q=0.8,sr;q=0.7,hr;q=0.6',
'sec-gpc': '1',
'Range': 'bytes=0-',
}
def __init__(self, url, web_id):
self.__url = url
self.__cookies = {
'tt_webid': web_id,
'tt_webid_v2': web_id
}
def __get_video_url(self) -> str:
response = requests.get(self.__url, cookies=self.__cookies, headers=TikTokDownloaderjbsidis.HEADERS)
return response.text.split('"playAddr":"')[1].split('"')[0].replace(r'\u0026', '&')
def download(self, file_path: str):
video_url = self.__get_video_url()
url = urlparse(video_url)
params = tuple(parse_qsl(url.query))
request = requests.Request(method='GET',
url='{}://{}{}'.format(url.scheme,
url.netloc, url.path),
cookies=self.__cookies,
headers=TikTokDownloaderjbsidis.HEADERS,
params=params)
prepared_request = request.prepare()
session = requests.Session()
response = session.send(request=prepared_request)
response.raise_for_status()
if os.path.exists(file_path):
choice = str('jbsidis File already exists. Overwrite? (Y/N): ')
print("Downloading jbsidis == "+str(file_path))
with open(os.path.abspath(file_path), 'wb') as output_file:
output_file.write(response.content)
import time
import random
def A_thousand_links_jbsidis(file_with_a_thousand_links):
n=open(file_with_a_thousand_links).read()
m=n.splitlines() #guessing the links are per line
MyWebIDis="1234567890123" #put the id that works for you
c=0
for new_url in m:
c=c+1
new_auto_file_name=str(c)+" - "+str(time.strftime("_%Y%m%d_%H%M%S_"))+"_video_"+".mp4" #i guess they are mp4
clean_url=str(new_url).replace("\n","").replace("\x0a","").replace("\x0d","").replace(" ","")
downloader = TikTokDownloaderjbsidis(clean_url, MyWebIDis)
downloader.download(new_auto_file_name)
time.sleep(10) #just in case the internet is not that fast, wait 10 seconds after next download
A_thousand_links_jbsidis("my_file_with_1000_links.txt")
And here is the image, I don't know why sometimes we answer questions without giving a real solution, greetings from El Salvador.
jbsidis

Multiple requests with Scrapy

I am trying to scrape the following URLs, however, I need to create two different requests, one for properties for sale and one for rent, as the URL differs.
When I run the code I have I am only able to parse the properties that are for sale ('propbay') and not for rent ('rentbay'). And I am not sure what I am doing wrong with the second request.
Does anyone have any suggestions? Here is my code:
import scrapy
import re
import requests
class ProbRentBaySpider(scrapy.Spider):
name = 'prob_rent_bay'
start_urls = [
'https://www.propbay.co.za/for-sale-property/residential/za/western_cape/7',
'https://www.rentbay.co.za/to-rent-property/residential/za/western_cape/7'
]
headers = {
'authority': 'www.propbay.co.za',
'pragma': 'no-cache',
'cache-control': 'no-cache',
'sec-ch-ua': '"Chromium";v="86", "\\"Not\\\\A;Brand";v="99", "Google Chrome";v="86"',
'accept': '*/*',
'x-requested-with': 'XMLHttpRequest',
'sec-ch-ua-mobile': '?1',
'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Mobile Safari/537.36',
'content-type': 'application/json',
'sec-fetch-site': 'same-origin',
'sec-fetch-mode': 'cors',
'sec-fetch-dest': 'empty',
'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
'cookie': '_ga=GA1.3.1758254184.1598490742; ASP.NET_SessionId=v1muaskigrgnn40m42lsqzct; __RequestVerificationToken=AIEv13vh8ksXZeG6Tf_o-vLCscKt7sYKJjwB0kz0CfqmCe8ZpYRQQdGk2BnN095p2A6wlFf7o_lVYyxe1Jro-I5vHE01; _gid=GA1.3.900892753.1605696808',
}
headers2 = {
'authority': 'www.rentbay.co.za',
'pragma': 'no-cache',
'cache-control': 'no-cache',
'sec-ch-ua': '"Chromium";v="86", "\\"Not\\\\A;Brand";v="99", "Google Chrome";v="86"',
'accept': '*/*',
'x-requested-with': 'XMLHttpRequest',
'sec-ch-ua-mobile': '?1',
'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Mobile Safari/537.36',
'content-type': 'application/json',
'sec-fetch-site': 'same-origin',
'sec-fetch-mode': 'cors',
'sec-fetch-dest': 'empty',
'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
'cookie': 'ASP.NET_SessionId=rexgmrrgju10aw4rirulzrmk; _ga=GA1.3.225479946.1606814269; __RequestVerificationToken=az6ZATA2H0dJfBQ6KuwDwz39XGSiSuIjc4iZwRT8BGSD2surYfA6iOmkIQk2p835G51hYqJd5FFoiSQYsvx-V3Ndx6s1; _gid=GA1.3.1154633027.1607081144; _gat_gtag_UA_7413963_2=1',
}
base_url_sale = ['https://www.propbay.co.za/Property/ListByCityAjax?city=Cape%20Town&province=western%20cape&cityCode=199041&category=3&page=',
'https://www.propbay.co.za/Property/ListByCityAjax?city=Milnerton&province=western%20cape&cityCode=199014&category=3&page=' ]
base_url_rent = ['https://www.rentbay.co.za/Property/ListByCityAjax?city=Cape%20Town&province=western%20cape&cityCode=199041&category=3&page=',
'https://www.rentbay.co.za/Property/ListByCityAjax?city=Milnerton&province=western%20cape&cityCode=199014&category=3&page=' ]
def parse(self,response):
for page in range(2, 8):# specify page range you would like to scrape data for
for link in self.base_url_sale:
next_page = link + str(page)
response = requests.get(url=next_page, headers=self.headers)
for product in response.json()['Data']['Suburbs']:
area_url = 'https://www.propbay.co.za'+ product['FriendlyUrl']
yield scrapy.Request(area_url,callback=self.parse_sale)
for page2 in range(2, 8):# specify page range you would like to scrape data for
for link2 in self.base_url_rent:
next_page2 = link2 + str(page2)
response2 = requests.get(url=next_page2, headers=self.headers2)
for product2 in response2.json()['Data']['Suburbs']:
area_url_2 = 'https://www.rentbay.co.za'+ product2['FriendlyUrl']
yield scrapy.Request(area_url_2,callback=self.parse_rent)
def parse_sale(self, response):
# follow links to property pages
for href in response.xpath('//a[#class="u-text-uppercase"]/#href').getall():
follow_link = 'https://www.propbay.co.za'+ href
yield response.follow(follow_link, self.parse_property)
# follow pagination links
for href in response.xpath('//*[#id="btnNext"]/#href'):
yield response.follow(href, self.parse_sale)
def parse_rent(self, response):
# follow links to property pages
for href in response.xpath('//a[#class="u-text-uppercase"]/#href').getall():
follow_link = 'https://www.rentbay.co.za'+ href
yield response.follow(follow_link, self.parse_property)
# follow pagination links
for href in response.xpath('//*[#id="btnNext"]/#href').getall():
yield response.follow(href, self.parse_rent)
def parse_property(self, response):
title = response.css('span.u-text-capitalize::text').get()
bedrooms = response.xpath('//span[contains(text(), "Bedrooms")]/following-sibling::span/text()').get()
bedrooms = bedrooms.split()[0] if bedrooms is not None else None
...
Edited code:
I have tried to make separate parsing functions, however, I am only getting rental properties, not sure how to get also the properties for sale.
def parse(self,response):
for page in range(2, 8):# specify page range you would like to scrape data for
for link in self.base_url_sale:
next_page = link + str(page)
response = requests.get(url=next_page, headers=self.headers)
for product in response.json()['Data']['Suburbs']:
area_url = 'https://www.propbay.co.za'+ product['FriendlyUrl']
yield scrapy.Request(area_url,callback=self.parse_rent)
def parse_rent(self, response):
for page2 in range(2, 8):# specify page range you would like to scrape data for
for link2 in self.base_url_rent:
next_page2 = link2 + str(page2)
response = requests.get(url=next_page2, headers=self.headers2)
for product2 in response.json()['Data']['Suburbs']:
item = dict()
area_url_2 = 'https://www.rentbay.co.za'+ product2['FriendlyUrl']
yield scrapy.Request(area_url_2,callback=self.parse_all)
def parse_all(self, response):
# follow links to property pages
for href in response.xpath('//a[#class="u-text-uppercase"]/#href').getall():
follow_link = 'https://www.propbay.co.za'+ href
yield response.follow(follow_link, self.parse_property)
# follow pagination links
for href in response.xpath('//*[#id="btnNext"]/#href'):
yield response.follow(href, self.parse_all)
First things first, you need to override start_requests and specify different callbacks for each url and then split you parse() logic into those two methods. Else you can at least have an if check on url before looping over either propbay or rentbay part. For now the responses of both urls are being treated the same way in your parse. So may be first time the request is incorrect as your response is for propbay but second time when it was correct it is filtered by the dupefilter.
For an instant fix you can try adding don't_filter=True to your requests in parse method.

How to get a correct session_id? (Scrapy, Python)

There is an url: https://maps.leicester.gov.uk/map/Aurora.svc/run?inspect_query=QPPRN&inspect_value=ROH9385&script=%5CAurora%5Cw3%5CPLANNING%5Cw3PlanApp_MG.AuroraScript%24&nocache=f73eee56-45da-f708-87e7-42e82982370f&resize=always
It returns the coordinates. To get the coordinates - it does 3 requests(I SUPPOSE):
the url mentioned above
requesting session_id
getting coordinates using previousely mentioned session_id.
I am getting session_id in the 2nd step, but it is wrong. I can't get coordinates in step 3 using it. How can I know that the problem is in session_id? When I insert the session_id taken from the browser - my code works fine and coordinates are received.
Here are the requests in browser:
Here is the correct response from browser:
And this is what I'm getting with my code:
Here is my code (it is for Scrapy framework):
'''
import inline_requests
#inline_requests.inline_requests
def get_map_data(self, response):
""" Getting map data. """
map_referer = ("https://maps.leicester.gov.uk/map/Aurora.svc/run?inspect_query=QPPRN&"
"inspect_value=ROH9385&script=%5CAurora%5Cw3%5CPLANNING%5Cw3PlanApp_MG.AuroraScript"
"%24&nocache=f73eee56-45da-f708-87e7-42e82982370f&resize=always")
response = yield scrapy.Request(
url=map_referer,
meta=response.meta,
method='GET',
dont_filter=True,
)
time_str = str(int(time.time()*1000))
headers = {
'Referer': response.url,
'Accept': 'application/javascript, */*; q=0.8',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7',
'Host': 'maps.leicester.gov.uk',
'Sec-Fetch-Dest': 'script',
'Sec-Fetch-Mode': 'no-cors',
'Sec-Fetch-Site': 'same-origin',
'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36'
}
response.meta['handle_httpstatus_all'] = True
url = ( 'https://maps.leicester.gov.uk/map/Aurora.svc/RequestSession?userName=inguest'
'&password=&script=%5CAurora%5Cw3%5CPLANNING%5Cw3PlanApp_MG.AuroraScript%24&'
f'callback=_jqjsp&_{time_str}=' )
reqest_session_response = yield scrapy.Request(
url=url,
meta=response.meta,
method='GET',
headers=headers,
dont_filter=True,
)
session_id = re.search(r'"SessionId":"([^"]+)', reqest_session_response.text)
session_id = session_id.group(1) if session_id else None
print(8888888888888)
print(session_id)
# session_id = '954f04e2-e52c-4dd9-9046-f3f013d3f633'
# pprn = item.get('other', {}).get('PPRN')
pprn = 'ROH9385' # hard coded for the current page
if session_id and pprn:
time_str = str(int(time.time()*1000))
url = ('https://maps.leicester.gov.uk/map/Aurora.svc/FindValue'
f'Location?sessionId={session_id}&value={pprn}&query=QPPRN&callback=_jqjsp'
f'&_{time_str}=')
coords_response = yield scrapy.Request(
url = url,
method='GET',
meta=reqest_session_response.meta,
dont_filter = True,
)
print(coords_response.text)
breakpoint()'''
Could you please correct my code so that it could get coordinates?
The website creates a sessionId first, then use the sessionId creates a layer on server (I guess). Then you can start requesting, otherwise it can't find the map layer under that sessionId.
import requests
url = "https://maps.leicester.gov.uk/map/Aurora.svc/RequestSession?userName=inguest&password=&script=%5CAurora%5Cw3%5CPLANNING%5Cw3PlanApp_MG.AuroraScript%24"
res = requests.get(url, verify=False).json()
sid = res["Session"]["SessionId"]
url = f"https://maps.leicester.gov.uk/map/Aurora.svc/OpenScriptMap?sessionId={sid}"
res = requests.get(url, verify=False)
url = f"https://maps.leicester.gov.uk/map/Aurora.svc/FindValueLocation?sessionId={sid}&value=ROH9385&query=QPPRN"
res = requests.get(url, verify=False).json()
print(res)

Why getting empty list while try to get links which includes specific class in python using bs4?

I'm try to get some links which are includes specific class, therefore i writed this code:
from bs4 import BeautifulSoup
import requests
def getPages(requestedURLS):
_buff = []
for url in requestedURLS:
try:
_buff.append(requests.get(url))
except requests.exceptions.RequestException as err:
print(err)
return _buff
def getProductList(pages):
links = []
for page in pages:
content = BeautifulSoup(page.content, 'html.parser')
links.extend(content.find_all("a", class_="sresult lvresult clearfix li shic"))
print(links)
def main():
pageLinks = [
"https://www.ebay.co.uk/sch/m.html?_nkw=&_armrs=1&_from=&_ssn=carabaeuro13&_clu=2&_fcid=3&_localstpos=&_stpos=&gbr=1&_pppn=r1&scp=ce0",
"https://www.ebay.co.uk/sch/m.html?_nkw=&_armrs=1&_from=&_ssn=carabaeuro13&_clu=2&_fcid=3&_localstpos=&_stpos=&gbr=1&_pgn=2&_skc=200&rt=nc",
"https://www.ebay.co.uk/sch/m.html?_nkw=&_armrs=1&_from=&_ssn=carabaeuro13&_clu=2&_fcid=3&_localstpos=&_stpos=&gbr=1&_pgn=3&_skc=400&rt=nc"
]
#Results in : <div id="Results" class="results "> <ul id="ListViewInner">
pages = getPages(pageLinks)
productList = getProductList(pages)
if __name__ == '__main__':
main()
You can check links there are lots of links which includes this class but output is empty as you see in below:
C:\Users\projects\getMarketData\venv\Scripts\python.exe C:/Users/projects/getMarketData/getData.py
[]
[]
[]
Process finished with exit code 0
What is wrong?
Using appropriate headers should do the trick:
url = 'https://www.ebay.co.uk/sch/m.html?_nkw=&_armrs=1&_from=&_ssn=carabaeuro13&_clu=2&_fcid=3&_localstpos=&_stpos=&gbr=1&_pppn=r1&scp=ce0'
request_headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-US,en;q=0.9',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Host': 'www.ebay.co.uk',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
}
r = requests.get(url, headers=request_headers)
soup = BeautifulSoup(r.content, 'lxml')
items = soup.find_all('li', {'class': 'sresult lvresult clearfix li shic'})
Result:
print(len(items)) # 18
print(items[0]['listingid']) # 333200565336
Use the network tab of the developer tools to inspect the traffic. Check this out if you've never done this before.

Categories

Resources