How to fix Scrapy-Selenium not yielding output?

How to fix Scrapy-Selenium not yielding output? - python

Selenium requests works but not using scrapy-selenium. The page loads and I get a 200 response from the website, but I get no error as it isn't yielding any output.
class SeamdbTestSpider(scrapy.Spider):
name = 'steam_db_test'
start_urls = ['https://steamdb.info/graph/']
def start_requests(self):
for link in self.start_urls:
yield SeleniumRequest(
url=link,
wait_time= 10,
callback=self.parse)
def parse(self, response):
driver = response.meta['driver']
initial_page = driver.page_source
r = Selector(text=initial_page)
table = r.xpath('//*[#id="table-apps"]/tbody')
rows = table.css('tr[class= "app"]')[0:2]
for element in rows:
info_link = "https://steamdb.info" + element.css('::attr(href)').get()
name = element.css('a ::text').get()
yield {"Name": name, "Link": info_link}

Actually, SeleniumRequest with scrapy is not always perfect. The same selement selection is worwking selenium with bs4 but getting empty output like you along with scrapy.
Scrapy-SeleniumRequest not working
import scrapy
from scrapy import Selector
from scrapy_selenium import SeleniumRequest
class SeamdbTestSpider(scrapy.Spider):
name = 'steam_db_test'
start_urls = ['https://steamdb.info/graph/']
def start_requests(self):
for link in self.start_urls:
yield SeleniumRequest(
url=link,
wait_time= 10,
callback=self.parse)
def parse(self, response):
driver = response.meta['driver']
initial_page = driver.page_source
r = Selector(text=initial_page)
rows = r.css('table#table-apps tbody tr')
for element in rows:
info_link = "https://steamdb.info" + element.css('td:nth-child(3) > a::attr(href)').get()
name = element.css('td:nth-child(3) > a::text').get()
yield {"Name": name, "Link": info_link}
Selenium with bs4 is working fine:
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
options = webdriver.ChromeOptions()
options.add_argument("start-maximized")
#chrome to stay open
options.add_experimental_option("detach", True)
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()),options=options)
driver.get("https://steamdb.info/graph/")
time.sleep(5)
soup = BeautifulSoup(driver.page_source, 'lxml')
for tr in soup.select('table#table-apps tbody tr'):
link=tr.select_one('td:nth-child(3) > a').get('href')
link="https://steamdb.info" + link
name = tr.select_one('td:nth-child(3) > a').text
print(link)
print(name)
Output:
https://steamdb.info/app/730/graphs/
Counter-Strike: Global Offensive
https://steamdb.info/app/570/graphs/
Dota 2
https://steamdb.info/app/578080/graphs/
PUBG: BATTLEGROUNDS
https://steamdb.info/app/1172470/graphs/
Apex Legends
https://steamdb.info/app/1599340/graphs/
Lost Ark
https://steamdb.info/app/271590/graphs/
Grand Theft Auto V
https://steamdb.info/app/440/graphs/
Team Fortress 2
https://steamdb.info/app/1446780/graphs/
MONSTER HUNTER RISE
https://steamdb.info/app/346110/graphs/
ARK: Survival Evolved
https://steamdb.info/app/252490/graphs/
Rust
https://steamdb.info/app/431960/graphs/
Wallpaper Engine
https://steamdb.info/app/1506830/graphs/
FIFA 22
https://steamdb.info/app/1085660/graphs/
Destiny 2
https://steamdb.info/app/1569040/graphs/
Football Manager 2022
https://steamdb.info/app/230410/graphs/
Warframe
https://steamdb.info/app/1203220/graphs/
NARAKA: BLADEPOINT
https://steamdb.info/app/359550/graphs/
Tom Clancy's Rainbow Six Siege
https://steamdb.info/app/381210/graphs/
Dead by Daylight
https://steamdb.info/app/236390/graphs/
.. so on

Related

How can I send Dynamic website content to scrapy with the html content generated by selenium browser?

I am working on certain stock-related projects where I have had a task to scrape all data on a daily basis for the last 5 years. i.e from 2016 to date. I particularly thought of using selenium because I can use crawler and bot to scrape the data based on the date. So I used the use of button click with selenium and now I want the same data that is displayed by the selenium browser to be fed by scrappy.
This is the website I am working on right now.
I have written the following code inside scrappy spider.
class FloorSheetSpider(scrapy.Spider):
name = "nepse"
def start_requests(self):
driver = webdriver.Firefox(executable_path=GeckoDriverManager().install())
floorsheet_dates = ['01/03/2016','01/04/2016', up to till date '01/10/2022']
for date in floorsheet_dates:
driver.get(
"https://merolagani.com/Floorsheet.aspx")
driver.find_element(By.XPATH, "//input[#name='ctl00$ContentPlaceHolder1$txtFloorsheetDateFilter']"
).send_keys(date)
driver.find_element(By.XPATH, "(//a[#title='Search'])[3]").click()
total_length = driver.find_element(By.XPATH,
"//span[#id='ctl00_ContentPlaceHolder1_PagerControl2_litRecords']").text
z = int((total_length.split()[-1]).replace(']', ''))
for data in range(z, z + 1):
driver.find_element(By.XPATH, "(//a[#title='Page {}'])[2]".format(data)).click()
self.url = driver.page_source
yield Request(url=self.url, callback=self.parse)
def parse(self, response, **kwargs):
for value in response.xpath('//tbody/tr'):
print(value.css('td::text').extract()[1])
print("ok"*200)
Update: Error after answer is
2022-01-14 14:11:36 [twisted] CRITICAL:
Traceback (most recent call last):
File "/home/navaraj/PycharmProjects/first_scrapy/env/lib/python3.8/site-packages/twisted/internet/defer.py", line 1661, in _inlineCallbacks
result = current_context.run(gen.send, result)
File "/home/navaraj/PycharmProjects/first_scrapy/env/lib/python3.8/site-packages/scrapy/crawler.py", line 88, in crawl
start_requests = iter(self.spider.start_requests())
TypeError: 'NoneType' object is not iterable
I want to send current web html content to scrapy feeder but I am getting unusal error for past 2 days any help or suggestions will be very much appreciated.

The 2 solutions are not very different. Solution #2 fits better to your question, but choose whatever you prefer.
Solution 1 - create a response with the html's body from the driver and scraping it right away (you can also pass it as an argument to a function):
import scrapy
from selenium import webdriver
from selenium.webdriver.common.by import By
from scrapy.http import HtmlResponse
class FloorSheetSpider(scrapy.Spider):
name = "nepse"
def start_requests(self):
# driver = webdriver.Firefox(executable_path=GeckoDriverManager().install())
driver = webdriver.Chrome()
floorsheet_dates = ['01/03/2016','01/04/2016']#, up to till date '01/10/2022']
for date in floorsheet_dates:
driver.get(
"https://merolagani.com/Floorsheet.aspx")
driver.find_element(By.XPATH, "//input[#name='ctl00$ContentPlaceHolder1$txtFloorsheetDateFilter']"
).send_keys(date)
driver.find_element(By.XPATH, "(//a[#title='Search'])[3]").click()
total_length = driver.find_element(By.XPATH,
"//span[#id='ctl00_ContentPlaceHolder1_PagerControl2_litRecords']").text
z = int((total_length.split()[-1]).replace(']', ''))
for data in range(1, z + 1):
driver.find_element(By.XPATH, "(//a[#title='Page {}'])[2]".format(data)).click()
self.body = driver.page_source
response = HtmlResponse(url=driver.current_url, body=self.body, encoding='utf-8')
for value in response.xpath('//tbody/tr'):
print(value.css('td::text').extract()[1])
print("ok"*200)
# return an empty requests list
return []
Solution 2 - with super simple downloader middleware:
(You might have a delay here in parse method so be patient).
import scrapy
from scrapy import Request
from scrapy.http import HtmlResponse
from selenium import webdriver
from selenium.webdriver.common.by import By
class SeleniumMiddleware(object):
def process_request(self, request, spider):
url = spider.driver.current_url
body = spider.driver.page_source
return HtmlResponse(url=url, body=body, encoding='utf-8', request=request)
class FloorSheetSpider(scrapy.Spider):
name = "nepse"
custom_settings = {
'DOWNLOADER_MIDDLEWARES': {
'tempbuffer.spiders.yetanotherspider.SeleniumMiddleware': 543,
# 'projects_name.path.to.your.pipeline': 543
}
}
driver = webdriver.Chrome()
def start_requests(self):
# driver = webdriver.Firefox(executable_path=GeckoDriverManager().install())
floorsheet_dates = ['01/03/2016','01/04/2016']#, up to till date '01/10/2022']
for date in floorsheet_dates:
self.driver.get(
"https://merolagani.com/Floorsheet.aspx")
self.driver.find_element(By.XPATH, "//input[#name='ctl00$ContentPlaceHolder1$txtFloorsheetDateFilter']"
).send_keys(date)
self.driver.find_element(By.XPATH, "(//a[#title='Search'])[3]").click()
total_length = self.driver.find_element(By.XPATH,
"//span[#id='ctl00_ContentPlaceHolder1_PagerControl2_litRecords']").text
z = int((total_length.split()[-1]).replace(']', ''))
for data in range(1, z + 1):
self.driver.find_element(By.XPATH, "(//a[#title='Page {}'])[2]".format(data)).click()
self.body = self.driver.page_source
self.url = self.driver.current_url
yield Request(url=self.url, callback=self.parse, dont_filter=True)
def parse(self, response, **kwargs):
print('test ok')
for value in response.xpath('//tbody/tr'):
print(value.css('td::text').extract()[1])
print("ok"*200)
Notice that I've used chrome so change it back to firefox like in your original code.

Navigate to new page in Scrapy with the same URL

I am writing a scrapy spider to scrape Rightmove, a property website. The issue I'm having is that the property search, which consists of several pages of different house listings, is all located under the same URL.
This means that the usual process of identifying the URL of the 'next' page doesn't work. Is there any way, using scrapy and not selenium (not efficient enough for the purpose) that I can navigate through the different pages? Please see my code and the source code of the relevant 'next page' button as the IMG below.
Thanks.
class listingsSpider(scrapy.Spider):
name = 'listings'
start_urls = ['https://www.rightmove.co.uk/property-for-sale/find.html?locationIdentifier=STATION%5E1712&maxPrice=500000&radius=0.5&sortType=10&propertyTypes=&mustHave=&dontShow=&furnishTypes=&keywords=']
def parse(self, response):
self.logger.info('This my first spider')
address = response.xpath('//*[#id="property-65695633"]/div/div/div[4]/div[1]/div[2]/a/address')
listings = response.xpath('//h2[#class="propertyCard-title"]')
for listing in listings:
yield{
'Listing': listing.get()
}
nextPage = response.xpath('//*[#id="l-container"]/div[3]/div/div/div/div[3]/button/div/svg/use')
nextPage = nextPage.get()
pageTest = response.css('div[class=pagination-button pagination-direction pagination-direction--next] svg a::attr(href)')
pageTest = pageTest.get()
if pageTest is not None:
pageTest = response.urljoin(pageTest)
yield scrapy.Request(pageTest,callback=self.parse)
```[![enter image description here][1]][1]
[1]: https://i.stack.imgur.com/1I1J1.png

Actually, it turns out that each page has a unique identifier in the web-link. For example, attach &index = 24, this sends you to the next page.
What you need to figure out is how to include that into the request url. Some may have several pages so we increment by +24 each time to go onto the next page. However, we could increment by +24 onto infinite, therefore we use the number of page results as a way to break. It's rather sneaky to notice at first sight! but pretty easy to overcome.
Here's a scraper that can go to these next pages as requested:
import scrapy
from scrapy.item import Field
from itemloaders.processors import TakeFirst
from scrapy.crawler import CrawlerProcess
from scrapy.loader import ItemLoader
import requests
from bs4 import BeautifulSoup
links= []
for i in range(0, 480, 24):
url = f'https://www.rightmove.co.uk/property-for-sale/find.html?locationIdentifier=STATION%5E1712&maxPrice=500000&radius=0.5&sortType=10&propertyTypes=&mustHave=&dontShow=&index={i}&furnishTypes=&keywords='
r = requests.get(url)
soup = BeautifulSoup(r.content, 'lxml')
ps1 = soup.find_all('span', {'class':'searchHeader-resultCount'})
for ps in ps1:
if int(ps.text.strip()) > i:
links.append(url)
else:
break
class ListingsItem(scrapy.Item):
address = Field(output_processor = TakeFirst())
listings = Field(output_processor = TakeFirst())
class listingsSpider(scrapy.Spider):
name = 'listings'
start_urls = links
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url,
callback = self.parse
)
def parse(self, response):
container = response.xpath('//div[#class="l-searchResults"]/div')
for sales in container:
l = ItemLoader(ListingsItem(), selector = sales)
l.add_xpath('address', '//address[#class="propertyCard-address"]/meta[#content]')
l.add_xpath('listings', '//h2[#class="propertyCard-title"]//text()[normalize-space()]')
yield l.load_item()
#self.logger.info('This my first spider')
#address = response.xpath('//*[#id="property-65695633"]/div/div/div[4]/div[1]/div[2]/a/address')
#listings = response.xpath('//h2[#class="propertyCard-title"]')
#for listing in listings:
# yield{
# 'Listing': listing.get()
# }
process = CrawlerProcess(
settings = {
'FEED_URI': 'rightmove.jl',
'FEED_FORMAT': 'jsonlines'
}
)
process.crawl(listingsSpider)
process.start()

how to print 1st element in HTML tag

My code gets links/HTML from different "sections" of a page.
It prints 2 links per section, however I only want the first one printed.
Expected output should not contain the links ending with "video", as it does with my code.
from selenium import webdriver
from bs4 import BeautifulSoup
import time
driver = webdriver.Chrome()
jam=[]
baseurl='https://meetinglibrary.asco.org'
driver.get('https://meetinglibrary.asco.org/results?meetingView=2020%20ASCO%20Virtual%20Scientific%20Program&page=1')
time.sleep(3)
page_source = driver.page_source
soup = BeautifulSoup(page_source,'html.parser')
productlist=soup.find_all('a',class_='ng-star-inserted')
for item in productlist:
for link in item.find_all('a',href=True):
jam.append(baseurl+link['href'])
print(jam)

You can use the condition function before appending the script.
...
for item in productlist:
ahrefs = item.find_all('a', href=True)
for index in range(len(ahrefs)):
if (index % 2 == 0) and ('video' not in ahrefs[index]['href']):
jam.append(baseurl+ahrefs[index]['href'])
print(jam)
...
Let me know after trying.
Good luck

Use os.path.basename to get the end of string.And use in operator to check whether "video" exists:
from selenium import webdriver
from bs4 import BeautifulSoup
import time
import os
driver = webdriver.Chrome()
jam = []
baseurl = 'https://meetinglibrary.asco.org'
driver.get('https://meetinglibrary.asco.org/results?meetingView=2020%20ASCO%20Virtual%20Scientific%20Program&page=1')
time.sleep(3)
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'html.parser')
productlist = soup.find_all('a', class_='ng-star-inserted')
for item in productlist:
for link in item.find_all('a', href=True):
url = link['href']
if "video" not in os.path.basename(url):
jam.append(baseurl + url)
print(jam)
result:
['https://meetinglibrary.asco.org/record/185955/abstract',
'https://meetinglibrary.asco.org/record/185955/slide',
'https://meetinglibrary.asco.org/record/185954/abstract',
'https://meetinglibrary.asco.org/record/186048/abstract',
'https://meetinglibrary.asco.org/record/186048/slide',
'https://meetinglibrary.asco.org/record/190197/slide',
'https://meetinglibrary.asco.org/record/192623/slide',
'https://meetinglibrary.asco.org/record/185414/abstract',
'https://meetinglibrary.asco.org/record/185414/slide',
'https://meetinglibrary.asco.org/record/185415/abstract',
'https://meetinglibrary.asco.org/record/185415/slide',
'https://meetinglibrary.asco.org/record/185473/abstract',
'https://meetinglibrary.asco.org/record/185473/slide',
'https://meetinglibrary.asco.org/record/187584/slide',
'https://meetinglibrary.asco.org/record/188561/slide',
'https://meetinglibrary.asco.org/record/186710/abstract',
'https://meetinglibrary.asco.org/record/186710/slide',
'https://meetinglibrary.asco.org/record/186699/abstract',
'https://meetinglibrary.asco.org/record/186699/slide',
'https://meetinglibrary.asco.org/record/186698/abstract',
'https://meetinglibrary.asco.org/record/186698/slide',
'https://meetinglibrary.asco.org/record/187720/slide',
'https://meetinglibrary.asco.org/record/187480/abstract',
'https://meetinglibrary.asco.org/record/187480/slide',
'https://meetinglibrary.asco.org/record/191961/slide',
'https://meetinglibrary.asco.org/record/192626/slide',
'https://meetinglibrary.asco.org/record/186983/abstract',
'https://meetinglibrary.asco.org/record/186983/slide',
'https://meetinglibrary.asco.org/record/188580/abstract',
'https://meetinglibrary.asco.org/record/188580/slide',
'https://meetinglibrary.asco.org/record/189047/abstract',
'https://meetinglibrary.asco.org/record/189047/slide',
'https://meetinglibrary.asco.org/record/190223/slide',
'https://meetinglibrary.asco.org/record/190273/slide',
'https://meetinglibrary.asco.org/record/184812/abstract',
'https://meetinglibrary.asco.org/record/184812/slide',
'https://meetinglibrary.asco.org/record/184927/slide',
'https://meetinglibrary.asco.org/record/184805/abstract',
'https://meetinglibrary.asco.org/record/184805/slide',
'https://meetinglibrary.asco.org/record/184811/abstract',
'https://meetinglibrary.asco.org/record/184811/slide',
'https://meetinglibrary.asco.org/record/185576/slide',
'https://meetinglibrary.asco.org/record/190147/slide']

How to improve this webscraping python script?

Brief context, I started Python two weeks ago so don't hesitate to correct any mistake or improvement you see.I am trying to scrape data from the results club list of the site www.fff.fr .
My way of organizing it is:
Go to Homepage
Accept Cookies
Use search bar for cityname
Get result list
Follow each url of the result page
Go to each "Staff" sub-section
Extract data from this page
I started to build the below python code which is not working so far. I'd be really interested in feedback on how to actually make it work.
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from shutil import which
chrome_path = which("chromedriver")
driver = webdriver.Chrome(executable_path=chrome_path)
driver.get("https://fff.fr")
cookie_btn = driver.find_element_by_id("didomi-notice-agree-button")
cookie_btn.click()
search_input = driver.find_element_by_xpath("/html//form[#id='proximiteSearch']//input[#id='fff_club_form_club_near_to_search_address']")
search_input.send_keys("Paris")
search_input.send_keys(Keys.ENTER)
self.html = driver.page_source
driver.close()
def parse(self, response):
resp = Selector(text=self.html)
clubs = resp.xpath("(//ul[contains(#id, 'listresulclub')])/li/text()")
for club in clubs:
name = club.xpath(".//text()").get()
name_link = club.xpath(".//#href").get()
url = f"https://www.ffr.fr{name_link}"
absolute_url = url[:-10] + "/le-staff"
# absolute_url = response.urljoin()
yield scrapy.Request(url=absolute_url, meta={'club_name':name})
#yield response.follow (url = name_link, callback=self.parse_country, meta={'club_name': name})
def parse_country(self, response):
name = response.request.meta['club_name']
contacts = response.xpath("//div[#class='coor-block-content']/ol")
for contact in contacts:
contact_nom = contact.xpath(".//li[1]/text()").get()
yield {
'club_name': name,
'correspondant_nom': contact_nom
}

You can try the same thing without selenium and it works:
import bs4
import requests
import sys
import re
import unicodedata
import os
import random
import datetime
Current_Date_Formatted = datetime.datetime.today().strftime ('%d-%b-%Y')
time = str(Current_Date_Formatted)
filename = "footballstuff"
cityname = sys.argv[1]
filename=r"D:\Huzefa\Desktop\\" +filename+ ".txt"
url = "https://www.fff.fr/resultats?search="+cityname
res = requests.get(url)
soup = bs4.BeautifulSoup(res.text, "lxml")
file = open(filename , 'wb')
for i in soup.select("a"):
f=i.text
file.write(unicodedata.normalize('NFD', re.sub("[\(\[].*?[\)\]]", "", f)).encode('ascii', 'ignore'))
file.write(unicodedata.normalize('NFD', re.sub("[\(\[].*?[\)\]]", "", os.linesep)).encode('ascii', 'ignore'))
file.write(unicodedata.normalize('NFD', re.sub("[\(\[].*?[\)\]]", "", os.linesep)).encode('ascii', 'ignore'))
file.close()

How do I force my code to carry out the next for loop?

My code is stopping short before finishing all the tasks.
It should be:
1 - getting a link from search results of fitness classes to go to the individual studio page.
2 - then from the individual studio page(first for loop):
A) grab the studio name and write to csv.
B) grab a link to a fitness class from the class schedule
3 - Open class page link and grab class name (second for loop)
It completes step 2 and instead of continuing to step 3, it goes back to initial search results page and repeats step 1 for the next studio in order.
What am i doing wrong? Thanks in advance!
from selenium import webdriver
from bs4 import BeautifulSoup as soup
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait as browser_wait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
import time
import re
import csv
# initialize the chrome browser
browser = webdriver.Chrome(executable_path=r'./chromedriver')
# URL
class_pass_url = 'https://www.classpass.com'
# Create file and writes the first row, added encoding type as write was giving errors
f = open('ClassPass.csv', 'w', encoding='utf-8')
headers = 'Studio, Name, Description, Image, Address, Phone, Email, Website\n'
f.write(headers)
# classpass results page
page = "https://classpass.com/search/e8-4rb/fitness-classes/58PHLz8oWT9"
browser.get(page)
# Browser waits
browser_wait(browser, 10).until(EC.visibility_of_element_located((By.CLASS_NAME, "line")))
# Scrolls to bottom of page to reveal all classes
# browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Extract page source and parse
page_source = browser.page_source
page_soup = soup(page_source, "html.parser")
# Parse of class listings # Looks through results and gets link to class page
sessions = page_soup.findAll('li', {'class': '_3vk1F9nlSJQIGcIG420bsK'})
for session in sessions:
# gets link to class page and
session_link = class_pass_url + session.a['href']
browser.get(session_link)
browser_wait(browser, 10).until(EC.presence_of_element_located((By.CLASS_NAME, '_1ruz3nW6mOnylv99BOA_tm')))
# parses class page
session_page_source = browser.page_source
session_soup = soup(session_page_source, "html.parser")
# get studio name
try:
studio = session_soup.find('h2', {'class': 'gamma'}).text
except (AttributeError, TypeError,) as e:
pass
# write studio name
f.write(
studio.replace(',', '|') + "\n")
print('got studio name name')
# gets link to individual class in classes schedule table
classses = page_soup.findAll('section', {'class': '_33uV0qMCu2Sfk4M3oTJjVv'})
for classs in classses:
classs_link = class_pass_url + classs.a['href']
browser.get(classs_link)
browser_wait(browser, 10).until(EC.presence_of_element_located((By.CLASS_NAME, '_1ruz3nW6mOnylv99BOA_tm')))
# parses individual class page
classses_page_source = browser.page_source
classses_soup = soup(classses_page_source, "html.parser")
try:
classs_name = session_soup.find('span', {'data-component': 'LocalizableMessage'}).text
except (AttributeError, TypeError,) as e:
pass
# gets class names
f.write(
classs_name.replace(',', '|') + "\n")
print('got class name')

I'm not quite sure about your goal since your question and your code is completely unexplained well.
But from my point of view, i think that's your goal.
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
def Main():
r = requests.get(
"https://classpass.com/search/e8-4rb/fitness-classes/58PHLz8oWT9")
soup = BeautifulSoup(r.text, 'html.parser')
urls = []
for item in soup.findAll("a", {'class': '_3Rgmjog5fetGEXICK2gVhh'}):
item = item.get("href")
urls.append(f"https://classpass.com{item}")
return urls
options = Options()
options.add_argument('--headless')
def Second():
urls = Main()
studios = []
links = []
driver = webdriver.Firefox(options=options)
for url in urls:
print(f"Extracting: {url}")
driver.get(url)
soup = BeautifulSoup(driver.page_source, 'html.parser')
studio = soup.find('h2', {'class': 'gamma'}).text
studios.append(studio)
for item in soup.findAll("a", {'href': True}):
item = item.get("href")
if item.startswith("/classes/"):
print(item)
links.append(f"https://www.classpass.com{item}")
driver.quit()
return links
def Third():
links = Second()
driver = webdriver.Firefox(options=options)
for link in links:
driver.get(link)
soup = BeautifulSoup(driver.page_source, 'html.parser')
try:
name = soup.find(
'span', {'data-component': 'LocalizableMessage'}).text
print(name)
except:
pass
driver.quit()
Third()

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

How to fix Scrapy-Selenium not yielding output? - python

Related

How can I send Dynamic website content to scrapy with the html content generated by selenium browser?

Navigate to new page in Scrapy with the same URL

how to print 1st element in HTML tag

How to improve this webscraping python script?

How do I force my code to carry out the next for loop?

Categories

Resources