I've written a scraper in Python scrapy in combination with selenium to scrape some titles from a website. The css selectors defined within my scraper is flawless. I wish my scraper to keep on clicking on the next page and parse the information embedded in each page. It is doing fine for the first page but when it comes to play the role for selenium part the scraper keeps clicking on the same link over and over again.
As this is my first time to work with selenium along with scrapy, I don't have any idea to move on successfully. Any fix will be highly appreciated.
If I try like this then it works smoothly (there is nothing wrong with selectors):
class IncomeTaxSpider(scrapy.Spider):
name = "taxspider"
start_urls = [
'https://www.incometaxindia.gov.in/Pages/utilities/exempted-institutions.aspx',
]
def __init__(self):
self.driver = webdriver.Chrome()
self.wait = WebDriverWait(self.driver, 10)
def parse(self,response):
self.driver.get(response.url)
while True:
for elem in self.wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,"h1.faqsno-heading"))):
name = elem.find_element_by_css_selector("div[id^='arrowex']").text
print(name)
try:
self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "input[id$='_imgbtnNext']"))).click()
self.wait.until(EC.staleness_of(elem))
except TimeoutException:break
But my intention is to make my script run this way:
import scrapy
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
class IncomeTaxSpider(scrapy.Spider):
name = "taxspider"
start_urls = [
'https://www.incometaxindia.gov.in/Pages/utilities/exempted-institutions.aspx',
]
def __init__(self):
self.driver = webdriver.Chrome()
self.wait = WebDriverWait(self.driver, 10)
def click_nextpage(self,link):
self.driver.get(link)
elem = self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "div[id^='arrowex']")))
#It keeeps clicking on the same link over and over again
self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "input[id$='_imgbtnNext']"))).click()
self.wait.until(EC.staleness_of(elem))
def parse(self,response):
while True:
for item in response.css("h1.faqsno-heading"):
name = item.css("div[id^='arrowex']::text").extract_first()
yield {"Name": name}
try:
self.click_nextpage(response.url) #initiate the method to do the clicking
except TimeoutException:break
These are the titles visible on that landing page (to let you know what I'm after):
INDIA INCLUSION FOUNDATION
INDIAN WILDLIFE CONSERVATION TRUST
VATSALYA URBAN AND RURAL DEVELOPMENT TRUST
I'm not willing to get the data from that site, so any alternative approach other than what I've tried above is useless to me. My only intention is to have any solution related to the way I tried in my second approach.
Your initial code was almost correct with one key piece missing from it. You were using the same response object always. The response object needs to be from the latest page source.
Also you were browsing the link again and again in click next page which was resetting it to page 1 every time. That is why you get page 1 and 2 (max). You need to get the url only once in the parse stage and then let the next page click to happen
Below is final code working fine
class IncomeTaxSpider(scrapy.Spider):
name = "taxspider"
start_urls = [
'https://www.incometaxindia.gov.in/Pages/utilities/exempted-institutions.aspx',
]
def __init__(self):
self.driver = webdriver.Chrome()
self.wait = WebDriverWait(self.driver, 10)
def click_nextpage(self,link):
# self.driver.get(link)
elem = self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "div[id^='arrowex']")))
#It keeeps clicking on the same link over and over again
self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "input[id$='_imgbtnNext']"))).click()
self.wait.until(EC.staleness_of(elem))
def parse(self, response):
self.driver.get(response.url)
while True:
for item in response.css("h1.faqsno-heading"):
name = item.css("div[id^='arrowex']::text").extract_first()
yield {"Name": name}
try:
self.click_nextpage(response.url) #initiate the method to do the clicking
response = response.replace(body=self.driver.page_source)
except TimeoutException:break
After that change it works perfect
In case you need pure Selenium solution:
driver.get("https://www.incometaxindia.gov.in/Pages/utilities/exempted-institutions.aspx")
while True:
for item in wait(driver, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "div[id^='arrowex']"))):
print(item.text)
try:
driver.find_element_by_xpath("//input[#text='Next' and not(contains(#class, 'disabledImageButton'))]").click()
except NoSuchElementException:
break
import scrapy
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from scrapy.crawler import CrawlerProcess
class IncomeTaxSpider(scrapy.Spider):
name = "taxspider"
start_urls = [
'https://www.incometaxindia.gov.in/Pages/utilities/exempted-institutions.aspx',
]
def __init__(self):
self.driver = webdriver.Chrome()
self.wait = WebDriverWait(self.driver, 10)
link = 'https://www.incometaxindia.gov.in/Pages/utilities/exempted-institutions.aspx'
self.driver.get(link)
def click_nextpage(self):
elem = self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "div[id^='arrowex']")))
#It keeeps clicking on the same link over and over again
self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "input[id$='_imgbtnNext']"))).click()
self.wait.until(EC.staleness_of(elem))
time.sleep(4)
def parse(self,response):
while True:
for item in response.css("h1.faqsno-heading"):
name = item.css("div[id^='arrowex']::text").extract_first()
yield {"Name": name}
try:
self.click_nextpage() #initiate the method to do the clicking
except TimeoutException:break
process = CrawlerProcess()
process.crawl(IncomeTaxSpider)
process.start()
Whenever the page gets loaded using the 'Next Page' arrow (using Selenium) it gets reset back to Page '1'. Not sure about the reason for this (may be the java script)
Hence changed the approach to use the input field to enter the page number needed and hitting ENTER key to navigate.
Here is the modified code. Hope this may be useful for you.
import scrapy
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.keys import Keys
class IncomeTaxSpider(scrapy.Spider):
name = "taxspider"
start_urls = [
'https://www.incometaxindia.gov.in/Pages/utilities/exempted-institutions.aspx',
]
def __init__(self):
self.driver = webdriver.Firefox()
self.wait = WebDriverWait(self.driver, 10)
def click_nextpage(self,link, number):
self.driver.get(link)
elem = self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "div[id^='arrowex']")))
#It keeeps clicking on the same link over and over again
inputElement = self.driver.find_element_by_xpath("//input[#id='ctl00_SPWebPartManager1_g_d6877ff2_42a8_4804_8802_6d49230dae8a_ctl00_txtPageNumber']")
inputElement.clear()
inputElement.send_keys(number)
inputElement.send_keys(Keys.ENTER)
self.wait.until(EC.staleness_of(elem))
def parse(self,response):
number = 1
while number < 10412: #Website shows it has 10411 pages.
for item in response.css("h1.faqsno-heading"):
name = item.css("div[id^='arrowex']::text").extract_first()
yield {"Name": name}
print (name)
try:
number += 1
self.click_nextpage(response.url, number) #initiate the method to do the clicking
except TimeoutException:break
Create a self.page_num or something.
def parse(self,response):
self.pages = self.driver.find_element_by_css_selector("#ctl00_SPWebPartManager1_g_d6877ff2_42a8_4804_8802_6d49230dae8a_ctl00_totalRecordsDiv.act_search_footer span")
self.pages = int(self.pages.split('of ')[1].split(']')[0])
self.page_num = 1
while self.page_num <= self.pages:
for item in response.css("h1.faqsno-heading"):
name = item.css("div[id^='arrowex']::text").extract_first()
yield {"Name": name}
try:
self.click_nextpage(response.url) #initiate the method to do the clicking
except TimeoutException:break
def click_nextpage(self,link):
self.driver.get(link)
elem = self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "div[id^='arrowex']")))
page_link = 'ctl00_SPWebPartManager1_g_d6877ff2_42a8_4804_8802_6d49230dae8a_ctl00_lnkBtn_' + str(self.page_num)
self.page_num = self.page_num + 1
self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "input[id$='_imgbtnNext']"))).click()
self.wait.until(EC.staleness_of(elem))
Related
I am trying to scraping name and email of agents from this site. The code firstly captures link to every profile on first page and then visits each profile to get name and email. But the problem is that it is taking alot of time to get anchor tag having name of agent in it. Here's the code:
import os
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
class MessageIndividual(webdriver.Chrome):
def __init__(self, driver_path=r";C:/SeleniumDriver", teardown=False):
self.driver_path = driver_path
self.teardown = teardown
os.environ['PATH'] += self.driver_path
#options = webdriver.ChromeOptions()
#options.headless = True
super(MessageIndividual, self).__init__()
self.implicitly_wait(5)
self.maximize_window()
def __exit__(self, exc_type, exc_val, exc_tb):
if self.teardown:
self.quit()
def goToSite(self):
url = 'https://www.bhhs.com/agent-search-results'
self.get(url)
def getDetails(self):
mylist = [my_elem.get_attribute("href") for my_elem in WebDriverWait(self, 1000).until(
EC.visibility_of_all_elements_located((By.XPATH, "//section[#class='cmp-agent-results-list-view']/div[#class='cmp-agent-results-list-view__content container ']/div[#class='row associate pt-3 pb-3 ']/div[#class='col-6 col-sm-4 col-lg-3 order-lg-3 associate__btn-group']/section[2]/a[#href]")))]
for i in mylist:
self.execute_script("window.open('');")
self.switch_to.window(self.window_handles[1])
self.get(i)
name = WebDriverWait(self,5).until(
EC.presence_of_element_located((By.XPATH,'//h1[#class="cmp-agent__name"]/a[1]'))
)
print(name.text)
email = WebDriverWait(self,1).until(EC.presence_of_element_located((By.CLASS_NAME,'cmp-agent-details__mail')))
print(email.text)
self.close()
self.switch_to.window(self.window_handles[0])
if __name__ == '__main__':
inst = MessageIndividual(teardown=False)
inst.goToSite()
inst.getDetails()
Is there any way I can scrape name in lesser time?
I have change the xpath to identify the anchor tag and remove the new window open in each iteration. hope this will reduce some time.
def getDetails(self):
mylist = [my_elem.get_attribute("href") for my_elem in WebDriverWait(self, 1000).until(
EC.visibility_of_all_elements_located((By.XPATH, "//a[.//span[normalize-space(.)='agent details']]")))]
for i in mylist:
#self.execute_script("window.open('');")
#self.switch_to.window(self.window_handles[1])
self.get(i)
name = WebDriverWait(self,5).until(
EC.presence_of_element_located((By.XPATH,'//h1[#class="cmp-agent__name"]/a[1]'))
)
print(name.text)
email = WebDriverWait(self,1).until(EC.presence_of_element_located((By.CLASS_NAME,'cmp-agent-details__mail')))
print(email.text)
#self.close()
#self.switch_to.window(self.window_handles[0])
I'm quite new to python and have written a script using selenium to scrape a website. I've tried everything but can't get the loop to cycle through pages. It currently just repeats the data on the first page 5 times. I want to scrape all the pages for 'BR1' any help would be great, currently the script below only scrapes the first page 5 times.
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
with open('rightmove.csv', 'w') as file:
file.write('PropertyCardcontent \n')
PATH = ("/usr/local/bin/chromedriver")
driver = webdriver.Chrome(PATH)
driver.get("https://www.rightmove.co.uk/house-prices.html")
print(driver.title)
elem = driver.find_element(By.NAME, 'searchLocation') # Find the search box
elem.send_keys('BR1' + Keys.RETURN)
try:
content = WebDriverWait(driver, 15).until(
EC.presence_of_element_located((By.ID,'content'))
)
finally:
time.sleep(3)
for p in range(5):
sold = content.find_elements(By.CLASS_NAME, 'sold-prices-content-wrapper ')
for solds in sold:
address = solds.find_elements(By.CLASS_NAME, 'sold-prices-content ')
for addresses in address:
result = addresses.find_elements(By.CLASS_NAME, 'results ')
for results in result:
card = results.find_elements(By.CLASS_NAME,'propertyCard')
for propertyCard in card:
header = propertyCard.find_elements(By.CLASS_NAME,'propertyCard-content')
for propertyCardcontent in header:
road = propertyCardcontent.find_elements(By.CLASS_NAME,'title')
for propertyCardcontent in header:
road = propertyCardcontent.find_elements(By.CLASS_NAME,'subTitle')
for subtitle in road:
bed = subtitle.find_elements(By.CLASS_NAME, 'propertyType')
with open('rightmove.csv', 'a') as file:
for i in range(len(result)):
file.write(header[i].text + '\n')
button = driver.find_element(By.XPATH, '//*[#id="content"]/div[2]/div[2]/div[4]/div[27]/div[3]/div')
button.click()
file.close()
time.sleep(3)
driver.quit()
Since the website link has page number on it, I recommend you put the base url as "https://www.rightmove.co.uk/house-prices/br1.html?page=1", and loop through the pages while changing the last index of the url with methods like format string.
One other thing, you don't need to implement all those for loops, you can simply assign each variable to its specific value since everything you need is inside an html block which is easy to navigate on it.
Update:
I'm sorry for being late, had unexpected stuff(...).
I've made some changes as I use Brave, so make sure you select your browser, Chrome I believe, the chromedriver(ver:102) stays the same (or depending your Chrome version).
I've also got the Price and Date and stored them in a tuple.
Every record is stored in a list[Title, propertyType, tupleof(Price_Date)]
At the end, it creates a csv and stores everything inside with a ";" as delimter.
You can if you prefer split the price and date for later use, up to you.
Note: This looping method only applies to websites in which the number of page is included within the URL. In this case, both the key and number of page is included in the URL.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
import time
import random
import itertools
options = Options()
options.binary_location = r'C:\Program Files\BraveSoftware\Brave-Browser\Application\brave.exe'
driver = webdriver.Chrome(options = options, service = Service("chromedriver.exe"))
key_word = "BR1".lower()
base_url = f"https://www.rightmove.co.uk/house-prices/{key_word}.html?page=1"
driver.get(base_url)
#Number of pages
pages = driver.find_element(By.XPATH, '//span[#class="pagination-label"][2]').text
pages = int(pages.strip('of'))
WebDriverWait(driver, 15).until(
EC.presence_of_element_located((By.CLASS_NAME, 'results '))
)
data = []
pc = 0
for p in range(1,pages+1):
driver.get(f"https://www.rightmove.co.uk/house-prices/{key_word}.html?page={p}")
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.XPATH, '//div//div[#class="propertyCard"]'))
)
propertyCards = driver.find_elements(By.XPATH, '//div//div[#class="propertyCard"]')
for propertyCard in propertyCards:
title = propertyCard.find_element(By.CLASS_NAME, 'title').text
propertyType = propertyCard.find_element(By.CLASS_NAME, 'propertyType').text
price_list = propertyCard.find_elements(By.CLASS_NAME, 'price')
date_list = propertyCard.find_elements(By.CLASS_NAME, 'date-sold')
data.append([title,propertyType])
for p, d in itertools.zip_longest(price_list, date_list , fillvalue = None):
try:
price = p.text
date = d.text
data[pc].append((price, date))
except Exception as e:
print(e)
pc+=1
time.sleep(random.randint(1,4))
print(data)
with open('rightmove.csv', 'w') as file:
header = "Title;propertyType;Price_Date\n"
file.write(header)
for record in data:
file.write("{};{};{}\n".format(record[0],record[1],record[2:]))
driver.quit()
You don't have to go down to dom elem by elem, you can just use xpath or class_name (if it's unique, otherwise it's better xpath or css-selector) and get the item you are looking for.
Anyway follow this:
import time
import selenium.webdriver as webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome("/usr/local/bin/chromedriver")
driver.get("https://www.rightmove.co.uk/house-prices.html")
# send query
query = "BR1"
search_bar = driver.find_element(By.XPATH, '//input[#class="searchBox ac_input"]')
search_bar.send_keys(query)
search_bar.send_keys(Keys.ENTER)
# wait to result been loaded
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, 'propertyCard'))
)
#get amount of pages
pages = driver.find_element(By.XPATH, '//span[#class="pagination-label"][2]').text
pages = int(pages.replace('of ', ''))
data = []
i = 1
while i <= pages:
WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.XPATH, '//div[contains(text(), "Next")]'))
).click()
# wait page load result
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.XPATH, '//div//div[#class="propertyCard"]'))
)
propertyCards = driver.find_elements(By.XPATH, '//div//div[#class="propertyCard"]')
# loop over result and store data
for propertyCard in propertyCards:
title = propertyCard.find_element(By.CLASS_NAME, 'title').text
propertyType = propertyCard.find_element(By.CLASS_NAME, 'propertyType').text
data.append((title, propertyType))
time.sleep(1)
i += 1
print("you reach the last page")
#get number of results
printf(data)
driver.close()
I use a list of tuple cause in your example you want store 2 item, if you want store more data you can use a dict and then convert into csv with Dictwriter directly. Enjoy.
Was wondering if anyone knows of a way for instructing a selenium script to avoid visiting/redirecting to an internal page that wasn't part of the code. Essentially, my code opens up this page:
https://cryptwerk.com/companies/?coins=1,6,11,2,3,8,17,7,13,4,25,29,24,32,9,38,15,30,43,42,41,12,40,44,20
keeps clicking on show more button until there's none (at end of page) - which by then - it should have collected the links of all the products listed on the page it scrolled through till the end, then visit each one respectively.
What happens instead, it successfully clicks on show more till the end of the page, but then it visits this weird promotion page of the same website instead of following each of the gathered links respectively and then scraping further data points located off each of those newly opened ones.
In a nutshell, would incredibly appreciate it if someone can explain how to avoid this automated redirection on its own! And this is the code in case someone can gratefully nudge me in the right direction :)
from selenium.webdriver import Chrome
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time
from selenium.common.exceptions import NoSuchElementException, ElementNotVisibleException
import json
import selenium.common.exceptions as exception
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.keys import Keys
from selenium import webdriver
webdriver = '/Users/karimnabil/projects/selenium_js/chromedriver-1'
driver = Chrome(webdriver)
driver.implicitly_wait(5)
url = 'https://cryptwerk.com/companies/?coins=1,6,11,2,3,8,17,7,13,4,25,29,24,32,9,38,15,30,43,42,41,12,40,44,20'
driver.get(url)
links_list = []
coins_list = []
all_names = []
all_cryptos = []
all_links = []
all_twitter = []
all_locations = []
all_categories = []
all_categories2 = []
wait = WebDriverWait(driver, 2)
sign_in = driver.find_element_by_xpath("//li[#class='nav-item nav-guest']/a")
sign_in.click()
time.sleep(2)
user_name = wait.until(EC.presence_of_element_located((By.XPATH, "//input[#name='login']")))
user_name.send_keys("karimnsaber95#gmail.com")
password = wait.until(EC.presence_of_element_located((By.XPATH, "//input[#name='password']")))
password.send_keys("PleomaxCW#2")
signIn_Leave = driver.find_element_by_xpath("//div[#class='form-group text-center']/button")
signIn_Leave.click()
time.sleep(3)
while True:
try:
loadMoreButton = driver.find_element_by_xpath("//button[#class='btn btn-outline-primary']")
time.sleep(2)
loadMoreButton.click()
time.sleep(2)
except exception.StaleElementReferenceException:
print('stale element')
break
print('no more elements to show')
try:
company_links = driver.find_elements_by_xpath("//div[#class='companies-list items-infinity']/div[position() > 3]/div[#class='media-body']/div[#class='title']/a")
for link in company_links:
links_list.append(link.get_attribute('href'))
except:
pass
try:
with open("links_list.json", "w") as f:
json.dump(links_list, f)
with open("links_list.json", "r") as f:
links_list = json.load(f)
except:
pass
try:
for link in links_list:
driver.get(link)
name = driver.find_element_by_xpath("//div[#class='title']/h1").text
try:
show_more_coins = driver.find_element_by_xpath("//a[#data-original-title='Show more']")
show_more_coins.click()
time.sleep(1)
except:
pass
try:
categories = driver.find_elements_by_xpath("//div[contains(#class, 'categories-list')]/a")
categories_list = []
for category in categories:
categories_list.append(category.text)
except:
pass
try:
top_page_categories = driver.find_elements_by_xpath("//ol[#class='breadcrumb']/li/a")
top_page_categories_list = []
for category in top_page_categories:
top_page_categories_list.append(category.text)
except:
pass
coins_links = driver.find_elements_by_xpath("//div[contains(#class, 'company-coins')]/a")
all_coins = []
for coin in coins_links:
all_coins.append(coin.get_attribute('href'))
try:
location = driver.find_element_by_xpath("//div[#class='addresses mt-3']/div/div/div/div/a").text
except:
pass
try:
twitter = driver.find_element_by_xpath("//div[#class='links mt-2']/a[2]").get_attribute('href')
except:
pass
try:
print('-----------')
print('Company name is: {}'.format(name))
print('Potential Categories are: {}'.format(categories_list))
print('Potential top page categories are: {}'.format(top_page_categories_list))
print('Supporting Crypto is:{}'.format(all_coins))
print('Registered location is: {}'.format(location))
print('Company twitter profile is: {}'.format(twitter))
time.sleep(1)
except:
pass
all_names.append(name)
all_categories.append(categories_list)
all_categories2.append(top_page_categories_list)
all_cryptos.append(all_coins)
all_twitter.append(twitter)
all_locations.append(location)
except:
pass
df = pd.DataFrame(list(zip(all_names, all_categories, all_categories2, all_cryptos, all_twitter, all_locations)), columns=['Company name', 'Categories1', 'Categories2', 'Supporting Crypto', 'Twitter Handle', 'Registered Location'])
CryptoWerk_Data = df.to_csv('CryptoWerk4.csv', index=False)
Redirect calls happen for two reasons, in your case either by executing some javascript code when clicking the last time on the load more button or by receiving an HTTP 3xx code, which is the least likely in your case.
So you need to identify when this javascript code is executed and send an ESC_KEY before it loads and then executing the rest of your script.
You could also scrape the links and append them to your list before clicking the load more button and each time it is clicked, make an if statement the verify the link of the page you're in, if it is that of the promotion page then execute the rest of your code, else click load more.
while page_is_same:
scrape_elements_add_to_list()
click_load_more()
verify_current_page_link()
if current_link_is_same != link_of_scraped_page:
page_is_same = False
# rest of the code here
I have a small script that fetches company data from a website. This website gets regularly updated with new company information. How can I update my csv with new records on a periodic basis? Also as you can see in the code I have used an explicit range for the pages, what other solutions are possible?
The following is the code -
from selenium.webdriver import Firefox
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from time import sleep
import csv
#navigate to the ystory companies page
#start collecting data from ystory
START_URL = 'https://yourstory.com/companies/search?page=1&hitsPerPage=30'
#when the collection populates 30 elements then click on next page
class CompDeetz():
def __init__(self):
self.browser = Firefox()
self.browser.get(START_URL)
sleep(20)
self.browser.find_element_by_xpath('/html/body/div[12]/div/div/button').click()
sleep(5)
self.browser.find_element_by_xpath('/html/body/div[1]/div[4]').click()
self.database = []
def write_row(self,record):
with open('test.csv', 'a') as t:
writer = csv.writer(t)
writer.writerows(record)
def get_everything(self):
all_list = [ (a.text) for a in self.browser.find_elements_by_xpath('//tr[#class="hit"]')]
all_records = []
for company in all_list:
record = company.split('\n')
all_records.append(record)
self.write_row(all_records)
def next_page(self):
self.browser.find_element_by_xpath('//ul[#class="ais-Pagination-list"]/li[7]/a').click()
sleep(20)
def main():
t = CompDeetz()
t.get_everything()
for i in range(33):
t.next_page()
t.get_everything()
if __name__ == "__main__":
main()
Instead of having two different methods get_everything and next_page and calling them multiple times. You can have one method get_everything and call it once.
def get_everything(self):
all_records = []
nextPage = True
while nextPage:
all_list = [ (a.text) for a in self.browser.find_elements_by_xpath('//tr[#class="hit"]')]
for company in all_list:
record = company.split('\n')
all_records.append(record)
try:
nextPagelink = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//a[#aria-label='Next page']")))
driver.execute_script("arguments[0].scrollIntoView();", nextPagelink)
driver.execute_script("arguments[0].click();", nextPagelink)
time.sleep(5) # for next [age to load
#As on last page, next page link is not available. It will throw exception
except NoSuchElementException:
nextpage = False
self.write_row(all_records)
Note : take care of Pop up coming on page. I hope you already have mechanism to handle it.
I am trying to scrape agents data here. I am able to get the links from the first page. I am using numbered loops because I know the total number of pages. I tried to run this as long as the "next" page option is there. I tried both "try" and "if not" but wasn't able to figure it out. Any help is welcome. Here is the code.
from selenium import webdriver
import time
from selenium.common.exceptions import ElementNotVisibleException, NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
options = webdriver.ChromeOptions()
options.add_argument('headless')
driver = webdriver.Chrome('C:/Users/../Downloads/cd79/chromedriver.exe', options=options)
links_total = []
driver.get("https://www.cbp.gov/contact/find-broker-by-port?field_port_location_tid=All&field_port_code_value=")
def first_links():
initial_data = driver.find_elements_by_tag_name('td')
for initial in initial_data:
page_links = initial.find_elements_by_tag_name('a')
for page in page_links:
page_link = page.get_attribute("href")
links_total.append(page_link)
driver.refresh()
if driver.find_element_by_partial_link_text('next'):
next_page = driver.find_element_by_partial_link_text('next')
next_page.click()
time.sleep(2)
new_data = driver.find_elements_by_tag_name('td')
for new in new_data:
links = new.find_elements_by_tag_name('a')
for link in links:
new_link = link.get_attribute("href")
links_total.append(new_link)
for i in range(1, 23):
first_links()
for link in links_total:
print(link)
Try-catch would be better option
from selenium import webdriver
import time
from selenium.common.exceptions import ElementNotVisibleException, NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
options = webdriver.ChromeOptions()
options.add_argument('headless')
driver = webdriver.Chrome('C:/Users/../Downloads/cd79/chromedriver.exe', options=options)
driver.implicitly_wait(10)
# links_total = []
driver.get("https://www.cbp.gov/contact/find-broker-by-port?field_port_location_tid=All&field_port_code_value=")
def first_links(links_total=[]):
initial_data = driver.find_elements_by_tag_name('td')
for initial in initial_data:
page_links = initial.find_elements_by_tag_name('a')
for page in page_links:
page_link = page.get_attribute("href")
links_total.append(page_link)
# driver.refresh()
try:
next_page = driver.find_element_by_partial_link_text('next')
next_page.click()
time.sleep(2)
first_links(links_total)
except (TimeoutError, ElementNotVisibleException, NoSuchElementException):
print("NEXT btn not found : ")
pass
return links_total
all_links = first_links()
for link in all_links:
print(link)
You don't need to use Selenium actually. You could do it with BeautifulSoap like so :
import requests
from bs4 import BeautifulSoup
page_num=0
url_cbp = r"https://www.cbp.gov/contact/find-broker-by-port?field_port_location_tid=All&field_port_code_value=&page={}"
def get_links(links_total=[], page_num=0):
page = requests.get(url_cbp.format(page_num))
soup = BeautifulSoup(page.content, 'html.parser')
results = soup.find(id='region-content')
table_cells = results.find_all('td', class_='views-field')
for cell in table_cells:
# print(cell )
# print('\n\n')
cell_link = cell.find('a')
page_link = cell_link["href"]
links_total.append(page_link)
next_page = results.find('li', class_='pager-next')
if next_page:
page_num += 1
get_links(links_total, page_num)
return links_total
all_links = get_links()
for link in all_links:
print(link)