How to stop selenium scraper from redirecting to another internal weblink of the scraped website?

How to stop selenium scraper from redirecting to another internal weblink of the scraped website? - python

Was wondering if anyone knows of a way for instructing a selenium script to avoid visiting/redirecting to an internal page that wasn't part of the code. Essentially, my code opens up this page:
https://cryptwerk.com/companies/?coins=1,6,11,2,3,8,17,7,13,4,25,29,24,32,9,38,15,30,43,42,41,12,40,44,20
keeps clicking on show more button until there's none (at end of page) - which by then - it should have collected the links of all the products listed on the page it scrolled through till the end, then visit each one respectively.
What happens instead, it successfully clicks on show more till the end of the page, but then it visits this weird promotion page of the same website instead of following each of the gathered links respectively and then scraping further data points located off each of those newly opened ones.
In a nutshell, would incredibly appreciate it if someone can explain how to avoid this automated redirection on its own! And this is the code in case someone can gratefully nudge me in the right direction :)
from selenium.webdriver import Chrome
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time
from selenium.common.exceptions import NoSuchElementException, ElementNotVisibleException
import json
import selenium.common.exceptions as exception
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.keys import Keys
from selenium import webdriver
webdriver = '/Users/karimnabil/projects/selenium_js/chromedriver-1'
driver = Chrome(webdriver)
driver.implicitly_wait(5)
url = 'https://cryptwerk.com/companies/?coins=1,6,11,2,3,8,17,7,13,4,25,29,24,32,9,38,15,30,43,42,41,12,40,44,20'
driver.get(url)
links_list = []
coins_list = []
all_names = []
all_cryptos = []
all_links = []
all_twitter = []
all_locations = []
all_categories = []
all_categories2 = []
wait = WebDriverWait(driver, 2)
sign_in = driver.find_element_by_xpath("//li[#class='nav-item nav-guest']/a")
sign_in.click()
time.sleep(2)
user_name = wait.until(EC.presence_of_element_located((By.XPATH, "//input[#name='login']")))
user_name.send_keys("karimnsaber95#gmail.com")
password = wait.until(EC.presence_of_element_located((By.XPATH, "//input[#name='password']")))
password.send_keys("PleomaxCW#2")
signIn_Leave = driver.find_element_by_xpath("//div[#class='form-group text-center']/button")
signIn_Leave.click()
time.sleep(3)
while True:
try:
loadMoreButton = driver.find_element_by_xpath("//button[#class='btn btn-outline-primary']")
time.sleep(2)
loadMoreButton.click()
time.sleep(2)
except exception.StaleElementReferenceException:
print('stale element')
break
print('no more elements to show')
try:
company_links = driver.find_elements_by_xpath("//div[#class='companies-list items-infinity']/div[position() > 3]/div[#class='media-body']/div[#class='title']/a")
for link in company_links:
links_list.append(link.get_attribute('href'))
except:
pass
try:
with open("links_list.json", "w") as f:
json.dump(links_list, f)
with open("links_list.json", "r") as f:
links_list = json.load(f)
except:
pass
try:
for link in links_list:
driver.get(link)
name = driver.find_element_by_xpath("//div[#class='title']/h1").text
try:
show_more_coins = driver.find_element_by_xpath("//a[#data-original-title='Show more']")
show_more_coins.click()
time.sleep(1)
except:
pass
try:
categories = driver.find_elements_by_xpath("//div[contains(#class, 'categories-list')]/a")
categories_list = []
for category in categories:
categories_list.append(category.text)
except:
pass
try:
top_page_categories = driver.find_elements_by_xpath("//ol[#class='breadcrumb']/li/a")
top_page_categories_list = []
for category in top_page_categories:
top_page_categories_list.append(category.text)
except:
pass
coins_links = driver.find_elements_by_xpath("//div[contains(#class, 'company-coins')]/a")
all_coins = []
for coin in coins_links:
all_coins.append(coin.get_attribute('href'))
try:
location = driver.find_element_by_xpath("//div[#class='addresses mt-3']/div/div/div/div/a").text
except:
pass
try:
twitter = driver.find_element_by_xpath("//div[#class='links mt-2']/a[2]").get_attribute('href')
except:
pass
try:
print('-----------')
print('Company name is: {}'.format(name))
print('Potential Categories are: {}'.format(categories_list))
print('Potential top page categories are: {}'.format(top_page_categories_list))
print('Supporting Crypto is:{}'.format(all_coins))
print('Registered location is: {}'.format(location))
print('Company twitter profile is: {}'.format(twitter))
time.sleep(1)
except:
pass
all_names.append(name)
all_categories.append(categories_list)
all_categories2.append(top_page_categories_list)
all_cryptos.append(all_coins)
all_twitter.append(twitter)
all_locations.append(location)
except:
pass
df = pd.DataFrame(list(zip(all_names, all_categories, all_categories2, all_cryptos, all_twitter, all_locations)), columns=['Company name', 'Categories1', 'Categories2', 'Supporting Crypto', 'Twitter Handle', 'Registered Location'])
CryptoWerk_Data = df.to_csv('CryptoWerk4.csv', index=False)

Redirect calls happen for two reasons, in your case either by executing some javascript code when clicking the last time on the load more button or by receiving an HTTP 3xx code, which is the least likely in your case.
So you need to identify when this javascript code is executed and send an ESC_KEY before it loads and then executing the rest of your script.
You could also scrape the links and append them to your list before clicking the load more button and each time it is clicked, make an if statement the verify the link of the page you're in, if it is that of the promotion page then execute the rest of your code, else click load more.
while page_is_same:
scrape_elements_add_to_list()
click_load_more()
verify_current_page_link()
if current_link_is_same != link_of_scraped_page:
page_is_same = False
# rest of the code here

Related

Want to scraping titles, dates, links, and content from IOL website but can't

I am new to web scraping, and I am trying to scrape the titles, dates, links, and contents of news articles on this website: https://www.iol.co.za/news/south-africa/eastern-cape.
The titles of the articles have different class names and heading (h) tag. I was able to scrape the dates, links, and titles using h tag. However, when I tried to store them in a pandas dataframe, I received the following errors-> ValueError: All arrays must be of the same length.
I also wrote the code to get the content of each article using the links. I got an error as well. I will thankful if I can be assisted.
I have tried different options to scrape the titles by creating a list of the different class names, but to no avail.
Please see my code below:
import sys, time
from bs4 import BeautifulSoup
import requests
import pandas as pd
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from datetime import timedelta
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
import re
art_title = [] # to store the titles of all news article
art_date = [] # to store the dates of all news article
art_link = [] # to store the links of all news article
pagesToGet = ['south-africa/eastern-cape']
for i in range(0, len(pagesToGet)):
print('processing page : \n')
url = 'https://www.iol.co.za' + str(pagesToGet[i])
print(url)
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.maximize_window()
#time.sleep(5) # allow you to sleep your code before your retrieve the elements from the webpage. Additionally, to
# prevent the chrome driver opening a new instance for every url, open the browser outside of the loop.
# an exception might be thrown, so the code should be in a try-except block
try:
# use the browser to get the url. This is suspicious command that might blow up.
driver.get("https://www.iol.co.za/news/" +str(pagesToGet[i]))
except Exception as e: # this describes what to do if an exception is thrown
error_type, error_obj, error_info = sys.exc_info() # get the exception information
print('ERROR FOR LINK:', url) # print the link that cause the problem
print(error_type, 'Line:', error_info.tb_lineno) # print error info and line that threw the exception
continue # ignore this page. Abandon this and go back.
time.sleep(3) # Allow 3 seconds for the web page to open
# Code to scroll the screen to the end and click on more news till the 15th page before scraping all the news
k = 1
while k<=2:
scroll_pause_time = 1 # You can set your own pause time. My laptop is a bit slow so I use 1 sec
screen_height = driver.execute_script("return window.screen.height;") # get the screen height of the web
i = 1
while True:
# scroll one screen height each time
driver.execute_script("window.scrollTo(0, {screen_height}*{i});".format(screen_height=screen_height, i=i))
i += 1
time.sleep(scroll_pause_time)
# update scroll height each time after scrolled, as the scroll height can change after we scrolled the page
scroll_height = driver.execute_script("return document.body.scrollHeight;")
# Break the loop when the height we need to scroll to is larger than the total scroll height
if (screen_height) * i > scroll_height:
break
driver.find_element(By.CSS_SELECTOR, '.Articles__MoreFromButton-sc-1mrfc98-0').click()
k += 1
time.sleep(1)
soup = BeautifulSoup(driver.page_source, 'html.parser')
news = soup.find_all('article', attrs={'class': 'sc-ifAKCX'})
print(len(news))
# Getting titles, dates, and links
for j in news:
# Article title
title = j.findAll(re.compile('^h[1-6]'))
for news_title in title:
art_title.append(news_title.text)
# Article dates
dates = j.find('p', attrs={'class': 'sc-cIShpX'})
if dates is not None:
date = dates.text
split_date = date.rsplit('|', 1)[1][10:].rsplit('<', 1)[0]
art_date.append(split_date)
# Article links
address = j.find('a').get('href')
news_link = 'https://www.iol.co.za' + address
art_link.append(news_link)
df = pd.DataFrame({'Article_Title': art_title, 'Date': art_date, 'Source': art_link})
# Getting contents
new_articles = ...struggling to write the code
df['Content'] = news_articles
df.to_csv('data.csv')
driver.quit()

I think this is what you are looking for:
# Needed libs
from selenium.webdriver import ActionChains, Keys
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium import webdriver
# Initialize drivver and navigate
driver = webdriver.Chrome()
driver.maximize_window()
url = 'https://www.iol.co.za/news/south-africa/eastern-cape'
wait = WebDriverWait(driver, 5)
driver.get(url)
time.sleep(3)
# take the articles
articles = wait.until(EC.presence_of_all_elements_located((By.XPATH, f"//article//*[(name() = 'h1' or name()='h2' or name()='h3' or name()='h4' or name()='h5' or name()='h6' or name()='h7') and string-length(text()) > 0]/ancestor::article")))
# For every article we take what we want
for article in articles:
header = article.find_element(By.XPATH, f".//*[name() = 'h1' or name()='h2' or name()='h3' or name()='h4' or name()='h5' or name()='h6' or name()='h7']")
print(header.get_attribute('textContent'))
author_and_date = article.find_elements(By.XPATH, f".//*[name() = 'h1' or name()='h2' or name()='h3' or name()='h4' or name()='h5' or name()='h6' or name()='h7']/following-sibling::p[1]")
if author_and_date:
print(author_and_date[0].get_attribute('textContent'))
else:
print("No author found")
link = article.find_element(By.XPATH, f".//a")
print(link.get_attribute('href'))

How to run 'implicity_wait()' in a 'for loop' with respect to Web Scraping using Python?

Actually, I want to scrape the 'title' and 'product description' for all the products and from all the pages, and then save it into the '.csv' file.
URL:- hhttps://www.nykaa.com/makeup/body-art/c/3024?page_no=1&sort=popularity&ptype=lst&id=3024&root=nav_2&dir=desc&order=popularity&eq=desktop
This is what, I have tried.
from msilib.schema import Error
from os import sep
from tkinter import ON
from turtle import goto
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import numpy as np
from random import randint
import pandas as pd
import requests
import csv
title_list = []
para_list = []
expiry_list = []
country_list = []
importer_list = []
address_list = []
myDict = {'body-art': 3024}
browser = webdriver.Chrome(
r'C:\Users\paart\.wdm\drivers\chromedriver\win32\97.0.4692.71\chromedriver.exe')
browser.maximize_window()
browser.implicitly_wait(20)
for item_name in myDict:
page_num = 1
while True:
try:
page = f"https://www.nykaa.com/makeup/{item_name}/c/{myDict[item_name]}?page_no={page_num}&sort=popularity&ptype=lst&id={myDict[item_name]}&root=nav_2&dir=desc&order=popularity&eq=desktop"
print(page)
requests.get(page)
soup = BeautifulSoup(requests.get(page).content, 'html.parser')
urls = [item.get("href")
for item in soup.find_all("a", class_="css-qlopj4")]
# print(urls)
if len(urls) == 0:
break
for i in range(0, 2): #Since, it's a huge amount of data, that's why I have taken 2 products on one page, otherwise it will be in the range(0,30). It will cover all the products from an individual pages.
try:
url = urls[i]
browser.get("https://www.nykaa.com" + url)
title_data = browser.find_elements(
By.CLASS_NAME, 'css-1gc4x7i').text
print(title_data)
for t in title_data:
title_list.append(t)
browser.execute_script("document.body.style.zoom='50%'")
browser.execute_script("document.body.style.zoom='100%'")
# Creates "load more" button object.
browser.implicitly_wait(20)
loadMore = browser.find_element(
By.XPATH, "/html/body/div[1]/div/div[3]/div[1]/div[2]/div/div/div[2]")
loadMore.click()
browser.implicitly_wait(20)
desc_data = browser.find_elements(By.ID, 'content-details')
for desc in desc_data:
para_details = browser.find_element(By.XPATH,
'//*[#id="content-details"]/p[1]').text
para_list.append(para_details)
expiry = browser.find_element(By.XPATH,
'//*[#id="content-details"]/p[2]').text
expiry_list.append(expiry)
country = browser.find_element(By.XPATH,
'//*[#id="content-details"]/p[3]').text
country_list.append(country)
importer = browser.find_element(By.XPATH,
'//*[#id="content-details"]/p[4]').text
importer_list.append(importer)
address = browser.find_element(By.XPATH,
'//*[#id="content-details"]/p[5]').text
address_list.append(address)
except:
break
except:
break
page_num += 1
title_list = [i.split('.css', 1)[0] for i in title_list]
print(*title_list, sep="\n")
print(*para_list, sep="\n")
print(*expiry_list, sep="\n")
print(*country_list, sep="\n")
print(*importer_list, sep="\n")
print(*address_list, "\n")
data_new = {"Title": title_list, "Para": para_list, "Expiry": expiry_list,
"Country": country_list, "Importer": importer_list, "Address": address_list}
df = pd.DataFrame(data_new)
df.to_csv("nykaa_makeup_bodyArt_new.csv")
# print(df)
The Output, I am receiving is as:
DevTools listening on ws://127.0.0.1:30887/devtools/browser/a222842a-7ce3-4070-a684-7e8bb8772279
https://www.nykaa.com/makeup/body-art/c/3024?page_no=1&sort=popularity&ptype=lst&id=3024&root=nav_2&dir=desc&order=popularity&eq=desktop
https://www.nykaa.com/makeup/body-art/c/3024?page_no=2&sort=popularity&ptype=lst&id=3024&root=nav_2&dir=desc&order=popularity&eq=desktop
https://www.nykaa.com/makeup/body-art/c/3024?page_no=3&sort=popularity&ptype=lst&id=3024&root=nav_2&dir=desc&order=popularity&eq=desktop
https://www.nykaa.com/makeup/body-art/c/3024?page_no=4&sort=popularity&ptype=lst&id=3024&root=nav_2&dir=desc&order=popularity&eq=desktop
https://www.nykaa.com/makeup/body-art/c/3024?page_no=5&sort=popularity&ptype=lst&id=3024&root=nav_2&dir=desc&order=popularity&eq=desktop
PS E:\Web Scraping - Nykaa>
I think, due to the implicity_wait() function, it's not able to fetch the product's title & description. After my code runs, the '.csv' file is created, but it's a blank file. Maybe, I am wrong. Please help me regarding this. Do I need change to add/change some parts of the code?
Thanks 🙏🏻

There is no need to set browser.implicitly_wait multiple times.
browser.implicitly_wait is setting the timeout, how much time the driver will try to pool the DOM in order to locate an element on the page before it races exception.
browser.implicitly_wait is normally set per driver session.
This is definetely not a pause command like time.sleep.
So, in case you need to put a pause in your code you should use time.sleep while this is not recommended.
Also, it's much preferably to use Expected Conditions explicit waits rather than browser.implicitly_wait since browser.implicitly_wait waits for element presence i.e. it will release the run when element is just appeared while it may not be completely rendered.
In order to wait for element completely rendered and containing it text you should use something like
wait.until(EC.visibility_of_element_located((By.XPATH, "/html/body/div[1]/div/div[3]/div[1]/div[2]/div/div/div[2]")))
Where "/html/body/div[1]/div/div[3]/div[1]/div[2]/div/div/div[2]" is XPath of element you wishe to get the text from.

How to periodically fetch records from a website using selenium?

I have a small script that fetches company data from a website. This website gets regularly updated with new company information. How can I update my csv with new records on a periodic basis? Also as you can see in the code I have used an explicit range for the pages, what other solutions are possible?
The following is the code -
from selenium.webdriver import Firefox
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from time import sleep
import csv
#navigate to the ystory companies page
#start collecting data from ystory
START_URL = 'https://yourstory.com/companies/search?page=1&hitsPerPage=30'
#when the collection populates 30 elements then click on next page
class CompDeetz():
def __init__(self):
self.browser = Firefox()
self.browser.get(START_URL)
sleep(20)
self.browser.find_element_by_xpath('/html/body/div[12]/div/div/button').click()
sleep(5)
self.browser.find_element_by_xpath('/html/body/div[1]/div[4]').click()
self.database = []
def write_row(self,record):
with open('test.csv', 'a') as t:
writer = csv.writer(t)
writer.writerows(record)
def get_everything(self):
all_list = [ (a.text) for a in self.browser.find_elements_by_xpath('//tr[#class="hit"]')]
all_records = []
for company in all_list:
record = company.split('\n')
all_records.append(record)
self.write_row(all_records)
def next_page(self):
self.browser.find_element_by_xpath('//ul[#class="ais-Pagination-list"]/li[7]/a').click()
sleep(20)
def main():
t = CompDeetz()
t.get_everything()
for i in range(33):
t.next_page()
t.get_everything()
if __name__ == "__main__":
main()

Instead of having two different methods get_everything and next_page and calling them multiple times. You can have one method get_everything and call it once.
def get_everything(self):
all_records = []
nextPage = True
while nextPage:
all_list = [ (a.text) for a in self.browser.find_elements_by_xpath('//tr[#class="hit"]')]
for company in all_list:
record = company.split('\n')
all_records.append(record)
try:
nextPagelink = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//a[#aria-label='Next page']")))
driver.execute_script("arguments[0].scrollIntoView();", nextPagelink)
driver.execute_script("arguments[0].click();", nextPagelink)
time.sleep(5) # for next [age to load
#As on last page, next page link is not available. It will throw exception
except NoSuchElementException:
nextpage = False
self.write_row(all_records)
Note : take care of Pop up coming on page. I hope you already have mechanism to handle it.

how to save data from multiple pages using webdriver into a single csv

so i'm trying to save data from googlescholar using selenium (webdriver) and so far i can print the data that i want, but i when i saved it into a csv it only saves the first page
from selenium import webdriver
from selenium.webdriver.common.by import By
# Import statements for explicit wait
from selenium.webdriver.support.ui import WebDriverWait as W
from selenium.webdriver.support import expected_conditions as EC
import time
import csv
from csv import writer
exec_path = r"C:\Users\gvste\Desktop\proyecto\chromedriver.exe"
URL = r"https://scholar.google.com/citations?view_op=view_org&hl=en&authuser=2&org=8337597745079551909"
button_locators = ['//*[#id="gsc_authors_bottom_pag"]/div/button[2]', '//*[#id="gsc_authors_bottom_pag"]/div/button[2]','//*[#id="gsc_authors_bottom_pag"]/div/button[2]']
wait_time = 3
driver = webdriver.Chrome(executable_path=exec_path)
driver.get(URL)
wait = W(driver, wait_time)
#driver.maximize_window()
for j in range(len(button_locators)):
button_link = wait.until(EC.element_to_be_clickable((By.XPATH, button_locators[j])))
address = driver.find_elements_by_class_name("gsc_1usr")
#for post in address:
#print(post.text)
time.sleep(4)
with open('post.csv','a') as s:
for i in range(len(address)):
addresst = address
#if addresst == 'NONE':
# addresst = str(address)
#else:
addresst = address[i].text.replace('\n',',')
s.write(addresst+ '\n')
button_link.click()
time.sleep(4)
#driver.quit()

You only get one first page data because your program stops after it clicks next page button. You have to put all that in a for loop.
Notice i wrote in range(7), because I know there are 7 pages to open, in reality we should never do that. Imagine if we have thousands of pages. We should add some logic to check if the "next page button" exists or something and loop until it doesn't
exec_path = r"C:\Users\gvste\Desktop\proyecto\chromedriver.exe"
URL = r"https://scholar.google.com/citations?view_op=view_org&hl=en&authuser=2&org=8337597745079551909"
button_locators = "/html/body/div/div[8]/div[2]/div/div[12]/div/button[2]"
wait_time = 3
driver = webdriver.Chrome(executable_path=exec_path)
driver.get(URL)
wait = W(driver, wait_time)
time.sleep(4)
# 7 pages. In reality, we should get this number programmatically
for page in range(7):
# read data from new page
address = driver.find_elements_by_class_name("gsc_1usr")
# write to file
with open('post.csv','a') as s:
for i in range(len(address)):
addresst = address[i].text.replace('\n',',')
s.write(addresst+ '\n')
# find and click next page button
button_link = wait.until(EC.element_to_be_clickable((By.XPATH, button_locators)))
button_link.click()
time.sleep(4)
also in the future you should look to change all these time.sleeps to wait.until. Because sometimes your page loads quicker, and the program could do it's job faster. Or even worse, your network might get a lag and that would screw up your script.

In selenium how to find out the exact number of XPATH links with different ids?

With Python3 and selenium I want to automate the search on a public information site. In this site it is necessary to enter the name of a person, then select the spelling chosen for that name (without or with accents or name variations), access a page with the list of lawsuits found and in this list you can access the page of each case.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from selenium.webdriver.common.keys import Keys
import time
import re
Name that will be searched
name = 'JOSE ROBERTO ARRUDA'
Create path, search start link, and empty list to store information
firefoxPath="/home/abraji/Documentos/Code/geckodriver"
link = 'https://ww2.stj.jus.br/processo/pesquisa/?aplicacao=processos.ea'
processos = []
Call driver and go to first search page
driver = webdriver.Firefox(executable_path=firefoxPath)
driver.get(link)
Position cursor, fill and click
WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#idParteNome'))).click()
time.sleep(1)
driver.find_element_by_xpath('//*[#id="idParteNome"]').send_keys(name)
time.sleep(6)
WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#idBotaoPesquisarFormularioExtendido'))).click()
Mark all spelling possibilities for searching
WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#idBotaoMarcarTodos'))).click()
WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#idBotaoPesquisarMarcados'))).click()
time.sleep(1)
Check how many pages of data there are - to be used in "for range"
capta = driver.find_element_by_xpath('//*[#id="idDivBlocoPaginacaoTopo"]/div/span/span[2]').text
print(capta)
paginas = int(re.search(r'\d+', capta).group(0))
paginas = int(paginas) + 1
print(paginas)
Capture routine
for acumula in range(1, paginas):
# Fill the field with the page number and press enter
driver.find_element_by_xpath('//*[#id="idDivBlocoPaginacaoTopo"]/div/span/span[2]/input').send_keys(acumula)
driver.find_element_by_xpath('//*[#id="idDivBlocoPaginacaoTopo"]/div/span/span[2]/input').send_keys(Keys.RETURN)
time.sleep(2)
# Captures the number of processes found on the current page - qt
qt = driver.find_element_by_xpath('//*[#id="idDivBlocoMensagem"]/div/b').text
qt = int(qt) + 2
print(qt)
# Iterate from found number of processes
for item in range(2, qt):
# Find the XPATH of each process link - start at number 2
vez = '//*[#id="idBlocoInternoLinhasProcesso"]/div[' + str(item) + ']/span[1]/span[1]/span[1]/span[2]/a'
print(vez)
# Access the direct link and click
element = WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, vez)))
element.click()
# Run tests to get data
try:
num_unico = driver.find_element_by_xpath('//*[#id="idProcessoDetalhesBloco1"]/div[6]/span[2]/a').text
except NoSuchElementException:
num_unico = "sem_numero_unico"
try:
nome_proc = driver.find_element_by_xpath('//*[#id="idSpanClasseDescricao"]').text
except NoSuchElementException:
nome_proc = "sem_nome_encontrado"
try:
data_autu = driver.find_element_by_xpath('//*[#id="idProcessoDetalhesBloco1"]/div[5]/span[2]').text
except NoSuchElementException:
data_autu = "sem_data_encontrada"
# Fills dictionary and list
dicionario = {"num_unico": num_unico,
"nome_proc": nome_proc,
"data_autu": data_autu
}
processos.append(dicionario)
# Return a page to click on next process
driver.execute_script("window.history.go(-1)")
# Close driver
driver.quit()
In this case I captured the number of link pages (3) and the total number of links (84). So my initial idea was to do the "for" three times and within them split the 84 links
The direct address of each link is in XPATH (//*[#id="idBlocoInternoLinhasProcesso"]/div[41]/span[1]/span[1]/span[1]/span[2]/a) which I replace with the "item" to click
For example, when it arrives at number 42 I have an error because the first page only goes up to 41
My problem is how to go to the second page and then restart only "for" secondary
I think the ideal would be to know the exact number of links on each of the three pages
Anyone have any ideas?

Code below is "Capture routine":
wait = WebDriverWait(driver, 20)
#...
while True:
links = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//span[contains(#class,'classSpanNumeroRegistro')]")))
print("links len", len(links))
for i in range(1, len(links) + 1):
# Access the direct link and click
.until(EC.element_to_be_clickable((By.XPATH, f"(//span[contains(#class,'classSpanNumeroRegistro')])[{i}]//a"))).click()
# Run tests to get data
try:
num_unico = driver.find_element_by_xpath('//*[#id="idProcessoDetalhesBloco1"]/div[6]/span[2]/a').text
except NoSuchElementException:
num_unico = "sem_numero_unico"
try:
nome_proc = driver.find_element_by_xpath('//*[#id="idSpanClasseDescricao"]').text
except NoSuchElementException:
nome_proc = "sem_nome_encontrado"
try:
data_autu = driver.find_element_by_xpath('//*[#id="idProcessoDetalhesBloco1"]/div[5]/span[2]').text
except NoSuchElementException:
data_autu = "sem_data_encontrada"
# Fills dictionary and list
dicionario = {"num_unico": num_unico,
"nome_proc": nome_proc,
"data_autu": data_autu
}
processos.append(dicionario)
# Return a page to click on next process
driver.execute_script("window.history.go(-1)")
# wait.until(EC.presence_of_element_located((By.CLASS_NAME, "classSpanPaginacaoImagensDireita")))
next_page = driver.find_elements_by_css_selector(".classSpanPaginacaoProximaPagina")
if len(next_page) == 0:
break
next_page[0].click()

You can try run the loop until next button is present on the screen. the logic will look like this,
try:
next_page = driver.find_element_by_class_name('classSpanPaginacaoProximaPagina')
if(next_page.is_displayed()):
next_page.click()
except NoSuchElementException:
print('next page does not exists')

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

How to stop selenium scraper from redirecting to another internal weblink of the scraped website? - python

Related

Want to scraping titles, dates, links, and content from IOL website but can't

How to run 'implicity_wait()' in a 'for loop' with respect to Web Scraping using Python?

How to periodically fetch records from a website using selenium?

how to save data from multiple pages using webdriver into a single csv

In selenium how to find out the exact number of XPATH links with different ids?

Categories

Resources