scraping with selenium cant click on clickable text

scraping with selenium cant click on clickable text - python

I am trying to scrape some data from yahoo finance, for each stock, I want to get the historical data. Taking the Apple stock. I should go to https://finance.yahoo.com/quote/AAPL/history?p=AAPL and choose "MAX" from "Time Period". so
I believe the script I wrote so far is getting the date element, but somehow clicking on it to be able to choose "MAX" is not working.
here is my whole script:
# using linux here
project_path = os.getcwd()
driver_path = project_path + "/" + "chromedriver"
yahoo_finance = "https://finance.yahoo.com/quote/"
driver = webdriver.Chrome(driver_path)
def get_data(symbol='AAPL'):
stock_history_link = yahoo_finance + symbol + '/history?p=' + symbol
driver.get(stock_history_link)
date_picker = '//div[contains(#class, "D(ib)") and contains(#class, "Pos(r)") and contains(#class, "Cur(p)")' \
'and contains(#class, "O(n):f")]'
try:
print("I am inside")
date_picker_2 = "//div[#class='Pos(r) D(ib) O(n):f Cur(p)']"
date_picker_element = driver.find_element_by_xpath(date_picker_2)
print("date_picker_element: ", date_picker_element)
date_picker_element.click()
try:
print("I will be waiting for the date")
my_dropdown = WebDriverWait(driver, 100).until(
EC.presence_of_element_located((By.ID, 'dropdown-menu'))
)
print(my_dropdown)
print("I am not waiting anymore")
except TimeoutException as e:
print("wait timed out")
print(e)
except WebDriverException:
print("Something went wrong while trying to pick the max date")
if __name__ == '__main__':
try:
get_data()
except:
pass
# finally:
# driver.quit()

To click the button with Max just open it up and target it.
driver.get("https://finance.yahoo.com/quote/AAPL/history?p=AAPL")
wait = WebDriverWait(driver, 10)
wait.until(EC.element_to_be_clickable((By.XPATH, "//span[#class='C($linkColor) Fz(14px)']"))).click()
wait.until(EC.element_to_be_clickable((By.XPATH, "//button[#data-value='MAX']"))).click()
Element:
<button class="Py(5px) W(45px) Fz(s) C($tertiaryColor) Cur(p) Bd Bdc($seperatorColor) Bgc($lv4BgColor) Bdc($linkColor):h Bdrs(3px)" data-value="MAX"><span>Max</span></button>
Imports:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

You have the wrong xpath for the date_picker_2:
date_picker_2 = '//*[#id="Col1-1-HistoricalDataTable-Proxy"]/section/div[1]/div[1]/div[1]/div/div/div/span'
Using requests:
import requests
import datetime
end = int(datetime.datetime.strptime(datetime.date.today().isoformat(), "%Y-%m-%d").timestamp())
url = f"https://finance.yahoo.com/quote/AAPL/history?period1=345427200&period2={end}&interval=1d&filter=history&frequency=1d&includeAdjustedClose=true"
requests.get(url)
Gets you to the same end page.

Related

Webscraping Multiple Pages in Python with Selenium - loop not working

I'm quite new to python and have written a script using selenium to scrape a website. I've tried everything but can't get the loop to cycle through pages. It currently just repeats the data on the first page 5 times. I want to scrape all the pages for 'BR1' any help would be great, currently the script below only scrapes the first page 5 times.
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
with open('rightmove.csv', 'w') as file:
file.write('PropertyCardcontent \n')
PATH = ("/usr/local/bin/chromedriver")
driver = webdriver.Chrome(PATH)
driver.get("https://www.rightmove.co.uk/house-prices.html")
print(driver.title)
elem = driver.find_element(By.NAME, 'searchLocation') # Find the search box
elem.send_keys('BR1' + Keys.RETURN)
try:
content = WebDriverWait(driver, 15).until(
EC.presence_of_element_located((By.ID,'content'))
)
finally:
time.sleep(3)
for p in range(5):
sold = content.find_elements(By.CLASS_NAME, 'sold-prices-content-wrapper ')
for solds in sold:
address = solds.find_elements(By.CLASS_NAME, 'sold-prices-content ')
for addresses in address:
result = addresses.find_elements(By.CLASS_NAME, 'results ')
for results in result:
card = results.find_elements(By.CLASS_NAME,'propertyCard')
for propertyCard in card:
header = propertyCard.find_elements(By.CLASS_NAME,'propertyCard-content')
for propertyCardcontent in header:
road = propertyCardcontent.find_elements(By.CLASS_NAME,'title')
for propertyCardcontent in header:
road = propertyCardcontent.find_elements(By.CLASS_NAME,'subTitle')
for subtitle in road:
bed = subtitle.find_elements(By.CLASS_NAME, 'propertyType')
with open('rightmove.csv', 'a') as file:
for i in range(len(result)):
file.write(header[i].text + '\n')
button = driver.find_element(By.XPATH, '//*[#id="content"]/div[2]/div[2]/div[4]/div[27]/div[3]/div')
button.click()
file.close()
time.sleep(3)
driver.quit()

Since the website link has page number on it, I recommend you put the base url as "https://www.rightmove.co.uk/house-prices/br1.html?page=1", and loop through the pages while changing the last index of the url with methods like format string.
One other thing, you don't need to implement all those for loops, you can simply assign each variable to its specific value since everything you need is inside an html block which is easy to navigate on it.
Update:
I'm sorry for being late, had unexpected stuff(...).
I've made some changes as I use Brave, so make sure you select your browser, Chrome I believe, the chromedriver(ver:102) stays the same (or depending your Chrome version).
I've also got the Price and Date and stored them in a tuple.
Every record is stored in a list[Title, propertyType, tupleof(Price_Date)]
At the end, it creates a csv and stores everything inside with a ";" as delimter.
You can if you prefer split the price and date for later use, up to you.
Note: This looping method only applies to websites in which the number of page is included within the URL. In this case, both the key and number of page is included in the URL.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
import time
import random
import itertools
options = Options()
options.binary_location = r'C:\Program Files\BraveSoftware\Brave-Browser\Application\brave.exe'
driver = webdriver.Chrome(options = options, service = Service("chromedriver.exe"))
key_word = "BR1".lower()
base_url = f"https://www.rightmove.co.uk/house-prices/{key_word}.html?page=1"
driver.get(base_url)
#Number of pages
pages = driver.find_element(By.XPATH, '//span[#class="pagination-label"][2]').text
pages = int(pages.strip('of'))
WebDriverWait(driver, 15).until(
EC.presence_of_element_located((By.CLASS_NAME, 'results '))
)
data = []
pc = 0
for p in range(1,pages+1):
driver.get(f"https://www.rightmove.co.uk/house-prices/{key_word}.html?page={p}")
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.XPATH, '//div//div[#class="propertyCard"]'))
)
propertyCards = driver.find_elements(By.XPATH, '//div//div[#class="propertyCard"]')
for propertyCard in propertyCards:
title = propertyCard.find_element(By.CLASS_NAME, 'title').text
propertyType = propertyCard.find_element(By.CLASS_NAME, 'propertyType').text
price_list = propertyCard.find_elements(By.CLASS_NAME, 'price')
date_list = propertyCard.find_elements(By.CLASS_NAME, 'date-sold')
data.append([title,propertyType])
for p, d in itertools.zip_longest(price_list, date_list , fillvalue = None):
try:
price = p.text
date = d.text
data[pc].append((price, date))
except Exception as e:
print(e)
pc+=1
time.sleep(random.randint(1,4))
print(data)
with open('rightmove.csv', 'w') as file:
header = "Title;propertyType;Price_Date\n"
file.write(header)
for record in data:
file.write("{};{};{}\n".format(record[0],record[1],record[2:]))
driver.quit()

You don't have to go down to dom elem by elem, you can just use xpath or class_name (if it's unique, otherwise it's better xpath or css-selector) and get the item you are looking for.
Anyway follow this:
import time
import selenium.webdriver as webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome("/usr/local/bin/chromedriver")
driver.get("https://www.rightmove.co.uk/house-prices.html")
# send query
query = "BR1"
search_bar = driver.find_element(By.XPATH, '//input[#class="searchBox ac_input"]')
search_bar.send_keys(query)
search_bar.send_keys(Keys.ENTER)
# wait to result been loaded
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, 'propertyCard'))
)
#get amount of pages
pages = driver.find_element(By.XPATH, '//span[#class="pagination-label"][2]').text
pages = int(pages.replace('of ', ''))
data = []
i = 1
while i <= pages:
WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.XPATH, '//div[contains(text(), "Next")]'))
).click()
# wait page load result
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.XPATH, '//div//div[#class="propertyCard"]'))
)
propertyCards = driver.find_elements(By.XPATH, '//div//div[#class="propertyCard"]')
# loop over result and store data
for propertyCard in propertyCards:
title = propertyCard.find_element(By.CLASS_NAME, 'title').text
propertyType = propertyCard.find_element(By.CLASS_NAME, 'propertyType').text
data.append((title, propertyType))
time.sleep(1)
i += 1
print("you reach the last page")
#get number of results
printf(data)
driver.close()
I use a list of tuple cause in your example you want store 2 item, if you want store more data you can use a dict and then convert into csv with Dictwriter directly. Enjoy.

How do I press load more button while scraping comments on Instagram with Selenium Python

I'm working on a project that can scrape comments off posts on instagram and write them into an excel file.
Here's my code:
from selenium.webdriver.common.by import By
from selenium import webdriver
import time
import sys
import pandas as pd
from pandas import ExcelWriter
import os.path
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
url = [
"https://www.instagram.com/p/CcVTqRtJ2gj/",
"https://www.instagram.com/p/CcXpLHepve-/",
]
user_names = []
user_comments = []
driver = driver = webdriver.Chrome("C:\chromedriver.exe")
driver.get(url[0])
time.sleep(3)
username = WebDriverWait(driver, 30).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "input[name='username']")))
password = WebDriverWait(driver, 30).until(EC.element_to_be_clickable((By.CSS_SELECTOR,"input[name='password']")))
username.clear()
username.send_keys("username")
password.clear()
password.send_keys("pwd")
Login_button = (
WebDriverWait(driver, 2)
.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "button[type='submit']")))
.click()
)
time.sleep(4)
not_now = (
WebDriverWait(driver, 30)
.until(
EC.element_to_be_clickable((By.XPATH, '//button[contains(text(), "Not Now")]'))
)
.click()
)
for n in url:
try:
driver.get(n)
time.sleep(3)
load_more_comment = driver.find_element_by_xpath("//button[class='wpO6b ']")
print("Found {}".format(str(load_more_comment)))
i = 0
while load_more_comment.is_displayed() and i < 10:
load_more_comment.click()
time.sleep(1.5)
load_more_comment = driver.find_element_by_xpath(
"//button[class='wpO6b ']"
)
print("Found {}".format(str(load_more_comment)))
i += 1
user_names.pop(0)
user_comments.pop(0)
except Exception as e:
print(e)
pass
comment = driver.find_elements_by_class_name("gElp9 ")
for c in comment:
container = c.find_element_by_class_name("C4VMK")
name = container.find_element_by_class_name("_6lAjh ").text
content = container.find_element_by_class_name("MOdxS ").text
content = content.replace("\n", " ").strip().rstrip()
user_names.append(name)
user_comments.append(content)
print(content)
user_names.pop(0)
user_comments.pop(0)
# export(user_names, user_comments)
driver.close()
df = pd.DataFrame(list(zip(user_names, user_comments)), columns=["Name", "Comments"])
# df.to_excel("Anime Content Engagement.xlsx")
print(df)
And the load-more-comments part, doesn't seem to work.
Since there are more than one buttons with the same class name, I"m not able to choose the right button to click on. And I'm a beginner so if there's anyone with any solution to how I can solve this it would be great.

you can select by aria-label text:
driver.find_element_by_css_selector("svg._8-yf5[aria-label='TEXT']")
i believe the text inside changes according to instagram language, put it according to what appears on your

How to stop selenium scraper from redirecting to another internal weblink of the scraped website?

Was wondering if anyone knows of a way for instructing a selenium script to avoid visiting/redirecting to an internal page that wasn't part of the code. Essentially, my code opens up this page:
https://cryptwerk.com/companies/?coins=1,6,11,2,3,8,17,7,13,4,25,29,24,32,9,38,15,30,43,42,41,12,40,44,20
keeps clicking on show more button until there's none (at end of page) - which by then - it should have collected the links of all the products listed on the page it scrolled through till the end, then visit each one respectively.
What happens instead, it successfully clicks on show more till the end of the page, but then it visits this weird promotion page of the same website instead of following each of the gathered links respectively and then scraping further data points located off each of those newly opened ones.
In a nutshell, would incredibly appreciate it if someone can explain how to avoid this automated redirection on its own! And this is the code in case someone can gratefully nudge me in the right direction :)
from selenium.webdriver import Chrome
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time
from selenium.common.exceptions import NoSuchElementException, ElementNotVisibleException
import json
import selenium.common.exceptions as exception
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.keys import Keys
from selenium import webdriver
webdriver = '/Users/karimnabil/projects/selenium_js/chromedriver-1'
driver = Chrome(webdriver)
driver.implicitly_wait(5)
url = 'https://cryptwerk.com/companies/?coins=1,6,11,2,3,8,17,7,13,4,25,29,24,32,9,38,15,30,43,42,41,12,40,44,20'
driver.get(url)
links_list = []
coins_list = []
all_names = []
all_cryptos = []
all_links = []
all_twitter = []
all_locations = []
all_categories = []
all_categories2 = []
wait = WebDriverWait(driver, 2)
sign_in = driver.find_element_by_xpath("//li[#class='nav-item nav-guest']/a")
sign_in.click()
time.sleep(2)
user_name = wait.until(EC.presence_of_element_located((By.XPATH, "//input[#name='login']")))
user_name.send_keys("karimnsaber95#gmail.com")
password = wait.until(EC.presence_of_element_located((By.XPATH, "//input[#name='password']")))
password.send_keys("PleomaxCW#2")
signIn_Leave = driver.find_element_by_xpath("//div[#class='form-group text-center']/button")
signIn_Leave.click()
time.sleep(3)
while True:
try:
loadMoreButton = driver.find_element_by_xpath("//button[#class='btn btn-outline-primary']")
time.sleep(2)
loadMoreButton.click()
time.sleep(2)
except exception.StaleElementReferenceException:
print('stale element')
break
print('no more elements to show')
try:
company_links = driver.find_elements_by_xpath("//div[#class='companies-list items-infinity']/div[position() > 3]/div[#class='media-body']/div[#class='title']/a")
for link in company_links:
links_list.append(link.get_attribute('href'))
except:
pass
try:
with open("links_list.json", "w") as f:
json.dump(links_list, f)
with open("links_list.json", "r") as f:
links_list = json.load(f)
except:
pass
try:
for link in links_list:
driver.get(link)
name = driver.find_element_by_xpath("//div[#class='title']/h1").text
try:
show_more_coins = driver.find_element_by_xpath("//a[#data-original-title='Show more']")
show_more_coins.click()
time.sleep(1)
except:
pass
try:
categories = driver.find_elements_by_xpath("//div[contains(#class, 'categories-list')]/a")
categories_list = []
for category in categories:
categories_list.append(category.text)
except:
pass
try:
top_page_categories = driver.find_elements_by_xpath("//ol[#class='breadcrumb']/li/a")
top_page_categories_list = []
for category in top_page_categories:
top_page_categories_list.append(category.text)
except:
pass
coins_links = driver.find_elements_by_xpath("//div[contains(#class, 'company-coins')]/a")
all_coins = []
for coin in coins_links:
all_coins.append(coin.get_attribute('href'))
try:
location = driver.find_element_by_xpath("//div[#class='addresses mt-3']/div/div/div/div/a").text
except:
pass
try:
twitter = driver.find_element_by_xpath("//div[#class='links mt-2']/a[2]").get_attribute('href')
except:
pass
try:
print('-----------')
print('Company name is: {}'.format(name))
print('Potential Categories are: {}'.format(categories_list))
print('Potential top page categories are: {}'.format(top_page_categories_list))
print('Supporting Crypto is:{}'.format(all_coins))
print('Registered location is: {}'.format(location))
print('Company twitter profile is: {}'.format(twitter))
time.sleep(1)
except:
pass
all_names.append(name)
all_categories.append(categories_list)
all_categories2.append(top_page_categories_list)
all_cryptos.append(all_coins)
all_twitter.append(twitter)
all_locations.append(location)
except:
pass
df = pd.DataFrame(list(zip(all_names, all_categories, all_categories2, all_cryptos, all_twitter, all_locations)), columns=['Company name', 'Categories1', 'Categories2', 'Supporting Crypto', 'Twitter Handle', 'Registered Location'])
CryptoWerk_Data = df.to_csv('CryptoWerk4.csv', index=False)

Redirect calls happen for two reasons, in your case either by executing some javascript code when clicking the last time on the load more button or by receiving an HTTP 3xx code, which is the least likely in your case.
So you need to identify when this javascript code is executed and send an ESC_KEY before it loads and then executing the rest of your script.
You could also scrape the links and append them to your list before clicking the load more button and each time it is clicked, make an if statement the verify the link of the page you're in, if it is that of the promotion page then execute the rest of your code, else click load more.
while page_is_same:
scrape_elements_add_to_list()
click_load_more()
verify_current_page_link()
if current_link_is_same != link_of_scraped_page:
page_is_same = False
# rest of the code here

In selenium how to find out the exact number of XPATH links with different ids?

With Python3 and selenium I want to automate the search on a public information site. In this site it is necessary to enter the name of a person, then select the spelling chosen for that name (without or with accents or name variations), access a page with the list of lawsuits found and in this list you can access the page of each case.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from selenium.webdriver.common.keys import Keys
import time
import re
Name that will be searched
name = 'JOSE ROBERTO ARRUDA'
Create path, search start link, and empty list to store information
firefoxPath="/home/abraji/Documentos/Code/geckodriver"
link = 'https://ww2.stj.jus.br/processo/pesquisa/?aplicacao=processos.ea'
processos = []
Call driver and go to first search page
driver = webdriver.Firefox(executable_path=firefoxPath)
driver.get(link)
Position cursor, fill and click
WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#idParteNome'))).click()
time.sleep(1)
driver.find_element_by_xpath('//*[#id="idParteNome"]').send_keys(name)
time.sleep(6)
WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#idBotaoPesquisarFormularioExtendido'))).click()
Mark all spelling possibilities for searching
WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#idBotaoMarcarTodos'))).click()
WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#idBotaoPesquisarMarcados'))).click()
time.sleep(1)
Check how many pages of data there are - to be used in "for range"
capta = driver.find_element_by_xpath('//*[#id="idDivBlocoPaginacaoTopo"]/div/span/span[2]').text
print(capta)
paginas = int(re.search(r'\d+', capta).group(0))
paginas = int(paginas) + 1
print(paginas)
Capture routine
for acumula in range(1, paginas):
# Fill the field with the page number and press enter
driver.find_element_by_xpath('//*[#id="idDivBlocoPaginacaoTopo"]/div/span/span[2]/input').send_keys(acumula)
driver.find_element_by_xpath('//*[#id="idDivBlocoPaginacaoTopo"]/div/span/span[2]/input').send_keys(Keys.RETURN)
time.sleep(2)
# Captures the number of processes found on the current page - qt
qt = driver.find_element_by_xpath('//*[#id="idDivBlocoMensagem"]/div/b').text
qt = int(qt) + 2
print(qt)
# Iterate from found number of processes
for item in range(2, qt):
# Find the XPATH of each process link - start at number 2
vez = '//*[#id="idBlocoInternoLinhasProcesso"]/div[' + str(item) + ']/span[1]/span[1]/span[1]/span[2]/a'
print(vez)
# Access the direct link and click
element = WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, vez)))
element.click()
# Run tests to get data
try:
num_unico = driver.find_element_by_xpath('//*[#id="idProcessoDetalhesBloco1"]/div[6]/span[2]/a').text
except NoSuchElementException:
num_unico = "sem_numero_unico"
try:
nome_proc = driver.find_element_by_xpath('//*[#id="idSpanClasseDescricao"]').text
except NoSuchElementException:
nome_proc = "sem_nome_encontrado"
try:
data_autu = driver.find_element_by_xpath('//*[#id="idProcessoDetalhesBloco1"]/div[5]/span[2]').text
except NoSuchElementException:
data_autu = "sem_data_encontrada"
# Fills dictionary and list
dicionario = {"num_unico": num_unico,
"nome_proc": nome_proc,
"data_autu": data_autu
}
processos.append(dicionario)
# Return a page to click on next process
driver.execute_script("window.history.go(-1)")
# Close driver
driver.quit()
In this case I captured the number of link pages (3) and the total number of links (84). So my initial idea was to do the "for" three times and within them split the 84 links
The direct address of each link is in XPATH (//*[#id="idBlocoInternoLinhasProcesso"]/div[41]/span[1]/span[1]/span[1]/span[2]/a) which I replace with the "item" to click
For example, when it arrives at number 42 I have an error because the first page only goes up to 41
My problem is how to go to the second page and then restart only "for" secondary
I think the ideal would be to know the exact number of links on each of the three pages
Anyone have any ideas?

Code below is "Capture routine":
wait = WebDriverWait(driver, 20)
#...
while True:
links = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//span[contains(#class,'classSpanNumeroRegistro')]")))
print("links len", len(links))
for i in range(1, len(links) + 1):
# Access the direct link and click
.until(EC.element_to_be_clickable((By.XPATH, f"(//span[contains(#class,'classSpanNumeroRegistro')])[{i}]//a"))).click()
# Run tests to get data
try:
num_unico = driver.find_element_by_xpath('//*[#id="idProcessoDetalhesBloco1"]/div[6]/span[2]/a').text
except NoSuchElementException:
num_unico = "sem_numero_unico"
try:
nome_proc = driver.find_element_by_xpath('//*[#id="idSpanClasseDescricao"]').text
except NoSuchElementException:
nome_proc = "sem_nome_encontrado"
try:
data_autu = driver.find_element_by_xpath('//*[#id="idProcessoDetalhesBloco1"]/div[5]/span[2]').text
except NoSuchElementException:
data_autu = "sem_data_encontrada"
# Fills dictionary and list
dicionario = {"num_unico": num_unico,
"nome_proc": nome_proc,
"data_autu": data_autu
}
processos.append(dicionario)
# Return a page to click on next process
driver.execute_script("window.history.go(-1)")
# wait.until(EC.presence_of_element_located((By.CLASS_NAME, "classSpanPaginacaoImagensDireita")))
next_page = driver.find_elements_by_css_selector(".classSpanPaginacaoProximaPagina")
if len(next_page) == 0:
break
next_page[0].click()

You can try run the loop until next button is present on the screen. the logic will look like this,
try:
next_page = driver.find_element_by_class_name('classSpanPaginacaoProximaPagina')
if(next_page.is_displayed()):
next_page.click()
except NoSuchElementException:
print('next page does not exists')

Why ActionChains(driver).move_to_element(elem).click().perform() twice

I try to crawl the wechat public accounts includes the key word through "http://weixin.sogou.com/"
But i find i must use twice ActionChains(driver).move_to_element(nextpage).click().perform()，it can still work,and go to the next page !
who can tell me why and how to fix ! Thank you!
The source code are as follow , and sorry the comments are in the Chinese .
# coding=utf-8
import time
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
key = u"江南大学" #搜索的关键词
driver = webdriver.Chrome()
driver.get("http://weixin.sogou.com/")
assert u'搜狗微信' in driver.title
elem = driver.find_element_by_id("upquery")
elem.clear()
elem.send_keys(key)
button = driver.find_element_by_class_name("swz2") #搜索公众号
button.click()
WebDriverWait(driver,10).until(
EC.title_contains(key)
)
count = 0
while True:
for i in range(10):
try:
wechat_name = driver.find_element_by_xpath("//*[#id=\"sogou_vr_11002301_box_{}\"]/div[2]/h3".format(i)).text
print wechat_name
wechat_id = driver.find_element_by_xpath("//*[#id=\"sogou_vr_11002301_box_{}\"]/div[2]/h4/span/label".format(i)).text
print wechat_id
wechat_intro = driver.find_element_by_xpath("//*[#id=\"sogou_vr_11002301_box_{}\"]/div[2]/p[1]/span[2]".format(i)).text
print wechat_intro
print "*************************"
count += 1
except:
pass
try:
nextpage = driver.find_element_by_xpath("//*[#id=\"sogou_next\"]") #下一页的按钮
actions = ActionChains(driver)
actions.move_to_element(nextpage)
actions.click().
actions.perform()
actions = ActionChains(driver)
actions.move_to_element(nextpage)
actions.click().
actions.perform()
except Exception,e:
print e
break
driver.quit()
print count

You can chain your action, so no need to do perform after each action.
actions = ActionChains(driver)
actions.move_to_element(nextpage)
actions.click(nextpage)
actions.perform()
OR
actions = ActionChains(driver)
actions.move_to_element(nextpage)
actions.click(nextpage).perform()

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

scraping with selenium cant click on clickable text - python

Related

Webscraping Multiple Pages in Python with Selenium - loop not working

How do I press load more button while scraping comments on Instagram with Selenium Python

How to stop selenium scraper from redirecting to another internal weblink of the scraped website?

In selenium how to find out the exact number of XPATH links with different ids?

Why ActionChains(driver).move_to_element(elem).click().perform() twice

Categories

Resources