Find XPath Paragraph (following-sibling?) - python

I'm using Selenium with Python 2.7.10 and would like to grab the paragraph following the "Description" header on this page: http://etfdb.com/etf/ROBO/
from selenium import webdriver as driver
from selenium.common.exceptions import NoSuchElementException
def scrape(driver, key):
try:
find_value = driver.find_element_by_xpath("//span[#class='panel__sub-heading' and . = '%s']/following-sibling::p" % key).text
except NoSuchElementException:
print "Not Found"
return None
else:
value = re.search(r"(.+)", find_value).group().encode("utf-8")
print value
return value
description = scrape(driver, "Description")
The XPath I'm using is incorrect because it yields no result. What would be the correct way to find the paragraph following the header "Description"?

This is not a span tag - it's h3:
//h3[#class='panel__sub-heading' and . = '%s']/following-sibling::p

Related

Unable to print foodpanda product links using selenium python

As i want to extract link from a href tag but it no print any result from https://www.foodpanda.pk/restaurants/new?lat=24.9414896&lng=67.1676002&vertical=restaurants
from selenium import webdriver
driver = webdriver.Chrome('F:/chromedriver')
driver.get("https://www.foodpanda.pk/restaurants/new?lat=24.9414896&lng=67.1676002&vertical=restaurants")
# response = scrapy.Selector(text=driver.page_source)
list = driver.find_elements_by_css_selector("ul.vendor-list li")
length = len(driver.find_elements_by_css_selector("ul.vendor-list li"))
for i in range(length):
try:
name = driver.find_elements_by_css_selector(".headline .name")[i].text
time = driver.find_elements_by_css_selector(".badge-info")[i].text.strip()
rating = driver.find_elements_by_css_selector(".rating")[i].text
dealtag = driver.find_elements_by_css_selector(".multi-tag")[i].text
link = driver.find_elements_by_css_selector(".vendor [href]")[i].text
print(name,link,time,rating,dealtag)
except:
pass
Please read the code, This code is working fine in my computer.
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
driver = webdriver.Chrome()
wait = WebDriverWait(driver, 30)
driver.get('https://www.foodpanda.pk/restaurants/new?lat=24.9414896&lng=67.1676002&vertical=restaurants')
Vendor_list = driver.find_elements_by_xpath("//figure[#class=\"vendor-tile item\"]/ancestor::li")
for vendor in Vendor_list:
print("-------------------")
print("Restaurant Name :- " + vendor.find_element_by_xpath(".//span[#class=\"name fn\"]").text)
print("Badge :- " + vendor.find_element_by_xpath(".//span[#class=\"badge-info\"]").text[:2] +
vendor.find_element_by_xpath(".//span[#class=\"badge-info\"]/span").text)
try:
print("Rating :- " + vendor.find_element_by_xpath(".//span[#class=\"rating\"]").text)
except:
print("No Rating Available")
try:
print("Muti Tag :- " + vendor.find_element_by_xpath(".//span[#class=\"multi-tag\"]").text)
except:
print("No Tag Info")
print("Vendor URL :- " + vendor.find_element_by_xpath(".//a").get_attribute("href"))
If it solves your problem then please mark it as answer.
There are no elements with exact class name vendor there.
You should use something like //*[contains(#class,'vendor')]//a[#href]
I used Xpath since I prefer working with it, but you can also use similar css_selector

How to stop selenium scraper from redirecting to another internal weblink of the scraped website?

Was wondering if anyone knows of a way for instructing a selenium script to avoid visiting/redirecting to an internal page that wasn't part of the code. Essentially, my code opens up this page:
https://cryptwerk.com/companies/?coins=1,6,11,2,3,8,17,7,13,4,25,29,24,32,9,38,15,30,43,42,41,12,40,44,20
keeps clicking on show more button until there's none (at end of page) - which by then - it should have collected the links of all the products listed on the page it scrolled through till the end, then visit each one respectively.
What happens instead, it successfully clicks on show more till the end of the page, but then it visits this weird promotion page of the same website instead of following each of the gathered links respectively and then scraping further data points located off each of those newly opened ones.
In a nutshell, would incredibly appreciate it if someone can explain how to avoid this automated redirection on its own! And this is the code in case someone can gratefully nudge me in the right direction :)
from selenium.webdriver import Chrome
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time
from selenium.common.exceptions import NoSuchElementException, ElementNotVisibleException
import json
import selenium.common.exceptions as exception
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.keys import Keys
from selenium import webdriver
webdriver = '/Users/karimnabil/projects/selenium_js/chromedriver-1'
driver = Chrome(webdriver)
driver.implicitly_wait(5)
url = 'https://cryptwerk.com/companies/?coins=1,6,11,2,3,8,17,7,13,4,25,29,24,32,9,38,15,30,43,42,41,12,40,44,20'
driver.get(url)
links_list = []
coins_list = []
all_names = []
all_cryptos = []
all_links = []
all_twitter = []
all_locations = []
all_categories = []
all_categories2 = []
wait = WebDriverWait(driver, 2)
sign_in = driver.find_element_by_xpath("//li[#class='nav-item nav-guest']/a")
sign_in.click()
time.sleep(2)
user_name = wait.until(EC.presence_of_element_located((By.XPATH, "//input[#name='login']")))
user_name.send_keys("karimnsaber95#gmail.com")
password = wait.until(EC.presence_of_element_located((By.XPATH, "//input[#name='password']")))
password.send_keys("PleomaxCW#2")
signIn_Leave = driver.find_element_by_xpath("//div[#class='form-group text-center']/button")
signIn_Leave.click()
time.sleep(3)
while True:
try:
loadMoreButton = driver.find_element_by_xpath("//button[#class='btn btn-outline-primary']")
time.sleep(2)
loadMoreButton.click()
time.sleep(2)
except exception.StaleElementReferenceException:
print('stale element')
break
print('no more elements to show')
try:
company_links = driver.find_elements_by_xpath("//div[#class='companies-list items-infinity']/div[position() > 3]/div[#class='media-body']/div[#class='title']/a")
for link in company_links:
links_list.append(link.get_attribute('href'))
except:
pass
try:
with open("links_list.json", "w") as f:
json.dump(links_list, f)
with open("links_list.json", "r") as f:
links_list = json.load(f)
except:
pass
try:
for link in links_list:
driver.get(link)
name = driver.find_element_by_xpath("//div[#class='title']/h1").text
try:
show_more_coins = driver.find_element_by_xpath("//a[#data-original-title='Show more']")
show_more_coins.click()
time.sleep(1)
except:
pass
try:
categories = driver.find_elements_by_xpath("//div[contains(#class, 'categories-list')]/a")
categories_list = []
for category in categories:
categories_list.append(category.text)
except:
pass
try:
top_page_categories = driver.find_elements_by_xpath("//ol[#class='breadcrumb']/li/a")
top_page_categories_list = []
for category in top_page_categories:
top_page_categories_list.append(category.text)
except:
pass
coins_links = driver.find_elements_by_xpath("//div[contains(#class, 'company-coins')]/a")
all_coins = []
for coin in coins_links:
all_coins.append(coin.get_attribute('href'))
try:
location = driver.find_element_by_xpath("//div[#class='addresses mt-3']/div/div/div/div/a").text
except:
pass
try:
twitter = driver.find_element_by_xpath("//div[#class='links mt-2']/a[2]").get_attribute('href')
except:
pass
try:
print('-----------')
print('Company name is: {}'.format(name))
print('Potential Categories are: {}'.format(categories_list))
print('Potential top page categories are: {}'.format(top_page_categories_list))
print('Supporting Crypto is:{}'.format(all_coins))
print('Registered location is: {}'.format(location))
print('Company twitter profile is: {}'.format(twitter))
time.sleep(1)
except:
pass
all_names.append(name)
all_categories.append(categories_list)
all_categories2.append(top_page_categories_list)
all_cryptos.append(all_coins)
all_twitter.append(twitter)
all_locations.append(location)
except:
pass
df = pd.DataFrame(list(zip(all_names, all_categories, all_categories2, all_cryptos, all_twitter, all_locations)), columns=['Company name', 'Categories1', 'Categories2', 'Supporting Crypto', 'Twitter Handle', 'Registered Location'])
CryptoWerk_Data = df.to_csv('CryptoWerk4.csv', index=False)
Redirect calls happen for two reasons, in your case either by executing some javascript code when clicking the last time on the load more button or by receiving an HTTP 3xx code, which is the least likely in your case.
So you need to identify when this javascript code is executed and send an ESC_KEY before it loads and then executing the rest of your script.
You could also scrape the links and append them to your list before clicking the load more button and each time it is clicked, make an if statement the verify the link of the page you're in, if it is that of the promotion page then execute the rest of your code, else click load more.
while page_is_same:
scrape_elements_add_to_list()
click_load_more()
verify_current_page_link()
if current_link_is_same != link_of_scraped_page:
page_is_same = False
# rest of the code here

In selenium how to find out the exact number of XPATH links with different ids?

With Python3 and selenium I want to automate the search on a public information site. In this site it is necessary to enter the name of a person, then select the spelling chosen for that name (without or with accents or name variations), access a page with the list of lawsuits found and in this list you can access the page of each case.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from selenium.webdriver.common.keys import Keys
import time
import re
Name that will be searched
name = 'JOSE ROBERTO ARRUDA'
Create path, search start link, and empty list to store information
firefoxPath="/home/abraji/Documentos/Code/geckodriver"
link = 'https://ww2.stj.jus.br/processo/pesquisa/?aplicacao=processos.ea'
processos = []
Call driver and go to first search page
driver = webdriver.Firefox(executable_path=firefoxPath)
driver.get(link)
Position cursor, fill and click
WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#idParteNome'))).click()
time.sleep(1)
driver.find_element_by_xpath('//*[#id="idParteNome"]').send_keys(name)
time.sleep(6)
WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#idBotaoPesquisarFormularioExtendido'))).click()
Mark all spelling possibilities for searching
WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#idBotaoMarcarTodos'))).click()
WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#idBotaoPesquisarMarcados'))).click()
time.sleep(1)
Check how many pages of data there are - to be used in "for range"
capta = driver.find_element_by_xpath('//*[#id="idDivBlocoPaginacaoTopo"]/div/span/span[2]').text
print(capta)
paginas = int(re.search(r'\d+', capta).group(0))
paginas = int(paginas) + 1
print(paginas)
Capture routine
for acumula in range(1, paginas):
# Fill the field with the page number and press enter
driver.find_element_by_xpath('//*[#id="idDivBlocoPaginacaoTopo"]/div/span/span[2]/input').send_keys(acumula)
driver.find_element_by_xpath('//*[#id="idDivBlocoPaginacaoTopo"]/div/span/span[2]/input').send_keys(Keys.RETURN)
time.sleep(2)
# Captures the number of processes found on the current page - qt
qt = driver.find_element_by_xpath('//*[#id="idDivBlocoMensagem"]/div/b').text
qt = int(qt) + 2
print(qt)
# Iterate from found number of processes
for item in range(2, qt):
# Find the XPATH of each process link - start at number 2
vez = '//*[#id="idBlocoInternoLinhasProcesso"]/div[' + str(item) + ']/span[1]/span[1]/span[1]/span[2]/a'
print(vez)
# Access the direct link and click
element = WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, vez)))
element.click()
# Run tests to get data
try:
num_unico = driver.find_element_by_xpath('//*[#id="idProcessoDetalhesBloco1"]/div[6]/span[2]/a').text
except NoSuchElementException:
num_unico = "sem_numero_unico"
try:
nome_proc = driver.find_element_by_xpath('//*[#id="idSpanClasseDescricao"]').text
except NoSuchElementException:
nome_proc = "sem_nome_encontrado"
try:
data_autu = driver.find_element_by_xpath('//*[#id="idProcessoDetalhesBloco1"]/div[5]/span[2]').text
except NoSuchElementException:
data_autu = "sem_data_encontrada"
# Fills dictionary and list
dicionario = {"num_unico": num_unico,
"nome_proc": nome_proc,
"data_autu": data_autu
}
processos.append(dicionario)
# Return a page to click on next process
driver.execute_script("window.history.go(-1)")
# Close driver
driver.quit()
In this case I captured the number of link pages (3) and the total number of links (84). So my initial idea was to do the "for" three times and within them split the 84 links
The direct address of each link is in XPATH (//*[#id="idBlocoInternoLinhasProcesso"]/div[41]/span[1]/span[1]/span[1]/span[2]/a) which I replace with the "item" to click
For example, when it arrives at number 42 I have an error because the first page only goes up to 41
My problem is how to go to the second page and then restart only "for" secondary
I think the ideal would be to know the exact number of links on each of the three pages
Anyone have any ideas?
Code below is "Capture routine":
wait = WebDriverWait(driver, 20)
#...
while True:
links = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//span[contains(#class,'classSpanNumeroRegistro')]")))
print("links len", len(links))
for i in range(1, len(links) + 1):
# Access the direct link and click
.until(EC.element_to_be_clickable((By.XPATH, f"(//span[contains(#class,'classSpanNumeroRegistro')])[{i}]//a"))).click()
# Run tests to get data
try:
num_unico = driver.find_element_by_xpath('//*[#id="idProcessoDetalhesBloco1"]/div[6]/span[2]/a').text
except NoSuchElementException:
num_unico = "sem_numero_unico"
try:
nome_proc = driver.find_element_by_xpath('//*[#id="idSpanClasseDescricao"]').text
except NoSuchElementException:
nome_proc = "sem_nome_encontrado"
try:
data_autu = driver.find_element_by_xpath('//*[#id="idProcessoDetalhesBloco1"]/div[5]/span[2]').text
except NoSuchElementException:
data_autu = "sem_data_encontrada"
# Fills dictionary and list
dicionario = {"num_unico": num_unico,
"nome_proc": nome_proc,
"data_autu": data_autu
}
processos.append(dicionario)
# Return a page to click on next process
driver.execute_script("window.history.go(-1)")
# wait.until(EC.presence_of_element_located((By.CLASS_NAME, "classSpanPaginacaoImagensDireita")))
next_page = driver.find_elements_by_css_selector(".classSpanPaginacaoProximaPagina")
if len(next_page) == 0:
break
next_page[0].click()
You can try run the loop until next button is present on the screen. the logic will look like this,
try:
next_page = driver.find_element_by_class_name('classSpanPaginacaoProximaPagina')
if(next_page.is_displayed()):
next_page.click()
except NoSuchElementException:
print('next page does not exists')

using Enum to build locator for selenium test

I come out one question about selenium web test. Since all elements require their own xpath or css selector to be action by selenium webdriver.
I have tried to use python Enum and create something like
file: elementEnum.py
from enum import Enum
class PageA(Enum):
pElementA = '//div[{}]'
pElementB = '//a[.="{}"]'
pElementC = '//div[#class={}]'
class PageB(Enum):
pElementA = '//button[.="{}""]'
pElementB = '//table/tr[{}]/td[{}]'
but it turns out lots of time I require to build the string in python format function and it does not see pythonoic.
from elementEnum import *
driver.find_element_by_xpath('//div[#class="aaaaaa"]/{}'.format((PageA.pElementA.value).format(1)))
driver.find_element_by_xpath('{}/{}'.format(PageA.pElementA.value).format(1), PageA.pElementB.value.format(2)))
driver.find_element_by_xpath('{}/{}'.format(PageB.pElementB.value).format(1, 3), PageA.pElementA.value.format(2)))
What is the best way for me to list out all corresponse element and their locator.
you could use the
EC.visibility_of_element_located to locate the element
http://selenium-python.readthedocs.io/waits.html
sample code :
class SeleniumBaseClass(object):
def __init__(self,driver):
self.driver = driver
def open(self,URL):
self.driver.get(URL)
def driverURLChange(self,URL):
print("change URL" + URL)
self.driver.get(URL)
def currentUrl(self):
print("URL " + self.driver.current_url)
return self.driver.current_url
def switchNewWindow(self):
self.driver.switch_to_window(self.driver.window_handles[1])
return self.driver.title
def locateElement(self, loc):
try:
print(loc)
element = WebDriverWait(self.driver,10).until(EC.visibility_of_element_located(loc))
return element
except:
print ("cannot find {0} element".format(loc))
return None
and you could
password_loc =(By.NAME,'password')
webdriver = SeleniumBaseClass(driver)
webdriver.locateElement(password_loc )
this could you pass the tuple to locate the element

selenium python stuck on this line

Why does my code:
self.driver.find_element_by_xpath("//*[text()[contains(.,'%s')]]" % ('Sorry'))
Get stuck and won't pass this line? Even if I do something like:
driver.implicitly_wait(30)
self.driver.find_element_by_xpath("//*[text()[contains(.,'%s')]]" % ('Sorry'))
Complete code:
# gets stuck here
if self.is_text_present('Hello'):
print 'Hello'
# rest of code
def is_text_present(self, text):
try:
self.driver.find_element_by_xpath('//*[contains(text(), "%s")]' % (text))
except NoSuchElementException, e:
return False
return True
Your XPath can be simplified to
"//*[contains(text(),'%s')]" % ('Sorry')
Perhaps try something like:
import contextlib
import selenium.webdriver as webdriver
import selenium.webdriver.support.ui as ui
with contextlib.closing(webdriver.Firefox()) as driver:
...
# Set up a WebDriverWait instance that will poll for up to 10 seconds
wait = ui.WebDriverWait(driver, 10)
# wait.until returns the value of the callback
elt = wait.until(
lambda driver: driver.find_element_by_xpath(
"//*[contains(text(),'%s')]" % ('Sorry')
))
From the docs:
This waits up to 10 seconds before throwing a TimeoutException or if
it finds the element will return it in 0 - 10 seconds.
To debug the problem you might try saving the HTML source to a file right before calling find_element_by_xpath so you can see what the driver is seeing. Is the XPath valid for that HTML?.
def is_text_present(self, text):
with open('/tmp/debug.html', 'w') as f:
time.sleep(5) # crude, but perhaps effective enough for debugging here.
f.write(driver.page_source)
try:
self.driver.find_element_by_xpath('//*[contains(text(), "%s")]' % (text))
except NoSuchElementException, e:
return False
return True

Categories

Resources