Currently I am working on a web crawler that should be able to download text of a dutch newspaper bank. The first link is working correctly but suddenly the second link creates an error of which I do not know how to fix this.
It seems that selenium is unable to click the button in the second link while it succeeds doing so in the first link.
Do you know what causes the second link (telegraaf page) to fail?
UPDATE CODE:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
import pandas as pd
import numpy as np
import re
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.action_chains import ActionChains
#Set up the path to the chrome driver
driver = webdriver.Chrome()
html = driver.find_element_by_tag_name('html')
all_details = []
for c in range(1,2):
try:
driver.get("https://www.delpher.nl/nl/kranten/results?query=kernenergie&facets%5Bpapertitle%5D%5B%5D=Algemeen+Dagblad&facets%5Bpapertitle%5D%5B%5D=De+Volkskrant&facets%5Bpapertitle%5D%5B%5D=De+Telegraaf&facets%5Bpapertitle%5D%5B%5D=Trouw&page={}&sortfield=date&cql%5B%5D=(date+_gte_+%2201-01-1970%22)&cql%5B%5D=(date+_lte_+%2201-01-2018%22)&coll=ddd".format(c))
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
incategory = driver.find_elements_by_class_name("search-result")
print(driver.current_url)
links = [ i.find_element_by_class_name("thumbnail search-result__thumbnail").get_attribute("href") for i in incategory]
# Lets loop through each link to acces the page of each book
for link in links:
# get one book url
driver.get(link)
# newspaper
newspaper = driver.find_element_by_xpath("//*[#id='content']/div[2]/div/div[2]/header/h1/span[2]")
# date of the article
date = driver.find_element_by_xpath("//*[#id='content']/div[2]/div/div[2]/header/div/ul/li[1]")
#click button and find title
div_element = WebDriverWait(driver, 60).until(expected_conditions.presence_of_element_located((By.XPATH,'//*[#id="object"]/div/div/div')))
hover = ActionChains(driver).move_to_element(div_element)
hover.perform()
div_element.click()
button = WebDriverWait(driver, 90).until(expected_conditions.presence_of_element_located((By.XPATH, '//*[#id="object-viewer__ocr-button"]')))
hover = ActionChains(driver).move_to_element(button)
hover.perform()
button.click()
element = driver.find_element_by_css_selector(".object-viewer__ocr-panel-results")
driver.execute_script("$(arguments[0]).click();", element)
# content of article
try:
content = driver.find_elements_by_xpath("//*[contains(text(), 'kernenergie')]").text
except:
content = None
# Define a dictionary with details we need
r = {
"1Newspaper":newspaper.text,
"2Date":date.text,
"3Content":content,
}
# append r to all details
all_details.append(r)
except Exception as e:
print(str(e))
pass
# save the information into a CSV file
df = pd.DataFrame(all_details)
df = df.to_string()
time.sleep(3)
driver.close()
So you have some problems.
driver.implicitly_wait(10)
Should only be used once
links = [ i.find_element_by_class_name("search-result__thumbnail-link").get_attribute("href") for i in incategory]
Is a more useful way to get all links
print(driver.current_url)
Could replace
print("https://www.delpher.nl/nl/kranten/results?query=kernenergie&facets%5Bpapertitle%5D%5B%5D=Algemeen+Dagblad&facets%5Bpapertitle%5D%5B%5D=De+Volkskrant&facets%5Bpapertitle%5D%5B%5D=De+Telegraaf&facets%5Bpapertitle%5D%5B%5D=Trouw&page={}&sortfield=date&cql%5B%5D=(date+_gte_+%2201-01-1970%22)&cql%5B%5D=(date+_lte_+%2201-01-2018%22)&coll=ddd".format(c))
No need for url=link
for link in links:
driver.get(link)
Your title actually doesn't get on the second page. Use something like this for all values.
try:
content = driver.find_element_by_xpath('//*[#id="object-viewer__ocr-panel"]/div[2]/div[5]').text
except:
content = None
# Define a dictionary
r = {
"1Newspaper":newspaper,
"2Date":date,
"3Title": title,
"4Content": content,
}
You can replace your exception with to figure out the line of problem.
except Exception as e:
print(str(e))
pass
it might be that the button you are trying to reach is inside an iframe, which means you have to access that one before searching for the XPATH:
iframe = driver.find_elements_by_tag_name('iframe')
driver.switch_to.frame(iframe)
Also there might be a possibility the object youre trying to click is not visible yet, which could be solved by an timeout
Related
I need to parse a list of YouTube video links using the query I need and the filter
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
import time
name = 'hello world'
driver = webdriver.Firefox()
driver.get('https://www.youtube.com')
id_serth = driver.find_element(By.NAME, "search_query")
id_serth.send_keys(name)
button_serth = driver.find_element(By.ID, "search-icon-legacy")
time.sleep(1)
button_serth.click()
time.sleep(3)
button_filtr = driver.find_element(By.CLASS_NAME ,"ytd-toggle-button-renderer")
button_filtr.click()
time.sleep(3)
button_filtrtode=driver.find_element(By.CLASS_NAME, "ytd-search-filter-renderer")
button_filtrtode.click()
#linkvideo2 = driver.find_element(By.ID, "video-title").get_attribute("href")
time.sleep(3)
urltek = driver.current_url
linkvideo2 = driver.find_element(By.XPATH, '//*[#id="video-title"]').get_attribute('href')
print (linkvideo2)
print (urltek)
I get the answer None p.s just recently started learning selenium don't beat me)
None
https://www.youtube.com/results?search_query=hello+world&sp=EgIIAQ%253D%253D
The problem is that YouTube uses the ID video-title for multiple elements.
Your current query is getting a <yt-formatted-string> element which doesn't have the attribute href.
Change
linkvideo2 = driver.find_element(By.XPATH, '//*[#id="video-title"]').get_attribute('href')
to
linkvideo2 = driver.find_element(By.XPATH, '//a[#id="video-title"]').get_attribute('href')
in order to get the <a> element. Then your code will print the link to the first found video.
If you want to print all found links, you could use something like that:
videos = driver.find_elements(By.TAG_NAME, "ytd-video-renderer")
for video in videos:
link = video.find_element(By.XPATH, './/a[#id="video-title"]')
print(link.get_attribute('href'))
Hey everyone i am trying to scrape a website and there is not much problem in that but the main problem i am facing is that there is "load more" button in the site so before scraping the website i just thought to click that button till it doesnt exist any more so i wrote some code to do that but now when i run that code i am getting two error and these two errors are random i mean sometimes it occurs and other times it don't so the first error is Stale element exception and the second one is element is not clickable as it is being overlapped or something , now to stop that overlapping i added a line that whenever load button is clicked page will be scrolled back to top but now it is getting overlapped at there. So i am not able to solve these two problems here is the code:-
#importing webdriver from selenium
from numpy.lib.npyio import load
from selenium import webdriver
from selenium.common.exceptions import ElementClickInterceptedException, ElementNotInteractableException, ElementNotSelectableException, NoSuchElementException, StaleElementReferenceException
import csv
import pandas as pd
from selenium.webdriver.common.keys import Keys
#selecting Firefox as the browser
#in order to select Chrome
# webdriver.Chrome() will be used
path = r"D:\\Chromedriver.exe"
driver = webdriver.Chrome(path)
#URL of the website
url = "https://www.carecredit.com/doctor-locator/results/Any-Profession/Any-Specialty//?Sort=D&Radius=75&Page=1"
driver.maximize_window()
#opening link in the browser
driver.get(url)
driver.implicitly_wait(100)
search_bar = driver.find_element_by_xpath('//*[#id="location-display"]')
search_bar.click()
search_bar.send_keys("00704")
search_button = driver.find_element_by_xpath('//*[#id="dl-locate-button-new"]')
search_button.click()
driver.implicitly_wait(50)
distance = driver.find_element_by_xpath('//*[#id="dl-radius-new"]')
distance.click()
driver.implicitly_wait(100)
cookies = driver.find_element_by_xpath('//*[#id="onetrust-accept-btn-handler"]')
cookies.click()
i = 0
loadMoreButtonExists = True
while loadMoreButtonExists:
try:
driver.implicitly_wait(100)
load_more = driver.find_element_by_xpath('//*[#id="next-page"]')
load_more.click()
driver.find_element_by_tag_name('body').send_keys(Keys.CONTROL + Keys.HOME)
print("done"+str(i))
i = i+1
driver.implicitly_wait(30)
except ElementNotInteractableException:
loadMoreButtonExists = False
print("done loading button")
practise_name = driver.find_elements_by_class_name('page-results')
try:
for element in practise_name:
practise_name = element.find_element_by_class_name('dl-result-header dl-map-hover')
address = element.find_element_by_class_name('dl-result-address')
phone_no = element.find_element_by_class_name('no-mobile')
# l=[]
# l1 = []
# temp_list=[]
# l.append(element.text)
# print(l)
print(practise_name.text)
print(address.text)
print(phone_no.text)
except Exception as e:
print(e)
driver.close()
driver.close()
and if my code reaches the place where i actually scrape data then at that point it shows invalid id even when i checked the id and its correct. So Please check that part too.
Thank you
Try like below and confirm:
You can catch the respective exception like below:
# Imports Required:
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import ElementClickInterceptedException,StaleElementReferenceException,TimeoutException
import time
...
wait.until(EC.element_to_be_clickable((By.ID,"onetrust-accept-btn-handler"))).click()
while True:
try:
load_more = wait.until(EC.visibility_of_element_located((By.ID,"next-page")))
driver.execute_script("arguments[0].scrollIntoView(true);",load_more)
driver.execute_script("window.scrollBy(0,-200);")
load_more.click()
time.sleep(2)
except ElementClickInterceptedException: # This might not occur since we are scrolling so that the element is visible
print("in ElementClickInterceptedException")
pass
except StaleElementReferenceException: # This might not occur too, since we are trying to find the Load more element everytime.
print("in StaleElementReferenceException")
pass
except TimeoutException: # TimeoutException means there is no Load more button
print("in TimeoutException")
break
...
I'm trying to create a 'bot' to buy a graphics card. I've downloaded a pre-made script, and i'm trying to adjust to my needs.
Script take me to the site in Firefox, finds the button I am trying to look for using the following code:
findAllCards = soup.find('button', {'class': 'Button__StyledButton-iESSlv dJJJCD Button-dtUzzq kHUYTy'})
This works. However, when I an trying to click the button, I am unable to as I have no idea what I am suppose to find here:
driverWait(driver, 'css', '.space-b center')
Webpage I'm using to test is:
https://www.currys.co.uk/gbuk/gaming/console-gaming/controllers/xbox-wireless-controller-carbon-black-10211565-pdt.html
Full code here:
driver.get(url)
while True:
html = driver.page_source
soup = bs4.BeautifulSoup(html, 'html.parser')
wait = WebDriverWait(driver, 15)
wait2 = WebDriverWait(driver, 2)
try:
findAllCards = soup.find('button', {'class': 'Button__StyledButton-iESSlv dJJJCD Button-dtUzzq kHUYTy'})
if findAllCards:
print(f'Button Found!: {findAllCards.get_text()}')
# Clicking Add to Cart.
time.sleep(.3)
print('Click')
driverWait(driver, 'css', '.space-b center')
print('Click1')
time.sleep(2)
Thanks :)
Your findAllCards above returns 3 web elements, not 1. Assuming you are trying to click on the Add to Basket button:
findAllCards = driver.find_element_by_xpath("//div[#id='product-actions']//div[#data-component='add-to-basket-button-wrapper']//button")
findAllCards.click()
Some odd thing with element to be interactable.
wait = WebDriverWait(driver, 10)
driver.get(url)
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,"button#onetrust-accept-btn-handler"))).click()
try:
elem=wait.until(EC.presence_of_element_located((By.XPATH,"//button[contains(.,'Add to basket')]")))
driver.execute_script("arguments[0].click();", elem)
except Exception as e:
print(str(e))
Import
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
So I am scraping reviews and skin type from Sephora and have run into a problem identifying how to get elements off of the page.
Sephora.com loads reviews dynamically after you scroll down the page so I have switched from beautiful soup to Selenium to get the reviews.
The Reviews have no ID, no name, nor a CSS identifier that seems to be stable. The Xpath doesn't seem to be recognized each time I try to use it by copying from chrome nor from firefox.
Here is an example of the HTML from the inspected element that I loaded in chrome:
Inspect Element view from the desired page
My Attempts thus far:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
driver = webdriver.Chrome("/Users/myName/Downloads/chromedriver")
url = 'https://www.sephora.com/product/the-porefessional-face-primer-P264900'
driver.get(url)
reviews = driver.find_elements_by_xpath(
"//div[#id='ratings-reviews']//div[#data-comp='Ellipsis Box ']")
print("REVIEWS:", reviews)
Output:
| => /Users/myName/anaconda3/bin/python "/Users/myName/Documents/ScrapeyFile Group/attempt32.py"
REVIEWS: []
(base)
So basically an empty list.
ATTEMPT 2:
import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.keys import Keys
# Open up a Firefox browser and navigate to web page.
driver = webdriver.Firefox()
driver.get(
"https://www.sephora.com/product/squalane-antioxidant-cleansing-oil-P416560?skuId=2051902&om_mmc=ppc-GG_1165716902_56760225087_pla-420378096665_2051902_257731959107_9061275_c&country_switch=us&lang=en&ds_rl=1261471&gclid=EAIaIQobChMIisW0iLbK6AIVaR6tBh005wUTEAYYBCABEgJVdvD_BwE&gclsrc=aw.ds"
)
#Scroll to bottom of page b/c its dynamically loading
html = driver.find_element_by_tag_name('html')
html.send_keys(Keys.END)
#scrape stats and comments
comments = driver.find_elements_by_css_selector("div.css-7rv8g1")
print("!!!!!!Comments!!!!!")
print(comments)
OUTPUT:
| => /Users/MYNAME/anaconda3/bin/python /Users/MYNAME/Downloads/attempt33.py
!!!!!!Comments!!!!!
[]
(base)
Empty again. :(
I get the same results when I try to use different element selectors:
#scrape stats and comments
comments = driver.find_elements_by_class_name("css-7rv8g1")
I also get nothing when I tried this:
comments = driver.find_elements_by_xpath(
"//div[#data-comp='GridCell Box']//div[#data-comp='Ellipsis Box ']")
and This (notice the space after Ellipsis Box is gone :
comments = driver.find_elements_by_xpath(
"//div[#data-comp='GridCell Box']//div[#data-comp='Ellipsis Box']")
I have tried using the solutions outlined here and here but ti no avail -- I think there is something I don't understand about the page or selenium that I am missing since this is my first time using selenium so i'm a super nube :(
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
import time
from selenium import webdriver
driver = webdriver.Chrome(executable_path=r"")
driver.maximize_window()
wait = WebDriverWait(driver, 20)
driver.get("https://www.sephora.fr/p/black-ink---classic-line-felt-liner---eyeliner-feutre-precis-waterproof-P3622017.html")
scrolls = 1
while True:
scrolls -= 1
driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
time.sleep(3)
if scrolls < 0:
break
reviewText=wait.until(EC.presence_of_all_elements_located((By.XPATH, "//ol[#class='bv-content-list bv-content-list-reviews']//li//div[#class='bv-content-summary-body']//div[1]")))
for textreview in reviewText:
print textreview.text
Output:
I've been scraping reviews from Sephora and basically, even if there is plenty of room for improvement, it works like this :
Clicks on "reviews" to access reviews
Loads all reviews by scrolling until there aren't any review left to load
Finds review text and skin type by CSS SELECTOR
def load_all_reviews(driver):
while True:
try:
driver.execute_script(
"arguments[0].scrollIntoView(true);",
WebDriverWait(driver, 10).until(
EC.visibility_of_element_located(
(By.CSS_SELECTOR, ".bv-content-btn-pages-load-more")
)
),
)
driver.execute_script(
"arguments[0].click();",
WebDriverWait(driver, 20).until(
EC.element_to_be_clickable(
(By.CSS_SELECTOR, ".bv-content-btn-pages-load-more")
)
),
)
except Exception as e:
break
def get_review_text(review):
try:
return review.find_element(By.CLASS_NAME, "bv-content-summary-body-text").text
except:
return "NA" # in case it doesnt find a review
def get_skin_type(review):
try:
return review.find_element(By.XPATH, '//*[#id="BVRRContainer"]/div/div/div/div/ol/li[2]/div[1]/div/div[2]/div[5]/ul/li[4]/span[2]').text
except:
return "NA" # in case it doesnt find a skin type
to use those you've got to create a webdriver and first call the load_all_reviews() function.
Then you've got to find reviews with :
reviews = driver.find_elements(By.CSS_SELECTOR, ".bv-content-review")
and finally you can call for each review the get_review() and get_skin_type() functions :
for review in reviews :
print(get_review_text(review))
print(get_skin_type(review))
I came across a different type of problem while scraping a webpage using python. When an image is clicked, new information concerning its' flavor comes up under the image. My goal is to parse all the flavors connected to each image. My script can parse the flavors of currently active image but breaks after clicking on the new image. A little twitch in my loop will lead me to the right direction.
I've tried with:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome()
driver.get("https://www.optigura.com/uk/product/gold-standard-100-whey/")
wait = WebDriverWait(driver, 10)
while True:
items = wait.until(EC.presence_of_element_located((By.XPATH, "//div[#class='colright']//ul[#class='opt2']//label")))
for item in items.find_elements_by_xpath("//div[#class='colright']//ul[#class='opt2']//label"):
print(item.text)
try:
links = driver.find_elements_by_xpath("//span[#class='img']/img")
for link in links:
link.click()
except:
break
driver.quit()
The picture underneath may clarify what i could not:
I tweaked the code to properly click on the links and to check if the current listed item's text matches with the active listed item's text. If they match, you can safely go on parsing without worrying that you are parsing the same thing over and over again. Here you go:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome()
driver.get("https://www.optigura.com/uk/product/gold-standard-100-whey/")
wait = WebDriverWait(driver, 10)
links = driver.find_elements_by_xpath("//span[#class='img']/img")
for idx, link in enumerate(links):
while True:
try:
link.click()
while driver.find_elements_by_xpath("//span[#class='size']")[idx].text != driver.find_elements_by_xpath("//div[#class='colright']//li[#class='active']//span")[1].text:
link.click()
print driver.find_elements_by_xpath("//span[#class='size']")[idx].text
items = wait.until(EC.presence_of_element_located((By.XPATH, "//div[#class='colright']//ul[#class='opt2']//label")))
for item in items.find_elements_by_xpath("//div[#class='colright']//ul[#class='opt2']//label"):
print(item.text)
except StaleElementReferenceException:
continue
break
driver.quit()
I do not think it has much to do with Python, just many Javascript and ajax things.
The javascript part is
$(document).on("click", ".product-details .custom-radio input:not(.active input)", function() {
var elm = $(this);
var root = elm.closest(".product-details");
var option = elm.closest(".custom-radio");
var opt, opt1, opt2, ip, ipr;
elm.closest("ul").find("li").removeClass("active");
elm.closest("li").addClass("active");
if (option.hasClass("options1")) {
ip = root.find(".options1").data("ip");
opt = root.find(".options2").data("opt");
opt1 = root.find(".options1 li.active input").val();
opt2 = root.find(".options2 li.active input").data("opt-sel");
} else
ipr = root.find(".options2 input:checked").val();
$.ajax({
type: "POST",
url: "/product/ajax/details.php",
data: {
opt: opt,
opt1: opt1,
opt2: opt2,
ip: ip,
ipr: ipr
},
So you can just construct the params(use css selector will be better than xpath in this case), post and parse the json results.