So I am scraping reviews and skin type from Sephora and have run into a problem identifying how to get elements off of the page.
Sephora.com loads reviews dynamically after you scroll down the page so I have switched from beautiful soup to Selenium to get the reviews.
The Reviews have no ID, no name, nor a CSS identifier that seems to be stable. The Xpath doesn't seem to be recognized each time I try to use it by copying from chrome nor from firefox.
Here is an example of the HTML from the inspected element that I loaded in chrome:
Inspect Element view from the desired page
My Attempts thus far:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
driver = webdriver.Chrome("/Users/myName/Downloads/chromedriver")
url = 'https://www.sephora.com/product/the-porefessional-face-primer-P264900'
driver.get(url)
reviews = driver.find_elements_by_xpath(
"//div[#id='ratings-reviews']//div[#data-comp='Ellipsis Box ']")
print("REVIEWS:", reviews)
Output:
| => /Users/myName/anaconda3/bin/python "/Users/myName/Documents/ScrapeyFile Group/attempt32.py"
REVIEWS: []
(base)
So basically an empty list.
ATTEMPT 2:
import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.keys import Keys
# Open up a Firefox browser and navigate to web page.
driver = webdriver.Firefox()
driver.get(
"https://www.sephora.com/product/squalane-antioxidant-cleansing-oil-P416560?skuId=2051902&om_mmc=ppc-GG_1165716902_56760225087_pla-420378096665_2051902_257731959107_9061275_c&country_switch=us&lang=en&ds_rl=1261471&gclid=EAIaIQobChMIisW0iLbK6AIVaR6tBh005wUTEAYYBCABEgJVdvD_BwE&gclsrc=aw.ds"
)
#Scroll to bottom of page b/c its dynamically loading
html = driver.find_element_by_tag_name('html')
html.send_keys(Keys.END)
#scrape stats and comments
comments = driver.find_elements_by_css_selector("div.css-7rv8g1")
print("!!!!!!Comments!!!!!")
print(comments)
OUTPUT:
| => /Users/MYNAME/anaconda3/bin/python /Users/MYNAME/Downloads/attempt33.py
!!!!!!Comments!!!!!
[]
(base)
Empty again. :(
I get the same results when I try to use different element selectors:
#scrape stats and comments
comments = driver.find_elements_by_class_name("css-7rv8g1")
I also get nothing when I tried this:
comments = driver.find_elements_by_xpath(
"//div[#data-comp='GridCell Box']//div[#data-comp='Ellipsis Box ']")
and This (notice the space after Ellipsis Box is gone :
comments = driver.find_elements_by_xpath(
"//div[#data-comp='GridCell Box']//div[#data-comp='Ellipsis Box']")
I have tried using the solutions outlined here and here but ti no avail -- I think there is something I don't understand about the page or selenium that I am missing since this is my first time using selenium so i'm a super nube :(
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
import time
from selenium import webdriver
driver = webdriver.Chrome(executable_path=r"")
driver.maximize_window()
wait = WebDriverWait(driver, 20)
driver.get("https://www.sephora.fr/p/black-ink---classic-line-felt-liner---eyeliner-feutre-precis-waterproof-P3622017.html")
scrolls = 1
while True:
scrolls -= 1
driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
time.sleep(3)
if scrolls < 0:
break
reviewText=wait.until(EC.presence_of_all_elements_located((By.XPATH, "//ol[#class='bv-content-list bv-content-list-reviews']//li//div[#class='bv-content-summary-body']//div[1]")))
for textreview in reviewText:
print textreview.text
Output:
I've been scraping reviews from Sephora and basically, even if there is plenty of room for improvement, it works like this :
Clicks on "reviews" to access reviews
Loads all reviews by scrolling until there aren't any review left to load
Finds review text and skin type by CSS SELECTOR
def load_all_reviews(driver):
while True:
try:
driver.execute_script(
"arguments[0].scrollIntoView(true);",
WebDriverWait(driver, 10).until(
EC.visibility_of_element_located(
(By.CSS_SELECTOR, ".bv-content-btn-pages-load-more")
)
),
)
driver.execute_script(
"arguments[0].click();",
WebDriverWait(driver, 20).until(
EC.element_to_be_clickable(
(By.CSS_SELECTOR, ".bv-content-btn-pages-load-more")
)
),
)
except Exception as e:
break
def get_review_text(review):
try:
return review.find_element(By.CLASS_NAME, "bv-content-summary-body-text").text
except:
return "NA" # in case it doesnt find a review
def get_skin_type(review):
try:
return review.find_element(By.XPATH, '//*[#id="BVRRContainer"]/div/div/div/div/ol/li[2]/div[1]/div/div[2]/div[5]/ul/li[4]/span[2]').text
except:
return "NA" # in case it doesnt find a skin type
to use those you've got to create a webdriver and first call the load_all_reviews() function.
Then you've got to find reviews with :
reviews = driver.find_elements(By.CSS_SELECTOR, ".bv-content-review")
and finally you can call for each review the get_review() and get_skin_type() functions :
for review in reviews :
print(get_review_text(review))
print(get_skin_type(review))
Related
I'm beginner on programming. I trying get my Instagram follower list but i have just 12 follower. I tried firstly click to box and scroll down but it didn't work.
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import time
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome()
url= "https://www.instagram.com/"
driver.get(url)
time.sleep(1)
kullaniciAdiGir = driver.find_element(By.XPATH, "//*[#id='loginForm']/div/div[1]/div/label/input"")
kullaniciAdiGir.send_keys("USERNAME")
sifreGir = driver.find_element(By.XPATH, "//input[#name='password']")
sifreGir.send_keys("PASS")
girisButonu = driver.find_element(By.XPATH, "//*[#id='loginForm']/div/div[3]/button/div").click()
time.sleep(5)
driver.get(url="https://www.instagram.com/USERNAME/")
time.sleep(3)
kutucuk= driver.get(url="https://www.instagram.com/USERNAME/followers/")
time.sleep(5)
box =driver.find_element(By.XPATH, "//div[#class='xs83m0k xl56j7k x1iy3rx x1n2onr6 x1sy10c2 x1h5jrl4 xieb3on xmn8rco x1hfn5x7 x13wlyjk x1v7wizp x1l0w46t xa3vuyk xw8ag78']")
box.click()
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(5)
takipciler = driver.find_elements(By.CSS_SELECTOR, "._ab8y._ab94._ab97._ab9f._ab9k._ab9p._abcm")
for takipci in takipciler:
print(takipci.text)
time.sleep(10)
How can i fix it? How can scroll down in box? Thanks
You can select multiple elements with this.
#get all followers
followers = driver.find_elements(By.CSS_SELECTOR, "._ab8y._ab94._ab97._ab9f._ab9k._ab9p._abcm")
# loop each follower
for user in followers:
#do something here.
Using css selectors, in my opinion, is much easier.
Also note I used find_elemets not find_element, as the latter only returns a single result.
As the data is loaded dynamically, you will, infact have to scroll to make the site load more results. Then compare what you have agaisnt whats loaded. Probably execute some javascrtipt like scroll into view on the last element in that container etc.
or take a look here for an alternative solution
https://www.folkstalk.com/2022/10/how-to-get-a-list-of-followers-on-instagram-python-with-code-examples.html
or look at instragrams API, probably something in there for getting your followers.
I am new to the selenium framework and I must say it is an awesome library. I am basically trying to get all links from a webpage that has a particular id "pagination", and isolate them from links that don't have such id, reasons because I want to go through all the pages in this link.
for j in browser.find_elements(By.CSS_SELECTOR, "div#col-content > div.main-menu2.main-menu-gray strong a[href]"):
print(j.get_property('href')))
The code above gets all the links with and without pagination.
example links with pagination.
https://www.oddsportal.com/soccer/africa/africa-cup-of-nations-2015/results/
https://www.oddsportal.com/soccer/england/premier-league-2020-2021/results/
https://www.oddsportal.com/soccer/africa/africa-cup-of-nations-2021/results/
https://www.oddsportal.com/soccer/africa/africa-cup-of-nations-2019/results/
example links without pagination.
https://www.oddsportal.com/soccer/africa/africa-cup-of-nations/results/
In my code, I try to find if the given ID exists on the page, pagination = browser.find_element(By.ID, "pagination") but I stumble on an error, I understand the reason for the error, and it is coming from the fact that the ID "pagination" does not exist on some of the links.
no such element: Unable to locate element: {"method":"css selector","selector":"[id="pagination"]"}
I changed the above code to pagination = browser.find_elements(By.ID, "pagination"), which returns links with and without pagination. so my question is how can I get links that has a particular id from list of links.
from selenium.webdriver import Chrome, ChromeOptions
from selenium.webdriver.common.by import By
import time
import tqdm
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
#define our URL
url = 'https://oddsportal.com/results/'
path = r'C:\Users\Glodaris\OneDrive\Desktop\Repo\Scraper\chromedriver.exe'
options = ChromeOptions()
options.headless = True
# options=options
browser = Chrome(executable_path=path, options=options)
browser.get(url)
title = browser.title
print('Title', title)
links = []
for i in browser.find_elements(By.CSS_SELECTOR, "div#archive-tables tbody tr[xsid='1'] td a[href]"):
links.append(i.get_property('href'))
arr = []
condition = True
while condition:
for link in (links):
second_link = browser.get(link)
for j in browser.find_elements(By.CSS_SELECTOR, "div#col-content > div.main-menu2.main-menu-gray strong a[href]"):
browser.implicitly_wait(2)
pagination = browser.find_element(By.ID, "pagination")
if pagination:
print(pagination.get_property('href')))
else:
print(j.get_property('href')))
try:
browser.find_elements("xpath", "//*[#id='pagination']/a[6]")
except:
condition = False
As you are using Selenium, you are able to actually click on the pagination's forward button to navigate through pages.
The following example will test for cookie button, will scrape the data from the main table as a dataframe, will check if there is pagination, and if not, will stop there. If there is pagination, will navigate to next page, get the data from the table, navigate to the next page and so on, until the table data from the page is identical with table data from previous page, and then will stop. It is able to handle an n number of pages. The setup in the code below is for linux, what you need to pay attention to is the imports part, as well as the part after you define the browser/driver.
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time as t
import pandas as pd
chrome_options = Options()
chrome_options.add_argument("--no-sandbox")
webdriver_service = Service("chromedriver/chromedriver") ## path to where you saved chromedriver binary
browser = webdriver.Chrome(service=webdriver_service, options=chrome_options)
# url='https://www.oddsportal.com/soccer/africa/africa-cup-of-nations/results/'
url = 'https://www.oddsportal.com/soccer/africa/africa-cup-of-nations-2021/results/'
browser.get(url)
try:
WebDriverWait(browser, 10).until(EC.element_to_be_clickable((By.ID, "onetrust-reject-all-handler"))).click()
except Exception as e:
print('no cookie button!')
games_table = WebDriverWait(browser, 20).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "table[id='tournamentTable']")))
try:
initial_games_table_data = games_table.get_attribute('outerHTML')
dfs = pd.read_html(initial_games_table_data)
print(dfs[0])
except Exception as e:
print(e, 'Unfortunately, no matches can be displayed because there are no odds available from your selected bookmakers.')
while True:
browser.execute_script("window.scrollTo(0,document.body.scrollHeight);")
t.sleep(1)
try:
forward_button = WebDriverWait(browser, 20).until(EC.element_to_be_clickable((By.XPATH, "//div[#id='pagination']//span[text()='ยป']")))
forward_button.click()
except Exception as e:
print(e, 'no pagination, stopping here')
break
games_table = WebDriverWait(browser, 20).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "table[id='tournamentTable']")))
dfs = pd.read_html(games_table.get_attribute('outerHTML'))
games_table_data = games_table.get_attribute('outerHTML')
if games_table_data == initial_games_table_data:
print('this is the last page')
break
print(dfs[0])
initial_games_table_data = games_table_data
print('went to next page')
t.sleep(3)
You are seeing the error message...
no such element: Unable to locate element: {"method":"css selector","selector":"[id="pagination"]"}
...as all the pages doesn't contain the element:
<div id="pagination">
<a ...>
<a ...>
<a ...>
</div>
Solution
In these cases your best approach would be to wrapup the code block with in a try-except{} block as follows:
for j in browser.find_elements(By.CSS_SELECTOR, "div#col-content > div.main-menu2.main-menu-gray strong a[href]"):
try:
WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.ID, "pagination")))
print([my_elem.get_attribute("href") for my_elem in WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, "div#pagination a[href*='page']")))])
except:
print("Pagination not available")
continue
Note : You have to add the following imports :
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
Update
A couple of things to note.
The (By.ID, "pagination") element doesn't have a href attribute but the several decendants have. So you may find conflicting results.
As you are using WebDriverWait remember to remove all the instances of implicitly_wait() as mixing implicit and explicit waits can cause unpredictable wait times. For example setting an implicit wait of 10 seconds and an explicit wait of 15 seconds, could cause a timeout to occur after 20 seconds.
im quite noob in python and right now building up a web scraper in Selenium that would take all URL's for products in the clicked 'tab' on web page. But my code take the URL's from the first 'tab'. Code below. Thank you guys. Im starting to be kind of frustrated lol.
Screenshot
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
from lxml import html
PATH = 'C:\Program Files (x86)\chromedriver.exe'
driver = webdriver.Chrome(PATH)
url = 'https://www.alza.sk/vypredaj-akcia-zlava/e0.htm'
driver.get(url)
driver.find_element_by_xpath('//*[#id="tabs"]/ul/li[2]').click()
links = []
try:
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, 'blockFilter')))
link = driver.find_elements_by_xpath("//a[#class='name browsinglink impression-binded']")
for i in link:
links.append(i.get_attribute('href'))
finally:
driver.quit()
print(links)
To select current tab:
current_tab = driver.current_window_handle
To switch between tabs:
driver.switch_to_window(driver.window_handles[1])
driver.switch_to.window(driver.window_handles[-1])
Assuming you have the new tab url as TAB_URL, you should try:
from selenium.webdriver.common.action_chains import ActionChains
action = ActionChains(driver)
action.key_down(Keys.CONTROL).click(TAB_URL).key_up(Keys.CONTROL).perform()
Also, apparently the li doesn't have a click event, are you sure this element you are getting '//*[#id="tabs"]/ul/li[2]' has the aria-selected property set to true or any of these classes: ui-tabs-active ui-state-active?
If not, you should call click on the a tag inside this li.
Then you should increase the timeout parameter of your WebDriverWait to guarantee that the div is loaded.
Currently I am working on a web crawler that should be able to download text of a dutch newspaper bank. The first link is working correctly but suddenly the second link creates an error of which I do not know how to fix this.
It seems that selenium is unable to click the button in the second link while it succeeds doing so in the first link.
Do you know what causes the second link (telegraaf page) to fail?
UPDATE CODE:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
import pandas as pd
import numpy as np
import re
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.action_chains import ActionChains
#Set up the path to the chrome driver
driver = webdriver.Chrome()
html = driver.find_element_by_tag_name('html')
all_details = []
for c in range(1,2):
try:
driver.get("https://www.delpher.nl/nl/kranten/results?query=kernenergie&facets%5Bpapertitle%5D%5B%5D=Algemeen+Dagblad&facets%5Bpapertitle%5D%5B%5D=De+Volkskrant&facets%5Bpapertitle%5D%5B%5D=De+Telegraaf&facets%5Bpapertitle%5D%5B%5D=Trouw&page={}&sortfield=date&cql%5B%5D=(date+_gte_+%2201-01-1970%22)&cql%5B%5D=(date+_lte_+%2201-01-2018%22)&coll=ddd".format(c))
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
incategory = driver.find_elements_by_class_name("search-result")
print(driver.current_url)
links = [ i.find_element_by_class_name("thumbnail search-result__thumbnail").get_attribute("href") for i in incategory]
# Lets loop through each link to acces the page of each book
for link in links:
# get one book url
driver.get(link)
# newspaper
newspaper = driver.find_element_by_xpath("//*[#id='content']/div[2]/div/div[2]/header/h1/span[2]")
# date of the article
date = driver.find_element_by_xpath("//*[#id='content']/div[2]/div/div[2]/header/div/ul/li[1]")
#click button and find title
div_element = WebDriverWait(driver, 60).until(expected_conditions.presence_of_element_located((By.XPATH,'//*[#id="object"]/div/div/div')))
hover = ActionChains(driver).move_to_element(div_element)
hover.perform()
div_element.click()
button = WebDriverWait(driver, 90).until(expected_conditions.presence_of_element_located((By.XPATH, '//*[#id="object-viewer__ocr-button"]')))
hover = ActionChains(driver).move_to_element(button)
hover.perform()
button.click()
element = driver.find_element_by_css_selector(".object-viewer__ocr-panel-results")
driver.execute_script("$(arguments[0]).click();", element)
# content of article
try:
content = driver.find_elements_by_xpath("//*[contains(text(), 'kernenergie')]").text
except:
content = None
# Define a dictionary with details we need
r = {
"1Newspaper":newspaper.text,
"2Date":date.text,
"3Content":content,
}
# append r to all details
all_details.append(r)
except Exception as e:
print(str(e))
pass
# save the information into a CSV file
df = pd.DataFrame(all_details)
df = df.to_string()
time.sleep(3)
driver.close()
So you have some problems.
driver.implicitly_wait(10)
Should only be used once
links = [ i.find_element_by_class_name("search-result__thumbnail-link").get_attribute("href") for i in incategory]
Is a more useful way to get all links
print(driver.current_url)
Could replace
print("https://www.delpher.nl/nl/kranten/results?query=kernenergie&facets%5Bpapertitle%5D%5B%5D=Algemeen+Dagblad&facets%5Bpapertitle%5D%5B%5D=De+Volkskrant&facets%5Bpapertitle%5D%5B%5D=De+Telegraaf&facets%5Bpapertitle%5D%5B%5D=Trouw&page={}&sortfield=date&cql%5B%5D=(date+_gte_+%2201-01-1970%22)&cql%5B%5D=(date+_lte_+%2201-01-2018%22)&coll=ddd".format(c))
No need for url=link
for link in links:
driver.get(link)
Your title actually doesn't get on the second page. Use something like this for all values.
try:
content = driver.find_element_by_xpath('//*[#id="object-viewer__ocr-panel"]/div[2]/div[5]').text
except:
content = None
# Define a dictionary
r = {
"1Newspaper":newspaper,
"2Date":date,
"3Title": title,
"4Content": content,
}
You can replace your exception with to figure out the line of problem.
except Exception as e:
print(str(e))
pass
it might be that the button you are trying to reach is inside an iframe, which means you have to access that one before searching for the XPATH:
iframe = driver.find_elements_by_tag_name('iframe')
driver.switch_to.frame(iframe)
Also there might be a possibility the object youre trying to click is not visible yet, which could be solved by an timeout
i'm trying to scrape more than 10 pages of reviews from https://www.innisfree.com/kr/ko/ProductReviewList.do
However when i move to the next page and try to get the new page's reviews, i still get the first page's reviews only.
i used driver.execute_script("goPage(2)") and also time.sleep(5) but my code only gives me the first page's reviews.
''' i did not use for-loop just to see whether the results are different between page1 and page2'''
''' i imported beautifulsoup and selenium'''
here is my code:
url = "https://www.innisfree.com/kr/ko/ProductReviewList.do"
chromedriver = r'C:\Users\hhm\Downloads\chromedriver_win32\chromedriver.exe'
driver = webdriver.Chrome(chromedriver)
driver.get(url)
print("this is page 1")
driver.execute_script("goPage(1)")
nTypes = soup.select('.reviewList ul .newType div[class^=reviewCon] .reviewConTxt')
for nType in nTypes:
product = nType.select_one('.pdtName').text
print(product)
print('\n')
print("this is page 2")
driver.execute_script("goPage(2)")
time.sleep(5)
nTypes = soup.select('.reviewList ul .newType div[class^=reviewCon] .reviewConTxt')
for nType in nTypes:
product = nType.select_one('.pdtName').text
print(product)
If your second page open as new window then you need to switch to another page and switch your selenium control to another window
Example:
# Opens a new tab
self.driver.execute_script("window.open()")
# Switch to the newly opened tab
self.driver.switch_to.window(self.driver.window_handles[1])
Source:
How to switch to new window in Selenium for Python?
https://www.techbeamers.com/switch-between-windows-selenium-python/
Try the following code.You need to click on each pagination link to reach to next page.you will get all 100 review comments.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time
url = "https://www.innisfree.com/kr/ko/ProductReviewList.do"
chromedriver = r'C:\Users\hhm\Downloads\chromedriver_win32\chromedriver.exe'
driver = webdriver.Chrome(chromedriver)
driver.get(url)
for i in range(2,12):
time.sleep(2)
soup=BeautifulSoup(driver.page_source,'html.parser')
nTypes = soup.select('.reviewList ul .newType div[class^=reviewCon] .reviewConTxt')
for nType in nTypes:
product = nType.select_one('.pdtName').text
print(product)
if i==11:
break
nextbutton=WebDriverWait(driver,10).until(EC.element_to_be_clickable((By.XPATH,"//span[#class='num']/a[text()='" +str(i)+"']")))
driver.execute_script("arguments[0].click();",nextbutton)