WebScrapping with Selenium and BeaufitulSoup can't find anything - python

I am trying to extract all the description in the links in the class="publication u-padding-xs-ver js-publication" of this website: https://www.sciencedirect.com/browse/journals-and-books?accessType=openAccess&accessType=containsOpenAccess
I tried both with BeautifulSoup and Selenium but I can't extract anything. You can see in the image below the result I got
result
Here is the code I am using
options = Options()
options.add_argument("headless")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
ul = driver.find_element(By.ID, "publication-list")
print("Links")
allLi = ul.find_elements(By.TAG_NAME, "li")
for li in allLi:
print("Links " + str(count) + " " + li.text)

You are missing waits.
You have to wait for elements to become visible before accessing them.
The best approach to do that is with use of WebDriverWait expected_conditions explicit waits.
The following code works
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
options = Options()
options.add_argument("start-maximized")
webdriver_service = Service('C:\webdrivers\chromedriver.exe')
driver = webdriver.Chrome(options=options, service=webdriver_service)
wait = WebDriverWait(driver, 20)
url = "https://www.sciencedirect.com/browse/journals-and-books?accessType=openAccess&accessType=containsOpenAccess"
driver.get(url)
ul = wait.until(EC.visibility_of_element_located((By.ID, "publication-list")))
allLi = wait.until(EC.presence_of_all_elements_located((By.TAG_NAME, "li")))
print(len(allLi))
the output is:
167

Related

How can I wait for element attribute value with Selenium 4 n python?

I want to handle the progress bar to be stopped after a certain percent let's say 70%. So far I have got the solutions, they all are using .attributeToBe() method. But in selenium 4 I don't have that method present. How can I handle that?
This is the demo link - https://demoqa.com/progress-bar
I have tried to do that using explicit wait like others but I didn't find .attributrToBe() method in selenium 4. Is it possible to do that using loop?
You can use text_to_be_present_in_element_attribute expected_conditions.
The following code works:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
options = Options()
options.add_argument("start-maximized")
webdriver_service = Service('C:\webdrivers\chromedriver.exe')
driver = webdriver.Chrome(options=options, service=webdriver_service)
wait = WebDriverWait(driver, 60)
url = "https://demoqa.com/progress-bar"
driver.get(url)
wait.until(EC.element_to_be_clickable((By.ID, "startStopButton"))).click()
wait.until(EC.text_to_be_present_in_element_attribute((By.CSS_SELECTOR, '[role="progressbar"]'),"aria-valuenow","70"))
wait.until(EC.element_to_be_clickable((By.ID, "startStopButton"))).click()
UPD
For the second page the following code works as well:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
options = Options()
options.add_argument("start-maximized")
webdriver_service = Service('C:\webdrivers\chromedriver.exe')
driver = webdriver.Chrome(options=options, service=webdriver_service)
wait = WebDriverWait(driver, 60)
url = "http://uitestingplayground.com/progressbar"
driver.get(url)
wait.until(EC.element_to_be_clickable((By.ID, "startButton"))).click()
wait.until(EC.text_to_be_present_in_element_attribute((By.CSS_SELECTOR, '[role="progressbar"]'),"aria-valuenow","75"))
wait.until(EC.element_to_be_clickable((By.ID, "stopButton"))).click()

Can't find element by name using selenium

I'm using selenium 4.7.2 and can't find the element by its name. The following code returns NoSuchElementException error:
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
# Get the website using the Chrome webbdriver
browser = webdriver.Chrome()
browser.get('https://www.woofshack.com/en/cloud-chaser-waterproof-softshell-dog-jacket-ruffwear-rw-5102.html')
# Print out the result
price = browser.find_element(By.NAME, 'data-price-665')
print("Price: " + price.text)
# Close the browser
time.sleep(3)
browser.close()
What's wrong in using find_element method?
Looks like you are using a wrong locator here. I see no element with name attribute value 'data-price-665' on that page.
The following code is working:
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
options = Options()
options.add_argument("start-maximized")
webdriver_service = Service('C:\webdrivers\chromedriver.exe')
driver = webdriver.Chrome(service=webdriver_service, options=options)
wait = WebDriverWait(driver, 20)
actions = ActionChains(driver)
url = "https://www.woofshack.com/en/cloud-chaser-waterproof-softshell-dog-jacket-ruffwear-rw-5102.html"
driver.get(url)
price = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "#product-price-665 .price")))
print("Price: " + price.text)
The output is:
Price: €112.95

xpath give empty output using selenium

I am not getting price they give me empty output this is page link https://www.amazon.com/dp/B00M0DWQYI?th=1
from selenium import webdriver
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support.select import Select
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
url='https://www.amazon.com/dp/B00M0DWQYI?th=1'
PATH="C:\Program Files (x86)\chromedriver.exe"
driver =webdriver.Chrome(PATH)
driver.get(url)
item=dict()
try:
item['price'] = driver.find_element(By.XPATH, "//div[#id='corePrice_feature_div'] //span[#class='a-offscreen']").text
except:
item['price']=''
print(item)
You may want to wait for that element to properly load, prior to locating it:
[...]
wait = WebDriverWait(driver, 10)
item['price'] = wait.until(EC.element_to_be_clickable((By.XPATH, "//div[#id='corePrice_feature_div']//span[#class='a-offscreen']"))).text
Selenium documentation can be found at https://www.selenium.dev/documentation/
EDIT: Here is a complete example of how you can get that information:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
import time as t
chrome_options = Options()
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument('disable-notifications')
chrome_options.add_argument("window-size=1920,1080")
webdriver_service = Service("chromedriver/chromedriver") ## path to where you saved chromedriver binary
driver = webdriver.Chrome(service=webdriver_service, options=chrome_options)
wait = WebDriverWait(driver, 5)
items = dict()
driver.get('https://www.amazon.com/dp/B00M0DWQYI?th=1')
t.sleep(1)
driver.refresh()
items['price'] = wait.until(EC.element_to_be_clickable((By.XPATH, '//div[#id="corePrice_feature_div"]//span[#class="a-price aok-align-center"]'))).text.replace('\n', '.')
print(items)
Result in terminal:
{'price': '$32.98'}
You need to wait for element visibility and then to extract it's text.
The following Selenium code works:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
options = Options()
options.add_argument("start-maximized")
webdriver_service = Service('C:\webdrivers\chromedriver.exe')
driver = webdriver.Chrome(service=webdriver_service, options=options)
url = 'https://www.amazon.com/dp/B00M0DWQYI'
driver.get(url)
wait = WebDriverWait(driver, 10)
print(wait.until(EC.visibility_of_element_located((By.XPATH, "//div[#id='corePrice_feature_div']"))).text)
The output is
$32
98
You can use bs4 and it will work fine
from bs4 import BeautifulSoup
soup = BeautifulSoup(driver.page_source, 'lxml')
try:
item['price'] = soup.find('input', id="attach-base-product-price").get('value')
except:
item['price'] = ''
finally:
driver.close()
driver.quit()
print(item)

Selenium Python Paste Not Working in Headless Mode

Python selenium paste isnt working in headless mode, i tried CONTROL + V, SHIFT + INSERT, with pyperclip3, pyperclip, klembord, but nothing seems to working, here is the code
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import pyperclip3
import time
Desc = """<p><iframe src="https://www.youtube.com/embed/-grLLLTza6k" frameborder="0" allowfullscreen=""></iframe></p><p><img src="https://images-na.ssl-images-amazon.com/images/I/71Zxjh0AdpL.png" style="width:100%;max-width:450px;clear:both;"></p><p></p><h2>10 Undeniable Reasons People Hate cheats</h2>"""
options = Options()
options.binary_location = r'CHROME_BINARY_LOCATION'
options.add_argument("--headless")
webdriver.Chrome(executable_path=r'CHROME_DRIVER_LOCATION', options=options)
driver.implicitly_wait(10)
driver.get("http://google.com")
time.sleep(2)
pyperclip3.copy(Desc)
element = driver.find_element(By.XPATH, '//input[#name="q"]')
# element.send_keys(Keys.CONTROL, 'v')
a = pyperclip3.paste()
element.send_keys(Keys.SHIFT, Keys.INSERT)
element.send_keys(Keys.CONTROL, 'v')
time.sleep(2)
driver.save_screenshot("image.png")
driver.close()
driver.quit()
I tried the below code with explicit waits, and it seems to do the job with pyperclip :
options = webdriver.ChromeOptions()
options.add_argument("--disable-infobars")
options.add_argument("--start-maximized")
options.add_argument("--disable-extensions")
options.add_experimental_option("prefs", {"profile.default_content_setting_values.notifications": 2})
options.add_argument('--window-size=1920,1080')
options.add_argument("--headless")
options.add_experimental_option("prefs", {"profile.default_content_settings.cookies": 2})
driver = webdriver.Chrome(executable_path=driver_path, options = options)
driver.implicitly_wait(30)
driver.maximize_window()
driver.get("http://google.com")
wait = WebDriverWait(driver, 20)
desc = '''<p><iframe src="https://www.youtube.com/embed/-grLLLTza6k" frameborder="0" allowfullscreen=""></iframe></p><p><img src="https://images-na.ssl-images-amazon.com/images/I/71Zxjh0AdpL.png" style="width:100%;max-width:450px;clear:both;"></p><p></p><h2>10 Undeniable Reasons People Hate cheats</h2>'''
pyperclip.copy(desc)
time.sleep(1)
wait.until(EC.visibility_of_element_located((By.NAME, 'q'))).send_keys(pyperclip.paste())
print("Succesful")
driver.save_screenshot("image.png")
Imports :
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import pyperclip as pyperclip

Headless Chrome not loading page

Headless Chrome does not load the page and gets stuck at:
wait = WebDriverWait(driver, 30)
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, ".price-link:nth-child(1) .team-name")))
Why is it doing this? Is this a bug as it works perfectly in normal chrome and prints h below except in headless...
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
options = Options()
options.add_argument("--headless")
options.add_argument("window-size=1400,600")
driver = webdriver.Chrome(chrome_options=options)
#driver = webdriver.Chrome()
driver.set_window_size(1024, 600)
driver.maximize_window()
driver.get('https://www.sportsbet.com.au/betting/soccer?LeftNav')
print('?')
wait = WebDriverWait(driver, 30)
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, ".price-link:nth-child(1) .team-name")))
print('h')
Another example:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
options = Options()
options.add_argument("--headless")
options.add_argument("window-size=1400,600")
driver = webdriver.Chrome(chrome_options=options)
#driver = webdriver.Chrome()
driver.set_window_size(1024, 600)
driver.maximize_window()
import time
driver.get('https://www.sportsbet.com.au/betting/soccer?LeftNav')
import time
time.sleep(10)
langs = driver.find_elements_by_css_selector(".price-link:nth-child(1) .team-name")
langs_text = []
for lang in langs:
print(lang.text)
langs_text.append(lang.text)
print('h')
Job prints with removed wait until though nothing is scraped. Page not loading is likely issue.
Try to use the following code:
element = WebDriverWait(driver, 30).until(EC.visibility_of_element_located((By.CSS_SELECTOR, ".price-link:nth-child(1) .team-name")))
driver.execute_script("arguments[0].scrollIntoView(true);", element)
Hope it helps you!

Categories

Resources