I'm trying to scrape a website (https://harleytherapy.com/therapists?page=1) that looks like it's been generated by Javascript and the element I'm trying to scrape (the lu with id="downshift-7-menu") doesn't appear on the "Page source" but only after I click on "Inspect element".
I tried to find a solution here and so far this the code I was able to come up with (a combination of Selenium + Beautiful soup)
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import time
url = "https://harleytherapy.com/therapists?page=1"
options = webdriver.ChromeOptions()
options.add_argument('headless')
capa = DesiredCapabilities.CHROME
capa["pageLoadStrategy"] = "none"
driver = webdriver.Chrome(chrome_options=options, desired_capabilities=capa)
driver.set_window_size(1440,900)
driver.get(url)
time.sleep(15)
plain_text = driver.page_source
soup = BeautifulSoup(plain_text, 'html')
therapist_menu_id = "downshift-7-menu"
print(soup.find(id=therapist_menu_id))
I thought that allowing Selenium to wait for 15 seconds would make sure that all elements are loaded but I still can't find any element with id downshift-7-menu in the soup. Do you guys know what's wrong with my code?
The element with ID downshift-7-menu is loaded only after opening the THERAPIST dropdown menu, you can do it by scrolling it into view to load it and then clicking on it. You should also consider replacing sleep with explicit wait
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
wait = WebDriverWait(driver, 15)
# scroll the dropdown into view to load it
side_menu = wait.until(EC.visibility_of_element_located((By.CLASS_NAME, 'inner-a377b5')))
last_height = driver.execute_script("return arguments[0].scrollHeight", side_menu)
while True:
driver.execute_script("arguments[0].scrollTo(0, arguments[0].scrollHeight);", side_menu)
new_height = driver.execute_script("return arguments[0].scrollHeight", side_menu)
if new_height == last_height:
break
last_height = new_height
# open the menu
wait.until(EC.visibility_of_element_located((By.ID, 'downshift-7-input'))).click()
# wait for the option to load
therapist_menu_id = 'downshift-7-menu'
wait.until(EC.presence_of_element_located((By.ID, therapist_menu_id)))
print(soup.find(id=therapist_menu_id))
Related
I want to click on each product on aliexpress and do something with it.
However, I kept running into an ElementClickInterceptedException
Please verify that the code is correct before answering the question if you are using chat-GPT or any other AI to help with this problem.
These are the things that I tried
for supplier in suppliers:
driver.execute_script("arguments[0].scrollIntoView();", supplier)
actions = ActionChains(driver)
actions.move_to_element(supplier).click().perform()
for supplier in suppliers:
driver.execute_script("arguments[0].scrollIntoView();", supplier)
actions = ActionChains(driver)
actions.move_to_element(supplier)
wait.until(EC.visibility_of_element_located((By.XPATH, ".//*[#class='list--gallery--34TropR']//span/a")))
try:
supplier.click()
except ElementClickInterceptedException:
print('object not on screen')
However, this still gives me the highest click-through-rate
for supplier in suppliers:
try:
supplier.click()
print('Supplier clicked')
time.sleep(1)
except ElementClickInterceptedException:
print('object not on screen')
This is how I initialized the driver and loaded the elements.
search_key = "Motor+toy+boat"
suppliers = []
print("https://www.aliexpress.com/premium/"+search_key+".html?spm=a2g0o.best.1000002.0&initiative_id=SB_20221218233848&dida=y")
# create a webdriver object and set the path to the Chrome driver
service = Service('../venv/chromedriver.exe')
driver = webdriver.Chrome(service=service)
# navigate to the Aliexpress website
driver.get("https://www.aliexpress.com/")
# Wait for the page to load
wait = WebDriverWait(driver, 10)
wait.until(EC.presence_of_element_located((By.ID, "search-key")))
# wait for the page to load
driver.implicitly_wait(10)
driver.get("https://www.aliexpress.com/premium/"+search_key+".html?spm=a2g0o.best.1000002.0&initiative_id=SB_20221218233848&dida=y")
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
driver.execute_script("window.scrollBy(0, 800);")
sleep(1)
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
print(new_height, last_height)
break
last_height = new_height
for element in driver.find_elements(By.XPATH, "//*[contains(#class, 'manhattan--container--1lP57Ag cards--gallery--2o6yJVt')]"):
suppliers.append(element)
Couple of issues I have identified.
It is detecting bot, so after couple of runs it will stop identifying the element.Use --disable-blink-features in chrome options.
Once you iterate the list,it is clicking somewhere else, just wait for a second and then click, it will work.
added code will click only visible element on the page, If you need to click more you needed to scroll the page and then click.
You can check the count of total visible element on the page.
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
import time
chrome_options = webdriver.ChromeOptions()
chrome_options.add_experimental_option("excludeSwitches", ['enable-automation'])
chrome_options.add_experimental_option('useAutomationExtension', False)
chrome_options.add_argument('--disable-blink-features=AutomationControlled')
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()),options=chrome_options)
driver.get("https://www.aliexpress.com/w/wholesale-uk.html")
WebDriverWait(driver,10).until(EC.element_to_be_clickable((By.XPATH, "//*[contains(#class, 'manhattan--container--1lP57Ag cards--gallery--2o6yJVt')]")).click()
suppliers=WebDriverWait(driver,10).until(EC.visibility_of_all_elements_located((By.XPATH,".//*[#class='list--gallery--34TropR']//span/a")))
print("Total visible element on the page: " + str(len(suppliers)))
for supplier in suppliers:
time.sleep(1)
supplier.click()
I am not getting all links there are 403 links in these page I am getting only 68 links I also used the scroll down method they move to end of page but not give all links is any thing I am doing wrong kindly guide us these is page link https://www.ocado.com/search?entry=frozen
from selenium import webdriver
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support.select import Select
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
url='https://www.ocado.com/search?entry=frozen'
PATH="C:\Program Files (x86)\chromedriver.exe"
driver =webdriver.Chrome(PATH)
driver.get(url)
SCROLL_PAUSE_TIME = 50
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
# Scroll down to bottom
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
time.sleep(SCROLL_PAUSE_TIME)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
t=driver.find_elements(By.XPATH, "//div[#class='fop-contentWrapper']")
for l in t:
links= l.find_element(By.XPATH, ".//a[starts-with(#href, '/products')]").get_attribute("href")
print(links)
With this should be enough:
# Needed libs
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
# We create the driver
driver = webdriver.Chrome()
# We maximize the window, because if not the page will be different
driver.maximize_window()
# We navigate to the url
url='https://www.ocado.com/search?entry=frozen'
driver.get(url)
# We click on acceptbutton cookies pop up
WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.XPATH, "//button[#id='onetrust-accept-btn-handler']"))).click()
# We take the show_more_button, which is at the bottom
show_more_button = driver.find_element(By.XPATH, "//button[text()='Show more']")
# We take latest product which contain the info we want, that product will be more or less in the middle of the page because it is the latest one which is loaded
last_element_with_link = WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.XPATH, "//div[#class='fop-contentWrapper']/a[last()]")))
# If the location of the show_more_button - last_element_with_link location is bigger than 500 px means we did not arrive till the end of list
while show_more_button.location['y'] - last_element_with_link.location['y'] > 500:
# We get the location of the new last_element_with_link because we should have more elements
last_element_with_link = WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.XPATH, "//div[#class='fop-contentWrapper']/a[last()]")))
# We do till we arrive to the position of this new last_element_with_link
print(f"Scroll to px: {last_element_with_link.location['y']}")
driver.execute_script(f"window.scrollTo(0, {last_element_with_link.location['y']})")
# small sleep to give time to the page
time.sleep(0.1)
# Here we now we are at the botton, so we can take the links
list_of_elements = WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.XPATH, "//div[#class='fop-contentWrapper']/a")))
print(len(list_of_elements))
# For each element we print the url
for element in list_of_elements:
print(element.get_attribute('href'))
Actually there are 403 products per page
https://fbref.com/en/squads/0cdc4311/Augsburg-Stats provides buttons to transform a table to csv, which I would like to scrape. I click the buttons like
elements = driver.find_elements(By.XPATH, '//button[text()="Get table as CSV (for Excel)"]')
for element in elements:
element.click()
but I get an exception
ElementNotInteractableException: Message: element not interactable
This is the element I am trying to click.
Here's the full code (I added Adblock plus as a Chrome extension, which should be configured to test locally):
import pandas as pd
import bs4
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.options import Options
import time
import os
#activate adblock plus
path_to_extension = '/home/andreas/.config/google-chrome/Default/Extensions/cfhdojbkjhnklbpkdaibdccddilifddb/3.11.4_0'
options = Options()
options.add_argument('load-extension=' + path_to_extension)
#uses Chrome driver in usr/bin/ from https://chromedriver.chromium.org/downloads
driver = webdriver.Chrome(options=options)
#wait and switching back to tab with desired source
time.sleep(5)
driver.switch_to.window(driver.window_handles[0])
NO_OF_PREV_SEASONS = 5
df = pd.DataFrame()
urls = ['https://fbref.com/en/squads/247c4b67/Arminia-Stats']
for url in urls:
driver.get(url)
html = driver.page_source
soup = bs4.BeautifulSoup(html, 'html.parser')
#click button -> accept cookies
element = driver.find_element(By.XPATH, '//button[text()="AGREE"]')
element.click()
for i in range(NO_OF_PREV_SEASONS):
elements = driver.find_elements(By.XPATH, '//button[text()="Get table as CSV (for Excel)"]')
for element in elements:
element.click()
#todo: get data
#click button -> navigate to next page
time.sleep(5)
element = driver.find_element(By.LINK_TEXT, "Previous Season")
element.click()
driver.quit()
button is inside the drop-down list (i.e. <span>Share & Export</span>) so you need to hover it first.
e.g.
from selenium.webdriver.common.action_chains import ActionChains
action_chain = ActionChains(driver)
hover = driver.find_element_by_xpath("// span[contains(text(),'Share & Export')]")
action_chain.move_to_element(hover).perform() # hover to show drop down list
driver.execute_script("window.scrollTo(0, 200)") # scroll down a bit
time.sleep(1) # wait for scrolling
button = driver.find_element_by_xpath("// button[contains(text(),'Get table as CSV (for Excel)')]")
action_chain.move_to_element(button).click().perform() # move to button and click
time.sleep(3)
output:
This also happens to me sometimes. One way to overcome this problem is by getting the X and Y coordinates of this button and clicking on it.
import pyautogui
for element in elements:
element_pos = element.location
element_size = element.size
x_coordinate, y_coordinate = elemnt_pos['x'], element_pos['y']
e_width, e_height = element_size['width'], element_size['height']
click_x = x_coordinate + e_width/2
click_y = y_coordinate + e_height/2
pyauotgui.click(click_x, click_y)
Other solution that you may try is to click on the tag that contains this button.
There are several issues here:
You have to click and open Share and Export tab and then click Get table as CSV button
You have to scroll the page to access the non-first tables.
So, your code can be something like this:
import pandas as pd
import bs4
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options
import time
import os
#activate adblock plus
path_to_extension = '/home/andreas/.config/google-chrome/Default/Extensions/cfhdojbkjhnklbpkdaibdccddilifddb/3.11.4_0'
options = Options()
options.add_argument('load-extension=' + path_to_extension)
options.add_argument("window-size=1920,1080")
#uses Chrome driver in usr/bin/ from https://chromedriver.chromium.org/downloads
driver = webdriver.Chrome(options=options)
actions = ActionChains(driver)
#wait and switching back to tab with desired source
time.sleep(5)
#driver.switch_to.window(driver.window_handles[0])
NO_OF_PREV_SEASONS = 5
df = pd.DataFrame()
urls = ['https://fbref.com/en/squads/247c4b67/Arminia-Stats']
for url in urls:
driver.get(url)
html = driver.page_source
soup = bs4.BeautifulSoup(html, 'html.parser')
#click button -> accept cookies
element = driver.find_element(By.XPATH, '//button[text()="AGREE"]')
element.click()
for i in range(NO_OF_PREV_SEASONS):
elements = driver.find_elements(By.XPATH, "//div[#class='section_heading_text']//li[#class='hasmore']")
for element in elements:
actions.move_to_element(element).perform()
time.sleep(0.5)
element.click()
wait.until(EC.visibility_of_element_located((By.XPATH, "//button[#tip='Get a link directly to this table on this page']"))).click()
#todo: get data
I want to scrape all tweets from twitter using Selenium. So, for this I want to go at bottom of the page.I tried a lot but it shows "Back to top " as shown in image.
How can I go at the bottom of the page/disappear "Back to top" using Selenium or How can I scrape all the tweets, if applying any other approach?
import pandas as pd
import selenium
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
driver=webdriver.Firefox(executable_path="/home/piyush/geckodriver")
url="https://twitter.com/narendramodi"
driver.get(url)
time.sleep(6)
lastHeight = driver.execute_script("return document.body.scrollHeight")
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(6)
newHeight = driver.execute_script("return document.body.scrollHeight")
if newHeight == lastHeight:
break
lastHeight = newHeight
soup=BeautifulSoup(driver.page_source.encode("utf-8"),"html.parser")
tweet=[p.text for p in soup.find_all("p",class_="tweet-text")]
Here is image of inspect element of "Back-to-top"
Here is the output image
Just briefly looking at twitter, it appears that the content is generated on scrolling, meaning you need to scrape and parse the data as you scroll rather than after.
I would suggest moving
soup = BeautifulSoup(driver.page_source.encode("utf-8"),"html.parser")
tweet = [p.text for p in soup.find_all("p",class_="tweet-text")]
into your while loop after the scroll:
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
###
soup = BeautifulSoup(driver.page_source.encode("utf-8"),"html.parser")
tweet = [p.text for p in soup.find_all("p",class_="tweet-text")]
###
time.sleep(6)
newHeight = driver.execute_script("return document.body.scrollHeight")
if newHeight == lastHeight:
break
lastHeight = newHeight
If this doesn't work you are probably being fingerprinted and labeled as a bot by Twitter.
I've written a script in python to scrape names from a slow loading webpage. There are 1000 names in that page and the full content can only be loaded when the browser is made to scroll downmost. However, my script can successfully reach the lowest portion of this page and parse all the names. The issue I'm facing here is that I've used hardcoded delay which is 5 seconds in this case and it makes the browser unnecessarily wait even when the item is loaded. So how can i use explicit wait to overcome this situation and parse all the item.
Here is the script I've written so far:
from selenium import webdriver
import time
driver = webdriver.Chrome()
driver.get("http://fortune.com/fortune500/list/")
check_height = driver.execute_script("return document.body.scrollHeight;")
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(5)
height = driver.execute_script("return document.body.scrollHeight;")
if height == check_height:
break
check_height = height
listElements = driver.find_elements_by_css_selector(".company-title")
for item in listElements:
print(item.text)
You can add Explicit wait as below:
from selenium.webdriver.support.ui import WebDriverWait
from selenium import webdriver
driver = webdriver.Chrome()
driver.get("http://fortune.com/fortune500/list/")
check_height = driver.execute_script("return document.body.scrollHeight;")
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
try:
WebDriverWait(driver, 10).until(lambda driver: driver.execute_script("return document.body.scrollHeight;") > check_height)
check_height = driver.execute_script("return document.body.scrollHeight;")
except:
break
listElements = driver.find_elements_by_css_selector(".company-title")
for item in listElements:
print(item.text)
This should allow you to avoid hardcoding time.sleep()- instead you're just waiting for changing height value or break the loop in case height is constant after 10 seconds passed after scrolling...
You need to use explicit waits, like this:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Firefox()
driver.get("http://somedomain/url_that_delays_loading")
try:
element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.ID, "myDynamicElement"))
)
finally:
driver.quit()
More details here http://selenium-python.readthedocs.io/waits.html