not able to scrape data because of dynamic changes in element identities - python

hey guys i was trying to scrape Zomato's restaurants those have ratings above 4 but https://www.zomato.com/pune/order-food-online?delivery_subzone=1165 but its class name or every thing changing after next few elements
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import json
driver=webdriver.Chrome(executable_path='./chromedriver.exe')
driver.get('https://www.zomato.com/pune/order-food-online?delivery_subzone=1165')
rating=WebDriverWait(driver, 10).until(
EC.presence_of_all_elements_located((By.XPATH, '//p[#class="sc-1hez2tp-0 sc-lhdg1m-2 hDJwRc"]'))
)
for item in rating:
stars=item.text
if stars > '4.0':
title=WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.XPATH, "//p[#class='sc-1hez2tp-0 sc-izFuNb jbErXF']"))
)
time.sleep(10)
driver.close()
please guys I'm doing it by selenium

Go to the page.
Filter out the restaurants with 4.0+ ratings using the filters provided above - using the xpath //div[contains(text(),'Rating: 4.0+')] (use a click() method).
All of the cards of the restaurants have the image alt of Restaurant Card. So you can use the css selector img[alt='Restaurant Card'] to get all the cards appearing after filtering, and keep them in some count variable.
As you keep scrolling, you need to keep adding to this count variable.
Edit: Here is the whole script for you - which gives the count of restaurants as 117
import time
from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.options import Options
from urllib.parse import urljoin
##### Web scrapper for infinite scrolling page #####
driver = webdriver.Chrome(executable_path=r"path_to-chromedriver")
driver.get("https://www.zomato.com/pune/delivery-in-budhwar-peth")
time.sleep(10) # Allow 2 seconds for the web page to open
driver.find_element_by_xpath("//div[contains(text(),'Rating: 4.0+')]").click()
scroll_pause_time = 1 # You can set your own pause time. My laptop is a bit slow so I use 1 sec
screen_height = driver.execute_script("return window.screen.height;") # get the screen height of the web
i = 1
count=0
while True:
# scroll one screen height each time
driver.execute_script("window.scrollTo(0, {screen_height}*{i});".format(screen_height=screen_height, i=i))
i += 1
time.sleep(scroll_pause_time)
# update scroll height each time after scrolled, as the scroll height can change after we scrolled the page
scroll_height = driver.execute_script("return document.body.scrollHeight;")
# Break the loop when the height we need to scroll to is larger than the total scroll height
if (screen_height) * i > scroll_height:
break
soup = BeautifulSoup(driver.page_source, "html.parser")
for img in soup.find_all('img',alt='Restaurant Card'):
count+=1
print('Count of all rests is',count)
driver.quit()

Related

Scrape multiple pages with the same url using Python Selenium

I have the following code that scrapes some information I need from a website. However, there are 61 pages I need to go through and scrape the same data that requires me to click on the 'Next' button to go to the next page with the url remaining the same.
I know it is possible to use driver.find_element_by_link_text('Next').click() to go to the next page but I am not sure how to include this in my code.
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time
driver = webdriver.Chrome()
driver.get('https://mspotrace.org.my/Sccs_list')
time.sleep(20)
# Get list of elements
elements = WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.XPATH, "//a[#title='View on Map']")))
# Loop through element popups and pull details of facilities into DF
pos = 0
df = pd.DataFrame(columns=['facility_name','other_details'])
for element in elements:
try:
data = []
element.click()
time.sleep(10)
facility_name = driver.find_element_by_xpath('//h4[#class="modal-title"]').text
other_details = driver.find_element_by_xpath('//div[#class="modal-body"]').text
time.sleep(5)
data.append(facility_name)
data.append(other_details)
df.loc[pos] = data
WebDriverWait(driver,10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "button[aria-label='Close'] > span"))).click() # close popup window
print("Scraping info for",facility_name,"")
time.sleep(15)
pos+=1
except Exception:
alert = driver.switch_to.alert
print("No geo location information")
alert.accept()
pass
print(df)
Answering to your question, "I don't know how I would put it in my code"
Counter iii is used to repeat your existing code 60 times.
I cannot test the entire code, but I tested the loops.
For the sake of simplicity, in the code below I removed the element scraping so I could focus the test on repeating the clicks in the Next button, which is your question.
If you are going to test on your side, ensure you replace
print('your stuff would stay here!')
with the actual element scraping block that you have in your original code.
Hope it helps!
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time
driver = webdriver.Chrome()
driver.get('https://mspotrace.org.my/Sccs_list')
time.sleep(20)
# Get list of elements
elements = WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.XPATH, "//a[#title='View on Map']")))
# Loop through element popups and pull details of facilities into DF
pos = 0
df = pd.DataFrame(columns=['facility_name','other_details'])
for iii in range(1,60):
for element in elements:
print('your stuff would stay here!')
#click next
btnNext = driver.find_element(By.XPATH,'//*[#id="dTable_next"]/a')
driver.execute_script("arguments[0].scrollIntoView();", btnNext)
driver.execute_script("arguments[0].click();", btnNext)
time.sleep(5)
#print current df. You may want to store it and print in the end only?
print(df)
# Get list of elements again
elements = WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.XPATH, "//a[#title='View on Map']")))
# Resetting vars again
pos = 0
df = pd.DataFrame(columns=['facility_name','other_details'])

Want to scraping titles, dates, links, and content from IOL website but can't

I am new to web scraping, and I am trying to scrape the titles, dates, links, and contents of news articles on this website: https://www.iol.co.za/news/south-africa/eastern-cape.
The titles of the articles have different class names and heading (h) tag. I was able to scrape the dates, links, and titles using h tag. However, when I tried to store them in a pandas dataframe, I received the following errors-> ValueError: All arrays must be of the same length.
I also wrote the code to get the content of each article using the links. I got an error as well. I will thankful if I can be assisted.
I have tried different options to scrape the titles by creating a list of the different class names, but to no avail.
Please see my code below:
import sys, time
from bs4 import BeautifulSoup
import requests
import pandas as pd
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from datetime import timedelta
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
import re
art_title = [] # to store the titles of all news article
art_date = [] # to store the dates of all news article
art_link = [] # to store the links of all news article
pagesToGet = ['south-africa/eastern-cape']
for i in range(0, len(pagesToGet)):
print('processing page : \n')
url = 'https://www.iol.co.za' + str(pagesToGet[i])
print(url)
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.maximize_window()
#time.sleep(5) # allow you to sleep your code before your retrieve the elements from the webpage. Additionally, to
# prevent the chrome driver opening a new instance for every url, open the browser outside of the loop.
# an exception might be thrown, so the code should be in a try-except block
try:
# use the browser to get the url. This is suspicious command that might blow up.
driver.get("https://www.iol.co.za/news/" +str(pagesToGet[i]))
except Exception as e: # this describes what to do if an exception is thrown
error_type, error_obj, error_info = sys.exc_info() # get the exception information
print('ERROR FOR LINK:', url) # print the link that cause the problem
print(error_type, 'Line:', error_info.tb_lineno) # print error info and line that threw the exception
continue # ignore this page. Abandon this and go back.
time.sleep(3) # Allow 3 seconds for the web page to open
# Code to scroll the screen to the end and click on more news till the 15th page before scraping all the news
k = 1
while k<=2:
scroll_pause_time = 1 # You can set your own pause time. My laptop is a bit slow so I use 1 sec
screen_height = driver.execute_script("return window.screen.height;") # get the screen height of the web
i = 1
while True:
# scroll one screen height each time
driver.execute_script("window.scrollTo(0, {screen_height}*{i});".format(screen_height=screen_height, i=i))
i += 1
time.sleep(scroll_pause_time)
# update scroll height each time after scrolled, as the scroll height can change after we scrolled the page
scroll_height = driver.execute_script("return document.body.scrollHeight;")
# Break the loop when the height we need to scroll to is larger than the total scroll height
if (screen_height) * i > scroll_height:
break
driver.find_element(By.CSS_SELECTOR, '.Articles__MoreFromButton-sc-1mrfc98-0').click()
k += 1
time.sleep(1)
soup = BeautifulSoup(driver.page_source, 'html.parser')
news = soup.find_all('article', attrs={'class': 'sc-ifAKCX'})
print(len(news))
# Getting titles, dates, and links
for j in news:
# Article title
title = j.findAll(re.compile('^h[1-6]'))
for news_title in title:
art_title.append(news_title.text)
# Article dates
dates = j.find('p', attrs={'class': 'sc-cIShpX'})
if dates is not None:
date = dates.text
split_date = date.rsplit('|', 1)[1][10:].rsplit('<', 1)[0]
art_date.append(split_date)
# Article links
address = j.find('a').get('href')
news_link = 'https://www.iol.co.za' + address
art_link.append(news_link)
df = pd.DataFrame({'Article_Title': art_title, 'Date': art_date, 'Source': art_link})
# Getting contents
new_articles = ...struggling to write the code
df['Content'] = news_articles
df.to_csv('data.csv')
driver.quit()
I think this is what you are looking for:
# Needed libs
from selenium.webdriver import ActionChains, Keys
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium import webdriver
# Initialize drivver and navigate
driver = webdriver.Chrome()
driver.maximize_window()
url = 'https://www.iol.co.za/news/south-africa/eastern-cape'
wait = WebDriverWait(driver, 5)
driver.get(url)
time.sleep(3)
# take the articles
articles = wait.until(EC.presence_of_all_elements_located((By.XPATH, f"//article//*[(name() = 'h1' or name()='h2' or name()='h3' or name()='h4' or name()='h5' or name()='h6' or name()='h7') and string-length(text()) > 0]/ancestor::article")))
# For every article we take what we want
for article in articles:
header = article.find_element(By.XPATH, f".//*[name() = 'h1' or name()='h2' or name()='h3' or name()='h4' or name()='h5' or name()='h6' or name()='h7']")
print(header.get_attribute('textContent'))
author_and_date = article.find_elements(By.XPATH, f".//*[name() = 'h1' or name()='h2' or name()='h3' or name()='h4' or name()='h5' or name()='h6' or name()='h7']/following-sibling::p[1]")
if author_and_date:
print(author_and_date[0].get_attribute('textContent'))
else:
print("No author found")
link = article.find_element(By.XPATH, f".//a")
print(link.get_attribute('href'))

Cannot extract/load all hrefs from iframe (inside html page) while parsing Webpage

l am really struggling with this case and have been trying all day. Please l need your help.I am trying to scrape this webpage: https://decisions.scc-csc.ca/scc-csc/en/d/s/index.do?cont=&ref=&d1=2012-01-01&d2=2022-01-31&p=&col=1&su=16&or=
l want to get all 137 href-s (137 documents).
The code l used:
def test(self):
final_url = 'https://decisions.scc-csc.ca/scc-csc/en/d/s/index.do?cont=&ref=&d1=2012-01-01&d2=2022-01-31&p=&col=1&su=16&or='
self.driver.get(final_url)
soup = BeautifulSoup(self.driver.page_source, 'html.parser')
iframes = soup.find('iframe')
src = iframes['src']
base = 'https://decisions.scc-csc.ca/'
main_url = urljoin(base, src)
self.driver.get((main_url))
browser = self.driver
elem = browser.find_element_by_tag_name("body")
no_of_pagedowns = 20
while no_of_pagedowns:
elem.send_keys(Keys.PAGE_DOWN)
time.sleep(0.2)
no_of_pagedowns -= 1
The problem is that it loads only 25 first documents (href) and don't know how to do that.
This code scrolls down until all elements are visible, then save the urls of the pdfs in the list pdfs. Notice that all the work is done with selenium, without using BeautifulSoup
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome(options=options, service=Service(your_chromedriver_path))
driver.get('https://decisions.scc-csc.ca/scc-csc/en/d/s/index.do?cont=&ref=&d1=2012-01-01&d2=2022-01-31&p=&col=1&su=16&or=')
# wait for the iframe to be loaded and then switch to it
WebDriverWait(driver, 20).until(EC.frame_to_be_available_and_switch_to_it((By.ID, "decisia-iframe")))
# in this case number_of_results = 137
number_of_results = int(driver.find_element(By.XPATH, "//h2[contains(., 'result')]").text.split()[0])
pdfs = []
while len(pdfs) < number_of_results:
pdfs = driver.find_elements(By.CSS_SELECTOR, 'a[title="Download the PDF version"]')
# scroll down to the last visible row
driver.execute_script('arguments[0].scrollIntoView({block: "center", behavior: "smooth"});', pdfs[-1])
time.sleep(1)
pdfs = [pdf.get_attribute('href') for pdf in pdfs]

Print item from new window- Selenium/Python

My code goes into a webpage, finds the table, clicks on each row,
(which each row when clicked, opens a new window)
and from this point I want to scrape 1 piece of information (faculty) from this new window which I cant seem to figure out.
Here is my code
from selenium import webdriver
from bs4 import BeautifulSoup
import time
import requests
driver = webdriver.Chrome()
driver.get('https://aaaai.planion.com/Web.User/SearchSessions?ACCOUNT=AAAAI&CONF=AM2021&USERPID=PUBLIC&ssoOverride=OFF')
time.sleep(3)
page_source = driver.page_source
soup = BeautifulSoup(page_source,'html.parser')
eachRow=driver.find_elements_by_class_name('clickdiv')
for item in eachRow:
item.click() #opens the new window per each row
time.sleep(2)
faculty=driver.find_elements_by_xpath('//*[#id="W1"]/div/div/div/div[2]/div[2]/table/tbody/tr[7]/td/table/tbody/tr/td[2]/b')
print(faculty)
driver.find_element_by_class_name('XX').click()#closes window
Use find_element and .text.
driver.get('https://aaaai.planion.com/Web.User/SearchSessions?ACCOUNT=AAAAI&CONF=AM2021&USERPID=PUBLIC&ssoOverride=OFF')
time.sleep(3)
page_source = driver.page_source
soup = BeautifulSoup(page_source,'html.parser')
eachRow=driver.find_elements_by_class_name('clickdiv')
for item in eachRow:
item.click() #opens the new window per each row
time.sleep(2)
faculty=driver.find_element_by_xpath("//td[#valign='MIDDLE']/b")
print(faculty.text)
driver.find_element_by_class_name('XX').click()
A better way:
wait = WebDriverWait(driver, 5)
for item in eachRow:
item.click() #opens the new window per each row
faculty=wait.until(EC.presence_of_element_located((By.XPATH, "//td[#valign='MIDDLE']/b")))
print(faculty.text)
driver.find_element_by_class_name('XX').click()
Import
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
Outputs
Zaimat Beiro
Débora Shibayama Guterres, DÉBORA S GUTERRES
Joong K. Cho
Tao Zhu
Caroline Horner, MD FAAAAI

Selenium Python clicking only works half the time

but I am trying to write a unit test for my website that runs through all the links and returns an A ok or no go if the site is working. But I am having trouble with the program it's not able to constantly click the link in the site navigation bar. I've tried multiple waits implicit. Explicit, expected condition but the page loads and half the time it will click the link and go to that part of the site and the other half the program just stops and nothing is clicked.
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver import ActionChains
PATH = "C:\Program Files (x86)\chromedriver.exe"
drive = webdriver.Chrome(PATH)
drive.get("https://www.blackhempfamily.com/")
wait = WebDriverWait(drive, 10)
link = wait.until(EC.element_to_be_clickable((By.LINK_TEXT, "Why Black Hemp?")))
link.click()
Would be a better tag to use.
wait.until(EC.element_to_be_clickable((By.XPATH, "//p[text()='Why Black Hemp?']")))
The element you're searching for is not a link. It's a paragraph (p). I added a sleep call to give the page more load time.
Try this code:
time.sleep(3)
wait = WebDriverWait(drive, 10)
#link = wait.until(EC.element_to_be_clickable((By.LINK_TEXT, "Why Black Hemp?")))
link = drive.find_element_by_xpath('//*[#id="idh09fqo2label"]')
link.click()
So, it took a while ... but, I think that I was able to figure this out. The actions that you need to do are:
Click "Why Black Hemp?"
Wait until the page stops scrolling
Scroll to the top of the page
Wait until the page stops scrolling
**Attempt to scroll down so you can get the nav bar to display
Repeat until your heart is content / Test Passes with "A-OK"
In order for this to be achieved, you need to have the following imports
from selenium import webdriver
from selenium.webdriver.chrome.webdriver import WebDriver as ChromeWebDriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait as DriverWait
from selenium.webdriver.support import expected_conditions as DriverConditions
from selenium.common.exceptions import WebDriverException
import time
Step 1 - Click your "Why Black Hemp?" nav bar element
chrome_driver.find_element(By.XPATH, "//nav[contains(#id, 'navContainer')]//p[text()='Why Black Hemp?']/../../..").click()
Step 2 - Check to see if our page is still scrolling
# Checks to see if our page is still scrolling
while is_same_position == False:
windowPosition1 = chrome_driver.execute_script("return document.body.scrollHeight;")
time.sleep(2)
windowPosition2 = chrome_driver.execute_script("return document.body.scrollHeight;")
if(windowPosition1 == windowPosition2):
is_same_position = True
final_window_position = windowPosition1
Step 3 - Scroll to the top of the page
chrome_driver.execute_script("window.scrollTo(0, {0})".format((0 - final_window_position)))
Step 4 - Check to see if our page is still scrolling
# Checks to see if our page is still scrolling
while is_same_position == False:
windowPosition1 = chrome_driver.execute_script("return document.body.scrollHeight;")
time.sleep(2)
windowPosition2 = chrome_driver.execute_script("return document.body.scrollHeight;")
if(windowPosition1 == windowPosition2):
is_same_position = True
Step 5 - Attempt to scroll down until our header tag does not have the style of visibility: hidden
# Scrolls down until our nav bar is displayed
for scrollNum in range(10):
chrome_driver.execute_script("window.scrollTo(0, {0})".format(scrollNum * 100 + 200))
time.sleep(2)
if is_displayed(chrome_driver, "//header[contains(#style, 'visibility: hidden')]") == False:
break
Step 6 - Repeat until your heart is content
MAIN CODE - For Reference
from selenium import webdriver
from selenium.webdriver.chrome.webdriver import WebDriver as ChromeWebDriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait as DriverWait
from selenium.webdriver.support import expected_conditions as DriverConditions
from selenium.common.exceptions import WebDriverException
import time
def get_chrome_driver():
"""This sets up our Chrome Driver and returns it as an object"""
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("window-size=1500,1000")
# Removes the "This is being controlled by automation" alert / notification
chrome_options.add_experimental_option("excludeSwitches", ['enable-automation'])
path_to_chrome = "F:\Selenium_Drivers\Windows_Chrome85_Driver\chromedriver.exe"
return webdriver.Chrome(executable_path = path_to_chrome,
options = chrome_options)
def wait_displayed(driver : ChromeWebDriver, xpath : str, int = 3):
try:
DriverWait(driver, int).until(
DriverConditions.presence_of_element_located(locator = (By.XPATH, xpath))
)
except:
raise WebDriverException(f'Timeout: Failed to find {xpath}')
def is_displayed(driver : ChromeWebDriver, xpath : str, int = 3):
try:
webElement = DriverWait(driver, int).until(
DriverConditions.presence_of_element_located(locator = (By.XPATH, xpath))
)
return True if webElement != None else False
except:
return False
# Gets our chrome driver and opens our site
chrome_driver = get_chrome_driver()
chrome_driver.get("https://www.blackhempfamily.com/")
# Repeats this 5 times
for repeat in range(5):
print("Attempt to click our link. Try #{0}".format(repeat + 1))
is_same_position = False
final_window_position = 0
# Checks to see if our website's elements display
wait_displayed(chrome_driver, "//nav[contains(#id, 'navContainer')]")
wait_displayed(chrome_driver, "//nav[contains(#id, 'navContainer')]//p[text()='Why Black Hemp?']")
wait_displayed(chrome_driver, "//nav[contains(#id, 'navContainer')]//p[text()='Shop Black Hemp']")
# Clicks our "Why Black Hemp?" tab
chrome_driver.find_element(By.XPATH, "//nav[contains(#id, 'navContainer')]//p[text()='Why Black Hemp?']/../../..").click()
# Checks to see if our page is still scrolling
while is_same_position == False:
windowPosition1 = chrome_driver.execute_script("return document.body.scrollHeight;")
time.sleep(2)
windowPosition2 = chrome_driver.execute_script("return document.body.scrollHeight;")
if(windowPosition1 == windowPosition2):
is_same_position = True
final_window_position = windowPosition1
# Checks to see if our "Natural Moisture" text displays
wait_displayed(chrome_driver, "(//h2//span[contains(., 'Natural Moisture')]/../..)[1]")
# Scrolls back to the top of the page
chrome_driver.execute_script("window.scrollTo(0, {0})".format((0 - final_window_position)))
is_same_position = False
# Checks to see if our page is still scrolling
while is_same_position == False:
windowPosition1 = chrome_driver.execute_script("return document.body.scrollHeight;")
time.sleep(2)
windowPosition2 = chrome_driver.execute_script("return document.body.scrollHeight;")
if(windowPosition1 == windowPosition2):
is_same_position = True
# Scrolls down until our nav bar is displayed
for scrollNum in range(10):
chrome_driver.execute_script("window.scrollTo(0, {0})".format(scrollNum * 100 + 200))
time.sleep(2)
if is_displayed(chrome_driver, "//header[contains(#style, 'visibility: hidden')]") == False:
break
chrome_driver.quit()
chrome_driver.stop_client()
print('Congratulations! You clicked your link multiple times!')
Try it with xpath instead, and with element to be located (not clickable), as it is a paragraph. This worked for me:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver import ActionChains
PATH = "C:\Program Files (x86)\chromedriver.exe"
drive = webdriver.Chrome(PATH)
drive.get("https://www.blackhempfamily.com/")
linkWait = EC.element_to_be_located((By.XPATH, "//div/p[contains(., 'Why Black Hemp?')]"))
WebDriverWait(drive, 10).until(linkWait)
link = drive.find_element_by_xpath("//div/p[contains(., 'Why Black Hemp?')]")
link.click()

Categories

Resources