could someone assist me with an issue, I am trying to scrape the the dish name with a tag MUST TRY but I don't know why it is printing the list of all dishes
CODE :
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
driver = webdriver.Chrome(executable_path='./chromedriver.exe')
driver.get("https://www.zomato.com/pune/bedekar-tea-stall-sadashiv-peth/order")
screen_height = driver.execute_script("return window.screen.height;") # get the screen height of the web
i = 1
count = 0
scroll_pause_time = 1
while True:
# scroll one screen height each time
driver.execute_script("window.scrollTo(0, {screen_height}*{i});".format(screen_height=screen_height, i=i))
i += 1
time.sleep(scroll_pause_time)
# update scroll height each time after scrolled, as the scroll height can change after we scrolled the page
scroll_height = driver.execute_script("return document.body.scrollHeight;")
# Break the loop when the height we need to scroll to is larger than the total scroll height
if (screen_height) * i > scroll_height:
break
driver.execute_script("window.scrollTo(0, 0);")
#block of code where i am struggling with
dish_divs = driver.find_elements_by_xpath("//div[#class = 'sc-1s0saks-11 cYGeYt']")
for items in dish_divs:
if items.find_element(By.XPATH, "//div[contains(text(),'MUST TRY')]"):
name = items.find_element(By.CSS_SELECTOR,'h4.sc-1s0saks-15.iSmBPS')
print(name.text)
else:
continue
driver.close()
OUTPUT :
['Misal Slice', 'Shev Chivda', 'Kharvas [Sugar]', 'Extra Rassa [1 Vati]', 'Taak', 'Extra Slice', 'Misal Slice', 'Kharvas [Jaggery]', 'Solkadhi', 'Kokam', 'Nimboo Sharbat', 'Shev Chivda', 'Batata Chivda', 'Misal Slice', 'Extra Kanda [1 Vati]', 'Extra Slice', 'Extra Rassa [1 Vati]', 'Coffee Kharvas', 'Rose Kharvas', 'Shengdana Ladoo', 'Chirota', 'Kharvas [Sugar]', 'Kharvas [Jaggery]', 'Chocolate Fudge', 'Taak', 'Kokam', 'Flavored Milk', 'Nimboo Sharbat', 'Solkadhi', 'Dahi']
EXPECTED OUTPUT :
the list of dishes with musttry tag like in image below. My script is getting all the names not the selected ones
just try this xpath :
//div[text()='MUST TRY']/../../../h4
and use in code like this :
for name in driver.find_elements(By.XPATH, "//div[text()='MUST TRY']/../../../h4"):
print(name.text)
Instead of
dish_divs = driver.find_elements_by_xpath("//div[#class = 'sc-1s0saks-11 cYGeYt']")
for items in dish_divs:
if items.find_element(By.XPATH, "//div[contains(text(),'MUST TRY')]"):
name = items.find_element(By.CSS_SELECTOR,'h4.sc-1s0saks-15.iSmBPS')
print(name.text)
else:
continue
You can use
dish_divs = driver.find_elements_by_xpath('//div[#class="sc-1s0saks-1 dpXgPd"]/preceding-sibling::h4')
for items in dish_divs:
print(items.text)
This will make your code more readable and easy to maintain
Here items.find_element(By.XPATH, "//div[contains(text(),'MUST TRY')]") you're using absolute XPath (search all elements from the root). In fact you need relative XPath (search only in the current element):
items.find_element(By.XPATH, ".//div[contains(text(),'MUST TRY')]")
You can get same result using a single XPath:
//div[div/div[#type="tag"][.="MUST TRY"]]/preceding-sibling::h4[1]/text()
Also I don't recommend you to parse HTML using Selenium. It's really slow for this. I recommend to use lxml or beautifulsoup.
You can use above XPath like this:
from lxml import html
....
content = driver.page_source
tree = html.fromstring(content)
titles = tree.xpath('//div[div/div[#type="tag"][.="MUST TRY"]]/preceding-sibling::h4[1]/text()')
Related
I'm scraping news-articles from a website where there is no load-more button in a specific category page, the news article links are being generated as I scroll down. I wrote a function which take input category_page_url and limit_page(how many times I want to scroll down) and return me back all the links of the news articles displayed in that page.
Category page link = https://www.scmp.com/topics/trade
def get_article_links(url, limit_loading):
options = webdriver.ChromeOptions()
lists = ['disable-popup-blocking']
caps = DesiredCapabilities().CHROME
caps["pageLoadStrategy"] = "normal"
options.add_argument("--window-size=1920,1080")
options.add_argument("--disable-extensions")
options.add_argument("--disable-notifications")
options.add_argument("--disable-Advertisement")
options.add_argument("--disable-popup-blocking")
driver = webdriver.Chrome(executable_path= r"E:\chromedriver\chromedriver.exe", options=options) #add your chrome path
driver.get(url)
last_height = driver.execute_script("return document.body.scrollHeight")
loading = 0
while loading < limit_loading:
loading += 1
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(8)
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
article_links = []
bsObj = BeautifulSoup(driver.page_source, 'html.parser')
for i in bsObj.find('div', {'class': 'content-box'}).find('div', {'class': 'topic-article-container'}).find_all('h2', {'class': 'article__title'}):
article_links.append(i.a['href'])
return article_links
Assuming I want to scroll 5 times in this category page,
get_article_links('https://www.scmp.com/topics/trade', 5)
But even if I change the number of my limit_page it return me back only the links from first page, there is some mistake I've done to write the scrolling part. Please help me with this.
Instead of scrolling using per body scrollHeight property, I checked to see if there was any appropriate element after the list of articles to scroll to. I noticed this appropriately named div:
<div class="topic-content__load-more-anchor" data-v-db98a5c0=""></div>
Accordingly, I primarily changed the while loop in your function get_article_links to scroll to this div using location_once_scrolled_into_view after finding the div before the loop starts, as follows:
loading = 0
end_div = driver.find_element('class name','topic-content__load-more-anchor')
while loading < limit_loading:
loading += 1
print(f'scrolling to page {loading}...')
end_div.location_once_scrolled_into_view
time.sleep(2)
If we now call the function with different limit_loading, we get different count of unique news links. Here are couple of runs:
>>> ar_links = get_article_links('https://www.scmp.com/topics/trade', 2)
>>> len(ar_links)
scrolling to page 1...
scrolling to page 2...
90
>>> ar_links = get_article_links('https://www.scmp.com/topics/trade', 3)
>>> len(ar_links)
scrolling to page 1...
scrolling to page 2...
scrolling to page 3...
120
hey guys i was trying to scrape Zomato's restaurants those have ratings above 4 but https://www.zomato.com/pune/order-food-online?delivery_subzone=1165 but its class name or every thing changing after next few elements
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import json
driver=webdriver.Chrome(executable_path='./chromedriver.exe')
driver.get('https://www.zomato.com/pune/order-food-online?delivery_subzone=1165')
rating=WebDriverWait(driver, 10).until(
EC.presence_of_all_elements_located((By.XPATH, '//p[#class="sc-1hez2tp-0 sc-lhdg1m-2 hDJwRc"]'))
)
for item in rating:
stars=item.text
if stars > '4.0':
title=WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.XPATH, "//p[#class='sc-1hez2tp-0 sc-izFuNb jbErXF']"))
)
time.sleep(10)
driver.close()
please guys I'm doing it by selenium
Go to the page.
Filter out the restaurants with 4.0+ ratings using the filters provided above - using the xpath //div[contains(text(),'Rating: 4.0+')] (use a click() method).
All of the cards of the restaurants have the image alt of Restaurant Card. So you can use the css selector img[alt='Restaurant Card'] to get all the cards appearing after filtering, and keep them in some count variable.
As you keep scrolling, you need to keep adding to this count variable.
Edit: Here is the whole script for you - which gives the count of restaurants as 117
import time
from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.options import Options
from urllib.parse import urljoin
##### Web scrapper for infinite scrolling page #####
driver = webdriver.Chrome(executable_path=r"path_to-chromedriver")
driver.get("https://www.zomato.com/pune/delivery-in-budhwar-peth")
time.sleep(10) # Allow 2 seconds for the web page to open
driver.find_element_by_xpath("//div[contains(text(),'Rating: 4.0+')]").click()
scroll_pause_time = 1 # You can set your own pause time. My laptop is a bit slow so I use 1 sec
screen_height = driver.execute_script("return window.screen.height;") # get the screen height of the web
i = 1
count=0
while True:
# scroll one screen height each time
driver.execute_script("window.scrollTo(0, {screen_height}*{i});".format(screen_height=screen_height, i=i))
i += 1
time.sleep(scroll_pause_time)
# update scroll height each time after scrolled, as the scroll height can change after we scrolled the page
scroll_height = driver.execute_script("return document.body.scrollHeight;")
# Break the loop when the height we need to scroll to is larger than the total scroll height
if (screen_height) * i > scroll_height:
break
soup = BeautifulSoup(driver.page_source, "html.parser")
for img in soup.find_all('img',alt='Restaurant Card'):
count+=1
print('Count of all rests is',count)
driver.quit()
I'm trying to scrape company's jobs offer from linkedin. I need to scroll a section in the page (with an inner scrollbar). I have been trying this :
1.
scroll_active = WebDriverWait(driver, 40).until(EC.presence_of_element_located((By.CSS_SELECTOR, "body > div.application-outlet > div.authentication-outlet > div.job-search-ext > div > div > section.jobs-search__left-rail > div > div > ul")))
scroll_active.location_once_scrolled_into_view
while driver.find_element_by_tag_name('div'):
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
Divs=driver.find_element_by_tag_name('div').text
if 'End of Results' in Divs:
print 'end'
break
else:
continue
Need to extract 'href'
If any one facing that, I wish this could help, you just have to choose well the element that you want to scroll
my_xpath = WebDriverWait(driver, 40).until(EC.presence_of_element_located((By.XPATH, "/html/body/div[8]/div[3]/div[3]/div/div/section[1]/div/div")))
driver.execute_script('arguments[0].scrollTop = arguments[0].scrollHeight', my_xpath)
Why do need to scroll here?
seems like you can get all of the element by command:
elements = driver.find_elements(By.XPATH, "//a[#class='result-card__full-card-link']")
and looks like:
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
driver = webdriver.Chrome()
driver.get('https://www.linkedin.com/jobs/search/?f_C=1110%2C12800%2C5115950%2C3165553%2C603115%2C10916%2C8331%2C3297950%2C8238%2C5509188%2C3093%2C2625246%2C1112%2C947572%2C11018069%2C407323&geoId=92000000')
time.sleep(3)
def element_present():
try:
driver.find_element(By.XPATH, "//button[#class='infinite-scroller__show-more-button infinite-scroller__show-more-button--visible']")
except Exception:
return False
return True
while not element_present():
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
elements = driver.find_elements(By.XPATH, "//a[#class='result-card__full-card-link']")
hrefs = [el.get_attribute('href') for el in elements]
print(hrefs)
print(len(hrefs))
driver.quit()
might I missed smth, but seems like it works well as well
I am writing a code in python using the module selenium, and I want to scroll on a list that is on a different layer in the same window. Imagine you go to Instagram, click on followers, and then wish to scroll down to the bottom so that selenium can make a list of all the users who follow that page.
My problem is my code scrolls on the layer below, which is the wall of the user.
def readingFollowers(self):
self.driver.find_element_by_xpath("//a[contains(#href, '/followers')]")\
.click()
sleep(2.5)
scroll_box = self.driver.find_element_by_xpath('/html/body/div[4]/div/div[2]')
# Get scroll height
last_height = self.driver.execute_script("return arguments[0].scrollHeight", scroll_box)
while True:
# Scroll down to bottom
self.driver.execute_script("window.scrollTo(0, arguments[0].scrollHeight);", scroll_box)
# Wait to load page
sleep(1)
# Calculate new scroll height and compare with last scroll height
new_height = self.driver.execute_script("return arguments[0].scrollHeight", scroll_box)
if new_height == last_height:
break
last_height = new_height
I have used Google Chrome, and the inspect element would be the same on all the systems (most probably).
For complete code, you can comment on me, in case you are not able to understand the problem. I can give you the code required to recreate the situation for better understanding.
I assume that you are already logged-in on the IG account.
def readingFollowers(self):
#click followers
self.driver.find_element_by_xpath('//a[#class="-nal3 "]').click()
time.sleep(5)
pop_up = driver.find_element_by_xpath('//div[#class="isgrP"]')
height = driver.execute_script("return arguments[0].scrollHeight", pop_up)
initial_height = height
#default follower count is 12
followers_count = 12
while True:
driver.execute_script("arguments[0].scrollBy(0,arguments[1])", pop_up, initial_height)
time.sleep(5)
#count loaded followers
count = len(driver.find_elements_by_xpath('//div[#class="PZuss"]/li'))
if count == followers_count:
break
followers_count = count
#add height because the list is expanding
initial_height+=initial_height
It took me some time but it works.
I'm trying to loop through a dropdown menu on at this url: https://www.accuform.com/safety-sign/danger-danger-authorized-personnel-only-MADM006
So, for example, the first dropdown menu - under options - lists out different materials and I want to select each one in turn and then gather some other information from the webpage before moving on to the next material. Here is my current code:
driver = webdriver.Firefox()
driver.get('https://www.accuform.com/safety-sign/danger-danger-authorized-personnel-only-MADM006')
time.sleep(3)
driver.find_element_by_id('x-mark-icon').click()
select = Select(driver.find_element_by_name('Wiqj7mb4rsAq9LB'))
options = select.options
optionsList = []
driver.find_elements_by_class_name('select-wrapper')[0].click()
element = driver.find_element_by_xpath("//select[#name='Wiqj7mb4rsAq9LB']")
actions = ActionChains(driver)
actions.move_to_element(element).perform()
# driver.execute_script("arguments[0].scrollIntoView();", element)
for option in options: #iterate over the options, place attribute value in list
optionsList.append(option.get_attribute("value"))
for optionValue in optionsList:
print("starting loop on option %s" % optionValue)
# select = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//select[#name='Wiqj7mb4rsAq9LB']")))
# select = Select(select)
select.select_by_value(optionValue)
I started with just the loop, but got this error:
ElementNotInteractableException: Message: Element <option> could not be scrolled into view
I then added the webdriverwait and get a TimeoutException error.
I then realized I should probably click on the wrapper in which the dropdown is held, so I added the click, which does pup up the menu, but I still got the TimeoutException.
So I thought, maybe I should move to the element, which I tried with the action chain lines and I got this error
WebDriverException: Message: TypeError: rect is undefined
I tried to avoid that error by using this code instead:
# driver.execute_script("arguments[0].scrollIntoView();", element)
Which just resulted in the timeoutexception again.
I pretty new to Python and Selenium and have basically just been modifying code from SO answers to similar questions, but nothing has worked.
I'm using python 3.6 and the current versions of Selenium and firefox webdriver.
If anything is unclear or if you need more info just let me know.
Thanks so much!
EDIT: Based on the answer and comments by Kajal Kunda, I've updated my code to the following:
`material_dropdown = driver.find_element_by_xpath("//input[#class='select-
dropdown']")
driver.execute_script("arguments[0].click();", material_dropdown)
materials=driver.find_elements_by_css_selector("div.select-wrapper
ul.dropdown-content li")
for material in materials:
# material_dropdown =
driver.find_element_by_xpath("//input[#class='select-dropdown']")
# driver.execute_script("arguments[0].click();", material_dropdown)
# materials=driver.find_elements_by_css_selector("div.select-wrapper ul.dropdown-content li")
material_ele=material.find_element_by_tag_name('span')
if material_ele.text!='':
material_ele.click()
time.sleep(5)
price = driver.find_element_by_class_name("dataPriceDisplay")
print(price.text)`
The result is that it successfully prints the price for the first type of material, but then it returns:
StaleElementReferenceException: Message: The element reference of <li class=""> is stale;...
I've tried variations of having the hashed out lines in and outside of the loop, but always get a version of the StaleElementReferenceException error.
Any suggestions?
Thanks!
You could do the whole thing with requests. Grab the drop down list from the options listed in drop down then concatenate the value attributes into requests url that retrieves json containing all the info on the page. Same principle applies for adding in other dropdown values. The ids for each drop down selection are the value attributes of the options in the drop down and appear in the url I show separated by // for each drop down selection.
import requests
from bs4 import BeautifulSoup as bs
url = 'https://www.accuform.com/product/getSku/danger-danger-authorized-personnel-only-MADM006/1/false/null//{}//WHFIw3xXmQx8zlz//6wr93DdrFo5JV//WdnO0RpwKpc4fGF'
startURL = 'https://www.accuform.com/safety-sign/danger-danger-authorized-personnel-only-MADM006'
res = requests.get(startURL)
soup = bs(res.content, 'lxml')
materials = [item['value'] for item in soup.select('#Wiqj7mb4rsAq9LB option')]
sizes = [item['value'] for item in soup.select('#WvXESrTyQjM3Ciw option')]
languages = [item['value'] for item in soup.select('#WUYWGMePtpmpmhy option')]
units = [item['value'] for item in soup.select('#W91eqaJ0WPXwe9b option')]
for material in materials:
data = requests.get(url.format(material)).json()
soup = bs(data['dataMaterialBullets'], 'lxml')
lines = [item.text for item in soup.select('li')]
print(lines)
print(data['dataPriceDisplay'])
# etc......
Sample of JSON:
Try the below code.It should work.
driver = webdriver.Firefox()
driver.get('https://www.accuform.com/safety-sign/danger-danger-authorized-personnel-only-MADM006')
time.sleep(3)
driver.find_element_by_id('x-mark-icon').click()
material_dropdown = driver.find_element_by_xpath("//input[#class='select-dropdown']")
driver.execute_script("arguments[0].click();", material_dropdown)
#Code for material dropdown
materials=driver.find_elements_by_css_selector("div.select-wrapper ul.dropdown-content li")
material_optionsList = []
for material in materials:
material_ele=material.find_element_by_tag_name('span')
if material_ele.text!='':
material_optionsList.append(material_ele.text)
print(material_optionsList)
driver.execute_script("arguments[0].click();", material_dropdown)
size_dropdown = driver.find_element_by_xpath("(//input[#class='select-dropdown'])[2]")
driver.execute_script("arguments[0].click();", size_dropdown)
#Code for size dropdown
Sizes=driver.find_elements_by_css_selector("div.select-wrapper ul.dropdown-content li")
size_optionsList = []
for size in Sizes:
size_ele=size.find_element_by_tag_name('span')
if size_ele.text!='':
size_optionsList.append(size_ele.text)
driver.execute_script("arguments[0].click();", size_dropdown)
Output :
[u'Adhesive Vinyl', u'Plastic', u'Adhesive Dura-Vinyl', u'Aluminum', u'Dura-Plastic\u2122', u'Aluma-Lite\u2122', u'Dura-Fiberglass\u2122', u'Accu-Shield\u2122']
Hope you will do the remaining.Let me know if it works for you.
EDIT Code for loop through and get the price value of materials.
for material in range(len(materials)):
material_ele=materials[material]
if material_ele.text!='':
#material_optionsList.append(material_ele.text)
#material_ele.click()
driver.execute_script("arguments[0].click();", material_ele)
time.sleep(2)
price = driver.find_element_by_id("priceDisplay")
print( price.text)
time.sleep(2)
material_dropdown = driver.find_element_by_xpath("//input[#class='select-dropdown']")
driver.execute_script("arguments[0].click();", material_dropdown)
materials = driver.find_elements_by_css_selector("div.select-wrapper ul.dropdown-content li")
material+=2
Output :
$8.31
$9.06
$13.22
$15.91
$15.91