Selenium scraping Issues with site having an popup window with endless scroll

Selenium scraping Issues with site having an popup window with endless scroll - python

I am trying to scrape a website that populates a list of providers. the site makes you go through a list of options and then finally it populates a list of providers through a pop up that has an endless/continuous scroll.
i have tried:
from selenium.webdriver.common.action_chains import ActionChains
element = driver.find_element_by_id("my-id")
actions = ActionChains(driver)
actions.move_to_element(element).perform()
but this code didn't work.
I tried something similar to this:
driver.execute_script("arguments[0].scrollIntoView();", list )
but this didnt move anything. it just stayed on the first 20 providers.
i tried this alternative:
main = driver.find_element_by_id('mainDiv')
recentList = main.find_elements_by_class_name('nameBold')
for list in recentList :
driver.execute_script("arguments[0].scrollIntoView(true);", list)
time.sleep(20)
but ended up with this error message:
selenium.common.exceptions.StaleElementReferenceException: Message: stale element reference: element is not attached to the page document
The code that worked the best was this one:
while True:
# Scroll down to bottom
element_inside_popup = driver.find_element_by_xpath('//*[#id="mainDiv"]')
element_inside_popup.send_keys(Keys.END)
# Wait to load page
time.sleep(3)
but this is an endless scroll that i dont know how to stop since "while True:" will always be true.
Any help with this would be great and thanks in advance.
This is my code so far:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.ui import Select
import pandas as pd
PATH = '/Users/AnthemScraper/venv/chromedriver'
driver = webdriver.Chrome(PATH)
#location for the website
driver.get('https://shop.anthem.com/sales/eox/abc/ca/en/shop/plans/medical/snq?execution=e1s13')
print(driver.title)
#entering the zipcode
search = driver.find_element_by_id('demographics.zip5')
search.send_keys(90210)
#making the scraper sleep for 5 seconds while the page loads
time.sleep(5)
#entering first name and DOB then hitting next
search = driver.find_element_by_id('demographics.applicants0.firstName')
search.send_keys('juelz')
search = driver.find_element_by_id('demographics.applicants0.dob')
search.send_keys('01011990')
driver.find_element_by_xpath('//*[#id="button/shop/getaquote/next"]').click()
#hitting the next button
driver.find_element_by_xpath('//*[#id="hypertext/shop/estimatesavings/skipthisstep"]').click()
#making the scraper sleep for 2 seconds while the page loads
time.sleep(2)
#clicking the no option to view all the health plans
driver.find_element_by_xpath('//*[#id="radioNoID"]').click()
driver.find_element_by_xpath('/html/body/div[4]/div[11]/div/button[2]/span').click()
#making the scraper sleep for 2 seconds while the page loads
time.sleep(2)
driver.find_element_by_xpath('//*[#id="hypertext/shop/medical/showmemydoctorlink"]/span').click()
time.sleep(2)
#section to choose the specialist. here we are choosing all
find_specialist=\
driver.find_element_by_xpath('//*[#id="specializedin"]')
#this is the method for a dropdown
select_provider = Select(find_specialist)
select_provider.select_by_visible_text('All Specialties')
#choosing the distance. Here we click on 50 miles
choose_mile_radius=\
driver.find_element_by_xpath('//*[#id="distanceInMiles"]')
select_provider = Select(choose_mile_radius)
select_provider.select_by_visible_text('50 miles')
driver.find_element_by_xpath('/html/body/div[4]/div[11]/div/button[2]/span').click()
#handling the endless scroll
while True:
time.sleep(20)
# Scroll down to bottom
element_inside_popup = driver.find_element_by_xpath('//*[#id="mainDiv"]')
element_inside_popup.send_keys(Keys.END)
# Wait to load page
time.sleep(3)
#block below allows us to grab the majority of the data. we would have to split it up in pandas since this info
#is nested in with classes
time.sleep(5)
main = driver.find_element_by_id('mainDiv')
sections = main.find_elements_by_class_name('firstRow')
pcp_info = []
#print(section.text)
for pcp in sections:
#the site stores the information inside inner classes which make it difficult to scrape.
#the solution would be to pull the entire text in the block and hope to clean it aftewards
#innerText allows to pull just the text inside the blocks
first_blox = pcp.find_element_by_class_name('table_content_colone').get_attribute('innerText')
second_blox = pcp.find_element_by_class_name('table_content_coltwo').get_attribute('innerText')
#creating columns and rows and assigning them
pcp_items = {
'first_block' : [first_blox],
'second_block' : [second_blox]
}
pcp_info.append(pcp_items)
df = pd.DataFrame(pcp_info)
print(df)
df.to_csv('yerp.csv',index=False)
#driver.quit()

Related

Selenium - HTML doesn't always update after a click. Content in the browser changes, but I often get the same HTML from prior to the click

I'm trying to set up a simple webscraping script to pull every hyperlink from the discover cards on Bandcamp.
Here is my code:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.select import Select
browser = webdriver.Chrome()
all_links = []
page = 1
url = "https://bandcamp.com/?g=all&s=new&p=0&gn=0&f=digital&w=-1"
browser.get(url)
while page < 6:
page += 1
# wait until discover cards are loaded
test = WebDriverWait(browser, 20).until(EC.element_to_be_clickable(
(By.XPATH, '//*[#id="discover"]/div[9]/div[2]/div/div[1]/div/table/tbody/tr[1]/td[1]/a/div')))
# scrape hyperlinks for each of the 8 albums shown
titles = browser.find_elements(By.CLASS_NAME, "item-title")
links = [title.get_attribute('href') for title in titles[-8:]]
all_links = all_links + links
print(links)
# pagination - click through the page buttons as the links are scraped
page_nums = browser.find_elements(By.CLASS_NAME, 'item-page')
for page_num in page_nums:
if page_num.text.isnumeric():
if int(page_num.text) == page:
page_num.click()
time.sleep(20) # I've tried multiple long wait times as well as WebDriverWaits on different elements to see if the HTML will update, but I haven't seen a positive effect
break
I'm using print(links) to see where this is going wrong. In the selenium browser, it clicks through the pages well. Note that pagination via the url parameters doesn't seem possible as the discover cards often won't load unless you click the page buttons towards the bottom of my picture. BetterSoup and Requests don't work either for the same reason. The print function is returning the following:
['https://neubauten.bandcamp.com/album/stimmen-reste-musterhaus-7?from=discover-new', 'https://cirka1.bandcamp.com/album/time?from=discover-new', 'https://futuramusicsound.bandcamp.com/album/yoga-meditation?from=discover-new', 'https://deathsoundbatrecordings.bandcamp.com/album/real-mushrooms-dsbep092?from=discover-new', 'https://riacurley.bandcamp.com/album/take-me-album?from=discover-new', 'https://terracuna.bandcamp.com/album/el-origen-del-viento?from=discover-new', 'https://hyper-music.bandcamp.com/album/hypermusic-vol-4?from=discover-new', 'https://defisis1.bandcamp.com/album/priceless?from=discover-new']
['https://jarnosalo.bandcamp.com/album/here-lies-ancient-blob?from=discover-new', 'https://andreneitzel.bandcamp.com/album/allegasi-gold-2?from=discover-new', 'https://moonraccoon.bandcamp.com/album/prequels?from=discover-new', 'https://lolivone.bandcamp.com/album/live-at-the-berklee-performance-center?from=discover-new', 'https://nilswrasse.bandcamp.com/album/a-calling-from-the-desert-to-the-sea-original-motion-picture-soundtrack?from=discover-new', 'https://whitereaperaskingride.bandcamp.com/album/asking-for-a-ride?from=discover-new', 'https://collageeffect.bandcamp.com/album/emerald-network?from=discover-new', 'https://foxteethnj.bandcamp.com/album/through-the-blue?from=discover-new']
['https://jarnosalo.bandcamp.com/album/here-lies-ancient-blob?from=discover-new', 'https://andreneitzel.bandcamp.com/album/allegasi-gold-2?from=discover-new', 'https://moonraccoon.bandcamp.com/album/prequels?from=discover-new', 'https://lolivone.bandcamp.com/album/live-at-the-berklee-performance-center?from=discover-new', 'https://nilswrasse.bandcamp.com/album/a-calling-from-the-desert-to-the-sea-original-motion-picture-soundtrack?from=discover-new', 'https://whitereaperaskingride.bandcamp.com/album/asking-for-a-ride?from=discover-new', 'https://collageeffect.bandcamp.com/album/emerald-network?from=discover-new', 'https://foxteethnj.bandcamp.com/album/through-the-blue?from=discover-new']
['https://jarnosalo.bandcamp.com/album/here-lies-ancient-blob?from=discover-new', 'https://andreneitzel.bandcamp.com/album/allegasi-gold-2?from=discover-new', 'https://moonraccoon.bandcamp.com/album/prequels?from=discover-new', 'https://lolivone.bandcamp.com/album/live-at-the-berklee-performance-center?from=discover-new', 'https://nilswrasse.bandcamp.com/album/a-calling-from-the-desert-to-the-sea-original-motion-picture-soundtrack?from=discover-new', 'https://whitereaperaskingride.bandcamp.com/album/asking-for-a-ride?from=discover-new', 'https://collageeffect.bandcamp.com/album/emerald-network?from=discover-new', 'https://foxteethnj.bandcamp.com/album/through-the-blue?from=discover-new']
['https://finitysounds.bandcamp.com/album/kreme?from=discover-new', 'https://mylittlerobotfriend.bandcamp.com/album/amen-break?from=discover-new', 'https://electrinityband.bandcamp.com/album/rise?from=discover-new', 'https://abyssal-void.bandcamp.com/album/ritualist?from=discover-new', 'https://plataformarecs.bandcamp.com/album/v-a-david-lynch-experience?from=discover-new', 'https://hurricaneturtles.bandcamp.com/album/industrial-synth?from=discover-new', 'https://blackwashband.bandcamp.com/album/2?from=discover-new', 'https://worldwide-bitchin-records.bandcamp.com/album/wack?from=discover-new']
Each time it correctly pulls the first 8 albums on page 1, then for pages 2-4 it repeats the 8 albums on page 2, for pages 5-7 it repeats the 8 albums on page 5, and so on. Even though the page is updating (and the url changes) in the selenium browser, for some reason selenium is not recognizing any changes to the html so it repeats the same titles. Any idea where I've gone wrong?

Your definition of titles, i.e.
titles = browser.find_elements(By.CLASS_NAME, "item-title")
is a bad idea because item-title is the class of many elements in the page. Then another bad idea is to pick titles[-8:]. It may sounds good because you think ok each time I click a page the new elements are added at the end, but this is not always the case. Your case is one of those were elements are not added sequentially.
So let's start by considering a class exclusive of cards. For example discover-item. Then open the DevTools, press CTRL+F and enter .discover-item. When the url is first loaded, it will find 8 results. Now click next page, now it finds 16 results, click again and will find 24 results. To better see what's going I suggest you to run the following each time you click on the "next" button.
el = driver.find_elements(By.CSS_SELECTOR, '.discover-item')
for i,e in enumerate(el):
print(i,e.get_attribute('innerText').replace('\n',' - '))
In particular, when arriving to page 3, you will see that the first item shown in page 1 (which in my case is 0 and friends - Jacob Stanley - alternative), is now printed at a different position (in my case 8 and friends - Jacob Stanley - alternative). What happened is that the items of page 3 were added at the beginning of the list, and so you can see why titles[-8:] was a bad choice.
So a better choice is to consider all cards each time you go to the next page, instead of the last 8 only (notice that the HTML of this site can contain no more than 24 cards), and then add all current cards to a set (since a set cannot contain duplicates, only new elements will be added).
# scroll to cards
cards = WebDriverWait(driver,20).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, ".discover-item")))
driver.execute_script('arguments[0].scrollIntoView({block: "center", behavior: "smooth"});', cards[0])
time.sleep(1)
items = set()
while 1:
links = driver.find_elements(By.CSS_SELECTOR, '.discover-item .item-title')
# extract info and store it
for idx,card in enumerate(cards):
tit_art_gen = card.get_attribute('innerText').replace('\n',' - ')
href = links[idx].get_attribute('href')
# print(idx, tit_art_gen)
items.add(tit_art_gen + ' - ' + href)
# click 'next' button if it is not disabled
next_button = driver.find_element(By.XPATH, "//a[.='next']")
if 'disabled' in next_button.get_attribute('class'):
print('last page reached')
break
else:
next_button.click()
# wait until new elements are loaded
cards_new = cards.copy()
while cards_new == cards:
cards_new = driver.find_elements(By.CSS_SELECTOR, '.discover-item')
time.sleep(.5)
cards = cards_new.copy()

Printing YouTube Video Titles to File w/Selenium & Python, MacOS

I am trying to create a basic web scraper that takes input from the user to search for a video on YouTube and then write x number of titles to a file. I have tried using the ID of the video title, XPATH, and CSS_SELECT0R but am unable to get the program to fully do what I am wanting it to. I either get stuck in the automated browser (it doesn't quit) or I am unable to write the web element to the file. This is my current code:
# Importing the necessary modules for the program to run.
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver import ActionChains
from selenium import webdriver
# This is the first part of the program. It asks the user for a search query and how many results they
# want to see.
request = input('Search for: ')
split_request = request.split()
url = 'https://www.Youtube.com/results?search_query=' + '+'.join(split_request) + '+'
query_amount = int(input('How many results do you want to see? '))
driver = webdriver.Safari()
actions = ActionChains(driver)
driver.get(url)
i = 0
# Creating a new file called 'YouTube Search Results.txt' and opening it in write mode.
with open('YouTube Search Results.txt', mode='w') as file:
pass
try:
# Waiting for the elements with the ID 'video-title' to load.
WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.XPATH, '//*[#id="video-title"]/yt-formatted-string/text()')))
# This is the part of the program that scrolls down the page to load more results.
query = (driver.find_elements(By.XPATH, '//*[#id="video-title"]/yt-formatted-string/text()'))
while i < query_amount:
actions.scroll_by_amount(0, 10)
query = (driver.find_elements(By.XPATH, '//*[#id="video-title"]/yt-formatted-string/text()'))
i = len(query)
# Counting the number of results and breaking the loop when the number of results is greater than
# the amount of results the user wants to see.
count = 0
for title in query:
if count > query_amount:
break
# This is the part of the program that writes the results to the file.
else:
count += 1
with open('YouTube Search Results.txt', mode='a') as file:
file.write(title.text + '\n')
finally:
driver.quit()
I believe the issue is in the last block but I am unable to figure it out.
Thanks in advance for your help!

PYTHON scrapy selenium WebDriverWait

Experts here, I am searching for your help if you don't mind it.
Recently, I am working out a web crawler using scrapy and selenium in python. My mind has crush.
I just want to ask whether it is possible that you still get empty even if you've used the statement
WebDriverWait(driver, 100, 0.1).until(EC.presence_of_all_elements_located((By.XPATH,xxxxx)))
to get those elements. And also, it even doesn't take 100 second to get empty. Why?
And by the way, it is a random thing, which means this phenomenon happens anywhere, anytime.
Does getting empty had something about my network connection?
Could you help me or give me some opinions, suggestion about the question above?
Thanks a lot!
-----------------------supplementary notes-----------------------
Thanks for the heads up.
In summary, I used scrapy and selenium to crawl a site about reviews and write the username, posting time, comment content, etc. to a .xlsx file via pipeline.py, I wanted it to be as fast as possible while gathering complete information.
A page with many people commenting, and because the review text is too long it gets put away, which means that almost 20 comments per page have their expand button.
Therefore, I need to use selenium to click the expand button and then use driver to fetch the complete comment. Common sense dictates that it takes a bit of time to load after the expand button is clicked, and I believe the time it takes depends on the speed of the network. So using WebDriverWait seems to be a wise choice here. After my practice, the default parameters timeout=10 and poll_frequency=0.5 seem to be too slow and error-prone. So I considered using the specifications of timeout=100 and poll_frequency=0.1.
However, the problem is that every time I run the project through the cmd statement scrapy crawl spider, there are always several comment crawls that are empty, and each time the location of the empty is different. I've thought about using time.sleep() to force a stop, but that would take a lot of time if every page did that, and while it's certainly a more useful way to get complete information. Also, it's looks not so elegant and a little bit clumsy in my opinion.
Have I express my question clearly?
-------------------------------add something--------------------------------
The exact meaning of I got somwhere empty is shown as the picture below.
---------------------------add my code--------------------------2022/5/18
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome()
driver.get('https://movie.douban.com/subject/5045678/reviews?start=0')
users = [el.text for el in driver.find_elements(By.CSS_SELECTOR, 'a[class=name]')]
dates = [el.text for el in driver.find_elements(By.CSS_SELECTOR, 'span[class=main-meta]')]
full_content, words = [], []
unfolds = WebDriverWait(driver,100,0.1).until(EC.presence_of_all_elements_located((By.XPATH,"//a[#class='unfold']")))
# Here's how I think about and design my loop body.
# I click the expansion bottun, then grab the text, then put it away, then move on to the next one.
for i in range(len(unfolds)):
unfolds[i].click()
time.sleep(1)
# After the javascript, the `div[#class='review-content clearfix']` appear,
# and some of the full review content will be put in a `<p></p>` label
find_full_content_p = WebDriverWait(driver,100,0.1).until(EC.presence_of_all_elements_located((By.XPATH,"//div[#class='review-content clearfix']/p")))
full_content_p = [j.text for j in find_full_content_p]
# and some of them will just put in `div[#class='review-content clearfix']` itself.
find_full_content_div = WebDriverWait(driver,100,0.1).until(EC.presence_of_all_elements_located((By.XPATH,"//div[#class='review-content clearfix']")))
full_content_div = [j.text for j in find_full_content_div]
# and I made a list merge
full_content_p.extend(full_content_div)
full_content.append("".join(full_content_p))
words.append(len("".join(full_content_p)))
time.sleep(1)
# then put it away
WebDriverWait(driver,100,0.1).until(EC.element_to_be_clickable((By.XPATH,"//a[#class='fold']"))).click()
driver.close()
pd.DataFrame({"users":users, "dates":dates, "full_content":full_content, "words":words})
AND, this is the code of an expert I genuinely respect named sound wave.(This is slightly modified, the core code has not been changed)
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
# from selenium.webdriver.chrome.service import Service
driver = webdriver.Chrome()
driver.get('https://movie.douban.com/subject/5045678/reviews?start=0')
users = [el.text for el in driver.find_elements(By.CSS_SELECTOR, 'a[class=name]')]
dates = [el.text for el in driver.find_elements(By.CSS_SELECTOR, 'span[class=main-meta]')]
reviews, words = [], []
for review in driver.find_elements(By.CSS_SELECTOR, 'div.review-short'):
show_more = review.find_elements(By.CSS_SELECTOR, 'a.unfold')
if show_more:
# scroll to the show more button, needed to avoid ElementClickInterceptedException
driver.execute_script('arguments[0].scrollIntoView({block: "center"});', show_more[0])
show_more[0].click()
review = review.find_element(By.XPATH, 'following-sibling::div')
while review.get_attribute('class') == 'hidden':
time.sleep(0.2)
review = review.find_element(By.CSS_SELECTOR, 'div.review-content')
reviews.append(review.text)
words.append(len(review.text))
print('done',len(reviews),end='\r')
pd.DataFrame({"users":users,"dates":dates,"reviews":reviews,"words":words})

NEW
Added code for the site douban. To export the scraped data to a csv see the pandas code in the OLD section below
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
driver = webdriver.Chrome(service=Service('...'))
driver.get('https://movie.douban.com/subject/5045678/reviews?start=0')
users = [el.text for el in driver.find_elements(By.CSS_SELECTOR, 'a[class=name]')]
dates = [el.text for el in driver.find_elements(By.CSS_SELECTOR, 'span[class=main-meta]')]
reviews = []
for review in driver.find_elements(By.CSS_SELECTOR, 'div.review-short'):
show_more = review.find_elements(By.CSS_SELECTOR, 'a.unfold')
if show_more:
# scroll to the show more button, needed to avoid ElementClickInterceptedException
driver.execute_script('arguments[0].scrollIntoView({block: "center"});', show_more[0])
show_more[0].click()
review = review.find_element(By.XPATH, 'following-sibling::div')
while review.get_attribute('class') == 'hidden':
time.sleep(0.2)
review = review.find_element(By.CSS_SELECTOR, 'div.review-content')
reviews.append(review.text)
print('done',len(reviews),end='\r')
OLD
For the website you mentioned (imdb.com) in order to scrape the hidden content there is no need to click on the show more button because the text is already loaded in the HTML code, simply it is not shown on the site. So you can scrape all the comments in a single time. Code below stores users, dates and reviews in seprate lists, and finally save data to a .csv file.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
driver = webdriver.Chrome(service=Service(chromedriver_path))
driver.get('https://www.imdb.com/title/tt1683526/reviews')
# sets a maximum waiting time for .find_element() and similar commands
driver.implicitly_wait(10)
reviews = [el.get_attribute('innerText') for el in driver.find_elements(By.CSS_SELECTOR, 'div.text')]
users = [el.text for el in driver.find_elements(By.CSS_SELECTOR, 'span.display-name-link')]
dates = [el.text for el in driver.find_elements(By.CSS_SELECTOR, 'span.review-date')]
# store data in a csv file
import pandas as pd
df = pd.DataFrame(list(zip(users,dates,reviews)), columns=['user','date','review'])
df.to_csv(r'C:\Users\your_name\Desktop\data.csv', index=False)
To print a single review you can do something like this
i = 0
print(f'User: {users[i]}\nDate: {dates[i]}\n{reviews[i]}')
the output (truncated) is
User: dschmeding
Date: 26 February 2012
Wow! I was not expecting this movie to be this engaging. Its one of those films...

How to get all the data from a webpage manipulating lazy-loading method?

I've written some script in python using selenium to scrape name and price of different products from redmart website. My scraper clicks on a link, goes to its target page, parses data from there. However, the issue I'm facing with this crawler is it scrapes very few items from a page because of the webpage's slow-loading method. How can I get all the data from each page controlling the lazy-loading process? I tried with "execute script" method but i did it wrongly. Here is the script I'm trying with:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome()
driver.get("https://redmart.com/bakery")
wait = WebDriverWait(driver, 10)
counter = 0
while True:
try:
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "li.image-facets-pill")))
driver.find_elements_by_css_selector('img.image-facets-pill-image')[counter].click()
counter += 1
except IndexError:
break
# driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
for elems in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "li.productPreview"))):
name = elems.find_element_by_css_selector('h4[title] a').text
price = elems.find_element_by_css_selector('span[class^="ProductPrice__"]').text
print(name, price)
driver.back()
driver.quit()

I guess you could use Selenium for this but if speed is your concern aften #Andersson crafted the code for you in another question on Stackoverflow, well, you should replicate the API calls, that the site uses instead and extract the data from the JSON - like the site does.
If you use Chrome Inspector you'll see that the site for each of those categories that are in your outer while-loop (the try-block in your original code) calls an API, that returns the overall categories of the site. All this data can be retrieved like so:
categories_api = 'https://api.redmart.com/v1.5.8/catalog/search?extent=0&depth=1'
r = requests.get(categories_api).json()
For the next API calls you need to grab the uris concerning the bakery stuff. This can be done like so:
bakery_item = [e for e in r['categories'] if e['title'] == 'Bakery]
children = bakery_item[0]['children']
uris = [c['uri'] for c in children]
Uris will now be a list of strings (['bakery-bread', 'breakfast-treats-212', 'sliced-bread-212', 'wraps-pita-indian-breads', 'rolls-buns-212', 'baked-goods-desserts', 'loaves-artisanal-breads-212', 'frozen-part-bake', 'long-life-bread-toast', 'speciality-212']) that you'll pass on to another API found by Chrome Inspector, and that the site uses to load content.
This API has the following form (default returns a smaller pageSize but I bumped it to 500 to be somewhat sure you get all data in one request):
items_API = 'https://api.redmart.com/v1.5.8/catalog/search?pageSize=500&sort=1024&category={}'
for uri in uris:
r = requests.get(items_API.format(uri)).json()
products = r['products']
for product in products:
name = product['title']
# testing for promo_price - if its 0.0 go with the normal price
price = product['pricing']['promo_price']
if price == 0.0:
price = product['pricing']['price']
print("Name: {}. Price: {}".format(name, price))
Edit: If you want to stick to selenium still, you could insert something like this to hansle the lazy loading. Questions on scrolling has been answered several times before, so yours is actually a duplicate. In the future you should showcase what you tried (you own effort on the execute part) and show the traceback.
check_height = driver.execute_script("return document.body.scrollHeight;")
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(5)
height = driver.execute_script("return document.body.scrollHeight;")
if height == check_height:
break
check_height = height

selenium, webdriver.page_source not refreshing after click

I am trying to copy a web page's list of addresses for a given community service to a new document so i can geocode all of the locations in a map. Instead of being able to get a list of all the parcels I can only download one at a time and there are 25 parcel numbers limited to a page. As such, this would be extremely time consuming.
I want to develop a script that will look at the page source (everything including the 25 addresses which are contained in a table tag) click the next page button, copy the next page, and so on until the max page is reached. Afterwards, I can format the text to be geocoding compatible.
The code below does all of this except it only copies the first page over and over again even though I can clearly see that the program has successfully navigated to the next page:
# Open chrome
br = webdriver.Chrome()
raw_input("Navigate to web page. Press enter when done: ")
pg_src = br.page_source.encode("utf")
soup = BeautifulSoup(pg_src)
max_page = 122 #int(max_page)
#open a text doc to write the results to
f = open(r'C:\Geocoding\results.txt', 'w')
# write results page by page until max page number is reached
pg_cnt = 1 # start on 1 as we should already have the first page
while pg_cnt < max_page:
tble_elems = soup.findAll('table')
soup = BeautifulSoup(str(tble_elems))
f.write(str(soup))
time.sleep(5)
pg_cnt +=1
# clicks the next button
br.find_element_by_xpath("//div[#class='next button']").click()
# give some time for the page to load
time.sleep(5)
# get the new page source (THIS IS THE PART THAT DOESN'T SEEM TO BE WORKING)
page_src = br.page_source.encode("utf")
soup = BeautifulSoup(pg_src)
f.close()

I faced the same problem.
The problem i think is because some javascripts are not completely loaded.
All you need is wait till the object is loaded.Below code worked for me
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
delay = 10 # seconds
try:
myElem = WebDriverWait(drivr, delay).until(EC.presence_of_element_located((By.CLASS_NAME, 'legal-attribute-row')))
except :
print ("Loading took too much time!")

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Selenium scraping Issues with site having an popup window with endless scroll - python

Related

Selenium - HTML doesn't always update after a click. Content in the browser changes, but I often get the same HTML from prior to the click

Printing YouTube Video Titles to File w/Selenium & Python, MacOS

PYTHON scrapy selenium WebDriverWait

How to get all the data from a webpage manipulating lazy-loading method?

selenium, webdriver.page_source not refreshing after click

Categories

Resources