python selenium driver not scrolling to collect all data points

python selenium driver not scrolling to collect all data points - python

I am attempting to get some data from the EPA website, unfortunately i am not able to capture all of the data points, I theorize this is due to a combination of scrolling and waiting for the tag to become visible. however I have been working on this since yesterday with no luck.
from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
from selenium.webdriver.common.keys import Keys
import numpy as np
options = webdriver.ChromeOptions()
path = '/Users/<user>/Applications/chromedriver'
options.set_headless(True)
driver = webdriver.Chrome(chrome_options= options, executable_path=path)
url = 'https://edap.epa.gov/public/single/?appid=73b2b6a5-70c6-4820-b3fa-186ac094f10d&obj=b5bf280c-3488-4e46-84f6-58e2a0c34108&opt=noanimate%2Cnoselections&select=clearall'
driver.set_window_size(1920, 1080)
driver.get(url)
SCROLL_PAUSE_TIME = 0.5
# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")
rin_data = []
while True:
# Scroll down to bottom
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.CLASS_NAME,"qv-st-value-overflow")))
soup = BeautifulSoup(driver.page_source, "html.parser")
tableURL = soup.select('.qv-st-value-overflow')
for rin_val in tableURL:
rin_data.append(rin_val.get_text())
# Wait to load page
time.sleep(SCROLL_PAUSE_TIME)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height

It use Websocket not Ajax to get the data and you need to scroll the table[ng-style="tableStyles.content"] not body but it need custom scroll or using mouse wheel scroll. The function is taken from here
SCROLL_PAUSE_TIME = 2
driver.get(url)
# add mouse wheel function to the page
driver.execute_script('''
window.scrollTable = function() {
var element = document.querySelector('table[ng-style="tableStyles.content"]')
var box = element.getBoundingClientRect();
var deltaY = box.height;
var clientX = box.left + (box.width / 2);
var clientY = box.top + (box.height / 2);
var target = element.ownerDocument.elementFromPoint(clientX, clientY);
for (var e = target; e; e = e.parentElement) {
if (e === element) {
target.dispatchEvent(new WheelEvent('wheel', {view: window, bubbles: true, cancelable: true, clientX: clientX, clientY: clientY, deltaY: deltaY}));
}
}
}
''')
rin_data = []
while True:
WebDriverWait(driver, 30).until(
EC.presence_of_element_located((By.CSS_SELECTOR, 'tr[class^="qv-st-data-row"]'))
)
last_position = driver.find_element_by_css_selector(".scrollbar-thumb").get_attribute('style')
rows = driver.find_elements_by_css_selector('tr[class^="qv-st-data-row"]')
for row in rows:
rin_data.append(row.text)
# Scroll down the table
driver.execute_script('scrollTable()')
# Wait to load content from Websocket, maybe need to increase
time.sleep(SCROLL_PAUSE_TIME)
# Calculate new scroll position and compare with last scroll position
new_position = driver.find_element_by_css_selector(".scrollbar-thumb").get_attribute('style')
if new_position == last_position:
break
Note, in this case you don't need to use BeautifulSoup

Related

python selenium : empty list is output as a crawling result

Here is my code.
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
url_list = []
content_list = ""
text = "b%C3%A1nh%20crown"
url_maxpage = "https://shopee.vn/search?brands=3372239&keyword=" + text + "&noCorrection=true&page=0"
driver.get(url_maxpage)
by = webdriver.common.by.By
max_page = driver.find_elements(by=By.CLASS_NAME, value='shopee-mini-page-controller__total')
num=int(max_page[0].text)
for i in range(num): # Crawl from 1 to max page
url = "https://shopee.vn/search?brands=3372239&keyword=" + text + "&noCorrection=true&page=" + str(i)
driver.get(url)
by = webdriver.common.by.By
time.sleep(0.5)
div_href = driver.find_elements(by.CLASS_NAME, "col-xs-2-4")
hlink = []
for i in range(int(len(div_href))):
hlink_list = div_href[i].find_elements(by.TAG_NAME, "a")[0].get_attribute('href')
hlink.append(hlink_list)
# Remove duplicates in list
my_list = hlink
my_set = set(my_list)
my_list = list(my_set)
output : []
Using the code above, I want to traverse the page and crawl the links for each product.
But I don't know why it outputs an empty list.
Any help would be greatly appreciated.
I modified the for statement as follows, but only 15 are output.
for i in range(num): # Crawl from 1 to max page
url = "https://shopee.vn/search?brands=3372239&keyword=" + text + "&noCorrection=true&page=" + str(i)
driver.get(url)
wait = WebDriverWait(driver, 20)
SCROLL_PAUSE_SEC = 10
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(SCROLL_PAUSE_SEC)
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
hlink = []
elements = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".col-xs-2-4 a")))
for element in elements:
hlink_list = element.get_attribute('href')
hlink.append(hlink_list)

You need to wait for elements to be loaded.
This will give you first 15 hrefs on the page.
To get all the 60 you will need to scroll the page since only 15 elements are initially loaded.
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
options = Options()
options.add_argument("start-maximized")
webdriver_service = Service('C:\webdrivers\chromedriver.exe')
driver = webdriver.Chrome(service=webdriver_service, options=options)
url = 'https://shopee.vn/search?brands=3372239&keyword=b%C3%A1nh%20crown&noCorrection=true&page=0e'
driver.get(url)
wait = WebDriverWait(driver, 20)
hlink = []
elements = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".col-xs-2-4 a")))
for element in elements:
hlink_list = element.get_attribute('href')
hlink.append(hlink_list)

Unsure why BS4 and selenium not return correctly

from bs4 import BeautifulSoup
import requests
from csv import writer
from selenium import webdriver
import time
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
#selenium Path
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.get('https://www.nike.com/w/mens-shoes-nik1zy7ok')
#PAGE SCROLLING
# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")
SCROLL_PAUSE_TIME = 1
while True:
# Scroll down to bottom
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
time.sleep(SCROLL_PAUSE_TIME)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
#BS4 taking Selenium Driver Source
soup = BeautifulSoup(driver.page_source, 'html.parser')
lists = soup.find_all('div', class_='product-card__info disable-animations for--product')
#Iterate Through HTML and filter out content as needed and store inside shoes.csv
with open('shoes.csv','w', encoding='utf8',newline='') as f:
thewriter= writer(f)
header=['Name','Price']
thewriter.writerow(header)
for list in lists:
try:
name = list.find('div', class_='product-card__title').text
price = list.find('div',class_='product-price css-11s12ax is--current-price').text
except:
print("\nList finished!")
break
info = [name,price]
thewriter.writerow(info)
print(info)
#testing for other tag
soup2 = BeautifulSoup(driver.page_source, 'html.parser')
lists2 = soup2.find_all('div', class_='product-card__info disable-animations for--product')
#testing
with open('shoes2.csv','w', encoding='utf8',newline='') as f:
thewriter= writer(f)
header=['Name','Price']
thewriter.writerow(header)
for list in lists2:
try:
names = list.find('div', class_='product-card__title').text
prices = list.find('div',class_='product-price is--current-price css-s56yt7').text
except:
print("\nList finished!")
break
info2 = [names,prices]
thewriter.writerow(info2)
print(info2)
The intent is to build a web scraper for search through the Nike Men Shoes Store and output a CSV file with the name and price of item
So on the the website it show 500+ items and I'm only able to gather 100 items....
I double check all the tags and notice when i print out the HTML its skipping item randomly! If anyone could tell me why I would greatly appreciated
UPDATE SOLVE using purely selenium!
will be using the webdriver opinion to use headless browser to further lessen resource load! any tip for make it more efficient would be appreciated
import requests
from csv import writer
from selenium import webdriver
import time
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
import itertools
#selenium Path
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.get('https://www.nike.com/w/mens-shoes-nik1zy7ok')
#PAGE SCROLLING
# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")
SCROLL_PAUSE_TIME = .5
while True:
# Scroll down to bottom
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
time.sleep(SCROLL_PAUSE_TIME)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
Wname=driver.find_elements(By.CLASS_NAME, "product-card__title")
Wprice=driver.find_elements(By.CLASS_NAME, "product-card__price")
Wcolor=driver.find_elements(By.CLASS_NAME, "product-card__product-count")
#Make 3 seperate list to translate over to text
name=[]
price=[]
color=[]
for i in Wname :
name.append(i.text)
for i in Wprice:
price.append(i.text)
for i in Wcolor:
color.append(i.text)
#making CSV
new_list=[]
with open('Menshoes.csv','w', encoding='utf8',newline='') as f:
thewriter= writer(f)
header=['Name','Price','#Color']
thewriter.writerow(header)
for n,p,q in itertools.zip_longest(name,price,color):
if n:
new_list.append(n)
if p:
new_list.append(p)
if q:
new_list.append(q)
info = [n,p,q]
thewriter.writerow(info)

Take a closer look on your selection with BeautifulSoup it only provides specific cards as ResultSet which correspond to your class specification :
soup.find_all('div', class_='product-card__info disable-animations for--product')
You should use a more general selection to get more cards shown in your ResultSet:
soup.find_all('div', class_='product-card__body')
Note: Also avoid using reserved keywords like list
While scrolling give a bit more time or wait until all elements are loaded - I stored the results in a list of dicts, so it is a bit more structured and I could perform some other things (check len() or set()) on it before writing it to csv.
...
last_height = driver.execute_script("return document.body.scrollHeight")
SCROLL_PAUSE_TIME = 2
while True:
# Scroll down to bottom
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
time.sleep(SCROLL_PAUSE_TIME)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
#BS4 taking Selenium Driver Source
soup = BeautifulSoup(driver.page_source, 'html.parser')
data = []
for e in soup.find_all('div', class_='product-card__body'):
data.append({
'name': e.select_one('.product-card__titles').text,
'price':e.select_one('.is--current-price').text
})
with open('shoes.csv','w', encoding='utf8',newline='') as f:
writer = csv.DictWriter(f, fieldnames = data[0].keys())
writer.writeheader()
writer.writerows(data)
...

How can I infinite-scroll a web page using selenium webdriver in python

I’m newbie of selenium,
I’m trying to figure out how to scroll infinitely
i tried almost everything what other stackoverflow said
1.
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko")
driver = webdriver.Chrome('chromedriver', options=chrome_options)
driver.set_window_size(1320, 550)
exchange_link = "https://icodrops.com/ico-stats/"
driver.get(exchange_link)
wait = WebDriverWait(driver, 10)
SCROLL_PAUSE_TIME = 0.5
# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
# Scroll down to bottom
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
time.sleep(SCROLL_PAUSE_TIME)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
from selenium.webdriver.common.keys import Keys
Number = wait.until(EC.presence_of_element_located((By.XPATH,'html[1]/body[1]/div[1]/div[1]/div[1]/main[1]/div[1]/div[4]/div[2]/div[1]/div[1]/div[1]')))
lastElement = Number.find_elements(By.XPATH,'div')[-1]
lastElement.send_keys(Keys.NULL)
Number = wait.until(EC.presence_of_element_located((By.XPATH,'html[1]/body[1]/div[1]/div[1]/div[1]/main[1]/div[1]/div[4]/div[2]/div[1]/div[1]/div[1]')))
lastElement = Number.find_elements(By.XPATH,'div')[-1]
lastElement.location_once_scrolled_into_view
etc
driver.execute_script("var scrollingElement = (document.scrollingElement || document.body);scrollingElement.scrollTop = scrollingElement.scrollHeight;")
driver.execute_script("document.getElementById('mydiv').scrollIntoView();")
idk somethingelse i can do
i spend a lot of time to fix it
thanks for all reply. but nothing works
i tried two things
while True:
if j == 900:
break
try :
ele = wait.until(EC.visibility_of_element_located((By.XPATH, f"(//div[#id='market-ico-stat-container']/div)[{j}]")))
driver.execute_script("arguments[0].scrollIntoView(true);", ele)
ico_name = wait.until(EC.presence_of_element_located((By.XPATH,f'/html[1]/body[1]/div[1]/div[1]/div[1]/main[1]/div[1]/div[5]/div[2]/div[1]/div[1]/div[1]/div[{j}]/a[1]/div[1]/div[1]/div[2]/h3/a'))).get_attribute("textContent")
print(j)
print(ico_name)
j+=1
except :
break
but result as same. from 51 it can’t crawl. so it means that no scroll down

You should scroll each web element one by one with the help of execute_script
Code:
driver = webdriver.Chrome(driver_path)
driver.maximize_window()
wait = WebDriverWait(driver, 30)
driver.get("https://icodrops.com/ico-stats/")
j = 1
while True:
ele = wait.until(EC.visibility_of_element_located((By.XPATH, f"(//div[#id='market-ico-stat-container']/div)[{j}]")))
driver.execute_script("arguments[0].scrollIntoView(true);", ele)
time.sleep(0.5)
name = ele.find_element(By.XPATH, ".//descendant::h3//a").get_attribute('innerText')
print(name)
j = j + 1
#below code is just in case you want to break from infinite loop
if j > 50:
break
Output:
Ambire Wallet
Himo World
Highstreet
Decimated
Planet Sandbox
BENQI
DeHorizon
Mines Of Dalarnia
MonoX
Lobis
AntEx
Titan Hunters
Tempus
The Realm Defenders
Aurora
XDEFI Wallet
Libre DeFi
Genopets
Mytheria
ReSource
Defactor
PlaceWar
CryptoXpress
Cryowar
Numbers Protocol
Dragon Kart
Trusted Node
Cere Network
Elemon
Meta Spatial
YIN Finance
Ardana
CropBytes
Good Games Guild
Ariadne
ThorSwap
Solend
GooseFX
Galactic Arena
DotOracle
Scallop
AcknoLedger
Clearpool
Sandclock
ArtWallet
Aurory
BloXmove
WonderHero
Lazio Fan Token
Hero Arena
Process finished with exit code 0
Imports:
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
the above code will never break and will be executing infinite, to overcome this behavior you should introduce a maximum limit like this:
if j == 500:
break
However, the web application seems to detect the Selenium script.

I was able to scroll this with the next code changes:
Add extra options to make the script undetected (it was blocked as a bot before)
Add keyboard action ARROW_UP, this does magic and content started to load after js scroll.
Add 5 seconds timeout to load the new content
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko")
#extra options
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_argument('--disable-blink-features=AutomationControlled')
driver = webdriver.Chrome('chromedriver', options=chrome_options)
driver.set_window_size(1320, 550)
exchange_link = "https://icodrops.com/ico-stats/"
driver.get(exchange_link)
SCROLL_PAUSE_TIME = 5 #5 seconds
time.sleep(SCROLL_PAUSE_TIME)
# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")
for x in range(0, 10):
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
driver.find_element(By.XPATH, "//body").send_keys(Keys.ARROW_UP)
time.sleep(SCROLL_PAUSE_TIME)
new_height = driver.execute_script("return document.body.scrollHeight")
print('current Y: ' + str(new_height))
if new_height == last_height:
break
last_height = new_height
driver.close()
Output:
current Y: 9792
current Y: 32542
current Y: 68942
current Y: 82592
current Y: 82592
I've tested this with Selenium 4, Chrome 97, Windows.
This code might be improved and optimized, but at least I hope it should work.

How do I scrape a website with an infinite scroller?

The code below is what I have so far, but it only pulls data for the first 25 items, which are the first 25 items on the page before scrolling down for more:
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd
start_time = time.time()
s = requests.Session()
#Get URL and extract content
response = s.get('https://www.linkedin.com/jobs/search?keywords=It%20Business%20Analyst&location=Boston%2C%20Massachusetts%2C%20United%20States&geoId=102380872&trk=public_jobs_jobs-search-bar_search-submit&position=1&pageNum=0')
soup = BeautifulSoup(response.text, 'html.parser')
items = soup.find('ul', {'class': 'jobs-search__results-list'})
job_titles = [i.text.strip('\n ') for i in items.find_all('h3', {'class': 'base-search-card__title'})]
job_companies = [i.text.strip('\n ') for i in items.find_all('h4', {'class': 'base-search-card__subtitle'})]
job_locations = [i.text.strip('\n ') for i in items.find_all('span', {'class': 'job-search-card__location'})]
job_links = [i["href"].strip('\n ') for i in items.find_all('a', {'class': 'base-card__full-link'})]
a = pd.DataFrame({'Job Titles': job_titles})
b = pd.DataFrame({'Job Companies': job_companies})
c = pd.DataFrame({'Job Locations': job_locations})
value_counts1 = a['Job Titles'].value_counts()
value_counts2 = b['Job Companies'].value_counts()
value_counts3 = c['Job Locations'].value_counts()
l1 = [f"{key} - {value_counts1[key]}" for key in value_counts1.keys()]
l2 = [f"{key} - {value_counts2[key]}" for key in value_counts2.keys()]
l3 = [f"{key} - {value_counts3[key]}" for key in value_counts3.keys()]
data = l1, l2, l3
df = pd.DataFrame(
data, index=['Job Titles', 'Job Companies', 'Job Locations'])
df = df.T
print(df)
print("--- %s seconds ---" % (time.time() - start_time))
I would like to pull data for more than the first 25 items, is there an efficient way of being able to do this?

Get the container that holds the desired data by inspecting and you can scrape from the infinite scroll page with Selenium web driver using window.scrollTo()
check this for more >
crawl site that has infinite scrolling using python
or this web-scraping-infinite-scrolling-with-selenium

The best way is to create a function to scroll down:
# Scroll function
# This function takes two arguments. The driver that is being used and a timeout.
# The driver is used to scroll and the timeout is used to wait for the page to load.
def scroll(driver, timeout):
scroll_pause_time = timeout
# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
# Scroll down to bottom
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
time.sleep(scroll_pause_time)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
# If heights are the same it will exit the function
break
last_height = new_height
Then you can use the scroll function to scroll desidered page:
import time
import pandas as pd
from seleniumwire import webdriver
# Create a new instance of the Firefox driver
driver = webdriver.Firefox()
# move to some url
driver.get('your_url')
# use "scroll" function to scroll the page every 5 seconds
scroll(driver, 5)

Can't get all xpath elements from dynamic webpage

First time here asking. Hope someone can help me with this, it's driving me crazy !
I'm trying to scrape a used-car webpage from my country. The data loads when you start to scroll down, so, the first part of the code is for scrolling down and load the webpage.
I'm trying to get the link of every car published here, that's why I'm using find_elements_by_xpath in the try-except part.
Well, the problem is, the cars are showed up in packs of 11 for every load(scroll down), so the 11 xpaths repeats when scrolling down everytime;
meaning xpaths from
"//*[#id='w1']/div[1]/div/div[1]/a"
to
"//*[#id='w11']/div[1]/div/div[1]/a"
All libraries are called at the start of the code, don't worry.
from selenium import webdriver
from bs4 import BeautifulSoup
import time
links = []
url = ('https://buy.olxautos.cl/buscar?VehiculoEsSearch%5Btipo_valor%5D=1&VehiculoEsSearch%5Bprecio_range%5D=3990000%3B15190000')
driver = webdriver.Chrome('')
driver.get(url)
time.sleep(5)
SCROLL_PAUSE_TIME = 3
# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
# Scroll down to bottom
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
time.sleep(SCROLL_PAUSE_TIME)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
try:
zelda = driver.find_elements_by_xpath("//*[#id='w1']/div[1]/div/div[1]/a").get_attribute('href')
links.append(zelda)
except:
pass
print(links)
So the expected output of this code would be something like this:
['link_car_1', 'link_car_12', 'link_car_23', '...']
But when I run this code, it returns an empty list. But when I run it with find_element_by_xpath returns the first link, what am I doing wrong 😭😭, I just can't figure it out !!.
Thanks!

You get only one link because the XPATH is not the same for all the links. you can use bs4 to extract links by using the driver page source as shown below.
from bs4 import BeautifulSoup
import lxml
links = []
url = ('https://buy.olxautos.cl/buscar?VehiculoEsSearch%5Btipo_valor%5D=1&VehiculoEsSearch%5Bprecio_range%5D=3990000%3B15190000')
driver = webdriver.Chrome(executable_path = Path)
driver.get(url)
time.sleep(5)
SCROLL_PAUSE_TIME = 3
# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
# Scroll down to bottom
page_source_ = driver.page_source
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
time.sleep(SCROLL_PAUSE_TIME)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
#use BeautifulSoup to extract links
sup = BeautifulSoup(page_source_, 'lxml')
sub_ = sup.findAll('div', {'class': 'owl-item active'})
for link_ in sub_:
link = link_.find('a', href= True)
#link = 'https://buy.olxautos.cl' + link #if needed (adding prefix)
links.append(link['href'])
if new_height == last_height:
break
last_height = new_height
print('>> Total length of list : ', len(links))
print('\n',links)

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

python selenium driver not scrolling to collect all data points - python

Related

python selenium : empty list is output as a crawling result

Unsure why BS4 and selenium not return correctly

How can I infinite-scroll a web page using selenium webdriver in python

How do I scrape a website with an infinite scroller?

Can't get all xpath elements from dynamic webpage

Categories

Resources