python selenium : empty list is output as a crawling result

python selenium : empty list is output as a crawling result - python

Here is my code.
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
url_list = []
content_list = ""
text = "b%C3%A1nh%20crown"
url_maxpage = "https://shopee.vn/search?brands=3372239&keyword=" + text + "&noCorrection=true&page=0"
driver.get(url_maxpage)
by = webdriver.common.by.By
max_page = driver.find_elements(by=By.CLASS_NAME, value='shopee-mini-page-controller__total')
num=int(max_page[0].text)
for i in range(num): # Crawl from 1 to max page
url = "https://shopee.vn/search?brands=3372239&keyword=" + text + "&noCorrection=true&page=" + str(i)
driver.get(url)
by = webdriver.common.by.By
time.sleep(0.5)
div_href = driver.find_elements(by.CLASS_NAME, "col-xs-2-4")
hlink = []
for i in range(int(len(div_href))):
hlink_list = div_href[i].find_elements(by.TAG_NAME, "a")[0].get_attribute('href')
hlink.append(hlink_list)
# Remove duplicates in list
my_list = hlink
my_set = set(my_list)
my_list = list(my_set)
output : []
Using the code above, I want to traverse the page and crawl the links for each product.
But I don't know why it outputs an empty list.
Any help would be greatly appreciated.
I modified the for statement as follows, but only 15 are output.
for i in range(num): # Crawl from 1 to max page
url = "https://shopee.vn/search?brands=3372239&keyword=" + text + "&noCorrection=true&page=" + str(i)
driver.get(url)
wait = WebDriverWait(driver, 20)
SCROLL_PAUSE_SEC = 10
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(SCROLL_PAUSE_SEC)
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
hlink = []
elements = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".col-xs-2-4 a")))
for element in elements:
hlink_list = element.get_attribute('href')
hlink.append(hlink_list)

You need to wait for elements to be loaded.
This will give you first 15 hrefs on the page.
To get all the 60 you will need to scroll the page since only 15 elements are initially loaded.
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
options = Options()
options.add_argument("start-maximized")
webdriver_service = Service('C:\webdrivers\chromedriver.exe')
driver = webdriver.Chrome(service=webdriver_service, options=options)
url = 'https://shopee.vn/search?brands=3372239&keyword=b%C3%A1nh%20crown&noCorrection=true&page=0e'
driver.get(url)
wait = WebDriverWait(driver, 20)
hlink = []
elements = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".col-xs-2-4 a")))
for element in elements:
hlink_list = element.get_attribute('href')
hlink.append(hlink_list)

Related

"TypeError: 'function' object is not iterable", why am I getting this error?

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait as wait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium. webdriver. chrome. options import Options
web = 'https://www.amazon.com'
driver_path = 'V:\Python Project\chromedriver_win32\chromedriver.exe'
options = webdriver.ChromeOptions()
options.add_argument('--headless')
s=Service('V:\Python Project\chromedriver_win32\chromedriver.exe')
driver = webdriver.Chrome(service=s)
driver.maximize_window() # For maximizing window
driver.implicitly_wait(30)
# driver = webdriver.Chrome(options=options, executable_path=driver_path)
driver.get(web)
driver.implicitly_wait(5)
keyword = "Table"
search = driver.find_element(By.ID,"twotabsearchtextbox")
search.send_keys(keyword)
# click search button
search_button = driver.find_element(By.ID,'nav-search-submit-button')
search_button.click()
driver.implicitly_wait(5)
product_asin = []
product_name = []
product_price = []
product_ratings = []
product_ratings_num = []
product_link = []
items =(EC.presence_of_all_elements_located((By.CLASS_NAME, "s-result-item sasin"))) *=>Error on this line*
for item in items:
# find name
name = item.find_element(By.CLASS_NAME,"a-size-medium a-color-base a-text-normal")
product_name.append(name.text)
# find ASIN number
data_asin = item.get_attribute("data-asin")
product_asin.append(data_asin)
# find price
whole_price = item.find_element(By.CLASS_NAME,"a-price-whole")
fraction_price = item.find_element(By.CLASS_NAME,"a-price-fraction")
if whole_price != [] and fraction_price != []:
price = '.'.join([whole_price[0].text, fraction_price[0].text])
else:
price = 0
product_price.append(price)
# find ratings box
ratings_box = item.find_element(By.CLASS_NAME,"a-row a-size-small")
# find ratings and ratings_num
if ratings_box != []:
ratings = ratings_box[0].get_attribute('aria-label')
ratings_num = ratings_box[1].get_attribute('aria-label')
else:
ratings, ratings_num = 0, 0
product_ratings.append(ratings)
product_ratings_num.append(str(ratings_num))
# find link
link = item.find_element(By.CLASS_NAME,"a-link-normal a-text-normal").get_attribute("href")
product_link.append(link)
driver.quit()
# to check data scraped
print(product_name)
print(product_asin)
print(product_price)
print(product_ratings)
print(product_ratings_num)
print(product_link)
For the code above I am getting the following error:
for item in items:
TypeError: 'function' object is not iterable
I am working on the above code, but it is giving me the error, "TypeError: 'function' object is not iterable". It should be working fine, as it is mostly correct, but don't know what is missing, which gives the error. Please can anyone provide mw with the solution?.

EC.presence_of_all_elements_located
this function will not return anything it will only check is the item which you want is available or not. Use something like below
WebDriverWait(self.browser, 5).until(
EC.visibility_of_all_elements_located((By.CLASS_NAME, "s-result-item sasin")))
rows = self.browser.find_elements(
By.CLASS_NAME, "s-result-item sasin")
self.assertIn(
rowItem,
[row.text for row in rows]
)
I think there is some mistake in class can you check below code
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait as wait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium. webdriver. chrome. options import Options
web = 'https://www.amazon.com'
driver_path = 'chromedriver_win32\chromedriver.exe'
options = webdriver.ChromeOptions()
options.add_argument('--headless')
# ser=Service(executable_path='chromedriver_win32\chromedriver.exe')
driver = webdriver.Chrome(executable_path='chromedriver_win32\chromedriver.exe')
driver.maximize_window() # For maximizing window
driver.implicitly_wait(30)
# driver = webdriver.Chrome(options=options, executable_path=driver_path)
driver.get(web)
driver.implicitly_wait(5)
keyword = "Table"
search = driver.find_element(By.ID,"twotabsearchtextbox")
search.send_keys(keyword)
# click search button
search_button = driver.find_element(By.ID,'nav-search-submit-button')
search_button.click()
driver.implicitly_wait(5)
product_asin = []
product_name = []
product_price = []
product_ratings = []
product_ratings_num = []
product_link = []
EC.presence_of_all_elements_located((By.CLASS_NAME, "s-result-item"))
rows = driver.find_elements(
By.CLASS_NAME, "s-result-item")
print(rows)

Selenium script captures first few items out of 100

I've rceated a script to scrape different collection names from a webpage traversing multiple pages. The script can parse first 13 names from each page out of 100 names. One such collection name looks like Pudgy Penguins. How can I capture 100 names instead of only 13 from different pages of that site using selenium?
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
link = "https://opensea.io/rankings"
def scroll_to_the_bottom():
check_height = driver.execute_script("return document.body.scrollHeight;")
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
try:
WebDriverWait(driver,5).until(lambda driver: driver.execute_script("return document.body.scrollHeight;") > check_height)
check_height = driver.execute_script("return document.body.scrollHeight;")
except TimeoutException:
break
def get_collection_names(link):
driver.get(link)
while True:
scroll_to_the_bottom()
for item in WebDriverWait(driver,10).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR,"[role='listitem'] [class$='Ranking--row']"))):
collection_name = WebDriverWait(item,10).until(EC.visibility_of_element_located((By.CSS_SELECTOR,"[class$='Ranking--collection-name']"))).text
yield collection_name
try:
button = WebDriverWait(driver,10).until(EC.presence_of_element_located((By.XPATH,"//button[contains(#class,'Buttonreact__StyledButton')][./i[contains(.,'arrow_forward')]]")))
driver.execute_script('arguments[0].click();',button)
WebDriverWait(driver,10).until(EC.staleness_of(item))
except Exception as e:
return
if __name__ == '__main__':
driver = webdriver.Chrome()
for collection_name in get_collection_names(link):
print(collection_name)
Scrolling to the bottom of every page seems not to have any effect on the number of results the script produces.

I have checked your description and source code and I think there are many elements. So it doesn't load at one time. For solving this, scroll down to the bottom step by step. So, I have changed function scroll_to_the_bottom as below :
def scroll_to_the_bottom() :
H = driver.execute_script('return document.body.scrollHeight;')
h = 0
while True :
h += 300
if h >= H :
break
driver.execute_script("window.scrollTo({}, {});".format(0, h))
time.sleep(1)
So, embed above code into your code, we can change it as below :
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
link = "https://opensea.io/rankings"
def get_collection_names(link):
driver.get(link)
unique_items = set()
while True:
item = WebDriverWait(driver,10).until(EC.presence_of_element_located((By.CSS_SELECTOR,"[class$='Ranking--collection-name']")))
H = driver.execute_script('return document.body.scrollHeight;')
h = 0
while True :
h += 300
if h >= H :
break
for element in WebDriverWait(driver,10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,"[class$='Ranking--collection-name']"))):
if element.text not in unique_items:
yield element.text
unique_items.add(element.text)
driver.execute_script("window.scrollTo(0, {});".format(h))
time.sleep(1)
try:
button = WebDriverWait(driver,10).until(EC.presence_of_element_located((By.XPATH,"//button[contains(#class,'Buttonreact__StyledButton')][./i[contains(.,'arrow_forward')]]")))
driver.execute_script('arguments[0].click();',button)
WebDriverWait(driver,10).until(EC.staleness_of(item))
except Exception as e:
return
if __name__ == '__main__':
driver = webdriver.Chrome()
for item in get_collection_names(link):
print(item)
driver.quit()
Hope to be helpful for you. Thanks.

Waiting for data table to load after click / Selenium

I am trying to read the data table from the Indian Central Pollution Controal Board using selenium/python. Here is an example of output.
I am essentially following the approach presented here: https://github.com/RachitKamdar/Python-Scraper.
Thanks to #Prophet, I was able to read data from the first page (Select element using XPATH with Python?) but I cannot get selenium to wait for the data table to reload when switching to page 2.
I tried to add a webdriverwait instruction but this does seem to work. Any help would be greatly appreciated. Thanks
Here is what I tried to do
browser.find_element_by_tag_name("select").send_keys("100")
WebDriverWait(browser, timeout).until(EC.visibility_of_element_located((By.XPATH, "//*[#id='DataTables_Table_0_paginate']/span/a")))
maxpage = int(browser.find_elements(By.XPATH,"//*[#id='DataTables_Table_0_paginate']/span/a")[-1].text)
i = 1
while i < maxpage + 1:
browser.find_element(By.XPATH,"//*[#id='DataTables_Table_0_paginate']/span/a[contains(text(),'{}')]".format(i)).click()
WebDriverWait(browser, timeout).until(EC.visibility_of_element_located((By.ID,"DataTables_Table_0_wrapper")))
#this works ok for page 1
#this does not wait after the click for the data table to update. As a result res is wrong for page 2 [empty].
res = browser.page_source
soup = BeautifulSoup(res, 'html.parser')
soup = soup.find(id = 'DataTables_Table_0')
...
i = i + 1
Update 1:
Following Prophet's suggestion, I made the following modification:
browser.find_element_by_tag_name("select").send_keys("100")
WebDriverWait(browser, timeout).until(EC.visibility_of_element_located((By.ID,"DataTables_Table_0_wrapper")))
WebDriverWait(browser, timeout).until(EC.visibility_of_element_located((By.XPATH, "//*[#id='DataTables_Table_0_paginate']/span/a")))
maxpage = int(browser.find_elements(By.XPATH,"//*[#id='DataTables_Table_0_paginate']/span/a")[-1].text)
print(maxpage)
i = 1
while i < maxpage + 1:
WebDriverWait(browser, timeout).until(EC.visibility_of_element_located((By.ID,"DataTables_Table_0_wrapper")))
res = browser.page_source
soup = BeautifulSoup(res, 'html.parser')
soup = soup.find(id = 'DataTables_Table_0')
if i == 1:
data = getValsHtml(soup)
else:
data = data.append(getValsHtml(soup))
print(i)
print(data)
i = i + 1
browser.find_element(By.XPATH,'//a[#class="paginate_button next"]').click()
This still crashes on page 2 (data is empty). In addition, data should contain 100 items from page 1 but only contains 10. The maxpage number is correct (15).
Update 2:
here is the whole script after incorporating Prophet's recommendations [original script follows https://github.com/RachitKamdar/Python-Scraper].
This only retrieves 10 points from the first page and fails to switch to the next page.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import Select
def getValsHtml(table):
data = []
heads = table.find_all('th')
data.append([ele.text.strip() for ele in heads])
rows = table.find_all('tr')
for row in rows:
cols = row.find_all('td')
cols = [ele.text.strip() for ele in cols]
data.append([ele for ele in cols]) # Get rid of empty values
data.pop(1)
data = pd.DataFrame(data[1:],columns = data[0])
return data
def parameters(br,param):
br.find_element_by_class_name("list-filter").find_element_by_tag_name("input").send_keys(param)
br.find_elements_by_class_name("pure-checkbox")[1].click()
br.find_element_by_class_name("list-filter").find_element_by_tag_name("input").clear()
timeout = 60
url = 'https://app.cpcbccr.com/ccr/#/caaqm-dashboard-all/caaqm-landing/data'
chdriverpath="/net/f1p/my_soft/chromedriver"
option = webdriver.ChromeOptions()
browser = webdriver.Chrome(executable_path="{}".format(chdriverpath), chrome_options=option)
browser.get(url)
station="Secretariat, Amaravati - APPCB"
state="Andhra Pradesh"
city="Amaravati"
sd=['01', 'Jan', '2018']
ed=['31', 'Dec', '2021']
duration="24 Hours"
WebDriverWait(browser, timeout).until(EC.visibility_of_element_located((By.CLASS_NAME,"toggle")))
browser.find_elements_by_class_name("toggle")[0].click()
browser.find_element_by_tag_name("input").send_keys(state)
browser.find_element_by_class_name("options").click()
browser.find_elements_by_class_name("toggle")[1].click()
browser.find_element_by_tag_name("input").send_keys(city)
browser.find_element_by_class_name("options").click()
browser.find_elements_by_class_name("toggle")[2].click()
browser.find_element_by_tag_name("input").send_keys(station)
browser.find_element_by_class_name("options").click()
browser.find_elements_by_class_name("toggle")[4].click()
browser.find_element_by_class_name("filter").find_element_by_tag_name("input").send_keys(duration)
browser.find_element_by_class_name("options").click()
browser.find_element_by_class_name("c-btn").click()
for p in ['NH3']:
print(p)
try:
parameters(browser,p)
except:
print("miss")
browser.find_element_by_class_name("list-filter").find_element_by_tag_name("input").clear()
pass
browser.find_element_by_class_name("wc-date-container").click()
browser.find_element_by_class_name("month-year").click()
browser.find_element_by_id("{}".format(sd[1].upper())).click()
browser.find_element_by_class_name("year-dropdown").click()
browser.find_element_by_id("{}".format(int(sd[2]))).click()
browser.find_element_by_xpath('//span[text()="{}"]'.format(int(sd[0]))).click()
browser.find_elements_by_class_name("wc-date-container")[1].click()
browser.find_elements_by_class_name("month-year")[1].click()
browser.find_elements_by_id("{}".format(ed[1].upper()))[1].click()
browser.find_elements_by_class_name("year-dropdown")[1].click()
browser.find_element_by_id("{}".format(int(ed[2]))).click()
browser.find_elements_by_xpath('//span[text()="{}"]'.format(int(ed[0])))[1].click()
browser.find_elements_by_tag_name("button")[-1].click()
next_page_btn_xpath = '//a[#class="paginate_button next"]'
actions = ActionChains(browser)
#This is how you should treat the Select drop down
select = Select(browser.find_element_by_tag_name("select"))
select.select_by_value('100')
WebDriverWait(browser, timeout).until(EC.visibility_of_element_located((By.XPATH,'//div[#class="dataTables_wrapper no-footer"]')))
WebDriverWait(browser, timeout).until(EC.visibility_of_element_located((By.XPATH, "//*[#id='DataTables_Table_0_paginate']/span/a")))
maxpage = int(browser.find_elements(By.XPATH,"//*[#id='DataTables_Table_0_paginate']/span/a")[-1].text)
i = 1
while i < maxpage + 1:
res = browser.page_source
soup = BeautifulSoup(res, 'html.parser')
soup = soup.find(id = 'DataTables_Table_0')
if i == 1:
data = getValsHtml(soup)
else:
data = data.append(getValsHtml(soup))
print(i)
print(data)
i = i + 1
#scroll to the next page btn and then click it
next_page_btn = browser.find_element_by_xpath(next_page_btn_xpath)
actions.move_to_element(next_page_btn).perform()
browser.find_element(By.XPATH,next_page_btn).click()
browser.quit()

Instead of
browser.find_element(By.XPATH,"//*[#id='DataTables_Table_0_paginate']/span/a[contains(text(),'{}')]".format(i)).click()
Try clicking on this element:
browser.find_element(By.XPATH,'//a[#class="paginate_button next"]').click()
It's simply next page button and it fill not change per page you are on.
Also, instead of
WebDriverWait(browser, timeout).until(EC.visibility_of_element_located((By.ID,"DataTables_Table_0_wrapper")))
Try this
WebDriverWait(browser, timeout).until(EC.visibility_of_element_located((By.XPATH,'//div[#class="dataTables_wrapper no-footer"]')))
This element will be same for all the pages while that you trying to use defined for first page only.
UPD
The correct code should be like this:
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import Select
next_page_btn_xpath = '//a[#class="paginate_button next"]'
actions = ActionChains(driver)
#This is how you should treat the Select drop down
select = Select(driver.find_element_by_tag_name("select"))
select.select_by_value('100')
WebDriverWait(browser, timeout).until(EC.visibility_of_element_located((By.XPATH,'//div[#class="dataTables_wrapper no-footer"]')))
maxpage = int(browser.find_elements(By.XPATH,"//*[#id='DataTables_Table_0_paginate']/span/a")[-1].text)
i = 1
while i < maxpage + 1:
res = browser.page_source
soup = BeautifulSoup(res, 'html.parser')
soup = soup.find(id = 'DataTables_Table_0')
if i == 1:
data = getValsHtml(soup)
else:
data = data.append(getValsHtml(soup))
print(i)
print(data)
i = i + 1
#scroll to the next page btn and then click it
next_page_btn = driver.find_element_by_xpath(next_page_btn_xpath)
actions.move_to_element(next_page_btn).perform()
browser.find_element(By.XPATH,next_page_btn).click()

if or try loop for an element in a page selenium

I am trying to scrape agents data here. I am able to get the links from the first page. I am using numbered loops because I know the total number of pages. I tried to run this as long as the "next" page option is there. I tried both "try" and "if not" but wasn't able to figure it out. Any help is welcome. Here is the code.
from selenium import webdriver
import time
from selenium.common.exceptions import ElementNotVisibleException, NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
options = webdriver.ChromeOptions()
options.add_argument('headless')
driver = webdriver.Chrome('C:/Users/../Downloads/cd79/chromedriver.exe', options=options)
links_total = []
driver.get("https://www.cbp.gov/contact/find-broker-by-port?field_port_location_tid=All&field_port_code_value=")
def first_links():
initial_data = driver.find_elements_by_tag_name('td')
for initial in initial_data:
page_links = initial.find_elements_by_tag_name('a')
for page in page_links:
page_link = page.get_attribute("href")
links_total.append(page_link)
driver.refresh()
if driver.find_element_by_partial_link_text('next'):
next_page = driver.find_element_by_partial_link_text('next')
next_page.click()
time.sleep(2)
new_data = driver.find_elements_by_tag_name('td')
for new in new_data:
links = new.find_elements_by_tag_name('a')
for link in links:
new_link = link.get_attribute("href")
links_total.append(new_link)
for i in range(1, 23):
first_links()
for link in links_total:
print(link)

Try-catch would be better option
from selenium import webdriver
import time
from selenium.common.exceptions import ElementNotVisibleException, NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
options = webdriver.ChromeOptions()
options.add_argument('headless')
driver = webdriver.Chrome('C:/Users/../Downloads/cd79/chromedriver.exe', options=options)
driver.implicitly_wait(10)
# links_total = []
driver.get("https://www.cbp.gov/contact/find-broker-by-port?field_port_location_tid=All&field_port_code_value=")
def first_links(links_total=[]):
initial_data = driver.find_elements_by_tag_name('td')
for initial in initial_data:
page_links = initial.find_elements_by_tag_name('a')
for page in page_links:
page_link = page.get_attribute("href")
links_total.append(page_link)
# driver.refresh()
try:
next_page = driver.find_element_by_partial_link_text('next')
next_page.click()
time.sleep(2)
first_links(links_total)
except (TimeoutError, ElementNotVisibleException, NoSuchElementException):
print("NEXT btn not found : ")
pass
return links_total
all_links = first_links()
for link in all_links:
print(link)
You don't need to use Selenium actually. You could do it with BeautifulSoap like so :
import requests
from bs4 import BeautifulSoup
page_num=0
url_cbp = r"https://www.cbp.gov/contact/find-broker-by-port?field_port_location_tid=All&field_port_code_value=&page={}"
def get_links(links_total=[], page_num=0):
page = requests.get(url_cbp.format(page_num))
soup = BeautifulSoup(page.content, 'html.parser')
results = soup.find(id='region-content')
table_cells = results.find_all('td', class_='views-field')
for cell in table_cells:
# print(cell )
# print('\n\n')
cell_link = cell.find('a')
page_link = cell_link["href"]
links_total.append(page_link)
next_page = results.find('li', class_='pager-next')
if next_page:
page_num += 1
get_links(links_total, page_num)
return links_total
all_links = get_links()
for link in all_links:
print(link)

python selenium driver not scrolling to collect all data points

I am attempting to get some data from the EPA website, unfortunately i am not able to capture all of the data points, I theorize this is due to a combination of scrolling and waiting for the tag to become visible. however I have been working on this since yesterday with no luck.
from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
from selenium.webdriver.common.keys import Keys
import numpy as np
options = webdriver.ChromeOptions()
path = '/Users/<user>/Applications/chromedriver'
options.set_headless(True)
driver = webdriver.Chrome(chrome_options= options, executable_path=path)
url = 'https://edap.epa.gov/public/single/?appid=73b2b6a5-70c6-4820-b3fa-186ac094f10d&obj=b5bf280c-3488-4e46-84f6-58e2a0c34108&opt=noanimate%2Cnoselections&select=clearall'
driver.set_window_size(1920, 1080)
driver.get(url)
SCROLL_PAUSE_TIME = 0.5
# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")
rin_data = []
while True:
# Scroll down to bottom
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.CLASS_NAME,"qv-st-value-overflow")))
soup = BeautifulSoup(driver.page_source, "html.parser")
tableURL = soup.select('.qv-st-value-overflow')
for rin_val in tableURL:
rin_data.append(rin_val.get_text())
# Wait to load page
time.sleep(SCROLL_PAUSE_TIME)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height

It use Websocket not Ajax to get the data and you need to scroll the table[ng-style="tableStyles.content"] not body but it need custom scroll or using mouse wheel scroll. The function is taken from here
SCROLL_PAUSE_TIME = 2
driver.get(url)
# add mouse wheel function to the page
driver.execute_script('''
window.scrollTable = function() {
var element = document.querySelector('table[ng-style="tableStyles.content"]')
var box = element.getBoundingClientRect();
var deltaY = box.height;
var clientX = box.left + (box.width / 2);
var clientY = box.top + (box.height / 2);
var target = element.ownerDocument.elementFromPoint(clientX, clientY);
for (var e = target; e; e = e.parentElement) {
if (e === element) {
target.dispatchEvent(new WheelEvent('wheel', {view: window, bubbles: true, cancelable: true, clientX: clientX, clientY: clientY, deltaY: deltaY}));
}
}
}
''')
rin_data = []
while True:
WebDriverWait(driver, 30).until(
EC.presence_of_element_located((By.CSS_SELECTOR, 'tr[class^="qv-st-data-row"]'))
)
last_position = driver.find_element_by_css_selector(".scrollbar-thumb").get_attribute('style')
rows = driver.find_elements_by_css_selector('tr[class^="qv-st-data-row"]')
for row in rows:
rin_data.append(row.text)
# Scroll down the table
driver.execute_script('scrollTable()')
# Wait to load content from Websocket, maybe need to increase
time.sleep(SCROLL_PAUSE_TIME)
# Calculate new scroll position and compare with last scroll position
new_position = driver.find_element_by_css_selector(".scrollbar-thumb").get_attribute('style')
if new_position == last_position:
break
Note, in this case you don't need to use BeautifulSoup

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

python selenium : empty list is output as a crawling result - python

Related

"TypeError: 'function' object is not iterable", why am I getting this error?

Selenium script captures first few items out of 100

Waiting for data table to load after click / Selenium

if or try loop for an element in a page selenium

python selenium driver not scrolling to collect all data points

Categories

Resources