I am using the code below to try to scrape product data from 90 pages; however the data from the first and last pages are missing in the list object when complete. Due to the nature of the website I cannot use scrapy or beautiful soup, so I am trying to navigate page by page with Selenium web driver. I have tried adjusting the number_of_pages to the actual number pages +1, which still skipped the first & last pages. I have also tried to set the page_to_start_clicking to 0 which produces a timeout error. Unfortunately I cannot share more about the source because of the authentication. Thank you in advanced for the help!
wait = WebDriverWait(driver, 20)
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#ResultsPerPageBottom > nav > span.next'))).click() # next button
number_of_pages = 90 # PROBLEM 1st & last pages missed
page_to_start_clicking = 1 # error if 0
# range set from 0; skips 1st and last page
for i in range(0, 90):
time.sleep(2)
for ele in driver.find_elements(By.CSS_SELECTOR, 'div.srp-item-body'):
driver.execute_script("arguments[0].scrollIntoView(true);", ele)
print(ele.text)
wait.until(EC.element_to_be_clickable((By.LINK_TEXT, f"{page_to_start_clicking}"))).click()
page_to_start_clicking = page_to_start_clicking + 1
This was the code from the solution described in the comments.
# Scrape & pagination
wait = WebDriverWait(driver, 20)
number_of_pages = 91
listings = []
for i in range(0, 91):
time.sleep(2)
for ele in driver.find_elements(By.CSS_SELECTOR, 'div.srp-item-body'):
driver.execute_script("arguments[0].scrollIntoView(true);", ele)
listings.append(ele.text)
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#ResultsPerPageBottom > nav > span.next'))).click()
Related
I need to access only the products on page 2 to 5 of the link below, the variable is at the end of the link where it changes according to the page sequence
driver.get(url)
classe = driver.find_elements(By. XPATH, "//*[#class='LinksShowcase_UrlContainer__kMj_n']/p")
pages = 1
for x in url:
driver.get("https://br.ebay.com/b/Portable-Audio/15052/bn_1642614?_pgn="+ str(pages))
sleep(2)
for i in classe:
#pages += 1
sleep(0.5)
links.append(i.text)
print(links)
sleep(2)
To get pages 2-5, you can iterate using the range() function:
for page in range(2, 6):
driver.get("https://br.ebay.com/b/Portable-Audio/15052/bn_1642614?_pgn="+ str(page))
I want to make a recommendation system for webtoon, so I am collecting webtoon data. Currently, I wrote a code to scrap the url of the toons on the Kakao Webtoon page.
def extract_from_page(page_link):
links = []
driver = webdriver.Chrome()
driver.get(page_link)
elems = driver.find_elements_by_css_selector(".h-full.relative")
for elem in elems:
link = elem.get_attribute('href')
if link:
links.append({'id': int(link.split('/')[-1]), 'link': link})
print(len(links))
return links
This code works in weekly page(https://webtoon.kakao.com/original-webtoon, https://webtoon.kakao.com/original-novel)
However, in page that shows finished toons(https://webtoon.kakao.com/original-webtoon?tab=complete), it only receives 13 urls for the 13 webtoons at the top of the page.
I found similar post(web scraping gives only first 4 elements on a page) and add scroll, but noting changed.
I would appreciate it if you could tell me the cause and solution.
Try like below.
driver.get("https://webtoon.kakao.com/original-webtoon?tab=complete")
wait = WebDriverWait(driver,30)
j = 1
for i in range(5):
# Wait for the elements to load/appear
wait.until(EC.presence_of_all_elements_located((By.XPATH, "//a[contains(#href,'content')]")))
# Get all the elements which contains href value
links = driver.find_elements(By.XPATH,"//a[contains(#href,'content')]")
# Iterate to print the links
for link in links:
print(f"{j} : {link.get_attribute('href')}")
j += 1
# Scroll to the last element of the list links
driver.execute_script("arguments[0].scrollIntoView(true);",links[len(links)-1])
Output:
1 : https://webtoon.kakao.com/content/%EB%B0%A4%EC%9D%98-%ED%96%A5/1532
2 : https://webtoon.kakao.com/content/%EB%B8%8C%EB%A0%88%EC%9D%B4%EC%BB%A42/596
3 : https://webtoon.kakao.com/content/%ED%86%A0%EC%9D%B4-%EC%BD%A4%ED%94%8C%EB%A0%89%EC%8A%A4/1683
...
I am attempting to scrape data through multiple pages (36) from a website to gather the document number and the revision number for each available document and save it to two different lists. If I run the code block below for each individual page, it works perfectly. However, when I added the while loop to loop through all 36 pages, it will loop, but only the data from the first page is saved.
#sam.gov website
url = 'https://sam.gov/search/?index=sca&page=1&sort=-modifiedDate&pageSize=25&sfm%5Bstatus%5D%5Bis_active%5D=true&sfm%5BwdPreviouslyPerformedWrapper%5D%5BpreviouslyPeformed%5D=prevPerfNo%2F'
#webdriver
driver = webdriver.Chrome(options = options_, executable_path = r'C:/Users/439528/Python Scripts/Spyder/chromedriver.exe' )
driver.get(url)
#get rid of pop up window
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#sds-dialog-0 > button > usa-icon > i-bs > svg'))).click()
#list of revision numbers
revision_num = []
#empty list for all the WD links
WD_num = []
substring = '2015'
current_page = 0
while True:
current_page += 1
if current_page == 36:
#find all elements on page named "field name". For each one, get the text. if the text is 'Revision Date'
#then, get the 'sibling' element, which is the actual revision number. append the date text to the revision_num list.
elements = driver.find_elements_by_class_name('sds-field__name')
wd_links = driver.find_elements_by_class_name('usa-link')
for i in elements:
element = i.text
if element == 'Revision Number':
revision_numbers = i.find_elements_by_xpath("./following-sibling::div")
for x in revision_numbers:
a = x.text
revision_num.append(a)
#finding all links that have the partial text 2015 and putting the wd text into the WD_num list
for link in wd_links:
wd = link.text
if substring in wd:
WD_num.append(wd)
print('Last Page Complete!')
break
else:
#find all elements on page named "field name". For each one, get the text. if the text is 'Revision Date'
#then, get the 'sibling' element, which is the actual revision number. append the date text to the revision_num list.
elements = driver.find_elements_by_class_name('sds-field__name')
wd_links = driver.find_elements_by_class_name('usa-link')
for i in elements:
element = i.text
if element == 'Revision Number':
revision_numbers = i.find_elements_by_xpath("./following-sibling::div")
for x in revision_numbers:
a = x.text
revision_num.append(a)
#finding all links that have the partial text 2015 and putting the wd text into the WD_num list
for link in wd_links:
wd = link.text
if substring in wd:
WD_num.append(wd)
#click on next page
click_icon = WebDriverWait(driver, 5, 0.25).until(EC.visibility_of_element_located([By.ID,'bottomPagination-nextPage']))
click_icon.click()
WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.ID, 'main-container')))
Things I've tried:
I added the WebDriverWait in order to slow the script down for the page to load and/or elements to be clickable/located
I declared the empty lists outside the loop so it does not overwrite over each iteration
I have edited the while loop multiple times to either count up to 36 (while current_page <37) or moved the counter to the top or bottom of the loop)
Any ideas? TIA.
EDIT: added screenshot of 'field name'
I have refactor your code and made things very simple.
driver = webdriver.Chrome(options = options_, executable_path = r'C:/Users/439528/Python Scripts/Spyder/chromedriver.exe' )
revision_num = []
WD_num = []
for page in range(1,37):
url = 'https://sam.gov/search/?index=sca&page={}&sort=-modifiedDate&pageSize=25&sfm%5Bstatus%5D%5Bis_active%5D=true&sfm%5BwdPreviouslyPerformedWrapper%5D%5BpreviouslyPeformed%5D=prevPerfNo%2F'.format(page)
driver.get(url)
if page==1:
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#sds-dialog-0 > button > usa-icon > i-bs > svg'))).click()
elements = WebDriverWait(driver, 10).until(EC.visibility_of_all_elements_located((By.XPATH,"//a[contains(#class,'usa-link') and contains(.,'2015')]")))
wd_links = WebDriverWait(driver, 10).until(EC.visibility_of_all_elements_located((By.XPATH,"//div[#class='sds-field__name' and text()='Revision Number']/following-sibling::div")))
for element in elements:
revision_num.append(element.text)
for wd_link in wd_links:
WD_num.append(wd_link.text)
print(revision_num)
print(WD_num)
if you know only 36 pages to iterate you can pass the value in the url.
wait for element visible using webdriverwait
construct your xpath in such a way so can identify element uniquely without if, but.
console output on my terminal:
I'm trying to do a simple Python Selenium automation on a website while the website is blocked by a dialog which needs to scroll down to see all the paragraph so as to pass into the website.
I tried to use the code below to scroll the paragraph, but unsuccessful.
driver = webdriver.Chrome('chromedriver')
driver.maximize_window()
driver.implicitly_wait(30)
driver.get('https://www.fidelity.com.hk/en/our-funds/mpf')
wait = WebDriverWait(driver, 20)
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'div[data-action="button68"]'))).click()
time.sleep(1)
ele =driver.find_element_by_css_selector('.content-scrolling-behavior')
driver.execute_script("return arguments[0].scrollIntoView(true);", ele)
html capture
I would appreciate any feedback on how to consistently select an option from the dropdown noted in the code provided. And here is the website I looking at: https://www.fidelity.com.hk/en/our-funds/mpf
You can scroll using ActionChain like this :
also, in that div, there are 27 li tags, so I am doing xpath indexing and then one by one I am moving driver focus to those li.
Sample code :
driver.implicitly_wait(30)
driver.maximize_window()
driver.get("https://www.fidelity.com.hk/en/our-funds/mpf")
wait = WebDriverWait(driver, 20)
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'div[data-action="button68"]'))).click()
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "div.container")))
list_size = len(wait.until(EC.visibility_of_all_elements_located((By.XPATH, "//ul[#class='list']/li"))))
print(list_size)
j = 1
for i in range(list_size):
ActionChains(driver).move_to_element(wait.until(EC.visibility_of_element_located((By.XPATH, f"(//ul[#class='list']/li)[{j}]")))).perform()
j = j + 1
time.sleep(1)
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "div[class$='btn-confirm']"))).click()
This should work
ele =driver.find_element_by_css_selector('div.content-scrolling-behavior')
driver.execute_script("arguments[0].scrollTop = arguments[0].scrollHeight", ele)
UPD
Try this instead:
ele =driver.find_element_by_css_selector('div.container')
driver.execute_script("arguments[0].scrollTop = arguments[0].scrollHeight", ele)
I'm trying to scrape company's jobs offer from linkedin. I need to scroll a section in the page (with an inner scrollbar). I have been trying this :
1.
scroll_active = WebDriverWait(driver, 40).until(EC.presence_of_element_located((By.CSS_SELECTOR, "body > div.application-outlet > div.authentication-outlet > div.job-search-ext > div > div > section.jobs-search__left-rail > div > div > ul")))
scroll_active.location_once_scrolled_into_view
while driver.find_element_by_tag_name('div'):
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
Divs=driver.find_element_by_tag_name('div').text
if 'End of Results' in Divs:
print 'end'
break
else:
continue
Need to extract 'href'
If any one facing that, I wish this could help, you just have to choose well the element that you want to scroll
my_xpath = WebDriverWait(driver, 40).until(EC.presence_of_element_located((By.XPATH, "/html/body/div[8]/div[3]/div[3]/div/div/section[1]/div/div")))
driver.execute_script('arguments[0].scrollTop = arguments[0].scrollHeight', my_xpath)
Why do need to scroll here?
seems like you can get all of the element by command:
elements = driver.find_elements(By.XPATH, "//a[#class='result-card__full-card-link']")
and looks like:
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
driver = webdriver.Chrome()
driver.get('https://www.linkedin.com/jobs/search/?f_C=1110%2C12800%2C5115950%2C3165553%2C603115%2C10916%2C8331%2C3297950%2C8238%2C5509188%2C3093%2C2625246%2C1112%2C947572%2C11018069%2C407323&geoId=92000000')
time.sleep(3)
def element_present():
try:
driver.find_element(By.XPATH, "//button[#class='infinite-scroller__show-more-button infinite-scroller__show-more-button--visible']")
except Exception:
return False
return True
while not element_present():
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
elements = driver.find_elements(By.XPATH, "//a[#class='result-card__full-card-link']")
hrefs = [el.get_attribute('href') for el in elements]
print(hrefs)
print(len(hrefs))
driver.quit()
might I missed smth, but seems like it works well as well