I need to access only the products on page 2 to 5 of the link below, the variable is at the end of the link where it changes according to the page sequence
driver.get(url)
classe = driver.find_elements(By. XPATH, "//*[#class='LinksShowcase_UrlContainer__kMj_n']/p")
pages = 1
for x in url:
driver.get("https://br.ebay.com/b/Portable-Audio/15052/bn_1642614?_pgn="+ str(pages))
sleep(2)
for i in classe:
#pages += 1
sleep(0.5)
links.append(i.text)
print(links)
sleep(2)
To get pages 2-5, you can iterate using the range() function:
for page in range(2, 6):
driver.get("https://br.ebay.com/b/Portable-Audio/15052/bn_1642614?_pgn="+ str(page))
Related
I am using the code below to try to scrape product data from 90 pages; however the data from the first and last pages are missing in the list object when complete. Due to the nature of the website I cannot use scrapy or beautiful soup, so I am trying to navigate page by page with Selenium web driver. I have tried adjusting the number_of_pages to the actual number pages +1, which still skipped the first & last pages. I have also tried to set the page_to_start_clicking to 0 which produces a timeout error. Unfortunately I cannot share more about the source because of the authentication. Thank you in advanced for the help!
wait = WebDriverWait(driver, 20)
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#ResultsPerPageBottom > nav > span.next'))).click() # next button
number_of_pages = 90 # PROBLEM 1st & last pages missed
page_to_start_clicking = 1 # error if 0
# range set from 0; skips 1st and last page
for i in range(0, 90):
time.sleep(2)
for ele in driver.find_elements(By.CSS_SELECTOR, 'div.srp-item-body'):
driver.execute_script("arguments[0].scrollIntoView(true);", ele)
print(ele.text)
wait.until(EC.element_to_be_clickable((By.LINK_TEXT, f"{page_to_start_clicking}"))).click()
page_to_start_clicking = page_to_start_clicking + 1
This was the code from the solution described in the comments.
# Scrape & pagination
wait = WebDriverWait(driver, 20)
number_of_pages = 91
listings = []
for i in range(0, 91):
time.sleep(2)
for ele in driver.find_elements(By.CSS_SELECTOR, 'div.srp-item-body'):
driver.execute_script("arguments[0].scrollIntoView(true);", ele)
listings.append(ele.text)
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#ResultsPerPageBottom > nav > span.next'))).click()
I want to make a recommendation system for webtoon, so I am collecting webtoon data. Currently, I wrote a code to scrap the url of the toons on the Kakao Webtoon page.
def extract_from_page(page_link):
links = []
driver = webdriver.Chrome()
driver.get(page_link)
elems = driver.find_elements_by_css_selector(".h-full.relative")
for elem in elems:
link = elem.get_attribute('href')
if link:
links.append({'id': int(link.split('/')[-1]), 'link': link})
print(len(links))
return links
This code works in weekly page(https://webtoon.kakao.com/original-webtoon, https://webtoon.kakao.com/original-novel)
However, in page that shows finished toons(https://webtoon.kakao.com/original-webtoon?tab=complete), it only receives 13 urls for the 13 webtoons at the top of the page.
I found similar post(web scraping gives only first 4 elements on a page) and add scroll, but noting changed.
I would appreciate it if you could tell me the cause and solution.
Try like below.
driver.get("https://webtoon.kakao.com/original-webtoon?tab=complete")
wait = WebDriverWait(driver,30)
j = 1
for i in range(5):
# Wait for the elements to load/appear
wait.until(EC.presence_of_all_elements_located((By.XPATH, "//a[contains(#href,'content')]")))
# Get all the elements which contains href value
links = driver.find_elements(By.XPATH,"//a[contains(#href,'content')]")
# Iterate to print the links
for link in links:
print(f"{j} : {link.get_attribute('href')}")
j += 1
# Scroll to the last element of the list links
driver.execute_script("arguments[0].scrollIntoView(true);",links[len(links)-1])
Output:
1 : https://webtoon.kakao.com/content/%EB%B0%A4%EC%9D%98-%ED%96%A5/1532
2 : https://webtoon.kakao.com/content/%EB%B8%8C%EB%A0%88%EC%9D%B4%EC%BB%A42/596
3 : https://webtoon.kakao.com/content/%ED%86%A0%EC%9D%B4-%EC%BD%A4%ED%94%8C%EB%A0%89%EC%8A%A4/1683
...
I am trying to scrape product's data when given a keyword/search_term and so far, I have managed to scrape all data from the first to the last page.
However, I wanted to change it in a way that I scrape just the first 100 or 150 products which I'm not sure how to do.
I reckon I need some integer value that keeps track of how many items I am scraping and stop when the integer gets to 100 or 150.
I know that I need to change something on the "for page in range (1, last_page)" but I've tried and ended up getting 100 same results for each of the item which isn't what I'm supposed to do.
def main(search_term):
# RUN MAIN PROGRAM ROUTINE
chromedriver = "path to chromedriver"
driver = webdriver.Chrome(chromedriver)
records = []
url = get_url(search_term)
driver.get(url)
last_page = int(driver.find_element_by_xpath('(//div[#class="a-text-center"]/ul/li)[last()-1]').text) + 1
# NUMBER OF PAGES TO CRAWL
for page in range(1, last_page):
driver.get(url.format(page))
soup = BeautifulSoup(driver.page_source, 'html.parser')
results = soup.find_all('div', {'data-component-type': 's-search-result'})
print(page)
for item in results:
record = extract_record(item)
if record:
records.append(record)
driver.close()
# Run the main function given a keyword
main("make-up")
# leads to https://www.amazon.com/s?k=cosmetics&ref=nb_sb_noss
#main("iphone")
How would I go on about changing it so that I can scrape the first 100, 150 or whatever the number I want to scrape?
So you have to check if the record length is 100 and then break from the outer for loop.
for page in range(1, last_page):
driver.get(url.format(page))
soup = BeautifulSoup(driver.page_source, 'html.parser')
results = soup.find_all('div', {'data-component-type': 's-search-result'})
print(page)
find=False
for item in results:
records.append(item)
if len(records)==100:
find=True
break
if find:
break
I have been getting this error that returns a link(good one) after the for loop and then when I put the link variables inside a for link in links loop it returns a different one. This is my code:
links=[]
def Start():
driver = webdriver.Chrome(executable_path="C:/Users/Tera max/.wdm/drivers/chromedriver/win32/87.0.4280.88/chromedriver.exe")
driver.get('https://instagram.com/')
sleep(2)
driver.maximize_window()
sleep(1)
driver.find_element_by_xpath('//*[#id="loginForm"]/div/div[3]/button').click()
sleep(3)
driver.find_element_by_xpath("//button[contains(text(), 'Not Now')]").click()
driver.get('https://www.instagram.com/explore/tags/{}/'.format("messi"))
sleep(2)
links=driver.find_elements_by_tag_name('a')
def condition(link):
return '.com/p/' in link.get_attribute('href')
valid_links = list(filter(condition,links))
for i in range(5):
link = valid_links[i].get_attribute('href')
if link not in links:
links.append(link)
print(link)
for link in links:
print(link)
driver.get(link)
Here is the out put of printing the first link then the second one:
#first
https://www.instagram.com/p/CKMR3lMAD8O/
#second
<selenium.webdriver.remote.webelement.WebElement (session="664522e3bb5f9a9527be40d5e34b79d6", element="4a0d1327-fd66-40ff-a622-55da864e9d14")>
print links and you'll see that the first link is the last element of links, while your second print is the first element of links.
I've reduced your code to demonstrate it more clearly:
links = []
for i in range(5):
link = i
links.append(i)
print(link)
for link in links:
print(link)
Output:
4 # < your first print
0 # The follow-up prints from the loop
1
2
3
4
you are reusing links without emptying them first. look at this line:
links=driver.find_elements_by_tag_name('a')
and later at this line
links.append(link)
so now links contains BOTH types of items. Either make a new list or do links.clear() before your for i loop
I was doing web scraping for a website with multiple pages in one web page.
But when I click page 2, the url showed http://www.worldhospitaldirectory.com/Germany/hospitals#page-2.
And I put this url as next navigation location. And it goes directly to
http://www.worldhospitaldirectory.com/Germany/hospitals#page-1, which is the default page.
I don't how to navigate to these sub pages.
Any suggestions or code?
my code now:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
driver = webdriver.Firefox()
driver.get('http://www.worldhospitaldirectory.com/Germany/hospitals')
url = []
pagenbr = 1
while pagenbr <= 43:
current = driver.current_url
driver.get(current)
lks = driver.find_elements_by_xpath('//*[#href]')
for ii in lks:
link = ii.get_attribute('href')
if '/info' in link:
url.extend(link)
print (link)
print('page ' + str(pagenbr) + ' is done.')
elm = driver.find_element_by_link_text('Next')
driver.implicitly_wait(10)
elm.click()
pagenbr += 1
Try just to click appropriate button on pagination as
driver.find_element_by_link_text('Next') # to get next page
or
driver.find_element_by_link_text('2') # to get second page
Get element button
button_next = driver.find_element_by_xpath('//a[#class='page-link next'])
button_next.click()
I let the algorithm to iterate all pages for you
This worked for me
while pagenbr <= 3:
current = driver.current_url
print current
driver.get(current)
lks = driver.find_elements_by_xpath('//*[#href]')
for ii in lks:
link = ii.get_attribute('href')
if '/info' in link:
url.extend(link)
print (link)
print('page ' + str(pagenbr) + ' is done.')
elm = driver.find_element_by_link_text('Next')
driver.implicitly_wait(10)
elm.click()
driver.implicitly_wait(10)
lks = driver.find_elements_by_xpath('//*[#href]')
for ii in lks:
link = ii.get_attribute('href')
if '/info' in link:
url.extend(link)
print (link)
pagenbr += 1