How to scrape data using next button with ellipsis using Scrapy

How to scrape data using next button with ellipsis using Scrapy - python

I need to continuously get the data on next button <1 2 3 ... 5> but there's no provided href link in the source also there's also elipsis. any idea please? here's my code
def start_requests(self):
urls = (
(self.parse_2, 'https://www.forever21.com/us/shop/catalog/category/f21/sale'),
)
for cb, url in urls:
yield scrapy.Request(url, callback=cb)
def parse_2(self, response):
for product_item_forever in response.css('div.pi_container'):
forever_item = {
'forever-title': product_item_forever.css('p.p_name::text').extract_first(),
'forever-regular-price': product_item_forever.css('span.p_old_price::text').extract_first(),
'forever-sale-price': product_item_forever.css('span.p_sale.t_pink::text').extract_first(),
'forever-photo-url': product_item_forever.css('img::attr(data-original)').extract_first(),
'forever-description-url': product_item_forever.css('a.item_slider.product_link::attr(href)').extract_first(),
}
yield forever_item
Please help me thank you

It seems this pagination uses additional request to API.
So, there are two ways:
Use Splash/Selenium to render pages by pattern of QHarr;
Make same calls to API. Check developer tools, you will find POST-request https://www.forever21.com/us/shop/Catalog/GetProducts will all proper params (they are too long, so I will not post full list here).

The url changes so you can specify page number and results per page in the url e.g.
https://www.forever21.com/uk/shop/catalog/category/f21/sale/#pageno=2&pageSize=120&filter=price:0,250
As mentioned by #vezunchik and OP feedback, this approach requires selenium/splash to allow js to run on the page. If you were going down that route you could just click the next ( .p_next) until you get the end page as it is easy to grab the last page number (.dot + .pageno)from the document.
I appreciate you are trying with scrapy.
Demo of the idea with selenium in case helps.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
url_loop = 'https://www.forever21.com/uk/shop/catalog/category/f21/sale/#pageno={}&pageSize=120&filter=price:0,250'
url = 'https://www.forever21.com/uk/shop/catalog/category/f21/sale'
d = webdriver.Chrome()
d.get(url)
d.find_element_by_css_selector('[onclick="fnAcceptCookieUse()"]').click() #get rid of cookies
items = WebDriverWait(d,10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#products .p_item")))
d.find_element_by_css_selector('.selectedpagesize').click()
d.find_elements_by_css_selector('.pagesize')[-1].click() #set page result count to 120
last_page = int(d.find_element_by_css_selector('.dot + .pageno').text) #get last page
if last_page > 1:
for page in range(2, last_page + 1):
url = url_loop.format(page)
d.get(url)
try:
d.find_element_by_css_selector('[type=reset]').click() #reject offer
except:
pass
# do something with page
break #delete later

Related

Creating POST request to scrape website with python where no network form data changes

I am scraping a website that dynamically renders with javascript. The urls don't change when hitting the > button So I have been trying to look at the inspector in the network section and more specifically the "General" section for the "Request Url" and the "Request Method" as well as in the "Form Data" section looking for any sort of ID that could be unique to distinguish each successive page. However when recording a log of clicking the > button from page to page the "Form Data" data seems to be the same each time (See images):
Currently my code doesn't incorporate this method because I can't see it helping until I can find a unique identifier in the "Form Data" section. However, I can show my code if helpful. In essence it just pulls the first page of data over and over again in my while loop even though I'm using a driver with selenium and using driver.find_elements_by_xpath("xpath of > button").click() before trying to get the data with BeautifulSoup.
(Updated code see comments)
from selenium import webdriver
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import pandas as pd
from pandas import *
masters_list = []
def extract_info(html_source):
# html_source will be inner HTMl of table
global lst
soup = BeautifulSoup(html_source, 'html.parser')
lst = soup.find('tbody').find_all('tr')[0]
masters_list.append(lst)
# i am printing just id because it's id set as crypto name you have to do more scraping to get more info
chrome_driver_path = '/Users/Justin/Desktop/Python/chromedriver'
driver = webdriver.Chrome(executable_path=chrome_driver_path)
url = 'https://cryptoli.st/lists/fixed-supply'
driver.get(url)
loop = True
while loop: # loop for extrcting all 120 pages
crypto_table = driver.find_element(By.ID, 'DataTables_Table_0').get_attribute(
'innerHTML') # this is for crypto data table
extract_info(crypto_table)
paginate = driver.find_element(
By.ID, "DataTables_Table_0_paginate") # all table pagination
pages_list = paginate.find_elements(By.TAG_NAME, 'li')
# we clicking on next arrow sign at last not on 2,3,.. etc anchor link
next_page_link = pages_list[-1].find_element(By.TAG_NAME, 'a')
# checking is there next page available
if "disabled" in next_page_link.get_attribute('class'):
loop = False
pages_list[-1].click() # if there next page available then click on it
df = pd.DataFrame(masters_list)
print(df)
df.to_csv("crypto_list.csv")
driver.quit()

I am using my own code to show how i am getting the table i add explanation as comment for important line
from selenium import webdriver
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
def extract_info(html_source):
soup = BeautifulSoup(html_source,'html.parser') #html_source will be inner HTMl of table
lst = soup.find('tbody').find_all('tr')
for i in lst:
print(i.get('id')) # i am printing just id because it's id set as crypto name you have to do more scraping to get more info
driver = webdriver.Chrome()
url = 'https://cryptoli.st/lists/fixed-supply'
driver.get(url)
loop = True
while loop: #loop for extrcting all 120 pages
crypto_table = driver.find_element(By.ID,'DataTables_Table_0').get_attribute('innerHTML') # this is for crypto data table
print(extract_info(crypto_table))
paginate = driver.find_element(By.ID, "DataTables_Table_0_paginate") # all table pagination
pages_list = paginate.find_elements(By.TAG_NAME,'li')
next_page_link = pages_list[-1].find_element(By.TAG_NAME,'a') # we clicking on next arrow sign at last not on 2,3,.. etc anchor link
if "disabled" in next_page_link.get_attribute('class'): # checking is there next page available
loop = False
pages_list[-1].click() # if there next page available then click on it
so main answer of your question is when you click on button, selenium update the page then you can use driver.page_source to get updated html. some times (*not this url) page can have ajax request which can take some time so you have to wait till the selenium load the full page.

Get an empty list of XPATH expression in python

I have watched a video at this link https://www.youtube.com/watch?v=EELySnTPeyw and this is the code ( I have changed the xpath as it seems the website has been changed)
import selenium.webdriver as webdriver
def get_results(search_term):
url = 'https://www.startpage.com'
browser = webdriver.Chrome(executable_path="D:\\webdrivers\\chromedriver.exe")
browser.get(url)
search_box = browser.find_element_by_id('q')
search_box.send_keys(search_term)
try:
links = browser.find_elements_by_xpath("//a[contains(#class, 'w-gl__result-title')]")
except:
links = browser.find_lemets_by_xpath("//h3//a")
print(links)
for link in links:
href = link.get_attribute('href')
print(href)
results.append(href)
browser.close()
get_results('cat')
The code works well as for the part of opening the browser and navigating to the search box and sending keys but as for the links return an empty list although I have manually searched for the xpath in the developer tools and it returns 10 results.

You need to add keys.enter to your search. You weren't on the next page.
search_box.send_keys(search_term+Keys.ENTER)
Import
from selenium.webdriver.common.keys import Keys
Outputs
https://en.wikipedia.org/wiki/Cat
https://www.cat.com/en_US.html
https://www.cat.com/
https://www.youtube.com/watch?v=cbP2N1BQdYc
https://icatcare.org/advice/thinking-of-getting-a-cat/
https://www.caterpillar.com/en/brands/cat.html
https://www.petfinder.com/cats/
https://www.catfootwear.com/US/en/home
https://www.aspca.org/pet-care/cat-care/general-cat-care
https://www.britannica.com/animal/cat

WebScraping Next pages with Selenium

When I navigate to the below link and locate the pagination at the bottom of the page:
https://shop.nordstrom.com/c/sale-mens-clothing?origin=topnav&breadcrumb=Home%2FSale%2FMen%2FClothing&sort=Boosted
I am only able to scrape the first 4 or so pages then my script stops
I have tried with xpath, css_selector, and with the WebDriverWait options
pages_remaining = True
page = 2 //starts # page 2 since page one is scraped already with first loop
while pages_remaining:
//scrape code
try:
wait = WebDriverWait(browser, 20)
wait.until(EC.element_to_be_clickable((By.LINK_TEXT, str(page)))).click()
print browser.current_url
page += 1
except TimeoutException:
pages_remaining = False
Current Results from console:
https://shop.nordstrom.com/c/sale-mens-designer-clothing-accessories- shoes?breadcrumb=Home%2FSale%2FMen%2FDesigner&page=2&sort=Boosted
https://shop.nordstrom.com/c/sale-mens-designer-clothing-accessories-shoes?breadcrumb=Home%2FSale%2FMen%2FDesigner&page=3&sort=Boosted
https://shop.nordstrom.com/c/sale-mens-designer-clothing-accessories-shoes?breadcrumb=Home%2FSale%2FMen%2FDesigner&page=4&sort=Boosted

This solution is a BeautifulSoup one, because I am not too familiar with Selenium.
Try to create a new variable with your number of pages. As you can see, when you enter the next page the URL changes, thus just manipulate the given URL. See my code example below.
# Define variable pages first
pages = [str(i) for i in range(1,53)] # 53 'cuz you have 52 pages
for page in pages:
response = get("https://shop.nordstrom.com/c/sale-mens-clothing?origin=topnav&breadcrumb=Home%2FSale%2FMen%2FClothing&page=" + page + "&sort=Boosted"
# Rest of you code
This snippet should do the job for the rest of the pages. Hope that helps, although this might not exactly what you have been looking for.
When you have any questions just post below. ;).
Cheers.

You could loop throught page numbers until no more results are shown by just changing the url:
from bs4 import BeautifulSoup
from selenium import webdriver
base_url = "https://m.shop.nordstrom.com/c/sale-mens-clothing?origin=topnav&breadcrumb=Home%2FSale%2FMen%2FClothing&page={}&sort=Boosted"
driver = webdriver.Chrome()
page = 1
soup = BeautifulSoup("")
#Will loop untill there's no more results
while "Looks like we don’t have exactly what you’re looking for." not in soup.text:
print(base_url.format(page))
#Go to page
driver.get(base_url.format(page))
soup = BeautifulSoup(driver.page_source)
### your extracting code
page +=1

How to get all the products from all pages in the subcategory(python, amazon)

How can I get all the products from all the pages in the subcategory? I attached the program. Now my program is getting only from the first page. I would like to get all the products from that subcategory from all +400 pages so to go to the next page extract all products then to the next page etc. I will appreciate any help.
# selenium imports
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import random
PROXY ="88.157.149.250:8080";
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--proxy-server=%s' % PROXY)
# //a[starts-with(#href, 'https://www.amazon.com/')]/#href
LINKS_XPATH = '//*[contains(#id,"result")]/div/div[3]/div[1]/a'
browser = webdriver.Chrome(chrome_options=chrome_options)
browser.get(
'https://www.amazon.com/s/ref=lp_11444071011_nr_p_8_1/132-3636705-4291947?rh=n%3A3375251%2Cn%3A%213375301%2Cn%3A10971181011%2Cn%3A11444071011%2Cp_8%3A2229059011')
links = browser.find_elements_by_xpath(LINKS_XPATH)
for link in links:
href = link.get_attribute('href')
print(href)

As you want to get huge piece of data, it's better to get it with direct HTTP request instead of navigating to each page with Selenium...
Try to iterate through all the pages and scrape required data as below
import requests
from lxml import html
page_counter = 1
links = []
while True:
headers = {"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:58.0) Gecko/20100101 Firefox/58.0"}
url = "https://www.amazon.com/s/ref=sr_pg_{0}?rh=n%3A3375251%2Cn%3A!3375301%2Cn%3A10971181011%2Cn%3A11444071011%2Cp_8%3A2229059011&page={0}&ie=UTF8&qid=1517398836".format(page_counter)
response = requests.get(url, headers=headers)
if response.status_code == 200:
source = html.fromstring(response.content)
links.extend(source.xpath('//*[contains(#id,"result")]/div/div[3]/div[1]/a/#href'))
page_counter += 1
else:
break
print(links)
P.S. Check this ticket to be able to use proxy with requests library

# selenium imports
import random
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
import time
def list_all_items():
# items = browser.find_elements_by_css_selector('.a-size-base.s-inline.s-access-title.a-text-normal')
print "Start"
item_list = []
items = WebDriverWait(browser, 60).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".a-size-base.s-inline.s-access-title.a-text-normal")))
print "items--->", items
if items:
for item in items:
print item.text, "\n\n"
item_list.append(item.text)
#time.sleep(3)
#next_button = WebDriverWait(browser, 60).until(EC.presence_of_element_located((By.ID, 'pagnNextString')))
next_button = WebDriverWait(browser, 60).until(EC.element_to_be_clickable((By.ID, "pagnNextString")))
print "next_button-->", next_button
browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
print "____________SCROLL_DONE___"
next_button.click()
print "Click_done"
list_all_items()
# next_button = browser.find_element_by_id('pagnNextString')
# next_button.click()
# ifpagnNextString
# https://www.amazon.com/s/ref=lp_11444071011_nr_p_8_1/132-3636705-4291947?rh=n%3A3375251%2Cn%3A%213375301%2Cn%3A10971181011%2Cn%3A11444071011%2Cp_8%3A2229059011
PROXY = "88.157.149.250:8080";
chrome_options = webdriver.ChromeOptions()
# chrome_options.add_argument('--proxy-server=%s' % PROXY)
# //a[starts-with(#href, 'https://www.amazon.com/')]/#href
LINKS_XPATH = '//*[contains(#id,"result")]/div/div[3]/div[1]/a'
browser = webdriver.Chrome(chrome_options=chrome_options)
browser.maximize_window()
browser.get('https://www.amazon.com/s/ref=lp_11444071011_nr_p_8_1/132-3636705-4291947?rh=n%3A3375251%2Cn%3A%213375301%2Cn%3A10971181011%2Cn%3A11444071011%2Cp_8%3A2229059011')
list_all_items()
i have made one method that will print list of items from all page and call it recursively and at end of method i have click on next button. I did not give the break and exit condition i bellieve that you can manage it. The "list_all_items" method is the logic for do the thing that you required.
also uncomment proxy part that i have commented.

Let me break up this problem in a few steps, so you understand what needs to be done here.
First of all, you need to get all the products from a page.
Then, you need to get all the pages and repeat the first step on each and every page.
Now I do not know Python, so I will try to do this as much in a generic way as I can.
First, you need to create an int with value 0.
After that you need to get the number of pages. To do so, check:
numberOfPagesString = browser.find_element_by_xpath('//span[#class='pagnDisabled']').text
numberOfPages = int(numberOfPagesString)
i = 0
Then you need to create a loop. In the loop, you are going to increment the int where you set the value 0, to a maximum of 400.
So now your loop, each time the int is NOT equal to 400, is going to click on next page and get all products, and do what you want it to do. This will result in something like:
while i < numberOfPages **Here, as long as the value of i is less than 400, do this loop**
**code to get all products on page here**
**click on next page link**
browser.find_element_by_id('pagnNextString').click
i++ **here your i will become 1 after first page, 2 after second etc**
So to conclude, first thing you are doing is, determine how many pages are there on the page.
Then you are going to create an int from that string you get back from the browser.
Then you create an int with value 0, which you are going to use to check if you have reached the amount of maximum pages, every time you iterate through the loop.
After that you are going to first get all the products from the page (if you do not do that, it is going to skip the first page).
And at last, its going to click on the next page button.
To finish it, you int i is going to get an increment with ++, so after every loop, it increases by 1.

python selenium to scrape data from asos - need a better approach

Hi I'm new to Python and crawling. I've been researching and going over Stackoverflow and come up with Python + Selenium to open Webdriver to open the URL and get the page source and turn it into the data I need. However, I know there's a better approach (for example, scraping w/o selenium, not having to scrape page source, posting data to ASP, etc) and I hope I can seek some help here for an educational purpose.
Here's what I'd like to achieve.
Start:
http://www.asos.com/Women/New-In-Clothing/Cat/pgecategory.aspx?cid=2623
Obtain: product title, price, img, and its link
Next: go to next page if there is, if not, output
BEFORE you go into my code, here is some background information. Asos is a site that uses pagination so this is related to scraping through multipages. Also, I tried without Selenium by posting to http://www.asos.com/services/srvWebCategory.asmx/GetWebCategories
with this data:
{'cid':'2623', 'strQuery':"", 'strValues':'undefined', 'currentPage':'0',
'pageSize':'204','pageSort':'-1','countryId':'10085','maxResultCount':''}
but there's no return I get.
I know my approach is not good, I'd much appreciate any help/recommendation/approach/idea! Thanks!
import scrapy
import time
import logging
from random import randint
from selenium import webdriver
from asos.items import ASOSItem
from scrapy.selector import Selector
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
class ASOSSpider(scrapy.Spider):
name = "asos"
allowed_domains = ["asos.com"]
start_urls = [
"http://www.asos.com/Women/New-In-Clothing/Cat/pgecategory.aspx?cid=2623#/parentID=-1&pge=0&pgeSize=204&sort="
]
def __init__(self):
self.driver = webdriver.Firefox()
def parse(self, response):
self.driver.get(response.url)
view_204 = self.driver.find_element_by_xpath("//div[#class='product-count-bottom']/a[#class='view-max-paged']")
view_204.click() #click and show 204 pictures
time.sleep(5) #wait till 204 images loaded, I've also tried the explicit wait, but i got timedout
# element = WebDriverWait(self.driver, 8).until(EC.presence_of_element_located((By.XPATH, "category-controls bottom")))
logging.debug("wait time has reached! go CRAWL!")
next = self.driver.find_element_by_xpath("//li[#class='page-skip']/a")
pageSource = Selector(text=self.driver.page_source) # load page source instead, cant seem to crawl the page by just passing the reqular request
for sel in pageSource.xpath("//ul[#id='items']/li"):
item = ASOSItem()
item["product_title"] = sel.xpath("a[#class='desc']/text()").extract()
item["product_link"] = sel.xpath("a[#class='desc']/#href").extract()
item["product_price"] = sel.xpath("div/span[#class='price']/text()").extract()
item["product_img"] = sel.xpath("div/a[#class='productImageLink']/img/#src").extract()
yield item
next.click()
self.driver.close()

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.