How to scrape and infinity scrolling page? - python

I was trying to scrape the men's coats and jackets category in next.co.uk and I realized that the page has the infinity scrolling page
# -*- coding: utf-8 -*-
import scrapy
from ..items import NextItem
class NewoneSpider(scrapy.Spider):
name = 'newOne'
allowed_domains = ['www.next.co.uk']
start_urls = [
'https://www.next.co.uk/shop/gender-newbornboys-gender-newbornunisex-gender-olderboys-gender-youngerboys-productaffiliation-coatsandjackets-0'
]
def parse(self, response):
items = NextItem();
global productCategory
global productSubCategory
products = response.css('.Details')
currentUrl = response.request.url
for product in products:
productCategory = 'Furniture'
productSubCategory = 'living Room'
productCountry = 'uk'
productSeller = 'John Lewis'
productLink = product.css('.TitleText::attr(href)').extract_first()
productTitle = product.css('.Desc::text').extract_first()
productImage = product.css('.Image img::attr(src)').extract_first()
productSalePrice = product.css('.Price a::text').extract_first()
items['productCategory'] = productCategory
items['productSubCategory'] = productSubCategory
items['productCountry'] = productCountry
items['productSeller'] = productSeller
items['productLink'] = productLink
items['productTitle'] = productTitle
items['productImage'] = productImage
items['productSalePrice'] = productSalePrice
yield items
I was able to scrape 28 items and I can see more than that on on the website which has infinite scrolling implementation.

When you scroll down the page sends XHR call to the server and asks for more data.
Example:
https://www.next.co.uk/shop/gender-newbornboys-gender-newbornunisex-gender-olderboys-gender-youngerboys-productaffiliation-coatsandjackets/isort-score-minprice-0-maxprice-30000-srt-24
Each request is almost the same but the last element in the url is growing up by 24:
srt-24
srt-48
srt-72
Now that you know how the "infinity" works, you can try and simulate it with code.
Example:
import requests
URL_TEMPLATE = 'https://www.next.co.uk/shop/gender-newbornboys-gender-newbornunisex-gender-olderboys-gender-youngerboys-productaffiliation-coatsandjackets/isort-score-minprice-0-maxprice-30000-srt-{}'
for step in range(24, 240, 24):
r = requests.get(URL_TEMPLATE.format(step))
if r.status_code == 200:
# TODO We have the data - lets parse it
pass

if i have two links i want to scrape from like
https://www.next.co.uk/shop/gender-newbornboys-gender-newbornunisex-gender-olderboys-gender-youngerboys-productaffiliation-coatsandjackets-0
and
https://www2.next.co.uk/shop/gender-men-productaffiliation-coatsandjackets-0
how would the code be like

Related

Navigate to new page in Scrapy with the same URL

I am writing a scrapy spider to scrape Rightmove, a property website. The issue I'm having is that the property search, which consists of several pages of different house listings, is all located under the same URL.
This means that the usual process of identifying the URL of the 'next' page doesn't work. Is there any way, using scrapy and not selenium (not efficient enough for the purpose) that I can navigate through the different pages? Please see my code and the source code of the relevant 'next page' button as the IMG below.
Thanks.
class listingsSpider(scrapy.Spider):
name = 'listings'
start_urls = ['https://www.rightmove.co.uk/property-for-sale/find.html?locationIdentifier=STATION%5E1712&maxPrice=500000&radius=0.5&sortType=10&propertyTypes=&mustHave=&dontShow=&furnishTypes=&keywords=']
def parse(self, response):
self.logger.info('This my first spider')
address = response.xpath('//*[#id="property-65695633"]/div/div/div[4]/div[1]/div[2]/a/address')
listings = response.xpath('//h2[#class="propertyCard-title"]')
for listing in listings:
yield{
'Listing': listing.get()
}
nextPage = response.xpath('//*[#id="l-container"]/div[3]/div/div/div/div[3]/button/div/svg/use')
nextPage = nextPage.get()
pageTest = response.css('div[class=pagination-button pagination-direction pagination-direction--next] svg a::attr(href)')
pageTest = pageTest.get()
if pageTest is not None:
pageTest = response.urljoin(pageTest)
yield scrapy.Request(pageTest,callback=self.parse)
```[![enter image description here][1]][1]
[1]: https://i.stack.imgur.com/1I1J1.png
Actually, it turns out that each page has a unique identifier in the web-link. For example, attach &index = 24, this sends you to the next page.
What you need to figure out is how to include that into the request url. Some may have several pages so we increment by +24 each time to go onto the next page. However, we could increment by +24 onto infinite, therefore we use the number of page results as a way to break. It's rather sneaky to notice at first sight! but pretty easy to overcome.
Here's a scraper that can go to these next pages as requested:
import scrapy
from scrapy.item import Field
from itemloaders.processors import TakeFirst
from scrapy.crawler import CrawlerProcess
from scrapy.loader import ItemLoader
import requests
from bs4 import BeautifulSoup
links= []
for i in range(0, 480, 24):
url = f'https://www.rightmove.co.uk/property-for-sale/find.html?locationIdentifier=STATION%5E1712&maxPrice=500000&radius=0.5&sortType=10&propertyTypes=&mustHave=&dontShow=&index={i}&furnishTypes=&keywords='
r = requests.get(url)
soup = BeautifulSoup(r.content, 'lxml')
ps1 = soup.find_all('span', {'class':'searchHeader-resultCount'})
for ps in ps1:
if int(ps.text.strip()) > i:
links.append(url)
else:
break
class ListingsItem(scrapy.Item):
address = Field(output_processor = TakeFirst())
listings = Field(output_processor = TakeFirst())
class listingsSpider(scrapy.Spider):
name = 'listings'
start_urls = links
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url,
callback = self.parse
)
def parse(self, response):
container = response.xpath('//div[#class="l-searchResults"]/div')
for sales in container:
l = ItemLoader(ListingsItem(), selector = sales)
l.add_xpath('address', '//address[#class="propertyCard-address"]/meta[#content]')
l.add_xpath('listings', '//h2[#class="propertyCard-title"]//text()[normalize-space()]')
yield l.load_item()
#self.logger.info('This my first spider')
#address = response.xpath('//*[#id="property-65695633"]/div/div/div[4]/div[1]/div[2]/a/address')
#listings = response.xpath('//h2[#class="propertyCard-title"]')
#for listing in listings:
# yield{
# 'Listing': listing.get()
# }
process = CrawlerProcess(
settings = {
'FEED_URI': 'rightmove.jl',
'FEED_FORMAT': 'jsonlines'
}
)
process.crawl(listingsSpider)
process.start()

My Scrapy code is just crawling(Debug:Crawled(200)) but not scraping any data

My Scrapy code is just crawling the links on the webpage but not Scraping any data.Actually,I'm trying to scrape some data about Coronavirus pandemic for my project(like country name,cities within that country and then no. of cases,casualties etc.).The output is Debug:Crawled(200) in the cmd. I'm trying to scrape it from worldometers website(Being a newbie to scrapy i don't know much and for output reference the image link is provided)
# -*- coding: utf-8 -*-
import scrapy
import logging
class CountriesSpider(scrapy.Spider):
name = 'countries'
allowed_domains = ['www.worldometers.info']
start_urls = ['http://www.worldometers.info/coronavirus/']
def parse(self, response):
countries = response.xpath("//td/a")
for country in countries:
country_name = country.xpath(".//text()").get()
country_link = country.xpath(".//#href").get()
#To access the country link
absolute_url = response.urljoin(country_link)
yield scrapy.Request(url = absolute_url,callback = self.parse_country) #Or do directly--> yield response.follow(url = country_link)
def parse_country(self,response):
rows = response.xpath("(//table[#class = 'table table-bordered table-hover table-responsive usa_table_countries dataTable no-footer'])[1]/tbody/tr")
for row in rows:
city = row.xpath(".//td[1]/text()").get()
cases = row.xpath(".//td[2]/text()").get()
deaths = row.xpath(".//td[4]/text()").get()
active_cases = row.xpath(".//td[6]/text()").get()
yield {
"City":city,
"Total_Number_of_Cases": cases,
"Deaths":deaths,
"Active_Cases":active_cases
}
enter image description here

Web Scraping all Urls from a website with Scrapy and Python

I am writing a web scraper to fetch a group of links
(located at tree.xpath('//div[#class="work_area_content"]/a/#href')
from a website and return the Title and Url of all the leafs sectioned by the leafs parent. I have two scrapers: one in python and one in Scrapy for Python. What is the purpose of callbacks in the Scrapy Request method? Should the information be in a multidimensional or single dimension list ( I believe multi-dimensional but it enhances complication)? Which of the below code is better? If the scraper code is better, how do I migrate the python code to the Scrapy code?
From what I understand from callbacks is that it passes a function's arguments to another function; however, if the callback refers to itself, the data gets overwritten and therefore lost, and you're unable to go back to the root data. Is this correct?
python:
url_storage = [ [ [ [] ] ] ]
page = requests.get('http://1.1.1.1:1234/TestSuites')
tree = html.fromstring(page.content)
urls = tree.xpath('//div[#class="work_area_content"]/a/#href').extract()
i = 0
j = 0
k = 0
for i, url in enumerate(urls):
absolute_url = "".join(['http://1.1.1.1:1234/', url])
url_storage[i][j][k].append(absolute_url)
print(url_storage)
#url_storage.insert(i, absolute_url)
page = requests.get(url_storage[i][j][k])
tree2 = html.fromstring(page.content)
urls2 = tree2.xpath('//div[#class="work_area_content"]/a/#href').extract()
for j, url2 in enumerate(urls2):
absolute_url = "".join(['http://1.1.1.1:1234/', url2])
url_storage[i][j][k].append(absolute_url)
page = requests.get(url_storage[i][j][k])
tree3 = html.fromstring(page.content)
urls3 = tree3.xpath('//div[#class="work_area_content"]/a/#href').extract()
for k, url3 in enumerate(urls3):
absolute_url = "".join(['http://1.1.1.1:1234/', url3])
url_storage[i][j][k].append(absolute_url)
page = requests.get(url_storage[i][j][k])
tree4 = html.fromstring(page.content)
urls3 = tree4.xpath('//div[#class="work_area_content"]/a/#href').extract()
title = tree4.xpath('//span[#class="page_title"]/text()').extract()
yield Request(url_storage[i][j][k], callback=self.end_page_parse_TS, meta={"Title": title, "URL": urls3 })
#yield Request(absolute_url, callback=self.end_page_parse_TC, meta={"Title": title, "URL": urls3 })
def end_page_parse_TS(self, response):
print(response.body)
url = response.meta.get('URL')
title = response.meta.get('Title')
yield{'URL': url, 'Title': title}
def end_page_parse_TC(self, response):
url = response.meta.get('URL')
title = response.meta.get('Title')
description = response.meta.get('Description')
description = response.xpath('//table[#class="wiki_table]/tbody[contains(/td/text(), "description")/parent').extract()
yield{'URL': url, 'Title': title, 'Description':description}
Scrapy:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractor import LinkExtractor
from scrapy.spiders import Rule, CrawlSpider
from datablogger_scraper.items import DatabloggerScraperItem
class DatabloggerSpider(CrawlSpider):
# The name of the spider
name = "datablogger"
# The domains that are allowed (links to other domains are skipped)
allowed_domains = ['http://1.1.1.1:1234/']
# The URLs to start with
start_urls = ['http://1.1.1.1:1234/TestSuites']
# This spider has one rule: extract all (unique and canonicalized) links, follow them and parse them using the parse_items method
rules = [
Rule(
LinkExtractor(
canonicalize=True,
unique=True
),
follow=True,
callback="parse_items"
)
]
# Method which starts the requests by visiting all URLs specified in start_urls
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(url, callback=self.parse, dont_filter=True)
# Method for parsing items
def parse_items(self, response):
# The list of items that are found on the particular page
items = []
# Only extract canonicalized and unique links (with respect to the current page)
links = LinkExtractor(canonicalize=True, unique=True).extract_links(response)
# Now go through all the found links
item = DatabloggerScraperItem()
item['url_from'] = response.url
for link in links:
item['url_to'] = link.url
items.append(item)
# Return all the found items
return items

Crawl iframe and page at the same time

I just wanted to know if it's possible to crawl a page on a website and extract data from this page and from an iframe in this page at the same time?
I'm using scrapy with python and I already know how to extract data from the iframe...
Thank you for your help!!
Thanks to your answer, I made this... But I don't know what to put instead of 'url'... Can you help me again please?
# -*- coding: utf-8 -*-
import scrapy
import re
import numbers
from fnac.items import FnacItem
from urllib.request import urlopen
# from scrapy.spiders import CrawlSpider, Rule
# from scrapy.linkextractors import LinkExtractor
from bs4 import BeautifulSoup
class Fnac(CrawlSpider): #scrapy.Spider
name = 'FnacCom'
allowed_domains = ['fnac.com']
start_urls = ['http://www.fnac.com/MORMANE/srefA5533119-3387-5EC4-82B6-AA61216BF599']
##### To extract links in order to run the spider in them
# rules = (
# Rule(LinkExtractor(allow=()), callback='parse'),
# )
def parse(self, response):
soup = BeautifulSoup(urlopen(response.url), "lxml")
iframexx = soup.find_all('iframe')
for iframe in iframexx:
yield scrapy.Request(iframe.attrs['src'],callback=self.parse2)
##### Main function
def parse1(self, response):
item1 = FnacItem()
nb_sales = response.xpath('//table[#summary="données détaillée du vendeur"]/tbody/tr/td/span/text()').extract()
country = response.xpath('//table[#summary="données détaillée du vendeur"]/tbody/tr/td/text()').extract()
yield scrapy.Request(url, meta={'item': item1}) #I don't know what to put instead of URL...
def parse2(self, response):
same_item = response.meta['item']
address = response.xpath('//div/p/text()').re(r'.*Adresse \: (.*)\n?.*')
email = response.xpath('//div/ul/li[contains(text(),"#")]/text()').extract()
name = response.xpath('//div/p[#class="customer-policy-label"]/text()').re(r'Infos sur la boutique \: ([a-zA-Z0-9]*)')
phone = response.xpath('//div/p/text()').re(r'.*Tél \: ([\d]*)\n?.*')
siret = response.xpath('//div/p/text()').re(r'.*Siret \: ([\d]*)\n?.*')
vat = response.xpath('//div/text()').re(r'.*TVA \: (.*)')
if (len(name) != 0):
item['name'] = ''.join(name).strip()
item['address'] = ''.join(address).strip()
item['phone'] = ''.join(phone).strip()
item['email'] = ''.join(email).strip()
item['nb_sales'] = ''.join(nb_sales).strip()
item['country'] = ''.join(country).strip()
item['vat'] = ''.join(vat).strip()
item['siret'] = ''.join(siret).strip()
return item
to combine information from different requests into a similar item, you have to use the meta parameter of the requests:
def parse1(self, response):
item1 = {
...
}
yield Request(url='another_url.com', meta={'item': item1}, callback=self.parse2)
def parse2(self, response):
same_item = response.meta['item']
# keep populating the item with the second response
...
yield same_item

Webcrawler multiple page iteration

I want to make the crawler go to the next page to extract data any help on what to do. I am a little lost on what to do. I tried scrapy but it is kinda complicated and bs4 is more convenient.
import bs4 as bs
import urllib.request
import pandas as pd
import re
source = urllib.request.urlopen('https://messageboards.webmd.com/').read()
soup = bs.BeautifulSoup(source,'lxml')
df = pd.DataFrame(columns = ['link'],data=[url.a.get('href') for url in soup.find_all('div',class_="link")])
lists=[]
for i in range(0,33):
link = (df.link.iloc[i])
source1 = urllib.request.urlopen(link).read()
soup1 = bs.BeautifulSoup(source1,'lxml')
for url1 in soup1.find_all('a',class_="next"):
next_link = soup1.find('a',href = True, text = re.compile("next"))
if next_link:
lists.append(link+url1.get('href'))
So it looks like you're storing hrefs in a list
for url1 in soup1.find_all('a',class_="next"):
next_link = soup1.find('a',href = True, text = re.compile("next"))
if next_link:
lists.append(link+url1.get('href'))
Now you actually have to do something with them. In this case I'm assuming you want to navigate to each href in your list.
for href in lists:
new_page = urllib.request.urlopen(href).read()
And then you can scrape whatever data you want out of new_page
I've got the same problem. Here is my code example for a page I crawled for exercise. I've chained multiple site requests to get detailed information.
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from capterra.items import CapterraItem
class CapterraCatSpider(CrawlSpider):
name = 'capterra_cat'
#allowed_domains = ['http://www.capterra.com/categories']
start_urls = ['http://www.capterra.com/categories']
# rules = (
# Rule(LinkExtractor(allow=r'Items/'), callback='parse_item', follow=True),
# )
def parse(self, response):
#TEMP
for category in response.css('ol.browse-group-list'):
#Debug: only elements of one category
if category.css('a::text').extract_first() == 'Yoga Studio':
i = CapterraItem()
#Get link to detail page
i['cat_name'] = category.css('a::text').extract_first()
#join link to detail page with base url
i['cat_link'] = response.urljoin(category.css('a::attr(href)').extract_first())
cat_link = i['cat_link']
print cat_link
#call request to detail page and pass response to parse_details method with callback method
request = scrapy.Request(cat_link, callback=self.parse_details)
request.meta['item'] = i
yield request
def parse_details(self,response):
#Debug print
print 'DETAILS!'
#read your items from response meta
item = response.meta['item']
#iterate over listings
for detail in response.css('p.listing-description.milli'):
item['profile_link'] = response.urljoin(detail.css('a.spotlight-link::attr(href)').extract_first())
#call request to profile page to get more information for listing
request = scrapy.Request(item['profile_link'], callback=self.parse_profile)
#set your item to rquest metadata
request.meta['item'] = item
yield request
def parse_profile(self,response):
#Debug print
print 'PROFILE'
item = response.meta['item']
item['product_name'] = response.css('h1.beta.no-margin-bottom::text').extract_first()
item['who_uses_software'] = response.css('div.spotlight-target > p.epsilon > i::text').extract_first()
item['vendor_name'] = response.css('h2.spotlight-vendor-name > span::text').extract_first()
return item

Categories

Resources