Scrapy multiple next page - python

I want to scrape every next page. I've found a way to do it with scrapy shell but I don't know if my spider will iterate through every page or just the next one; I'm not too sure how to implement that.
alphabet = string.ascii_uppercase
each_link = '.' + alphabet
each_url = ["https://myanimelist.net/anime.php?letter={0}".format(i) for i in each_link]
#sub_page_of_url = [[str(url)+"&show{0}".format(i) for i in range(50, 2000, 50)] for url in each_url] #start/stop/steps
#full_url = each_url + sub_page_of_url
class AnimeScraper_Spider(scrapy.Spider):
name = "Anime"
def start_requests(self):
for url in each_url:
yield scrapy.Request(url=url, callback= self.parse)
def parse(self, response):
next_page_url = response.xpath(
"//div[#class='bgColor1']//a[text()='Next']/#href").extract_first()
for href in response.css('#content > div.normal_header.clearfix.pt16 > div > div > span > a:nth-child(1)') :
url = response.urljoin(href.extract())
yield Request(url, callback = self.parse_anime)
yield Request(next_page_url, callback=self.parse)
def parse_anime(self, response):
for tr_sel in response.css('div.js-categories-seasonal tr ~ tr'):
return {
"title" : tr_sel.css('a[id] strong::text').extract_first().strip(),
"synopsis" : tr_sel.css("div.pt4::text").extract_first(),
"type_" : tr_sel.css('td:nth-child(3)::text').extract_first().strip(),
"episodes" : tr_sel.css('td:nth-child(4)::text').extract_first().strip(),
"rating" : tr_sel.css('td:nth-child(5)::text').extract_first().strip()
}

I think that you're trying something too complicated, it should be as simple as:
Start from the main page
Identify all the pages that start with a particular letter
For each of these pages, take all the next links and repeat
It looks something like that:
import string
import scrapy
from scrapy import Request
class AnimeSpider(scrapy.Spider):
name = "Anime"
start_urls = ['https://myanimelist.net/anime.php']
def parse(self, response):
xp = "//div[#id='horiznav_nav']//li/a/#href"
return (Request(url, callback=self.parse_anime_list_page) for url in response.xpath(xp).extract())
def parse_anime_list_page(self, response):
for tr_sel in response.css('div.js-categories-seasonal tr ~ tr'):
yield {
"title": tr_sel.css('a[id] strong::text').extract_first().strip(),
"synopsis": tr_sel.css("div.pt4::text").extract_first(),
"type_": tr_sel.css('td:nth-child(3)::text').extract_first().strip(),
"episodes": tr_sel.css('td:nth-child(4)::text').extract_first().strip(),
"rating": tr_sel.css('td:nth-child(5)::text').extract_first().strip(),
}
next_urls = response.xpath("//div[#class='spaceit']//a/#href").extract()
for next_url in next_urls:
yield Request(response.urljoin(next_url), callback=self.parse_anime_list_page)

Related

Extracting next page and setting a break

I'm trying to extract webpage data and wished to take the next few pages also but up to a limit, which I can alter. However, I've tested to see if I can at least extract the next few web-pages using Scrapy (As I'm trying to figure this out in Scrapy to learn it), but It only returns the items within the first page.
How do I extract the next pages while setting a limit i.e. 5 pages
For example, here's what I have tried:
import scrapy
from scrapy.item import Field
from itemloaders.processors import TakeFirst
from scrapy.crawler import CrawlerProcess
class StatisticsItem(scrapy.Item):
ebay_div = Field(output_processor=TakeFirst())
url = Field(output_processor=TakeFirst())
class StatisticsSpider(scrapy.Spider):
name = 'ebay'
start_urls = ['https://www.ebay.com/b/Collectible-Card-Games-Accessories/2536/bn_1852210?rt=nc&LH_BIN=1' +
'&LH_PrefLoc=2&mag=1&_sop=16']
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url
)
def parse(self, response):
all_cards = response.xpath('//div[#class="s-item__wrapper clearfix"]')
for card in all_cards:
name = card.xpath('.//h3/text()').get() #get name of product
price = card.xpath('.//span[#class="s-item__price"]//text()').get() #price
product_url = card.xpath('.//a[#class="s-item__link"]//#href').get() #link to product
# now do whatever you want, append to dictionary, yield as item...
summary_data = {
"Name": name,
"Price": price,
"URL": product_url
}
data = {'summary_data': summary_data}
yield scrapy.Request(product_url, meta=data, callback=self.parse_product_details)
# get the next page
next_page_url = card.xpath('.//a[#class="pagination__next icon-link"]/#href').extract_first()
# The last page do not have a valid url and ends with '#'
if next_page_url == None or str(next_page_url).endswith("#"):
self.log("eBay products collected successfully !!!")
else:
print('\n' + '-' * 30)
print('Next page: {}'.format(next_page_url))
yield scrapy.Request(next_page_url, callback=self.parse)
def parse_product_details(self, response):
# Get the summary data
data = response.meta['summary_data']
data['location'] = response.xpath('//span[#itemprop="availableAtOrFrom"]/text()').extract_first()
yield data
process = CrawlerProcess(
settings={
'FEED_URI': 'collectible_cards.json',
'FEED_FORMAT': 'jsonlines'
}
)
process.crawl(StatisticsSpider)
process.start()
You can try like this first make urls then start start_requests
start_urls = ["https://www.ebay.com/b/Collectible-Card-Games-Accessories/2536/bn_1852210?LH_BIN=1&LH_PrefLoc=2&mag=1&rt=nc&_pgn={}&_sop=16".format(i) for i in range(1,5)]

How I can scrape subpages and merge it with page info?

I use scrapy for parsing the page. The page has subpages (categories) from which I also need to get information and combine it all in one element (maybe save the information from the additional pages as a json), which I add to csv. I've tried different options, such as:
requests = scrapy.Request(url, meta={'meta_item': item}, callback=self.parse_category)
Or
yield scrapy.Request(url, meta={'meta_item': item}, callback=self.parse_category)
But neither method works the way I want it to.
For example, I took pages from https://www.webscorer.com/findraces?pg=results (example: https://www.webscorer.com/seriesresult?seriesid=211565 ) and get information from this page. After that, I need to get additional information from category (example: https://www.webscorer.com/seriesresult?seriesid=211565&gender=F ):
example and put all of them in csv. My code now:
class WebscorerSpider(scrapy.Spider):
name = 'webscorer'
allowed_domains = ['webscorer.com']
def start_requests(self):
url = f'https://www.webscorer.com/findraces?pg=results'
yield scrapy.Request(url, callback=self.parse_page)
def parse_page(self, response, **kwargs):
for href in response.css('table.results-table tbody tr a::attr("href")').extract():
url = response.urljoin(href)
url = 'https://www.webscorer.com/seriesresult?seriesid=211565'
yield scrapy.Request(url, callback=self.parse)
def parse(self, response: Response, **kwargs):
latlong_match = re.search('lat=(.*)&lng=(.*)', response.css('span#FSrc::text').get())
item = dict()
for href in response.css('table.category-table .category-name').css('a::attr("href")').extract():
url = response.urljoin(href)
# requests = scrapy.Request(url, meta={'meta_item': item}, callback=self.parse_category)
yield scrapy.Request(url, meta={'meta_item': item}, callback=self.parse_category)
yield WebscorerEvent(name=response.css('h1.race-name::text').get(),
source_url=response.request.url,
sport_discipline=response.css('td.spec+td').css('strong::text').get(),
description=response.css('span.regnotes span::text').get(),
hero_image=response.css('p.associated-race-pic img::attr(src)').get(),
start_date=parse_webscorer_date(response.css('p.race-date::text').get()),
location={
"link": f"https://www.google.com/maps/search/?api=1&query={latlong_match.group(1)},{latlong_match.group(2)}",
"description": response.css('td.spec:contains("Location:")+td strong::text').get()})
def parse_category(self, response, **kwargs):
item = response.meta['meta_item']
# print(item)
item['winner'] = response.css('table.results-table .r-racername span::text').get()
return item
You did yield WebscorerEvent, so you already "dropped" the item before getting the data needed on the next page.
You could do something like:
def parse(self, response: Response, **kwargs):
latlong_match = re.search('lat=(.*)&lng=(.*)', response.css('span#FSrc::text').get())
item = {
"name": response.css('h1.race-name::text').get(),
"source_url": response.request.url,
"sport_discipline": response.css('td.spec+td').css('strong::text').get(),
"description": response.css('span.regnotes span::text').get(),
"hero_image": response.css('p.associated-race-pic img::attr(src)').get(),
"start_date": parse_webscorer_date(response.css('p.race-date::text').get()),
"location": {
"link": f"https://www.google.com/maps/search/?api=1&query={latlong_match.group(1)},{latlong_match.group(2)}",
"description": response.css('td.spec:contains("Location:")+td strong::text').get()
}
}
for href in response.css('table.category-table .category-name').css('a::attr("href")').extract():
url = response.urljoin(href)
yield scrapy.Request(url, meta={'meta_item': item}, callback=self.parse_category)
def parse_category(self, response, **kwargs):
item = response.meta['meta_item']
item['winner'] = response.css('table.results-table .r-racername span::text').get()
yield WebscorerEvent(item)
So in that way you only yield the item in the end, with all data needed.

Scraping multiple pages with multiple start_urls

I want to scrape the details present in json form using scrapy. They are multiple start_urls and each start_url have multiple pages to scrape with. I am just not able to get the logic of how to do so.
import scrapy
from scrapy.http import Request
BASE_URL = ["https://www.change.org/api-proxy/-/tags/animals-19/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/civic/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/human-rights-en-in/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/child-rights-2/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/health-9/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/environment-18/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/education-en-in/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/women-s-rights-13/petitions?offset={}&limit=8&show_promoted_cards=true"
]
class ChangeSpider(scrapy.Spider):
name = 'change'
def start_requests(self):
for i in range(len(BASE_URL)):
yield Request(BASE_URL[i], callback = self.parse)
pageNumber = 11
def parse(self, response):
data = response.json()
for item in range(len(data['items'])):
yield {
"petition_id": data['items'][item]['petition']['id'],
}
next_page = "https://www.change.org/api-proxy/-/tags/animals-19/petitions?offset=" + str(ChangeSpider.pageNumber) + "&limit=8&show_promoted_cards=true"
if data['last_page'] == False:
ChangeSpider.pageNumber += 1
yield response.follow(next_page, callback=self.parse)
Try like this:
import scrapy
from scrapy.http import Request
class ChangeSpider(scrapy.Spider):
name = 'change'
start_urls = ["https://www.change.org/api-proxy/-/tags/animals-19/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/civic/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/human-rights-en-in/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/child-rights-2/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/health-9/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/environment-18/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/education-en-in/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/women-s-rights-13/petitions?offset={}&limit=8&show_promoted_cards=true"
]
pageNumber = 11
def parse(self, response):
data = response.json()
for item in range(len(data['items'])):
yield {
"petition_id": data['items'][item]['petition']['id'],
}
next_page = "https://www.change.org/api-proxy/-/tags/animals-19/petitions?offset=" + str(ChangeSpider.pageNumber) + "&limit=8&show_promoted_cards=true"
if data['last_page'] == False:
ChangeSpider.pageNumber += 1
yield response.follow(next_page, callback=self.parse)

how to scrape the URL on Scrapy Following Links

I am confused how to scrape the URL itself in following links scrapy.
I do crawling on this page here
import scrapy
from ..items import SkripsiItem
class SkripsiSpiderSpider(scrapy.Spider):
name = 'skripsi'
start_urls = ['https://nasional.sindonews.com/topic/9695/pemilu-2019/']
def parse(self, response):
for href in response.css('.lnk-t a::attr(href)'):
yield response.follow(href, self.parse_author)
for href in response.css('.newpaging li:nth-child(4) a::attr(href)'):
yield response.follow(href, self.parse)
def parse_author(self, response):
items = SkripsiItem()
def extract_with_css(query):
return response.css(query).get(default='').strip()
content = response.xpath(".//div[#class='vidy-embed']/descendant::text()").extract()
items['title'] = extract_with_css('h1::text'),
items['author'] = extract_with_css('.author a::text'),
items['time'] = extract_with_css('time::text'),
items['imagelink'] = extract_with_css('.article img::attr(src)'),
items['content'] = ''.join(content),
yield items
how to scrape every url that is visited at the following link, which is in the code above is .lnk -t a :: attr (href)
Save items['url'] = response.url in the parse_author function.

Getting to the next page using scrapy

I am trying to make a web scraper but I'm unable to get the link of the next page. I have tried some combinations but none of them work. The tutorial on scrapy.org has a simpler format so it doesn't solve my problem
The site I'm scraping has the following layout:
<nav class="nav_class">
<a class="class_1" href="1.html">
<a class="class_2" href="2.html">
<a class="class_3" href="3.html">
I want to get the 3.html link using css selectors
import scrapy
class MySpider(scrapy.Spider):
name = "flip_spider"
def start_requests(self):
urls = [
"https://www.flipkart.com/mobiles/pr?sid=tyy%2C4io&p%5B%5D=facets.processor_brand%255B%255D%3DSnapdragon&p%5B%5D=facets.serviceability%5B%5D%3Dfalse&p%5B%5D=facets.offer_type%255B%255D%3DExchange%2BOffer&otracker=clp_banner_1_10.bannerX3.BANNER_mobile-phones-store_HPUGCU9BYBF6&fm=neo%2Fmerchandising&iid=M_934db066-154e-4074-a4b1-96f56a0af28e_6.HPUGCU9BYBF6&ppt=HomePage&ppn=Home&ssid=85m4yqvgzk0000001558978084715&page=1",
]
for url in urls:
yield scrapy.Request(url, callback=self.parse)
def parse(self, response):
# page_id=response.url.split("=")[-1]
phone_details = response.css("div._1-2Iqu.row")
for ph in phone_details:
phone = ph.css("div._3wU53n::text").get()
rating = ph.css("div.hGSR34::text").get()
price = ph.css("div._1vC4OE._2rQ-NK::text").get()
yield{
"name": phone,
"rating": rating,
"price": price,
}
final = "https://www.flipkart.com/mobiles/pr?sid=tyy%2C4io&p%5B%5D=facets.processor_brand%255B%255D%3DSnapdragon&p%5B%5D=facets.serviceability%5B%5D%3Dfalse&p%5B%5D=facets.offer_type%255B%255D%3DExchange%2BOffer&otracker=clp_banner_1_10.bannerX3.BANNER_mobile-phones-store_HPUGCU9BYBF6&fm=neo%2Fmerchandising&iid=M_934db066-154e-4074-a4b1-96f56a0af28e_6.HPUGCU9BYBF6&ppt=HomePage&ppn=Home&ssid=85m4yqvgzk0000001558978084715&page=6"
next_page_id = response.css("nav._1ypTlJ a._3fVaIS::attr(href)").get()
# ^This is the line I need help with
if next_page_id is not final:
next_page = response.urljoin(next_page_id)
yield scrapy.Request(next_page, callback=self.parse)
It only scrapes the first page and then stops
Change you code to this and it will work
next_page_id = response.css("nav._1ypTlJ a._3fVaIS::attr(href)").get()
if next_page_id:
next_page = response.urljoin(next_page_id)
yield scrapy.Request(next_page, callback=self.parse)

Categories

Resources