Scraping multiple pages with multiple start_urls

Scraping multiple pages with multiple start_urls - python

I want to scrape the details present in json form using scrapy. They are multiple start_urls and each start_url have multiple pages to scrape with. I am just not able to get the logic of how to do so.
import scrapy
from scrapy.http import Request
BASE_URL = ["https://www.change.org/api-proxy/-/tags/animals-19/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/civic/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/human-rights-en-in/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/child-rights-2/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/health-9/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/environment-18/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/education-en-in/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/women-s-rights-13/petitions?offset={}&limit=8&show_promoted_cards=true"
]
class ChangeSpider(scrapy.Spider):
name = 'change'
def start_requests(self):
for i in range(len(BASE_URL)):
yield Request(BASE_URL[i], callback = self.parse)
pageNumber = 11
def parse(self, response):
data = response.json()
for item in range(len(data['items'])):
yield {
"petition_id": data['items'][item]['petition']['id'],
}
next_page = "https://www.change.org/api-proxy/-/tags/animals-19/petitions?offset=" + str(ChangeSpider.pageNumber) + "&limit=8&show_promoted_cards=true"
if data['last_page'] == False:
ChangeSpider.pageNumber += 1
yield response.follow(next_page, callback=self.parse)

Try like this:
import scrapy
from scrapy.http import Request
class ChangeSpider(scrapy.Spider):
name = 'change'
start_urls = ["https://www.change.org/api-proxy/-/tags/animals-19/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/civic/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/human-rights-en-in/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/child-rights-2/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/health-9/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/environment-18/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/education-en-in/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/women-s-rights-13/petitions?offset={}&limit=8&show_promoted_cards=true"
]
pageNumber = 11
def parse(self, response):
data = response.json()
for item in range(len(data['items'])):
yield {
"petition_id": data['items'][item]['petition']['id'],
}
next_page = "https://www.change.org/api-proxy/-/tags/animals-19/petitions?offset=" + str(ChangeSpider.pageNumber) + "&limit=8&show_promoted_cards=true"
if data['last_page'] == False:
ChangeSpider.pageNumber += 1
yield response.follow(next_page, callback=self.parse)

Related

Python scrapy returns uncomplete data

I am creating a scraper for web data scraping.
There are 58 pages and each page has 12 products.
The data should be return as 58 x 12 = 696 products title but it return only data of 404 products only. Here is my code
import scrapy
from fundrazr.items import FundrazrItem
from datetime import datetime
import re
class Fundrazr(scrapy.Spider):
name = "my_scraper"
# First Start Url
start_urls = ["https://perfumehut.com.pk/shop/"]
npages = 57
# This mimics getting the pages using the next button.
for i in range(2, npages + 1):
start_urls.append("https://perfumehut.com.pk/shop/page/"+str(i)+"")
def parse(self, response):
for href in response.xpath("//h3[contains(#class, 'product-title')]/a/#href"):
# add the scheme, eg http://
url = "" + href.extract()
yield scrapy.Request(url, callback=self.parse_dir_contents)
def parse_dir_contents(self, response):
item = FundrazrItem()
# Getting Campaign Title
item['campaignTitle'] = response.xpath("//h1[contains(#class, 'entry-title')]/text()").extract()
yield item
Its a woocommerce website and first page is
https://perfumehut.com.pk/shop/
and other pages as pagination
https://perfumehut.com.pk/shop/page/2/
https://perfumehut.com.pk/shop/page/3/
and up to 58.
I want to know what I did wrong by getting npages ?
Regards

import scrapy
from fundrazr.items import FundrazrItem
from datetime import datetime
import re
class Fundrazr(scrapy.Spider):
name = "my_scraper"
# First Start Url
start_urls = ["https://perfumehut.com.pk/shop/"]
def parse(self, response):
data = FundrazrItem()
for item in response.xpath("//div[contains(#class, 'products elements-grid ')]/div[contains(#class, 'product-grid-item product ')]/h3/a"):
data['campaignTitle'] = item.xpath("./text()").extract_first()
yield data
next_page = response.xpath("//ul[#class='page-numbers']/li[last()]/a/#href").extract_first()
if next_page is not None:
yield scrapy.Request(next_page, callback=self.parse)

Access multiple pages with pagination in Scrapy

I have urls with multiple pages. I try to paginate to extract datas from theses urls hut it works only one time (juste one next_page). What's wrong ?
import json
import scrapy
import re
import pkgutil
from scrapy.loader import ItemLoader
from rzc_spider.items import AnnonceItem
class AnnonceSpider(scrapy.Spider):
name = 'rzc_results'
def __init__(self, *args, **kwargs):
data_file = pkgutil.get_data("rzc_spider", "json/input/test_tt.json")
self.data = json.loads(data_file)
def start_requests(self):
for item in self.data:
request = scrapy.Request(item['rzc_url'], callback=self.parse)
request.meta['item'] = item
yield request
def parse(self, response):
item = response.meta['item']
item['results'] = []
item["car_number"] = response.css(
"h2.sub::text").extract_first()
for caritem in response.css("div.ad > div[itemtype='https://schema.org/Vehicle']"):
data = AnnonceItem()
#model
data["model"] = caritem.css(
"em.title::text").extract_first()
item['results'].append(data)
yield item
next_page = response.css(
'a.link::attr(href)').extract_first()
if next_page is not None:
url_pagination = 'https://www.websiteexample.com' + next_page
meta = {'item': response.meta['item']}
yield scrapy.Request(url=url_pagination, callback=self.parse, meta=meta)
#ban proxies reaction
def response_is_ban(self, request, response):
return b'banned' in response.body
def exception_is_ban(self, request, exception):
return None
The json file with the url (a sample in this case):
[{
"rzc_url": "https://www.websiteexample.com/model"
}]

Try and check the URL. Sometimes they set traps so only next_page has a absolute URL and another one has a relative URL. Instead of combining url_pagination with next_page use urljoin. Import it
yield scrapy.Request(urljoin(response.url, item), callback=self.parse, meta=meta)

Scrapy multiple next page

I want to scrape every next page. I've found a way to do it with scrapy shell but I don't know if my spider will iterate through every page or just the next one; I'm not too sure how to implement that.
alphabet = string.ascii_uppercase
each_link = '.' + alphabet
each_url = ["https://myanimelist.net/anime.php?letter={0}".format(i) for i in each_link]
#sub_page_of_url = [[str(url)+"&show{0}".format(i) for i in range(50, 2000, 50)] for url in each_url] #start/stop/steps
#full_url = each_url + sub_page_of_url
class AnimeScraper_Spider(scrapy.Spider):
name = "Anime"
def start_requests(self):
for url in each_url:
yield scrapy.Request(url=url, callback= self.parse)
def parse(self, response):
next_page_url = response.xpath(
"//div[#class='bgColor1']//a[text()='Next']/#href").extract_first()
for href in response.css('#content > div.normal_header.clearfix.pt16 > div > div > span > a:nth-child(1)') :
url = response.urljoin(href.extract())
yield Request(url, callback = self.parse_anime)
yield Request(next_page_url, callback=self.parse)
def parse_anime(self, response):
for tr_sel in response.css('div.js-categories-seasonal tr ~ tr'):
return {
"title" : tr_sel.css('a[id] strong::text').extract_first().strip(),
"synopsis" : tr_sel.css("div.pt4::text").extract_first(),
"type_" : tr_sel.css('td:nth-child(3)::text').extract_first().strip(),
"episodes" : tr_sel.css('td:nth-child(4)::text').extract_first().strip(),
"rating" : tr_sel.css('td:nth-child(5)::text').extract_first().strip()
}

I think that you're trying something too complicated, it should be as simple as:
Start from the main page
Identify all the pages that start with a particular letter
For each of these pages, take all the next links and repeat
It looks something like that:
import string
import scrapy
from scrapy import Request
class AnimeSpider(scrapy.Spider):
name = "Anime"
start_urls = ['https://myanimelist.net/anime.php']
def parse(self, response):
xp = "//div[#id='horiznav_nav']//li/a/#href"
return (Request(url, callback=self.parse_anime_list_page) for url in response.xpath(xp).extract())
def parse_anime_list_page(self, response):
for tr_sel in response.css('div.js-categories-seasonal tr ~ tr'):
yield {
"title": tr_sel.css('a[id] strong::text').extract_first().strip(),
"synopsis": tr_sel.css("div.pt4::text").extract_first(),
"type_": tr_sel.css('td:nth-child(3)::text').extract_first().strip(),
"episodes": tr_sel.css('td:nth-child(4)::text').extract_first().strip(),
"rating": tr_sel.css('td:nth-child(5)::text').extract_first().strip(),
}
next_urls = response.xpath("//div[#class='spaceit']//a/#href").extract()
for next_url in next_urls:
yield Request(response.urljoin(next_url), callback=self.parse_anime_list_page)

whats wrong with this scrapy spider? scrapes only last url

In method parse() spider crawls 4 urls and then sends to method parse_dir_contents() to scrape some data but only 4th url is being scraped I don't understand why it is not scraping other 3 urls?
import scrapy
from v_one.items import VOneItem
import json
class linkedin(scrapy.Spider):
name = "linkedin"
allowed_domains = ["linkedin.com"]
start_urls = [
"https://in.linkedin.com/directory/people-s-1-2-4/",
]
def parse(self, response):
for href in response.xpath('//*[#id="seo-dir"]/div/div/div/ul/li/a/#href'):
url = response.urljoin(href.extract())
print "________________"+url
yield scrapy.Request(url, callback=self.parse_dir_contents)
def parse_dir_contents(self, response):
for sel in response.xpath('//*[#id="profile"]'):
url = response.url
print "____________"+url
item = VOneItem()
item['name'] = sel.xpath('//*[#id="name"]/text()').extract()
item['headline'] = sel.xpath('//*[#id="topcard"]/div/div/div/p/span/text()').extract()
item['current'] = sel.xpath('//*[#id="topcard"]/div/div/div/table/tbody/tr/td/ol/li/span/text()').extract()
item['education'] = sel.xpath('//*[#id="topcard"]/div/div/div/table/tbody/tr/td/ol/li/a/text()').extract()
item['link'] = url
yield item

By inspecting the pages I think that there is no need of the for loop in the parse_dir_contents function. Make the function like this:
def parse_dir_contents(self, response):
item = VOneItem()
item['name'] = response.xpath('//*[#id="name"]/text()').extract()
item['headline'] = response.xpath('//*[#id="topcard"]/div/div/div/p/span/text()').extract()
item['current'] = response.xpath('//*[#id="topcard"]/div/div/div/table/tbody/tr/td/ol/li/span/text()').extract()
item['education'] = response.xpath('//*[#id="topcard"]/div/div/div/table/tbody/tr/td/ol/li/a/text()').extract()
item['link'] = response.url
return item
And check if this solves your issue.

python web recursive scraping error

I am trying to scrape multiple pages. Its structured like this:
--> Page 1 - Scrape links
-------> Page 2 - Scrape more links (some pages contain pagination) and data
------------> Page 3 - Scrape the data
It is returning 18 items, but there are 127 pages (in 2nd step) and 18 items/page. And not returning author and author_link in item.
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor as lext
from scrapy.selector import Selector
from scrapy.http import Request
from ror.items import RorItem
class RorSpiderSpider(CrawlSpider):
name = "ror_spider"
allowed_domains = ["example.com"]
start_urls = (
'http://www.example.com/',
)
rules = [
Rule(lext(allow=("http://www.example.com/$"), restrict_xpaths=('//a[#class="nextpostslink"]',)), callback='parse', follow=True),
]
def parse(self, response):
links = Selector(response).xpath('//ul[#id="nav"]/li')
for link in links:
item = RorItem()
item['menu_link'] = link.xpath('a/#href').extract()[0]
item['menu_title'] = link.xpath('a/text()').extract()[0]
if "http" not in item['menu_link']:
item['menu_link'] = "http://www.reviewofreligions.org" + ''.join(item['menu_link'])
yield Request(url=item['menu_link'], meta={'item': item}, callback=self.parse_articles)
else:
yield Request(url=item['menu_link'], meta={'item': item}, callback=self.parse_articles)
def parse_articles(self, response):
sel = Selector(response)
item = response.meta['item']
if "articles" in item['menu_link']:
item['link_cat'] = item['menu_title']
pg = 1
maxPgs = 124
while pg <= 124:
item['article_pg_link'] = item['menu_link'] + "page/" + str(pg) + "/"
article_links = sel.xpath('//div[#id="rightcol"]/div[#class="articlebox"]')
for art_link in article_links:
item['article_link'] = art_link.xpath('a[#class="title "]/#href').extract()[0]
item['article_title'] = art_link.xpath('a[#class="title "]/text()').extract()[0].replace('\n\t\t\t\t', '').replace('\t\t\t\t', '')
# article_txt_1 = art_link.xpath('text()').extract()[1].replace('\n \n\t\t\t\t', '').replace('\t\t\t\t', '').replace('\n \n', '')
# article_txt_2 = art_link.xpath('text()').extract()[2].replace('\n \n\t\t\t\t', '') if art_link.xpath('text()').extract()[2] else ''
# item['article_txt'] = article_txt_1 + '\n'.join(article_txt_2).replace('\n\n\n \n\n\n \n \n \n \n\n\n\t\n\t\n\t', '')
yield Request(url=item['article_link'], meta={'item': item}, callback=self.article_page)
pg += 1
def article_page(self, response):
select = Selector(response)
item = response.meta['item']
item['author'] = select.xpath('//div[#id="author"]/a/text()').extract()
item['author_link'] = select.xpath('//div[#id="author"]/a/#href').extract()
return item
What is wrong in the code?

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Scraping multiple pages with multiple start_urls - python

Related

Python scrapy returns uncomplete data

Access multiple pages with pagination in Scrapy

Scrapy multiple next page

whats wrong with this scrapy spider? scrapes only last url

python web recursive scraping error

Categories

Resources