I am trying to understand how scrapy works and want to know how to stop the spider once a condition is met. I am using the scrapy tutorial to show that once the author name Pablo Neruda is scraped then the spider should not continue on to the next page. It can finish scraping the page just do not go onto the next page. Any help would be appreciated.
import scrapy
class AuthorSpider(scrapy.Spider):
name = 'aq1'
start_urls = ['http://quotes.toscrape.com/']
stop_page = 0
def parse(self, response):
author_page_links = response.css('.author + a')
yield from response.follow_all(author_page_links, self.parse_author)
if AuthorSpider.stop_page == 0:
pagination_links = response.css('li.next a')
yield from response.follow_all(pagination_links, self.parse)
else:
pagination_links = " "
yield from response.follow_all(pagination_links, self.parse)
def parse_author(self, response):
def extract_with_css(query):
return response.css(query).get(default='').strip()
yield {
'Name': extract_with_css('h3.author-title::text'),
}
if extract_with_css('h3.author-title::text') == "Pablo Neruda":
AuthorSpider.stop_page = 1
Related
I'm trying to create a spider from the results of this page
When I call the runspider function on the scraped results I get zero crawled pages returned.
There are further errors that I do not understand. I'm a beginner and this project is new to me.
My code it is:
`
import scrapy
class SusSpider(scrapy.Spider):
name = 'susManagement'
allowed_domains = ['in.gov.br/']
start_urls = ['https://www.in.gov.br/consulta/-/buscar/dou?q=*&s=do1&s=doe&exactDate=personalizado&sortType=0&delta=20&publishFrom=01%2F10%2F2021&publishTo=31%2F12%2F2021&orgPrin=Minist%C3%A9rio+da+Sa%C3%BAde']
def parse(self, response):
gates = response.xpath("//div[#class='resultado']//h5[#class='title-marker']//a")
for gate in gates:
gate_number = gate.xpath(".//text()").get()
link_gate = gate.xpath(".//#href").get()
yield response.follow(url=link_gate, callback=self.parse_text,
meta={'gate_name': gate_number})
next_page = response.xpath("//div//ul/li[#class='page-item active']/button")
if next_page:
next_page = response.urljoin(next_page)
yield scrapy.Request(url=next_page, callback=self.parse)
def parse_text(self, response):
portaria = response.request.meta['gate_name']
num_portaria = response.xpath("*//section//div//p[#class='identifica']/text()").re('.*')
texto = response.xpath("//section//div//p[#class='texto-dou']/text()").re('.*')
ementa = response.xpath("//article//div//p[#class='ementa']/text()").re('.*')
rest_texto = texto - ementa - num_portaria
yield {
'port_name': portaria,
'numero_port': num_portaria,
'classified': ementa,
'texto_integral': rest_texto
}
`
I have tried to change the sequence of my parse functions. By clicking on each file after its finished.
At the moment I want to create a csv file with the documents of the output and separate them by columns.
I want to scrape the details present in json form using scrapy. They are multiple start_urls and each start_url have multiple pages to scrape with. I am just not able to get the logic of how to do so.
import scrapy
from scrapy.http import Request
BASE_URL = ["https://www.change.org/api-proxy/-/tags/animals-19/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/civic/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/human-rights-en-in/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/child-rights-2/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/health-9/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/environment-18/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/education-en-in/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/women-s-rights-13/petitions?offset={}&limit=8&show_promoted_cards=true"
]
class ChangeSpider(scrapy.Spider):
name = 'change'
def start_requests(self):
for i in range(len(BASE_URL)):
yield Request(BASE_URL[i], callback = self.parse)
pageNumber = 11
def parse(self, response):
data = response.json()
for item in range(len(data['items'])):
yield {
"petition_id": data['items'][item]['petition']['id'],
}
next_page = "https://www.change.org/api-proxy/-/tags/animals-19/petitions?offset=" + str(ChangeSpider.pageNumber) + "&limit=8&show_promoted_cards=true"
if data['last_page'] == False:
ChangeSpider.pageNumber += 1
yield response.follow(next_page, callback=self.parse)
Try like this:
import scrapy
from scrapy.http import Request
class ChangeSpider(scrapy.Spider):
name = 'change'
start_urls = ["https://www.change.org/api-proxy/-/tags/animals-19/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/civic/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/human-rights-en-in/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/child-rights-2/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/health-9/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/environment-18/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/education-en-in/petitions?offset={}&limit=8&show_promoted_cards=true",
"https://www.change.org/api-proxy/-/tags/women-s-rights-13/petitions?offset={}&limit=8&show_promoted_cards=true"
]
pageNumber = 11
def parse(self, response):
data = response.json()
for item in range(len(data['items'])):
yield {
"petition_id": data['items'][item]['petition']['id'],
}
next_page = "https://www.change.org/api-proxy/-/tags/animals-19/petitions?offset=" + str(ChangeSpider.pageNumber) + "&limit=8&show_promoted_cards=true"
if data['last_page'] == False:
ChangeSpider.pageNumber += 1
yield response.follow(next_page, callback=self.parse)
I want to scrape every next page. I've found a way to do it with scrapy shell but I don't know if my spider will iterate through every page or just the next one; I'm not too sure how to implement that.
alphabet = string.ascii_uppercase
each_link = '.' + alphabet
each_url = ["https://myanimelist.net/anime.php?letter={0}".format(i) for i in each_link]
#sub_page_of_url = [[str(url)+"&show{0}".format(i) for i in range(50, 2000, 50)] for url in each_url] #start/stop/steps
#full_url = each_url + sub_page_of_url
class AnimeScraper_Spider(scrapy.Spider):
name = "Anime"
def start_requests(self):
for url in each_url:
yield scrapy.Request(url=url, callback= self.parse)
def parse(self, response):
next_page_url = response.xpath(
"//div[#class='bgColor1']//a[text()='Next']/#href").extract_first()
for href in response.css('#content > div.normal_header.clearfix.pt16 > div > div > span > a:nth-child(1)') :
url = response.urljoin(href.extract())
yield Request(url, callback = self.parse_anime)
yield Request(next_page_url, callback=self.parse)
def parse_anime(self, response):
for tr_sel in response.css('div.js-categories-seasonal tr ~ tr'):
return {
"title" : tr_sel.css('a[id] strong::text').extract_first().strip(),
"synopsis" : tr_sel.css("div.pt4::text").extract_first(),
"type_" : tr_sel.css('td:nth-child(3)::text').extract_first().strip(),
"episodes" : tr_sel.css('td:nth-child(4)::text').extract_first().strip(),
"rating" : tr_sel.css('td:nth-child(5)::text').extract_first().strip()
}
I think that you're trying something too complicated, it should be as simple as:
Start from the main page
Identify all the pages that start with a particular letter
For each of these pages, take all the next links and repeat
It looks something like that:
import string
import scrapy
from scrapy import Request
class AnimeSpider(scrapy.Spider):
name = "Anime"
start_urls = ['https://myanimelist.net/anime.php']
def parse(self, response):
xp = "//div[#id='horiznav_nav']//li/a/#href"
return (Request(url, callback=self.parse_anime_list_page) for url in response.xpath(xp).extract())
def parse_anime_list_page(self, response):
for tr_sel in response.css('div.js-categories-seasonal tr ~ tr'):
yield {
"title": tr_sel.css('a[id] strong::text').extract_first().strip(),
"synopsis": tr_sel.css("div.pt4::text").extract_first(),
"type_": tr_sel.css('td:nth-child(3)::text').extract_first().strip(),
"episodes": tr_sel.css('td:nth-child(4)::text').extract_first().strip(),
"rating": tr_sel.css('td:nth-child(5)::text').extract_first().strip(),
}
next_urls = response.xpath("//div[#class='spaceit']//a/#href").extract()
for next_url in next_urls:
yield Request(response.urljoin(next_url), callback=self.parse_anime_list_page)
Let's imagine I have a webpage like this.
counter.php
if(isset($_GET['count'])){
$count = intval($_GET['count']);
$previous = $count - 1;
$next = $count + 1;
?>
< Previous
Current: <?php echo $count;?>
Next >
<?
}
?>
This is an "infinite" website because you can just keep clicking next to go to the next page (the counter will just increase) or previous etc.
However, if I wanted to crawl this page and follow the links using scrapy like this, scrapy will never stop crawling.
Example spider:
urls = []
class TestSpider(CrawlSpider):
name = 'test'
allowed_domains = ['example.com']
start_urls = ['http://example.com/counter?count=1']
rules = (
Rule(LinkExtractor(), callback='parse_item', follow=True),
)
def parse_item(self, response):
urls.append(response.url)
What kind of mechanism can I use to determine if indeed I am stuck in an infinite website and need to break out of it?
You can always break out if the page does not have ITEMS on that page, or do not have NEXT PAGE button, that means pagination has ended
class TestSpider(CrawlSpider):
name = 'test'
allowed_domains = ['example.com']
def start_requests(self):
page = 1
yield Request("http://example.com/counter?page=%s" % (page), meta={"page": page}, callback=self.parse_item)
def parse_item(self, response):
#METHOD 1: check if items availble on this page
items = response.css("li.items")
if items:
#Now go to next page
page = int(response.meta['page']) + 1
yield Request("http://example.com/counter?page=%s" % (page), meta={"page": page}, callback=self.parse_item)
else:
logging.info("%s was last page" % response.url)
#METHOD 2: check if this page has NEXT PAGE button, most websites has that
nextPage = response.css("a.nextpage")
if nextPage:
#Now go to next page
page = int(response.meta['page']) + 1
yield Request("http://example.com/counter?page=%s" % (page), meta={"page": page}, callback=self.parse_item)
else:
logging.info("%s was last page" % response.url)
You don't have to use Rule in the scrapy. You can first parse the page by page and then iterates all items in each page. Or you can collect all item links in the each page.
For example:
urls = []
class TestSpider(CrawlSpider):
name = 'test'
allowed_domains = ['example.com']
start_urls = ['http://example.com/counter?count=1']
def parse(self, response):
links = response.xpath('//a[#class="item"]/#href').extract()
for link in links:
yield Request(link, self.parse_item)
# you can insert the item 's url here, so you dont have to yield to parse_item
# urls.append(link)
url, pg = response.url.split("=")# you can break infinite loop here
if int(pg) <= 10: #We loop by page #10
yield Request(url + "=" + str(int(pg) + 1), self.parse)
def parse_item(self, response):
urls.append(response.url)
I am making a project in which I have used scrapy to scrape items from web sites, but the problem is, the xpaths of the 1st 2 pages of that site is different from the xpaths of the other pages.
As the result my spider just scrapes the items from first two pages and just simply crawls over the other pages.
How can I make my spider also scrape the items of the pages too??
I am also including my spider here so that u can see through my spider if needed.
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from project2.items import Project2Item
from scrapy.http import Request
class ProjectSpider(BaseSpider):
name = "project2spider"
allowed_domains = ["http://directory.thesun.co.uk/"]
current_page_no = 1
start_urls = [
'http://directory.thesun.co.uk/find/uk/computer-repair'
]
def get_next_url(self, fired_url):
if '/page/' in fired_url:
url, page_no = fired_url.rsplit('/page/', 1)
else:
if self.current_page_no != 1:
#end of scroll
return
self.current_page_no += 1
return "http://directory.thesun.co.uk/find/uk/computer-repair/page/%s" % self.current_page_no
# the parse procedure, and here is the codes which declares which field to scrape.
def parse(self, response):
fired_url = response.url
hxs = HtmlXPathSelector(response)
sites = hxs.select('//div[#class="abTbl "]')
for site in sites:
item = Project2Item()
item['Catogory'] = site.select('span[#class="icListBusType"]/text()').extract()
item['Bussiness_name'] = site.select('a/#title').extract()
item['Description'] = site.select('span[last()]/text()').extract()
item['Number'] = site.select('span[#class="searchInfoLabel"]/span/#id').extract()
item['Web_url'] = site.select('span[#class="searchInfoLabel"]/a/#href').extract()
item['adress_name'] = site.select('span[#class="searchInfoLabel"]/span/text()').extract()
item['Photo_name'] = site.select('img/#alt').extract()
item['Photo_path'] = site.select('img/#src').extract()
#items.append(item)
yield item
next_url = self.get_next_url(fired_url)
if next_url:
yield Request(next_url, self.parse, dont_filter=True)
for other pages I need to use this: sites = hxs.select('//div[#class="icListItem"]')
How can I include this in my spider so that it can scrape items form other pages too..
At present its just scraping 1st two pages and simply crawls over other pages.
What did you try so far?
One solution would be using an index-like parameter passed as a meta data when calling for the next page. Something like:
def parse(self, response):
hxs = HtmlXPathSelector(response)
2nd_xpath = False
try:
if response.meta['index'] > 1:
2nd_xpath = True
index = response.meta['index']
except KeyError:
index = 0
sites = (hxs.select('//div[#class="icListItem"]') if 2nd_xpath
else hxs.select('//div[#class="abTbl "]'))
...
request = Request(next_url, self.parse, dont_filter=True)
request.meta['index'] = index + 1
yield request
That code sure as hell can be improved but you get the idea.