Scrape some child links and then returning to the main scraping - python

I am trying to scrape a site with div elements and iteratively, for each div element I want to scrape some data from it and follow the child links it has and scrape more data from them.
Here is the code of quote.py
import scrapy
from ..items import QuotesItem
class QuoteSpider(scrapy.Spider):
name = 'quote'
baseurl='http://quotes.toscrape.com'
start_urls = [baseurl]
def parse(self, response):
all_div_quotes=response.css('.quote')
for quote in all_div_quotes:
item=QuotesItem()
title = quote.css('.text::text').extract()
author = quote.css('.author::text').extract()
tags = quote.css('.tag::text').extract()
author_details_url=self.baseurl+quote.css('.author+ a::attr(href)').extract_first()
item['title']=title
item['author']=author
item['tags']=tags
request = scrapy.Request(author_details_url,
callback=self.author_born,
meta={'item':item,'next_url':author_details_url})
yield request
def author_born(self, response):
item=response.meta['item']
next_url = response.meta['next_url']
author_born = response.css('.author-born-date::text').extract()
item['author_born']=author_born
yield scrapy.Request(next_url, callback=self.author_birthplace,
meta={'item':item})
def author_birthplace(self,response):
item=response.meta['item']
author_birthplace= response.css('.author-born-location::text').extract()
item['author_birthplace']=author_birthplace
yield item
Here is the code of items.py
import scrapy
class QuotesItem(scrapy.Item):
title = scrapy.Field()
author = scrapy.Field()
tags = scrapy.Field()
author_born = scrapy.Field()
author_birthplace = scrapy.Field()
I ran the command scrapy crawl quote -o data.json, but there was no error message and data.json was empty. I was expecting to get all the data in its corresponding field.
Can you please help me?

Take a closer look at your logs, you'll be able to find messages like this:
DEBUG: Filtered duplicate request: <GET http://quotes.toscrape.com/author/Albert-Einstein>
Scrapy is automatically managing duplicates and trying not to visit one URL twice(for obvious reasons).
In you case you can add dont_filter = True to your requests and will see something like this:
2019-07-15 19:33:15 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://quotes.toscrape.com/author/Steve-Martin/> (referer: http://quotes.toscrape.com/author/Steve-Martin/)
2019-07-15 19:33:15 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://quotes.toscrape.com/author/Albert-Einstein/> (referer: http://quotes.toscrape.com/author/Albert-Einstein/)
2019-07-15 19:33:15 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://quotes.toscrape.com/author/Marilyn-Monroe/> (referer: http://quotes.toscrape.com/author/Marilyn-Monroe/)
2019-07-15 19:33:15 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://quotes.toscrape.com/author/J-K-Rowling/> (referer: http://quotes.toscrape.com/author/J-K-Rowling/)
2019-07-15 19:33:15 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://quotes.toscrape.com/author/Eleanor-Roosevelt/> (referer: http://quotes.toscrape.com/author/Eleanor-Roosevelt/)
2019-07-15 19:33:15 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://quotes.toscrape.com/author/Andre-Gide/> (referer: http://quotes.toscrape.com/author/Andre-Gide/)
2019-07-15 19:33:15 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://quotes.toscrape.com/author/Thomas-A-Edison/> (referer: http://quotes.toscrape.com/author/Thomas-A-Edison/)
2019-07-15 19:33:15 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://quotes.toscrape.com/author/Jane-Austen/> (referer: http://quotes.toscrape.com/author/Jane-Austen/)
Which is kinda strange indeed, because of page yields request to itself.
Overall you could end up with something like this:
import scrapy
class QuoteSpider(scrapy.Spider):
name = 'quote'
baseurl = 'http://quotes.toscrape.com'
start_urls = [baseurl]
def parse(self, response):
all_div_quotes = response.css('.quote')
for quote in all_div_quotes:
item = dict()
title = quote.css('.text::text').extract()
author = quote.css('.author::text').extract()
tags = quote.css('.tag::text').extract()
author_details_url = self.baseurl + quote.css('.author+ a::attr(href)').extract_first()
item['title'] = title
item['author'] = author
item['tags'] = tags
print(item)
# Don't filter = True in case of we get two quotes of a single author.
# This is not optimal though. Better decision will be to save author data to self.storage
# And only visit new author info pages if needed, else take info from saved dict.
request = scrapy.Request(author_details_url,
callback=self.author_info,
meta={'item': item},
dont_filter=True)
yield request
def author_info(self, response):
item = response.meta['item']
author_born = response.css('.author-born-date::text').extract()
author_birthplace = response.css('.author-born-location::text').extract()
item['author_born'] = author_born
item['author_birthplace'] = author_birthplace
yield item

Related

Web scraping pagination with scrapy

I'm trying web scraping with scrapy. But I got "duplicates" warning. Can't jump next page.
How can I scrape all pages with pagination?
example site: teknosa.com
scraping url: https://www.teknosa.com/bilgisayar-tablet-c-116
pagination structure: ?s=%3Arelevance&page=0 (1,2,3,4,5, and more..)
My pagination code:
next_page = soup.find('button', {'title': 'Daha Fazla Ürün Gör'})['data-gotohref']
if next_page is not None:
next_page = response.urljoin(next_page)
yield scrapy.Request(next_page, callback=self.parse)
You can make the pagination in start_urls and increase or decrease range of page numbers.
import scrapy
from scrapy.crawler import CrawlerProcess
class CarsSpider(scrapy.Spider):
name = 'car'
start_urls=['https://www.teknosa.com/bilgisayar-tablet-c-116?s=%3Arelevance&page='+str(x)+'' for x in range(1,11)]
def parse(self, response):
print(response.url)
if __name__ == "__main__":
process =CrawlerProcess()
process.crawl()
process.start()
Output:
https://www.teknosa.com/bilgisayar-tablet-c-116?s=%3Arelevance&page=1
2022-05-01 08:55:06 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.teknosa.com/bilgisayar-tablet-c-116?s=%3Arelevance&page=2> (referer: None)
https://www.teknosa.com/bilgisayar-tablet-c-116?s=%3Arelevance&page=2
2022-05-01 08:55:06 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.teknosa.com/bilgisayar-tablet-c-116?s=%3Arelevance&page=5> (referer: None)
https://www.teknosa.com/bilgisayar-tablet-c-116?s=%3Arelevance&page=5
2022-05-01 08:55:06 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.teknosa.com/bilgisayar-tablet-c-116?s=%3Arelevance&page=6> (referer: None)
https://www.teknosa.com/bilgisayar-tablet-c-116?s=%3Arelevance&page=6
2022-05-01 08:55:07 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.teknosa.com/bilgisayar-tablet-c-116?s=%3Arelevance&page=7> (referer: None)
https://www.teknosa.com/bilgisayar-tablet-c-116?s=%3Arelevance&page=7
2022-05-01 08:55:07 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.teknosa.com/bilgisayar-tablet-c-116?s=%3Arelevance&page=3> (referer: None)
https://www.teknosa.com/bilgisayar-tablet-c-116?s=%3Arelevance&page=3
2022-05-01 08:55:07 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.teknosa.com/bilgisayar-tablet-c-116?s=%3Arelevance&page=4> (referer: None)
https://www.teknosa.com/bilgisayar-tablet-c-116?s=%3Arelevance&page=4
2022-05-01 08:55:07 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.teknosa.com/bilgisayar-tablet-c-116?s=%3Arelevance&page=8> (referer: None)
https://www.teknosa.com/bilgisayar-tablet-c-116?s=%3Arelevance&page=8
2022-05-01 08:55:08 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.teknosa.com/bilgisayar-tablet-c-116?s=%3Arelevance&page=9> (referer: None)
https://www.teknosa.com/bilgisayar-tablet-c-116?s=%3Arelevance&page=9
2022-05-01 08:55:08 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.teknosa.com/bilgisayar-tablet-c-116?s=%3Arelevance&page=10> (referer: None)
https://www.teknosa.com/bilgisayar-tablet-c-116?s=%3Arelevance&page=10
Multiple urls, pagination using for loop
import scrapy
class CarsSpider(scrapy.Spider):
name = 'car'
def start_requests(self):
urls=['url_1', 'url_2', 'url_3', ...]
for url in urls:
yield scrapy.Request(url=url,callback=self.parse)
def parse(self, response):
...

Scrapy stops scraping but continues to crawl

I’m trying to scrape different information from several pages of a website.
Until the sixteenth page, everything works: the pages are crawled, scraped and the information stock in my database, however after the sixteenth page, it stops scraping but continues to crawl.
I checked the website and there are more of 470 pages with information. The HTML tags are the same, so I don't understand why it stopped scraping.
Python:
def url_lister():
url_list = []
page_count = 1
while page_count < 480:
url = 'https://www.active.com/running?page=%s' %page_count
url_list.append(url)
page_count += 1
return url_list
class ListeCourse_level1(scrapy.Spider):
name = 'ListeCAP_ACTIVE'
allowed_domains = ['www.active.com']
start_urls = url_lister()
def parse(self, response):
selector = Selector(response)
for uneCourse in response.xpath('//*[#id="lpf-tabs2-a"]/article/div/div/div/a[#itemprop="url"]'):
loader = ItemLoader(ActiveItem(), selector=uneCourse)
loader.add_xpath('nom_evenement', './/div[2]/div/h5[#itemprop="name"]/text()')
loader.default_input_processor = MapCompose(string)
loader.default_output_processor = Join()
yield loader.load_item()
pass
The shell:
> 2018-01-23 17:22:29 [scrapy.core.scraper] DEBUG: Scraped from <200
> https://www.active.com/running?page=15>
> {
> 'nom_evenement': 'Enniscrone 10k run & 5k run/walk',
> }
> 2018-01-23 17:22:33 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.active.com/running?page=16> (referer: None)
> --------------------------------------------------
> SCRAPING DES ELEMENTS EVENTS
> --------------------------------------------------
> 2018-01-23 17:22:34 [scrapy.extensions.logstats] INFO: Crawled 17 pages (at 17 pages/min), scraped 155 items (at 155 items/min)
> 2018-01-23 17:22:36 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.active.com/running?page=17> (referer: None)
>
> --------------------------------------------------
> SCRAPING DES ELEMENTS EVENTS
> -------------------------------------------------- 2018-01-23 17:22:40 [scrapy.core.engine] DEBUG: Crawled (200) <GET
> https://www.active.com/running?page=18> (referer: None)
> --------------------------------------------------
> SCRAPING DES ELEMENTS EVENTS
> -------------------------------------------------- 2018-01-23 17:22:43 [scrapy.core.engine] DEBUG: Crawled (200) <GET
> https://www.active.com/running?page=19> (referer: None)
This is probably caused by the fact that there are only 17 pages with the content you are looking for, while you instruct Scrapy to visit all 480 pages of form https://www.active.com/running?page=NNN. A better approach is to check on each page you visit that there is a next page and only in that case yield Request to the next page.
So, I would refactor your code to something like (not tested):
class ListeCourse_level1(scrapy.Spider):
name = 'ListeCAP_ACTIVE'
allowed_domains = ['www.active.com']
base_url = 'https://www.active.com/running'
start_urls = [base_url]
def parse(self, response):
selector = Selector(response)
for uneCourse in response.xpath('//*[#id="lpf-tabs2-a"]/article/div/div/div/a[#itemprop="url"]'):
loader = ItemLoader(ActiveItem(), selector=uneCourse)
loader.add_xpath('nom_evenement', './/div[2]/div/h5[#itemprop="name"]/text()')
loader.default_input_processor = MapCompose(string)
loader.default_output_processor = Join()
yield loader.load_item()
# check for next page link
if response.xpath('//a[contains(#class, "next-page")]'):
next_page = response.meta.get('page_number', 1) + 1
next_page_url = '{}?page={}'.format(base_url, next_page)
yield scrapy.Request(next_page_url, callback=self.parse, meta={'page_number': next_page})

Scrapy Crawled (302) Status, how to handle

import scrapy
class Pttscrapper2Spider(scrapy.Spider):
name = 'PTTscrapper2'
allowed_domains = ['https://www.ptt.cc']
start_urls = ['https://www.ptt.cc/bbs/HatePolitics/index.html/']
handle_httpstatus_list = [400, 302]
def parse(self, response):
urls = response.css('div.r-ent > div.title > a::attr(href)').extract()
for thread_url in urls:
url = response.urljoin(thread_url)
yield scrapy.Request(url=url, callback=self.parse_details)
next_page_url = response.css('a.wide:nth-child(2)::attr(href)').extract_first()
if next_page_url:
next_page_url = response.urljoin(next_page_url)
yield scrapy.Request(url=next_page_url, callback=self.parse)
def parse_details(self, response):
yield {
'title' : response.xpath('//head/title/text()').extract(),
'stance' : response.xpath('//*[#id="main-content"]/div[#class="push"]/span[1]/text()').extract(),
'userid' : response.xpath('//*[#id="main-content"]/div[#class="push"]/span[2]/text()').extract(),
'comment' : response.xpath('//*[#id="main-content"]/div[#class="push"]/span[3]/text()').extract(),
'time_of_post' : response.xpath('//*[#id="main-content"]/div[#class="push"]/span[4]/text()').extract(),
}
I've been using the above spider to try and crawl a website, but I when I run the spider, I get these messages:
> 2017-10-05 23:14:27 [scrapy.core.engine] INFO: Spider opened
> 2017-10-05 23:14:27 [scrapy.extensions.logstats] INFO: Crawled 0 pages
> (at 0 pages/min), scraped 0 items (at 0 items/min) 2017-10-05 23:14:27
> [scrapy.extensions.telnet] DEBUG: Telnet console listening on
> 127.0.0.1:6023 2017-10-05 23:14:28 [scrapy.downloadermiddlewares.cookies] DEBUG: Received cookies from:
> <302 https://www.ptt.cc/bbs/HatePolitics/index.html/> Set-Cookie:
> __cfduid=d3ca57dcab04acfaf256438a57c547e4a1507216462; expires=Fri, 05-Oct-18 15:14:22 GMT; path=/; domain=.ptt.cc; HttpOnly
>
> 2017-10-05 23:14:28 [scrapy.core.engine] DEBUG: Crawled (302) <GET
> https://www.ptt.cc/bbs/HatePolitics/index.html/> (referer: None)
> 2017-10-05 23:14:28 [scrapy.core.engine] INFO: Closing spider
> (finished)
What I've been thinking is that my spider can't seem to access the sub forums in the index. I've tested that the selectors point to the correct locations and request.urljoin creates the correct absolute url but can't seem to access the sub forums in a page. It would be great if someone can tell me why the spider is unable to access the links!
Two issues with your scraper. In the start_urls you added a trailing slash to the index.html/, which is wrong. Next allowed_domains will take domain names and not urls.
Change starting code to below and it would work
class Pttscrapper2Spider(scrapy.Spider):
name = 'PTTscrapper2'
allowed_domains = ['www.ptt.cc']
start_urls = ['https://www.ptt.cc/bbs/HatePolitics/index.html']
Logs from the run
2017-10-06 13:16:15 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.ptt.cc/bbs/HatePolitics/M.1507268600.A.57C.html> (referer: https://www.ptt.cc/bbs/HatePolitics/index.html)
2017-10-06 13:16:15 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.ptt.cc/bbs/HatePolitics/M.1507268600.A.57C.html>
{'title': ['[黑特] 先刪文,洪慈庸和高潞那個到底撤案了沒? - 看板 HatePolitics - 批踢踢實業坊'], 'stance': ['推 ', '→ ', '噓 ', '→ ', '→ '], 'userid': ['ABA0525', 'gerund', 'AGODFATHER', 'laman45', 'victoryman'], 'comment': [': 垃圾不分藍綠黃', ': 垃圾靠弟傭 中華民國內最沒資格當立委的爛貨', ': 說什麼東西你個板啊', ': 有確定再說', ': 看起來應該是撤了'], 'time_of_post': ['10/06 13:43\n', '10/06 13:50\n', '10/06 13:57\n', '10/06 13:59\n', ' 10/06 15:27\n']}
2017-10-06 13:16:15 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.ptt.cc/bbs/HatePolitics/M.1507275599.A.657.html> (referer: https://www.ptt.cc/bbs/HatePolitics/index.html)

Scray CrawlSpider doesn't listen deny rules

I searched for any similar issues on stackowerflow and the other q&a sites but I could not find any proper answer for my problem.
I have written the following spider to crawl nautilusconcept.com . The category structure of site is so bad. Because of it I had to apply rules as it parse all link with callback. I determine which url should be parse with if statement inside parse_item method. Anyway spider doesn't listen my deny rules and still trying to crawl contains (?brw....) links.
Here is my spider;
from scrapy.contrib.linkextractors import LinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.selector import Selector
from vitrinbot.items import ProductItem
from vitrinbot.base import utils
import hashlib
removeCurrency = utils.removeCurrency
getCurrency = utils.getCurrency
class NautilusSpider(CrawlSpider):
name = 'nautilus'
allowed_domains = ['nautilusconcept.com']
start_urls = ['http://www.nautilusconcept.com/']
xml_filename = 'nautilus-%d.xml'
xpaths = {
'category' :'//tr[#class="KategoriYazdirTabloTr"]//a/text()',
'title':'//h1[#class="UrunBilgisiUrunAdi"]/text()',
'price':'//hemenalfiyat/text()',
'images':'//td[#class="UrunBilgisiUrunResimSlaytTd"]//div/a/#href',
'description':'//td[#class="UrunBilgisiUrunBilgiIcerikTd"]//*/text()',
'currency':'//*[#id="UrunBilgisiUrunFiyatiDiv"]/text()',
'check_page':'//div[#class="ayrinti"]'
}
rules = (
Rule(
LinkExtractor(allow=('com/[\w_]+',),
deny=('asp$',
'login\.asp'
'hakkimizda\.asp',
'musteri_hizmetleri\.asp',
'iletisim_formu\.asp',
'yardim\.asp',
'sepet\.asp',
'catinfo\.asp\?brw',
),
),
callback='parse_item',
follow=True
),
)
def parse_item(self, response):
i = ProductItem()
sl = Selector(response=response)
if not sl.xpath(self.xpaths['check_page']):
return i
i['id'] = hashlib.md5(response.url.encode('utf-8')).hexdigest()
i['url'] = response.url
i['category'] = " > ".join(sl.xpath(self.xpaths['category']).extract()[1:-1])
i['title'] = sl.xpath(self.xpaths['title']).extract()[0].strip()
i['special_price'] = i['price'] = sl.xpath(self.xpaths['price']).extract()[0].strip().replace(',','.')
images = []
for img in sl.xpath(self.xpaths['images']).extract():
images.append("http://www.nautilusconcept.com/"+img)
i['images'] = images
i['description'] = (" ".join(sl.xpath(self.xpaths['description']).extract())).strip()
i['brand'] = "Nautilus"
i['expire_timestamp']=i['sizes']=i['colors'] = ''
i['currency'] = sl.xpath(self.xpaths['currency']).extract()[0].strip()
return i
Here is the piece of scrapy log
2014-07-22 17:39:31+0300 [nautilus] DEBUG: Crawled (200) <GET http://www.nautilusconcept.com/catinfo.asp?brw=0&cid=64&direction=&kactane=100&mrk=1&offset=-1&order=&src=&typ=> (referer: http://www.nautilusconcept.com/catinfo.asp?brw=0&cid=64&direction=&kactane=100&mrk=1&offset=&offset=&order=&src=&stock=1)
2014-07-22 17:39:31+0300 [nautilus] DEBUG: Crawled (200) <GET http://www.nautilusconcept.com/catinfo.asp?brw=0&chkBeden=&chkMarka=&chkRenk=&cid=64&direction=1&kactane=100&mrk=1&offset=-1&order=prc&src=&stock=1&typ=> (referer: http://www.nautilusconcept.com/catinfo.asp?brw=0&cid=64&direction=&kactane=100&mrk=1&offset=&offset=&order=&src=&stock=1)
2014-07-22 17:39:32+0300 [nautilus] DEBUG: Crawled (200) <GET http://www.nautilusconcept.com/catinfo.asp?brw=0&chkBeden=&chkMarka=&chkRenk=&cid=64&direction=1&kactane=100&mrk=1&offset=-1&order=name&src=&stock=1&typ=> (referer: http://www.nautilusconcept.com/catinfo.asp?brw=0&cid=64&direction=&kactane=100&mrk=1&offset=&offset=&order=&src=&stock=1)
2014-07-22 17:39:32+0300 [nautilus] DEBUG: Crawled (200) <GET http://www.nautilusconcept.com/catinfo.asp?brw=&chkBeden=&chkMarka=&chkRenk=&cid=64&direction=2&kactane=100&mrk=1&offset=-1&order=prc&src=&stock=1&typ=7> (referer: http://www.nautilusconcept.com/catinfo.asp?brw=&cid=64&direction=1&kactane=100&mrk=1&offset=-1&order=prc&src=&stock=1&typ=7)
2014-07-22 17:39:32+0300 [nautilus] DEBUG: Crawled (200) <GET http://www.nautilusconcept.com/catinfo.asp?brw=&chkBeden=&chkMarka=&chkRenk=&cid=64&direction=2&kactane=100&mrk=1&offset=-1&order=name&src=&stock=1&typ=7> (referer: http://www.nautilusconcept.com/catinfo.asp?brw=&cid=64&direction=1&kactane=100&mrk=1&offset=-1&order=prc&src=&stock=1&typ=7)
2014-07-22 17:39:33+0300 [nautilus] DEBUG: Crawled (200) <GET http://www.nautilusconcept.com/catinfo.asp?brw=0&chkBeden=&chkMarka=&chkRenk=&cid=64&cmp=&direction=1&grp=&kactane=100&model=&mrk=1&offset=-1&order=prc&src=&stock=1&typ=7> (referer: http://www.nautilusconcept.com/catinfo.asp?brw=&cid=64&direction=1&kactane=100&mrk=1&offset=-1&order=prc&src=&stock=1&typ=7)
2014-07-22 17:39:33+0300 [nautilus] DEBUG: Crawled (200) <GET http://www.nautilusconcept.com/catinfo.asp?brw=1&chkBeden=&chkMarka=&chkRenk=&cid=64&cmp=&direction=1&grp=&kactane=100&model=&mrk=1&offset=-1&order=prc&src=&stock=1&typ=7> (referer: http://www.nautilusconcept.com/catinfo.asp?brw=&cid=64&direction=1&kactane=100&mrk=1&offset=-1&order=prc&src=&stock=1&typ=7)
2014-07-22 17:39:33+0300 [nautilus] DEBUG: Crawled (200) <GET http://www.nautilusconcept.com/catinfo.asp?brw=1&cid=64&direction=1&kactane=100&mrk=1&offset=-1&order=name&src=&typ=7> (referer: http://www.nautilusconcept.com/catinfo.asp?brw=1&chkBeden=&chkMarka=&chkRenk=&cid=64&cmp=&direction=1&grp=&kactane=100&model=&mrk=1&offset=-1&order=name&src=&stock=1&typ=7)
Spider also crawls proper page but it must not try to crawl links that contains (catinfo.asp?brw...)
I'm using Scrapy==0.24.2 and python 2.7.6
It's a canonicalizing "issue". By default, LinkExtractor returns canonicalized URLs, but regexes from deny and allow are applied before canonicalization.
I suggest you use these rules:
rules = (
Rule(
LinkExtractor(allow=('com/[\w_]+',),
deny=('asp$',
'login\.asp',
'hakkimizda\.asp',
'musteri_hizmetleri\.asp',
'iletisim_formu\.asp',
'yardim\.asp',
'sepet\.asp',
'catinfo\.asp\?.*brw',
),
),
callback='parse_item',
follow=True
),
)

How do I make recursive scraping work?

My goal is to scrape a list of URLs and headlines from a site, as part of a larger project-which is what drove me to learn scrapy. Now, as it stands, using basespider to scrape the first page of a given date (format is /archive/date/) works fine. However, trying to use crawlspider (working off some tutorials) to scrape each sequential page of a given date isn't working, and I'm not sure why. I've tried a number of solution.
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from physurlfetch.items import PhysurlfetchItem
from scrapy.http import Request
class PhysURLSpider(CrawlSpider):
date = raw_input("Please iput a date in the format M-DD-YYYY: ")
name = "PhysURLCrawlSpider"
allowed_domains = "phys.org"
start_url_str = ("http://phys.org/archive/%s/") % (date)
start_urls = [start_url_str]
rules = (
Rule (SgmlLinkExtractor(allow=("\d\.html",)),
callback="parse_items", follow = True),
)
#def parse_start_url(self, response):
#request = Request(start_urls, callback = self.parse_items)
def parse_items(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.select("//article[#class='news-box news-detail-box clearfix']/h4")
items = []
for titles in titles:
item = PhysurlfetchItem()
item ["title"] = titles.select("a/text()").extract()
item ["link"] = titles.select("a/#href").extract()
items.append(item)
return items
Currently I have parse_start_url commented out because that was failing with the method I was trying to snag start_urls (with the varying string). Running this currently jumps straight to page 2 of a given day without grabbing any data from page 1, and then stops (no page 2 data, no page 3).
When I ran your spider locally (using scrapy runspider yourspider.py) I got this console output:
2014-01-10 13:30:19+0100 [PhysURLCrawlSpider] DEBUG: Crawled (200) <GET http://phys.org/archive/5-12-2013/> (referer: None)
2014-01-10 13:30:19+0100 [PhysURLCrawlSpider] DEBUG: Filtered offsite request to 'phys.org': <GET http://phys.org/archive/5-12-2013/page2.html>
2014-01-10 13:30:19+0100 [PhysURLCrawlSpider] INFO: Closing spider (finished)
You can see Scrapy is filetring an offsite query. In fact allowed_domains should be a list of domains, so if you change to allowed_domains = ["phys.org"] you get further:
2014-01-10 13:32:00+0100 [PhysURLCrawlSpider] DEBUG: Crawled (200) <GET http://phys.org/archive/5-12-2013/> (referer: None)
2014-01-10 13:32:00+0100 [PhysURLCrawlSpider] DEBUG: Crawled (200) <GET http://phys.org/archive/5-12-2013/page2.html> (referer: http://phys.org/archive/5-12-2013/)
2014-01-10 13:32:00+0100 [PhysURLCrawlSpider] DEBUG: Filtered duplicate request: <GET http://phys.org/archive/5-12-2013/page3.html> - no more duplicates will be shown (see DUPEFILTER_CLASS)
2014-01-10 13:32:01+0100 [PhysURLCrawlSpider] DEBUG: Crawled (200) <GET http://phys.org/archive/5-12-2013/page8.html> (referer: http://phys.org/archive/5-12-2013/)
2014-01-10 13:32:01+0100 [PhysURLCrawlSpider] DEBUG: Crawled (200) <GET http://phys.org/archive/5-12-2013/page6.html> (referer: http://phys.org/archive/5-12-2013/)
2014-01-10 13:32:01+0100 [PhysURLCrawlSpider] DEBUG: Redirecting (301) to <GET http://phys.org/archive/5-12-2013/> from <GET http://phys.org/archive/5-12-2013/page1.html>
2014-01-10 13:32:01+0100 [PhysURLCrawlSpider] DEBUG: Crawled (200) <GET http://phys.org/archive/5-12-2013/page4.html> (referer: http://phys.org/archive/5-12-2013/)
2014-01-10 13:32:01+0100 [PhysURLCrawlSpider] DEBUG: Crawled (200) <GET http://phys.org/archive/5-12-2013/page7.html> (referer: http://phys.org/archive/5-12-2013/)
2014-01-10 13:32:01+0100 [PhysURLCrawlSpider] DEBUG: Crawled (200) <GET http://phys.org/archive/5-12-2013/page5.html> (referer: http://phys.org/archive/5-12-2013/)
2014-01-10 13:32:01+0100 [PhysURLCrawlSpider] DEBUG: Crawled (200) <GET http://phys.org/archive/5-12-2013/page3.html> (referer: http://phys.org/archive/5-12-2013/)
2014-01-10 13:32:01+0100 [PhysURLCrawlSpider] DEBUG: Crawled (200) <GET http://phys.org/archive/5-12-2013/> (referer: http://phys.org/archive/5-12-2013/page2.html)
2014-01-10 13:32:01+0100 [PhysURLCrawlSpider] INFO: Closing spider (finished)
But the spider is not picking up any item. It may or may not be a typo but your XPath expression for titles should probably be //article[#class='news-box news-detail-box clearfix']/h4, i.e. without the extra whitespace before clearfix.
As a final note, if you use the latest Scrapy version (from version 0.20.0 onwards), you'll be able to use CSS selectors, which can be more readable than XPath when selecting elements with multiple classes:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import Selector
from physurlfetch.items import PhysurlfetchItem
from scrapy.http import Request
class PhysURLSpider(CrawlSpider):
date = raw_input("Please iput a date in the format M-DD-YYYY: ")
name = "PhysURLCrawlSpider"
allowed_domains = ["phys.org"]
start_url_str = ("http://phys.org/archive/%s/") % (date)
start_urls = [start_url_str]
rules = (
Rule (SgmlLinkExtractor(allow=("\d\.html",)),
callback="parse_items", follow = True),
)
#def parse_start_url(self, response):
#request = Request(start_urls, callback = self.parse_items)
def parse_items(self, response):
selector = Selector(response)
# selecting only using "news-detail-box" class
# you could use "article.news-box.news-detail-box.clearfix > h4"
titles = selector.css("article.news-detail-box > h4")
items = []
for titles in titles:
item = PhysurlfetchItem()
item ["title"] = titles.xpath("a/text()").extract()
item ["link"] = titles.xpath("a/#href").extract()
items.append(item)
self.log("%d items in %s" % (len(items), response.url))
return items

Categories

Resources