I am using Scrapy (0.22) to crawl one site. I need to do three things:
I need the category and subcategory of the images
I need download the images and store them at local
I need store the categroy,subcategory,image url in Mongo
But now I am blocked,I use 'pipelines' to download the image, but my code can not work, it cannot download the picture to local.
Also, since I want to store the information in Mongo, anyone can give me some suggest on the "Mongo table structure"?
My code is as following:
settings.py
BOT_NAME = 'tutorial'
SPIDER_MODULES = ['tutorial.spiders']
NEWSPIDER_MODULE = 'tutorial.spiders'
ITEM_PIPELINES = {'tutorial.pipelines.TutorialPipeline': 1}
IMAGES_STORE = '/ttt'
items.py
from scrapy.item import Item, Field
class TutorialItem(Item):
# define the fields for your item here like:
# name = Field()
catname=Field()
caturl=Field()
image_urls = Field()
images = Field()
pass
pipelines.py
from scrapy.contrib.pipeline.images import ImagesPipeline
from scrapy.exceptions import DropItem
from scrapy.http import Request
from pprint import pprint as pp
class TutorialPipeline(object):
# def get_media_requests(self, item, info):
# for image_url in item['image_urls']:
# yield Request(image_url)
# def process_item(self, item, spider):
# print '**********************===================*******************'
# return item
# pp(item)
# pass
def get_media_requests(self,item,info):
# pass
pp('**********************===================*******************')
# yield Request(item['image_urls'])
for image_url in item['image_urls']:
# pass
# print image_url
yield Request(image_url)
spider.py
import scrapy
import os
from pprint import pprint as pp
from scrapy import log
from scrapy.http import Request
from scrapy.selector import Selector
from scrapy.spider import Spider
from scrapy.spider import Spider
from scrapy.selector import Selector
from tutorial.items import TutorialItem
from pprint import pprint as pp
class BaiduSpider(scrapy.spider.Spider):
name='baidu'
start_urls=[
# 'http://www.dmoz.org/Computers/Programming/Languages/Python/Books/'
'http://giphy.com/categories'
]
domain='http://giphy.com'
def parse(self,response):
selector=Selector(response)
topCategorys=selector.xpath('//div[#id="None-list"]/a')
# pp(topCategorys)
items=[]
for tc in topCategorys:
item=TutorialItem()
item['catname']=tc.xpath('./text()').extract()[0]
item['caturl']=tc.xpath('./#href').extract()[0]
if item['catname']==u'ALL':
continue
reqUrl=self.domain+'/'+item['caturl']
# pp(reqUrl)
yield Request(url=reqUrl,meta={'caturl':reqUrl},callback=self.getSecondCategory)
def getSecondCategory(self,response):
selector=Selector(response)
# pp(response.meta['caturl'])
# pp('*****************=================**************')
secondCategorys=selector.xpath('//div[#class="grid_9 omega featured-category-tags"]/div/a')
# pp(secondCategorys)
items=[]
for sc in secondCategorys:
item=TutorialItem()
item['catname']=sc.xpath('./div/h4/text()').extract()[0]
item['caturl']=sc.xpath('./#href').extract()[0]
items.append(item)
reqUrl=self.domain+item['caturl']
# pp(items)
# pp(item)
# pp(reqUrl)
yield Request(url=reqUrl,meta={'caturl':reqUrl},callback=self.getImages)
def getImages(self,response):
selector=Selector(response)
# pp(response.meta['caturl'])
# pp('*****************=================**************')
# images=selector.xpath('//ul[#class="gifs freeform grid_12"]/div[position()=3]')
images=selector.xpath('//*[contains (#class,"hoverable-gif")]')
# images=selector.xpath('//ul[#class="gifs freeform grid_12"]//div[#class="hoverable-gif"]')
# pp(len(images))
items=[]
for image in images:
item=TutorialItem()
item['image_urls']=image.xpath('./a/figure/img/#src').extract()[0]
# item['imgName']=image.xpath('./a/figure/img/#alt').extract()[0]
items.append(item)
# pp(item)
# pp(items)
# pp('==============************==============')
# pp(items)
# items=[{'images':"hello world"}]
return items
Addition,there are not errors in the output,just is as following:
2014-12-21 13:49:56+0800 [scrapy] INFO: Enabled item pipelines: TutorialPipeline
2014-12-21 13:49:56+0800 [baidu] INFO: Spider opened
2014-12-21 13:49:56+0800 [baidu] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2014-12-21 13:49:56+0800 [scrapy] DEBUG: Telnet console listening on 0.0.0.0:6023
2014-12-21 13:49:56+0800 [scrapy] DEBUG: Web service listening on 0.0.0.0:6080
2014-12-21 13:50:07+0800 [baidu] DEBUG: Crawled (200) <GET http://giphy.com/categories> (referer: None)
2014-12-21 13:50:08+0800 [baidu] DEBUG: Crawled (200) <GET http://giphy.com//categories/science/> (referer: http://giphy.com/categories)
2014-12-21 13:50:08+0800 [baidu] DEBUG: Crawled (200) <GET http://giphy.com//categories/sports/> (referer: http://giphy.com/categories)
2014-12-21 13:50:08+0800 [baidu] DEBUG: Crawled (200) <GET http://giphy.com//categories/news-politics/> (referer: http://giphy.com/categories)
2014-12-21 13:50:09+0800 [baidu] DEBUG: Crawled (200) <GET http://giphy.com//categories/transportation/> (referer: http://giphy.com/categories)
2014-12-21 13:50:09+0800 [baidu] DEBUG: Crawled (200) <GET http://giphy.com//categories/interests/> (referer: http://giphy.com/categories)
2014-12-21 13:50:09+0800 [baidu] DEBUG: Crawled (200) <GET http://giphy.com//categories/memes/> (referer: http://giphy.com/categories)
2014-12-21 13:50:09+0800 [baidu] DEBUG: Crawled (200) <GET http://giphy.com//categories/tv/> (referer: http://giphy.com/categories)
2014-12-21 13:50:09+0800 [baidu] DEBUG: Crawled (200) <GET http://giphy.com//categories/gaming/> (referer: http://giphy.com/categories)
2014-12-21 13:50:10+0800 [baidu] DEBUG: Crawled (200) <GET http://giphy.com//categories/nature/> (referer: http://giphy.com/categories)
2014-12-21 13:50:10+0800 [baidu] DEBUG: Crawled (200) <GET http://giphy.com//categories/emotions/> (referer: http://giphy.com/categories)
2014-12-21 13:50:10+0800 [baidu] DEBUG: Crawled (200) <GET http://giphy.com//categories/movies/> (referer: http://giphy.com/categories)
2014-12-21 13:50:10+0800 [baidu] DEBUG: Crawled (200) <GET http://giphy.com//categories/holiday/> (referer: http://giphy.com/categories)
2014-12-21 13:50:11+0800 [baidu] DEBUG: Crawled (200) <GET http://giphy.com//categories/reactions/> (referer: http://giphy.com/categories)
2014-12-21 13:50:11+0800 [baidu] DEBUG: Crawled (200) <GET http://giphy.com//categories/music/> (referer: http://giphy.com/categories)
2014-12-21 13:50:11+0800 [baidu] DEBUG: Crawled (200) <GET http://giphy.com//categories/decades/> (referer: http://giphy.com/categories)
2014-12-21 13:50:12+0800 [baidu] DEBUG: Crawled (200) <GET http://giphy.com/search/the-colbert-report/> (referer: http://giphy.com//categories/news-politics/)
2014-12-21 13:50:12+0800 [baidu] DEBUG: Scraped from <200 http://giphy.com/search/the-colbert-report/>
{'image_urls': u'http://media1.giphy.com/media/2BDLDXFaEiuBy/200_s.gif'}
2014-12-21 13:50:12+0800 [baidu] DEBUG: Scraped from <200 http://giphy.com/search/the-colbert-report/>
{'image_urls': u'http://media2.giphy.com/media/WisjAI5QGgsrC/200_s.gif'}
2014-12-21 13:50:12+0800 [baidu] DEBUG: Scraped from <200 http://giphy.com/search/the-colbert-report/>
{'image_urls': u'http://media3.giphy.com/media/ZgDGEMihlZXCo/200_s.gif'}
.............
As far as I see it, there is no need for you to override the ImagesPipeline, because you are not modifying its behavior. But, since you are doing it, you should do it properly.
When overriding ImagesPipeline, two methods should be overriden:
get_media_requests(item, info) should return a Request for every URL in image_urls. This part you have done correctly.
item_completed(results, items, info) is called when all image requests for a single item have completed (either finished downloading, or failed for some reason). From the official documentation:
The item_completed() method must return the output that will be sent
to subsequent item pipeline stages, so you must return (or drop) the
item, as you would in any pipeline.
So, to make your custom images pipeline work, you need to override the item_completed() method, like this:
def item_completed(self, results, item, info):
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem("Item contains no images")
item['image_paths'] = image_paths
return item
Further on, about other issues in your code that make it not work as expected:
You are not actually creating any useful items.
If you take a look at your parse() and getSecondCategory() functions, you will notice that you are not returning nor yielding any items. Although you seem to have prepared the items list, which you obviously wanted to use to store your items, it is never used to actually pass the items further down the processing path. At one point, you just yield a Request for the next page, and when the function finishes, your items are deleted.
You are not using your caturl info that you are passing via the meta dictionary. You are passing this info both in parse()˙and getSecondCategory(), but you never collect it in the callback function. Thus it is also being ignored.
So, the only thing that is basically going to work is the images pipeline, if you fix it as I already suggested. In order to fix these issue in your code, follow the guidelines below (please keep in mind that this is not tested, it is just a guideline for your consideration):
def parse(self,response):
selector=Selector(response)
topCategorys=selector.xpath('//div[#id="None-list"]/a')
for tc in topCategorys:
# no need to create the item just yet,
# only get the category and the url so we can
# continue the work in our callback
catname = tc.xpath('./text()').extract()[0]
caturl = tc.xpath('./#href').extract()[0]
if catname == u'ALL':
continue
reqUrl=self.domain + '/' + caturl
# pass the category name in the meta so we can retreive it
# from the response in the callback function
yield Request(url=reqUrl,meta={'catname': catname},
callback=self.getSecondCategory)
def getSecondCategory(self,response):
selector=Selector(response)
secondCategorys=selector.xpath('//div[#class="grid_9 omega featured-category-tags"]/div/a')
# retreive the category name from the response
# meta dictionary, which was copied from our request
catname = response.meta['catname']
for sc in secondCategorys:
# still no need to create the item,
# since we are just trying to get to
# the subcategory
subcatname = sc.xpath('./div/h4/text()').extract()[0]
subcaturl = sc.xpath('./#href').extract()[0]
reqUrl=self.domain + '/' + subcaturl
# this time pass both the category and the subcategory
# so we can read them both in the callback function
yield Request(url=reqUrl,meta={'catname':catname, 'subcatname':subcatname},
callback=self.getImages)
def getImages(self,response):
selector=Selector(response)
# retreive the category and subcategory name
catname = response.meta['catname']
subcatname = response.meta['subcatname']
images = selector.xpath('//*[contains (#class,"hoverable-gif")]')
for image in images:
# now could be a good time to create the items
item=TutorialItem()
# fill the items category information. You can concatenate
# the category and subcategory if you like, or you can
# add another field in your TutorialItem called subcatname
item['catname'] = catname + ":" + subcatname
# or alternatively:
# item['catname'] = catname
# item['subcatname'] = subcatname
item['image_urls']=image.xpath('./a/figure/img/#src').extract()[0]
# no need to store the items in the list to return
# it later, we can just yield the items as they are created
yield item
Related
Got the error above. Ran through SO and only on 403 or 404 errors discussed.
Here are some stuff I tried to make it work.
edited the user_agent to my string
checked Reddit's robots.txt but no disallow on my crawl parameters
tried to tweak URL and domains (didnt work)
did scrapy shell ..same error but no explanation
website works on my browser on laptop
checked http 501, refers to "not implemented" or server does not support functionality
And the error fields
2019-04-07 17:34:00 [scrapy.middleware] INFO: Enabled item pipelines:
[]
2019-04-07 17:34:00 [scrapy.core.engine] INFO: Spider opened
2019-04-07 17:34:00 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2019-04-07 17:34:00 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023
2019-04-07 17:34:00 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (301) to <GET https://www.reddit.com/robots.txt> from <GET http://www.reddit.com/robots.txt>
2019-04-07 17:34:00 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.reddit.com/robots.txt> (referer: None)
2019-04-07 17:34:00 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (301) to <GET https://www.reddit.com/r/gameofthrones//> from <GET http://www.reddit.com/r/gameofthrones//>
2019-04-07 17:34:01 [scrapy.core.engine] DEBUG: Crawled (501) <GET https://www.reddit.com/r/gameofthrones//> (referer: None)
2019-04-07 17:34:01 [scrapy.spidermiddlewares.httperror] INFO: Ignoring response **<501 https://www.reddit.com/r/gameofthrones//>: HTTP status code is not handled or not allowed**
2019-04-07 17:34:01 [scrapy.core.engine] INFO: Closing spider (finished)
Codes
class RedditbotSpider(scrapy.Spider):
name = "redditgetter"
allowed_domains = ['reddit.com']
start_urls = ['http://www.reddit.com/r/gameofthrones/']
custom_settings = {
'FEED_URI' : 'tmp/redditdata.csv'
}
def parse(self, response):
titles = response.css('.title.may-blank::text').extract()
votes = response.css('.score.unvoted::text').extract()
times = response.css('time::attr(title)').extract()
for item in zip(titles,votes,times):
scraped_info = {
'title' : item[0],
'vote' : item[1],
'created_at' : item[2],
}
yield scraped_info
UPDATE : the new error after indent and "//" corrected.
2019-04-07 23:00:44 [scrapy.core.engine] INFO: Spider opened
2019-04-07 23:00:44 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2019-04-07 23:00:44 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023
2019-04-07 23:00:44 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (301) to <GET https://www.reddit.com/robots.txt> from <GET http://www.reddit.com/robots.txt>
2019-04-07 23:00:45 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.reddit.com/robots.txt> (referer: None)
2019-04-07 23:00:45 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (301) to <GET https://www.reddit.com/r/gameofthrones/> from <GET http://www.reddit.com/r/gameofthrones/>
2019-04-07 23:00:47 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.reddit.com/r/gameofthrones/> (referer: None)
2019-04-07 23:00:47 [scrapy.core.engine] INFO: Closing spider (finished)
2019-04-07 23:00:47 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 1287,
'downloader/request_count': 4,
'downloader/request_method_count/GET': 4,
'downloader/response_bytes': 101351,
'downloader/response_count': 4,
'downloader/response_status_count/200': 2,
'downloader/response_status_count/301': 2,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2019, 4, 7, 15, 0, 47, 452284),
'log_count/DEBUG': 4,
'log_count/INFO': 9,
'memusage/max': 50294784,
'memusage/startup': 50290688,
'response_received_count': 2,
'robotstxt/request_count': 1,
'robotstxt/response_count': 1,
'robotstxt/response_status_count/200': 1,
'scheduler/dequeued': 2,
'scheduler/dequeued/memory': 2,
'scheduler/enqueued': 2,
'scheduler/enqueued/memory': 2,
'start_time': datetime.datetime(2019, 4, 7, 15, 0, 44, 502445)}
2019-04-07 23:00:47 [scrapy.core.engine] INFO: Spider closed (finished)
This issue has nothing to do with your code implementation, it's just a typo in the URL. I was able to reproduce the issue (501 status code) by sending a GET request to https://www.reddit.com/r/gameofthrones// via https://reqbin.com/u4xxuehu
You simply need to remove the extra slash at the end of the URL to receive a 200 (OK) status: https://reqbin.com/43le52w4
Edit regarding reddit scraping: your code doesn't have an error, it's successfully getting a response of 101351 bytes (you can check this yourself by printing the response.body in the parse() method): 'downloader/response_bytes': 101351.
The problem is in the way that you're parsing the response. You're using CSS selectors (eg. response.css('.title.may-blank::text')) that aren't returning anything because there are no elements with such classes in the HTML (check the source of the web-page in the browser and look for the elements that you're trying to select). In fact, you'll notice that Reddit takes anti-crawling measures by assigning obscure classes to HTML elements.
However, upon inspecting the source code, it turns out that the end of the page contains JSON data wrapped in <script id="data"></script>, which includes information about posts (title, upvotes, etc). The information that you want to extract is structured as follows:
posts
└── models
└── postId
├── title
├── score
└── created
You can simply load and parse the JSON data to get the desired fields. Here's a working implementation of the parse() method:
def parse(self, response):
# get the contents inside <script id="data"></script>
data = response.css('#data::text').get()
# remove anything before { and after } to get valid JSON
data = re.findall(r"{.*}",str(data), re.MULTILINE)[0];
jsonresponse = json.loads(data)
titles = []
votes = []
times = []
for post in jsonresponse["posts"]["models"]:
titles.append(jsonresponse["posts"]["models"][post]["title"])
votes.append(jsonresponse["posts"]["models"][post]["score"])
times.append(jsonresponse["posts"]["models"][post]["created"])
for item in zip(titles,votes,times):
scraped_info = {
'title' : item[0],
'vote' : item[1],
'created_at' : item[2],
}
yield scraped_info
Sample output:
[scrapy.core.scraper] DEBUG: Scraped from <200 https://www.reddit.com/r/gameofthrones/>
{'title': '[NO SPOILERS] GoT this viewing party invite today. What do you think?', 'vote': 133, 'created_at': 1554610745000}
Demo: https://repl.it/#glhr/55557800
I am trying to get volume-weighted average prices for stocks from the moneycontrol.com website. The parse function is running without any issues but the parse_links function is not getting called. Am i missing something here?
# -*- coding: utf-8 -*-
import scrapy
class MoneycontrolSpider(scrapy.Spider):
name = "moneycontrol"
allowed_domains = ["https://www.moneycontrol.com"]
start_urls = ["https://www.moneycontrol.com/india/stockpricequote"]
def parse(self,response):
for link in response.css('td.last > a::attr(href)').extract():
if(link):
yield scrapy.Request(link, callback=self.parse_links,method='GET')
def parse_links(self, response):
VWAP= response.xpath('//*[#id="n_vwap_val"]/text()').extract_first()
print(VWAP)
with open('quotes.txt','a+') as f:
f.write('VWAP: {}'.format(VWAP) + '\n')
If you read the log output, the error becomes obvious.
2018-09-08 19:52:38 [py.warnings] WARNING: c:\program files\python37\lib\site-packages\scrapy\spidermiddlewares\offsite.py:59: URLWarning: allowed_domains accepts only domains, not URLs. Ignoring URL entry https://www.moneycontrol.com in allowed_domains.
warnings.warn("allowed_domains accepts only domains, not URLs. Ignoring URL entry %s in allowed_domains." % domain, URLWarning)
2018-09-08 19:52:38 [scrapy.extensions.telnet] DEBUG: Telnet console listening on 127.0.0.1:6023
2018-09-08 19:52:39 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.moneycontrol.com/india/stockpricequote> (referer: None)
2018-09-08 19:52:40 [scrapy.spidermiddlewares.offsite] DEBUG: Filtered offsite request to 'www.moneycontrol.com': <GET http://www.moneycontrol.com/india/stockpricequote/chemicals/aartiindustries/AI45>
So just fix your allowed_domains, and you should be fine:
allowed_domains = ["moneycontrol.com"]
I read the documentation on the SitemapSpider class over here: https://scrapy.readthedocs.io/en/latest/topics/spiders.html#sitemapspider
Here's my code:
class CurrentHarvestSpider(scrapy.spiders.SitemapSpider):
name = "newegg"
allowed_domains = ["newegg.com"]
sitemap_urls = ['http://www.newegg.com/Siteindex_USA.xml']
# if I comment this out, then the parse function should be called by default for every link, but it doesn't
sitemap_rules = [('/Product', 'parse_product_url'), ('product','parse_product_url')]
sitemap_follow = ['/newegg_sitemap_product', '/Product']
def parse(self, response):
with open("/home/dan/debug/newegg_crawler.log", "a") as log:
log.write("logging from parse " + response.url)
self.this_function_does_not_exist()
yield Request(response.url, callback=self.some_callback)
def some_callback(self, response):
with open("/home/dan/debug/newegg_crawler.log", "a") as log:
log.write("logging from some_callback " + response.url)
self.this_function_does_not_exist()
def parse_product_url(self, response):
with open("/home/dan/debug/newegg_crawler.log ", "a") as log:
log.write("logging from parse_product_url" + response.url)
self.this_function_does_not_exist()
This can be run successfully with scrapy installed.
Run pip install scrapy to get scrapy and execute with scrapy crawl newegg from the working directory.
My question is, why aren't any of these callbacks being called? The documentation claims that the callback defined in sitemap_rules should be called. If I comment it out, then parse() should be called by default but it still doesn't get called. Are the docs just 100% wrong? I'm checking this log file that I setup, and nothing is being written. I've even set the permissions on the file to 777. Also, I'm calling a non existent function which should cause an error to prove that the functions are not being called, but no error occurs. What am I doing wrong?
When I run your spider, this is what I get on the console:
$ scrapy runspider op.py
2016-11-09 21:34:51 [scrapy] INFO: Scrapy 1.2.1 started (bot: scrapybot)
(...)
2016-11-09 21:34:51 [scrapy] INFO: Spider opened
2016-11-09 21:34:51 [scrapy] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2016-11-09 21:34:51 [scrapy] DEBUG: Telnet console listening on 127.0.0.1:6023
2016-11-09 21:34:51 [scrapy] DEBUG: Crawled (200) <GET http://www.newegg.com/Siteindex_USA.xml> (referer: None)
2016-11-09 21:34:53 [scrapy] DEBUG: Crawled (200) <GET http://www.newegg.com/Sitemap/USA/newegg_sitemap_product01.xml.gz> (referer: http://www.newegg.com/Siteindex_USA.xml)
2016-11-09 21:34:53 [scrapy] ERROR: Spider error processing <GET http://www.newegg.com/Sitemap/USA/newegg_sitemap_product01.xml.gz> (referer: http://www.newegg.com/Siteindex_USA.xml)
Traceback (most recent call last):
File "/home/paul/.virtualenvs/scrapy12/local/lib/python2.7/site-packages/scrapy/utils/defer.py", line 102, in iter_errback
yield next(it)
File "/home/paul/.virtualenvs/scrapy12/local/lib/python2.7/site-packages/scrapy/spidermiddlewares/offsite.py", line 29, in process_spider_output
for x in result:
File "/home/paul/.virtualenvs/scrapy12/local/lib/python2.7/site-packages/scrapy/spidermiddlewares/referer.py", line 22, in <genexpr>
return (_set_referer(r) for r in result or ())
File "/home/paul/.virtualenvs/scrapy12/local/lib/python2.7/site-packages/scrapy/spidermiddlewares/urllength.py", line 37, in <genexpr>
return (r for r in result or () if _filter(r))
File "/home/paul/.virtualenvs/scrapy12/local/lib/python2.7/site-packages/scrapy/spidermiddlewares/depth.py", line 58, in <genexpr>
return (r for r in result or () if _filter(r))
File "/home/paul/.virtualenvs/scrapy12/local/lib/python2.7/site-packages/scrapy/spiders/sitemap.py", line 44, in _parse_sitemap
s = Sitemap(body)
File "/home/paul/.virtualenvs/scrapy12/local/lib/python2.7/site-packages/scrapy/utils/sitemap.py", line 17, in __init__
rt = self._root.tag
AttributeError: 'NoneType' object has no attribute 'tag'
You've probably noticed the AttributeError exception.
So scrapy is saying it has trouble parsing the sitemap response body.
And if scrapy cannot understand the sitemap content, it cannot parse content as XML, hence cannot follow any <loc> URL and will therefore not call any callback since it found nothing.
So you've actually found a bug in scrapy (thanks for reporting): https://github.com/scrapy/scrapy/issues/2389
As for the bug itself,
The different sub-sitemaps, e.g. http://www.newegg.com/Sitemap/USA/newegg_sitemap_store01.xml.gz, are sent "on the wire" as gzipped .gz files (gzipped twice -- so the HTTP response needs to be gunzipped twice) to be parsed as XML correctly.
Scrapy does not handle this case, hence the exception printed out.
Here's a basic sitemap spider that tries to double-gunzip responses:
from scrapy.utils.gz import gunzip
import scrapy
class CurrentHarvestSpider(scrapy.spiders.SitemapSpider):
name = "newegg"
allowed_domains = ["newegg.com"]
sitemap_urls = ['http://www.newegg.com/Siteindex_USA.xml']
def parse(self, response):
self.logger.info('parsing %r' % response.url)
def _get_sitemap_body(self, response):
body = super(CurrentHarvestSpider, self)._get_sitemap_body(response)
self.logger.debug("body[:32]: %r" % body[:32])
try:
body_unzipped_again = gunzip(body)
self.logger.debug("body_unzipped_again[:32]: %r" % body_unzipped_again[:100])
return body_unzipped_again
except:
pass
return body
And this the logs showing that newegg's .xml.gz sitemaps indeed need gunzipping twice:
$ scrapy runspider spider.py
2016-11-09 13:10:56 [scrapy] INFO: Scrapy 1.2.1 started (bot: scrapybot)
(...)
2016-11-09 13:10:56 [scrapy] INFO: Spider opened
2016-11-09 13:10:56 [scrapy] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2016-11-09 13:10:56 [scrapy] DEBUG: Telnet console listening on 127.0.0.1:6023
2016-11-09 13:10:57 [scrapy] DEBUG: Crawled (200) <GET http://www.newegg.com/Siteindex_USA.xml> (referer: None)
2016-11-09 13:10:57 [newegg] DEBUG: body[:32]: '\xef\xbb\xbf<?xml version="1.0" encoding='
2016-11-09 13:10:57 [scrapy] DEBUG: Crawled (200) <GET http://www.newegg.com/Sitemap/USA/newegg_sitemap_store01.xml.gz> (referer: http://www.newegg.com/Siteindex_USA.xml)
2016-11-09 13:10:57 [newegg] DEBUG: body[:32]: '\x1f\x8b\x08\x08\xda\xef\x1eX\x00\x0bnewegg_sitemap_store01'
2016-11-09 13:10:57 [newegg] DEBUG: body_unzipped_again[:32]: '\xef\xbb\xbf<?xml version="1.0" encoding="utf-8"?><urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"'
2016-11-09 13:10:57 [scrapy] DEBUG: Filtered duplicate request: <GET http://www.newegg.com/Hubs/SubCategory/ID-26> - no more duplicates will be shown (see DUPEFILTER_DEBUG to show all duplicates)
2016-11-09 13:10:59 [scrapy] DEBUG: Crawled (200) <GET http://www.newegg.com/Sitemap/USA/newegg_sitemap_product15.xml.gz> (referer: http://www.newegg.com/Siteindex_USA.xml)
2016-11-09 13:10:59 [newegg] DEBUG: body[:32]: '\x1f\x8b\x08\x08\xe3\xfa\x1eX\x00\x0bnewegg_sitemap_product'
2016-11-09 13:10:59 [newegg] DEBUG: body_unzipped_again[:32]: '\xef\xbb\xbf<?xml version="1.0" encoding="utf-8"?><urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"'
(...)
2016-11-09 13:11:02 [scrapy] DEBUG: Crawled (200) <GET http://www.newegg.com/Product/Product.aspx?Item=9SIA04Y0766512> (referer: http://www.newegg.com/Sitemap/USA/newegg_sitemap_product15.xml.gz)
(...)
2016-11-09 13:11:02 [newegg] INFO: parsing 'http://www.newegg.com/Product/Product.aspx?Item=9SIA04Y0766512'
(...)
Closed. This question is not reproducible or was caused by typos. It is not currently accepting answers.
This question was caused by a typo or a problem that can no longer be reproduced. While similar questions may be on-topic here, this one was resolved in a way less likely to help future readers.
Closed 3 years ago.
Improve this question
For some reason scrapy is parsing data from URLs in my denied rules:
I'm getting parsed data from urls containing /browse/, /search/, /ip/.
I'm not sure where this is going wrong.
Please advise, thanks! Please find my code below:
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from wallspider.items import Website
class mydomainSpider(CrawlSpider):
name = "tp"
allowed_domains = ["www.mydomain.com"]
start_urls = ["http://www.mydomain.com",]
"""/tp/ page type to crawl"""
rules = (Rule (SgmlLinkExtractor(allow=('/tp/', ),
deny=(
'browse/',
'browse-ng.do?',
'search-ng.do?',
'facet=',
'ip/',
'page/'
'search/',
'/[1-9]$',
'(bti=)[1-9]+(?:\.[1-9]*)?',
'(sort_by=)[a-zA-Z]',
'(sort_by=)[1-9]+(?:\.[1-9]*)?',
'(ic=32_)[1-9]+(?:\.[1-9]*)?',
'(ic=60_)[0-9]+(?:\.[0-9]*)?',
'(search_sort=)[1-9]+(?:\.[1-9]*)?', )
,)
, callback="parse_items", follow= True),
)
def parse_items(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//html')
items = []
for site in sites:
item = Website()
item['referer'] = response.request.headers.get('Referer')
item['url'] = response.url
item['title'] = site.xpath('/html/head/title/text()').extract()
item['description'] = site.select('//meta[#name="Description"]/#content').extract()
items.append(item)
return items
a part of my console log, its grabing /ip/ pages?:
2013-12-11 11:21:43-0800 [tp] DEBUG: Crawled (200) <GET http://www.mydomain.com/ip/1104329> (referer: http://www.mydomain.com/tp/john-duigan)
2013-12-11 11:21:43-0800 [tp] DEBUG: Scraped from <200 http://www.mydomain.com/ip/1104329>
{'description': [u'Shop Low Prices on: Molly (Widescreen) : Movies'],
'referer': 'http://www.mydomain.com/tp/john-duigan',
'title': [u'Molly (Widescreen): Movies : mydomain.com '],
'url': 'http://www.mydomain.com/ip/1104329'}
2013-12-11 11:21:43-0800 [tp] DEBUG: Redirecting (302) to <GET http://www.mydomain.com/ip/17371019> from <GET http://www.mydomain.com/tp/jon-furmanski>
2013-12-11 11:21:43-0800 [tp] DEBUG: Redirecting (302) to <GET http://www.mydomain.com/ip/17371019> from <GET http://www.mydomain.com/tp/taylor-byrd>
2013-12-11 11:21:43-0800 [tp] DEBUG: Redirecting (302) to <GET http://www.mydomain.com/ip/17371019> from <GET http://www.mydomain.com/tp/greg-byers>
2013-12-11 11:21:43-0800 [tp] DEBUG: Redirecting (302) to <GET http://www.mydomain.com/ip/17371019> from <GET http://www.mydomain.com/tp/tom-bowker>
2013-12-11 11:21:43-0800 [tp] DEBUG: Crawled (200) <GET http://www.mydomain.com/ip/21152221> (referer: http://www.mydomain.com/tp/peter-levin)
2013-12-11 11:21:43-0800 [tp] DEBUG: Scraped from <200 http://www.mydomain.com/ip/21152221>
{'description': [u'Shop Low Prices on: Marva Collins Story (1981) : Video on Demand by VUDU'],
'referer': 'http://www.mydomain.com/tp/peter-levin',
'title': [u'Marva Collins Story (1981): Video on Demand by VUDU : mydomain.com '],
'url': 'http://www.mydomain.com/ip/21152221'}
The rules of your SgmlLinkExtractor apply when extracting links from pages. And in your case, some of your .../tp/... requests are being redirected to .../ip/... pages.
Redirecting (302) to <GET http://www.mydomain.com/ip/17371019> from <GET http://www.mydomain.com/tp/tom-bowker>
allow and deny patterns do no apply to URLs after redirections.
You could disable following redirections altogether by setting REDIRECT_ENABLED to False (see RedirectMiddleware)
I found out what was wrong, the pages were redirecting to a page type that was in my deny rule. Thank you for all your help! I appreciate it!
I have following class method in a scrapy spider. parse_category yields a Request object that has callback to parse_product. Sometimes a category page redirects to a product page. So here I detect if a category page is a product page. If it is, I just call the parse_product method. But for some reason it does not call the method.
def parse(self, response):
hxs = HtmlXPathSelector(response)
anchors = hxs.select('//div[#id="panelMfr"]/div/ul/li[position() != last()]/a')
for anchor in anchors[2:3]:
url = anchor.select('#href').extract().pop()
cat = anchor.select('text()').extract().pop().strip()
yield Request(urljoin(get_base_url(response), url), callback=self.parse_category, meta={"category": cat})
def parse_category(self, response):
hxs = HtmlXPathSelector(response)
base_url = get_base_url(response)
# check if its a redirected product page
if (hxs.select(self.product_name_xpath)):
self.log("Category-To-Product Redirection")
self.parse_product(response) # <<---- This line is not called.
self.log("Product Parsed")
return
products_xpath = '//div[#class="productName"]/a/#href'
products = hxs.select(products_xpath).extract()
for url in products:
yield Request(urljoin(base_url, url), callback=self.parse_product, meta={"category": response.meta['category']})
next_page = hxs.select('//table[#class="nav-back"]/tr/td/span/a[contains(text(), "Next")]/text()').extract()
if next_page:
url = next_page[0]
yield Request(urljoin(base_url, url), callback=self.parse_category, meta={"category": response.meta['category']})
def parse_product(self, response):
hxs = HtmlXPathSelector(response)
base_url = get_base_url(response)
self.log("Inside parse_product")
In the log I see Category-To-Product Redirection and Product Parsed is printed but Inside parse_product is missing. Whats did I do wrong here?
2013-07-12 21:31:34+0100 [example.com] DEBUG: Crawled (200) <GET http://www.example.com/category.aspx> (referer: None)
2013-07-12 21:31:34+0100 [example.com] DEBUG: Redirecting (302) to <GET http://www.example.com/productinfo.aspx?catref=AM6901> from <GET http://www.example.com/products/Inks-Toners/Apple>
2013-07-12 21:31:35+0100 [example.com] DEBUG: Crawled (200) <GET http://www.example.com/productinfo.aspx?catref=AM6901> (referer: http://www.example.com/category.aspx)
2013-07-12 21:31:35+0100 [example.com] DEBUG: Category-To-Product Redirection
2013-07-12 21:31:35+0100 [example.com] DEBUG: Product Parsed
2013-07-12 21:31:35+0100 [example.com] INFO: Closing spider (finished)
2013-07-12 21:31:35+0100 [-] ERROR: ERROR:root:SPIDER CLOSED: No. of products: 0