I added restrict_xpaths rules to my scrapy spider and now it immediately fails with:
2015-03-16 15:46:53+0000 [tsr] ERROR: Spider error processing <GET http://www.thestudentroom.co.uk/forumdisplay.php?f=143>
Traceback (most recent call last):
File "/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/twisted/internet/base.py", line 800, in runUntilCurrent
call.func(*call.args, **call.kw)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/twisted/internet/task.py", line 602, in _tick
taskObj._oneWorkUnit()
File "/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/twisted/internet/task.py", line 479, in _oneWorkUnit
result = self._iterator.next()
File "/Library/Python/2.7/site-packages/scrapy/utils/defer.py", line 57, in <genexpr>
work = (callable(elem, *args, **named) for elem in iterable)
--- <exception caught here> ---
File "/Library/Python/2.7/site-packages/scrapy/utils/defer.py", line 96, in iter_errback
yield next(it)
File "/Library/Python/2.7/site-packages/scrapy/contrib/spidermiddleware/offsite.py", line 26, in process_spider_output
for x in result:
File "/Library/Python/2.7/site-packages/scrapy/contrib/spidermiddleware/referer.py", line 22, in <genexpr>
return (_set_referer(r) for r in result or ())
File "/Library/Python/2.7/site-packages/scrapy/contrib/spidermiddleware/urllength.py", line 33, in <genexpr>
return (r for r in result or () if _filter(r))
File "/Library/Python/2.7/site-packages/scrapy/contrib/spidermiddleware/depth.py", line 50, in <genexpr>
return (r for r in result or () if _filter(r))
File "/Library/Python/2.7/site-packages/scrapy/contrib/spiders/crawl.py", line 73, in _parse_response
for request_or_item in self._requests_to_follow(response):
File "/Library/Python/2.7/site-packages/scrapy/contrib/spiders/crawl.py", line 52, in _requests_to_follow
links = [l for l in rule.link_extractor.extract_links(response) if l not in seen]
File "/Library/Python/2.7/site-packages/scrapy/contrib/linkextractors/lxmlhtml.py", line 107, in extract_links
links = self._extract_links(doc, response.url, response.encoding, base_url)
File "/Library/Python/2.7/site-packages/scrapy/linkextractor.py", line 94, in _extract_links
return self.link_extractor._extract_links(*args, **kwargs)
File "/Library/Python/2.7/site-packages/scrapy/contrib/linkextractors/lxmlhtml.py", line 50, in _extract_links
for el, attr, attr_val in self._iter_links(selector._root):
**File "/Library/Python/2.7/site-packages/scrapy/contrib/linkextractors/lxmlhtml.py", line 38, in _iter_links
for el in document.iter(etree.Element):
exceptions.AttributeError: 'str' object has no attribute 'iter'**
I cannot understand why this error is happening.
Here is my short Spider:
import scrapy
from tutorial.items import DmozItem
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
class TsrSpider(CrawlSpider):
name = 'tsr'
allowed_domains = ['thestudentroom.co.uk']
start_urls = ['http://www.thestudentroom.co.uk/forumdisplay.php?f=143']
download_delay = 4
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:35.0) Gecko/20100101 Firefox/35.0'
rules = (
Rule(
LinkExtractor(
allow=('forumdisplay\.php\?f=143\&page=\d',),
restrict_xpaths=("//li[#class='pager-page_numbers']/a/#href",))),
Rule(
LinkExtractor(
allow=('showthread\.php\?t=\d+\&page=\d+',),
restrict_xpaths=("//li[#class='pager-page_numbers']/a/#href",)),
callback='parse_link'),
Rule(
LinkExtractor(
allow=('showthread\.php\?t=\d+',),
restrict_xpaths=("//tr[#class='thread unread ']",)),
callback='parse_link'),
)
def parse_link(self, response):
# Iterate over posts.
for sel in response.xpath("//li[#class='post threadpost old ']"):
rating = sel.xpath(
"div[#class='post-footer']//span[#class='score']/text()").extract()
if not rating:
rating = 0
else:
rating = rating[0]
item = DmozItem()
item['post'] = sel.xpath(
"div[#class='post-content']/blockquote[#class='postcontent restore']/text()").extract()
item['link'] = response.url
item['topic'] = response.xpath(
"//div[#class='forum-header section-header']/h1/span/text()").extract()
item['rating'] = rating
yield item
source: http://pastebin.com/YXdWvPgX
Can someone help me out? Where is the mistake? I've been searching for days!?
The problem is that restrict_xpaths should point to elements - either the links directly or containers containing links, not attributes:
rules = [
Rule(LinkExtractor(allow='forumdisplay\.php\?f=143\&page=\d',
restrict_xpaths="//li[#class='pager-page_numbers']/a")),
Rule(LinkExtractor(allow='showthread\.php\?t=\d+\&page=\d+',
restrict_xpaths="//li[#class='pager-page_numbers']/a"),
callback='parse_link'),
Rule(LinkExtractor(allow='showthread\.php\?t=\d+',
restrict_xpaths="//tr[#class='thread unread ']"),
callback='parse_link'),
]
Tested (worked for me).
FYI, Scrapy defines restrict_xpaths as "expressions pointing to regions":
restrict_xpaths (str or list) – is a XPath (or list of XPath’s) which
defines regions inside the response where links should be extracted
from. If given, only the text selected by those XPath will be scanned
for links. See examples below.
Related
Intention / expected behaviour
Return the text of the links from page: https://www.bezrealitky.cz/vypis/nabidka-prodej/byt/praha
In CSV format and in the shell.
Error
I get a KeyError: 'title', even though I have defined the key in the item.py itemloader.
Full Traceback
Traceback (most recent call last):
File "C:\Users\phili\Anaconda3\envs\py35\lib\site-packages\scrapy\utils\defer.py", line 102, in iter_errback
yield next(it)
File "C:\Users\phili\Anaconda3\envs\py35\lib\site-packages\scrapy\spidermiddlewares\offsite.py", line 29, in process_spider_output
for x in result:
File "C:\Users\phili\Anaconda3\envs\py35\lib\site-packages\scrapy\spidermiddlewares\referer.py", line 22, in <genexpr>
return (_set_referer(r) for r in result or ())
File "C:\Users\phili\Anaconda3\envs\py35\lib\site-packages\scrapy\spidermiddlewares\urllength.py", line 37, in <genexpr>
return (r for r in result or () if _filter(r))
File "C:\Users\phili\Anaconda3\envs\py35\lib\site-packages\scrapy\spidermiddlewares\depth.py", line 58, in <genexpr>
return (r for r in result or () if _filter(r))
File "C:\Users\phili\Documents\Python Scripts\Scrapy Spiders\bezrealitky\bezrealitky\spiders\bezrealitky_spider.py", line 33, in parse
yield loader.load_item()
File "C:\Users\phili\Anaconda3\envs\py35\lib\site-packages\scrapy\loader\__init__.py", line 115, in load_item
value = self.get_output_value(field_name)
File "C:\Users\phili\Anaconda3\envs\py35\lib\site-packages\scrapy\loader\__init__.py", line 122, in get_output_value
proc = self.get_output_processor(field_name)
File "C:\Users\phili\Anaconda3\envs\py35\lib\site-packages\scrapy\loader\__init__.py", line 144, in get_output_processor
self.default_output_processor)
File "C:\Users\phili\Anaconda3\envs\py35\lib\site-packages\scrapy\loader\__init__.py", line 154, in _get_item_field_attr
value = self.item.fields[field_name].get(key, default)
KeyError: 'title'
Spider.py
def parse(self, response):
for records in response.xpath('//*[starts-with(#class,"record")]'):
loader = BaseItemLoader(selector=records)
loader.add_xpath('title', './/div[#class="details"]/h2/a[#href]/text()')
yield loader.load_item()
Item.py - Itemloader
class BaseItemLoader(ItemLoader):
title_in = MapCompose(unidecode)
Conclusion
I am a bit at a loss, as I think I followed the Scrapy manual and defined the item loader and the key by "title_in", but then when I yield the value to it I get the KeyError. I check in the shell that the Xpath provides the text I want, so at least that is working. Hoping to get some help!
Even if you use ItemLoader you should define Item class first and then pass it to the item loader either defining it as loader's property:
class CustomItemLoader(ItemLoader):
default_item_class = MyItem
or passing its instance to loader's constructor:
l = CustomItemLoader(item=Item())
otherwise item loader knows nothing about the item and its fields.
i do not quite get while i kept getting Index error that says out of range here. Is it because the list link is empty? What is the format of writing link here
class POSpider(CrawlSpider):
name = 'po'
start_urls = ['https://www.poets.org/poetsorg/poems']
allowed_domains = ['poets.org/poetsorg/poems']
def parse(self, response):
items=[]
l=response.xpath('//*[#class="themes"]//a//#href').extract()
theme_ids=[]
for item in l:
theme_ids.append(item[855:1412])
theme_urls=[]
for tid in theme_ids:
theme_urls.append('https://www.poets.org/poetsorg/poems? field_occasion_tid=All&field_poem_themes_tid='+ tid)
for link in theme_urls:
request=scrapy.Request(link,callback=self.parse_layer2,dont_filter=True)
yield request
def parse_layer2(self,response):
items=[]
p=response.xpath('//*[#id="block-views-poems-poems-block-all"]/div/div//tbody//td[2]//#href')[-1].extract()
poem_urls=[]
for item in p:
poem_urls.append(item)
for link in poem_urls:
request=scrapy.Request(link,callback=self.parse_layer3,dont_filter=True)
yield request
def parse_layer3(self,response):
items=[]
poems=response.xpath('//*[#id="poem-content"]/div[2]/div/div').extract()
for poem in poems:
item=PoetryItem()
s=poem.xpath('*/p/text()').extract()
t=strip_list(s)
t=t.encode('ascii','replace').lower()+'\r\n'
item['poem']=t
items.append(item)
return items
and this is what i kept getting as an result.
Traceback (most recent call last):
File "//anaconda/lib/python2.7/site-packages/scrapy/utils/defer.py", line 102, in iter_errback
yield next(it)
File "//anaconda/lib/python2.7/site-packages/scrapy/spidermiddlewares/offsite.py", line 29, in process_spider_output
for x in result:
File "//anaconda/lib/python2.7/site-packages/scrapy/spidermiddlewares/referer.py", line 22, in <genexpr>
return (_set_referer(r) for r in result or ())
File "//anaconda/lib/python2.7/site-packages/scrapy/spidermiddlewares/urllength.py", line 37, in <genexpr>
return (r for r in result or () if _filter(r))
File "//anaconda/lib/python2.7/site-packages/scrapy/spidermiddlewares/depth.py", line 58, in <genexpr>
return (r for r in result or () if _filter(r))
File "/Users/chinouhane/Desktop/Org/Org/spiders/PO_spider.py", line 37, in parse_layer2
p=response.xpath('//*[#id="block-views-poems-poems-block-all"]/div/div//tbody//td[2]//#href')[-1].extract()
File "//anaconda/lib/python2.7/site-packages/parsel/selector.py", line 56, in __getitem__
o = super(SelectorList, self).__getitem__(pos)
IndexError: list index out of range
I've been using scrapy for over a year now with a script that someone else wrote for me. It was working great for over a year, until 6-8 weeks ago when it started giving me the following error whenever I try to download. Does anyone have any ideas?
I am running this on Ubuntu 14.04 LTS.
Command: scrapy crawl googleplay
2015-08-30 13:10:31-0400 [googleplay] ERROR: Spider error processing <GET https://accounts.google.com/ServiceLogin?continue=https%3A%2F%2Fplay.google.com%2Fstore%2Fapps%2Fcategory%2FGAME&followup=https%3A%2F%2Fplay.google.com%2Fstore%2Fapps%2Fcategory%2FGAME&passive=1209600&service=googleplay>
Traceback (most recent call last):
File "/usr/lib/python2.7/dist-packages/twisted/internet/base.py", line 800, in runUntilCurrent
call.func(*call.args, **call.kw)
File "/usr/lib/python2.7/dist-packages/twisted/internet/task.py", line 595, in _tick
taskObj._oneWorkUnit()
File "/usr/lib/python2.7/dist-packages/twisted/internet/task.py", line 472, in _oneWorkUnit
result = self._iterator.next()
File "/usr/lib/pymodules/python2.7/scrapy/utils/defer.py", line 57, in <genexpr>
work = (callable(elem, *args, **named) for elem in iterable)
--- <exception caught here> ---
File "/usr/lib/pymodules/python2.7/scrapy/utils/defer.py", line 96, in iter_errback
yield next(it)
File "/usr/lib/pymodules/python2.7/scrapy/contrib/spidermiddleware/offsite.py", line 23, in process_spider_output
for x in result:
File "/usr/lib/pymodules/python2.7/scrapy/contrib/spidermiddleware/referer.py", line 22, in <genexpr>
return (_set_referer(r) for r in result or ())
File "/usr/lib/pymodules/python2.7/scrapy/contrib/spidermiddleware/urllength.py", line 33, in <genexpr>
return (r for r in result or () if _filter(r))
File "/usr/lib/pymodules/python2.7/scrapy/contrib/spidermiddleware/depth.py", line 50, in <genexpr>
return (r for r in result or () if _filter(r))
File "/usr/lib/pymodules/python2.7/scrapy/contrib/spiders/crawl.py", line 73, in _parse_response
for request_or_item in self._requests_to_follow(response):
File "/usr/lib/pymodules/python2.7/scrapy/contrib/spiders/crawl.py", line 52, in _requests_to_follow
links = [l for l in rule.link_extractor.extract_links(response) if l not in seen]
File "/usr/lib/pymodules/python2.7/scrapy/contrib/linkextractors/sgml.py", line 129, in extract_links
links = self._extract_links(body, response.url, response.encoding, base_url)
File "/usr/lib/pymodules/python2.7/scrapy/contrib/linkextractors/sgml.py", line 29, in _extract_links
self.feed(response_text)
File "/usr/lib/python2.7/sgmllib.py", line 104, in feed
self.goahead(0)
File "/usr/lib/python2.7/sgmllib.py", line 174, in goahead
k = self.parse_declaration(i)
File "/usr/lib/python2.7/markupbase.py", line 98, in parse_declaration
decltype, j = self._scan_name(j, i)
File "/usr/lib/python2.7/markupbase.py", line 392, in _scan_name
% rawdata[declstartpos:declstartpos+20])
File "/usr/lib/python2.7/sgmllib.py", line 111, in error
raise SGMLParseError(message)
sgmllib.SGMLParseError: expected name token at '<!\\\\])/g,"\\\\$1").rep'
here is my GooglePlay spider (after update) along with the error message I am now receiving
import string
import requests
from scrapy import log
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.htmlparser import HtmlParserLinkExtractor
from scrapy.selector import Selector
from scrapy.http import Request
from scraper.items import ApkItem
from play import parse_app
class GooglePlaySpider(CrawlSpider):
name = 'googleplay'
start_urls = [
'https://play.google.com/store/apps'
]
rules = (
Rule(SgmlLinkExtractor(allow=('/store/apps$', )), callback='parse_category_group', follow=True),
Rule(SgmlLinkExtractor(allow=('/store/apps/category/.*', )), callback='parse_category', follow=True),
Rule(SgmlLinkExtractor(allow=('/store/search\?.*', )), callback='parse_search', follow=True),
)
def parse_category_group(self, response):
sel = Selector(response)
category_groups = sel.xpath('//div[#class="padded-content3 app-home-nav"]')
for category_group in category_groups:
category_group_name = category_group.xpath('h2/a/text()').extract()
categories = category_group.xpath('ul/li')
for category in categories:
category_name = category.xpath('a/text()').extract()
category_url = category.xpath('a/#href').extract()[0]
chars = string.ascii_uppercase + string.digits
for x in chars:
yield Request('https://play.google.com/store/search?q=' + x + '&c=apps', callback=self.parse_search)
for x in chars:
for y in chars:
yield Request('https://play.google.com/store/search?q=' + x + y + '&c=apps', callback=self.parse_search)
for x in chars:
for y in chars:
for z in chars:
yield Request('https://play.google.com/store/search?q=' + x + y + z + '&c=apps', callback=self.parse_search)
return
def parse_category(self, response):
base_path = response.url.split('?')[0]
if '/collection/' in response.url:
sel = Selector(response)
apps = sel.xpath('//a[#class="title"]')
has_app = False
for app in apps:
has_app = True
app_name = app.xpath('text()').extract()
app_url = app.xpath('#href').extract()
yield Request('https://play.google.com' + app_url[0], meta={'come_from': self.name}, callback=parse_app)
if has_app:
m = re.match(r'(.*)\?start=(\d+)&num=24', response.url)
if m is None:
start_number = 24
else:
start_number = int(m.group(2)) + 24
yield Request(base_path + '?start=' + str(start_number) + '&num=24', callback=self.parse_category)
return
def parse_search(self, response):
m = re.match(r'(.*)&start=(\d+)&num=24', response.url)
if m is None:
base_path = response.url
start_number = 24
else:
start_number = int(m.group(2)) + 24
base_path = m.group(1)
sel = Selector(response)
apps = sel.xpath('//a[contains(#href,"/store/apps/details")]')
has_app = False
for app in apps:
has_app = True
app_url = app.xpath('#href').extract()
yield Request('https://play.google.com' + app_url[0], meta={'come_from': self.name}, callback=parse_app)
if has_app:
yield Request(base_path + '&start=' + str(start_number) + '&num=24', callback=self.parse_search)
return
**** Error ****Traceback (most recent call last):
File "/usr/bin/scrapy", line 4, in <module>
execute()
File "/usr/lib/pymodules/python2.7/scrapy/cmdline.py", line 143, in execute
_run_print_help(parser, _run_command, cmd, args, opts)
File "/usr/lib/pymodules/python2.7/scrapy/cmdline.py", line 89, in _run_print_help
func(*a, **kw)
File "/usr/lib/pymodules/python2.7/scrapy/cmdline.py", line 150, in _run_command
cmd.run(args, opts)
File "/usr/lib/pymodules/python2.7/scrapy/commands/crawl.py", line 47, in run
crawler = self.crawler_process.create_crawler()
File "/usr/lib/pymodules/python2.7/scrapy/crawler.py", line 87, in create_crawler
self.crawlers[name] = Crawler(self.settings)
File "/usr/lib/pymodules/python2.7/scrapy/crawler.py", line 25, in __init__
self.spiders = spman_cls.from_crawler(self)
File "/usr/lib/pymodules/python2.7/scrapy/spidermanager.py", line 35, in from_crawler
sm = cls.from_settings(crawler.settings)
File "/usr/lib/pymodules/python2.7/scrapy/spidermanager.py", line 31, in from_settings
return cls(settings.getlist('SPIDER_MODULES'))
File "/usr/lib/pymodules/python2.7/scrapy/spidermanager.py", line 22, in __init__
for module in walk_modules(name):
File "/usr/lib/pymodules/python2.7/scrapy/utils/misc.py", line 68, in walk_modules
submod = import_module(fullpath)
File "/usr/lib/python2.7/importlib/__init__.py", line 37, in import_module
__import__(name)
File "/home/darwin/ProjectKrutz/scraper/scraper/spiders/googlePlaySpider.py", line 12, in <module>
class GooglePlaySpider(CrawlSpider):
File "/home/darwin/ProjectKrutz/scraper/scraper/spiders/googlePlaySpider.py", line 18, in GooglePlaySpider
Rule(SgmlLinkExtractor(allow=('/store/apps$', )), callback='parse_category_group', follow=True),
NameError: name 'SgmlLinkExtractor' is not defined
The problem is that SgmlLinkExtractor has problems with comments. And the error message tells you that there is a comment: <!.
So the solution would be to change the spider you are using and replace the SgmlLinkExtractor with either
from scrapy.contrib.linkextractors.htmlparser import HtmlParserLinkExtractor
or
from scrapy.contrib.linkextractors.lxmlhtml import LxmlParserLinkExtractor
Naturally these were only the import statements, you have to change the Rule where the link extractor is used too to use one of these extractors.
Without the code I cannot give you more advice where to change the parts.
I'm getting an error: "Unhasable type: list" in the rules which i have defined to extract next-button link.
from scrapy.http import Request
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider,Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from walmart_sample.items import WalmartSampleItem
class MySpider(CrawlSpider):
name = "my_spider"
domain = ['Apparel']
keyword = 'Bags'
departments = {"All Departments": "0", "Apparel": "5438", "Auto": "91083", "Baby": "5427", "Beauty": "1085666","Books": "3920", "Electronics": "3944", "Gifts": "1094765", "Grocery": "976759", "Health": "976760","Home": "4044", "Home Improvement": "1072864", "Jwelery": "3891", "Movies": "4096", "Music": "4104","Party": "2637", "Patio": "5428", "Pets": "5440", "Pharmacy": "5431", "Photo Center": "5426","Sports": "4125", "Toys": "4171", "Video Games": "2636"}
allowed_domains = ['walmart.com']
denied_domains = ['reviews.walmart.com','facebook.com','twitter.com']
rules = (Rule(SgmlLinkExtractor(allow=("http://www.walmart.com/search/search-ng.do?tab_value=all&search_query=%s&search_constraint=%s&Find=Find&pref_store=1801&ss=false&ic=16_\d*2&_mm=" %(keyword,departments.get(domain))),),restrict_xpaths=('//li[#class="btn-nextResults"]'),callback='parse',follow=True),)
def start_requests(self):
for domains in self.domain:
if domains in self.departments:
url = 'http://www.walmart.com/search/search-ng.do?search_query=%s&ic=16_0&Find=Find&search_constraint=%s' % (self.keyword, self.departments.get(domains))
yield Request(url)
def parse(self, response):
hxs = HtmlXPathSelector(response)
links = hxs.select('//a[#class="prodLink ListItemLink"]/#href')
last = hxs.select('//a[#class="SPPagNoLink jump next"]').extract()
if last is None:
for link in links:
href = link.extract()
yield Request('http://www.walmart.com/' + href, self.parse_data)
else:
print "<<<<<Last Page>>>>>>"
def parse_data(self, response):
hxs = HtmlXPathSelector(response)
items=[]
walmart=WalmartSampleItem()
walmart['Title']=hxs.select('//h1[#class="productTitle"]/text()').extract()
walmart['Price']=hxs.select('//span[#class="bigPriceText1"]/text()').extract()+hxs.select('//span[#class="smallPriceText1"]/text()').extract()
walmart['Availability']=hxs.select('//span[#id="STORE_AVAIL"]/text()').extract()
walmart['Description']=hxs.select('//span[#class="ql-details-short-desc"]/p/text()').extract()
#walmart['Avg_Rating']=
#walmart['Detailed_Rating']=
items.append(walmart)
return items
Traceback (most recent call last):
File "/usr/bin/scrapy", line 4, in <module>
execute()
File "/usr/lib/pymodules/python2.7/scrapy/cmdline.py", line 143, in execute
_run_print_help(parser, _run_command, cmd, args, opts)
File "/usr/lib/pymodules/python2.7/scrapy/cmdline.py", line 89, in _run_print_help
func(*a, **kw)
File "/usr/lib/pymodules/python2.7/scrapy/cmdline.py", line 150, in _run_command
cmd.run(args, opts)
File "/usr/lib/pymodules/python2.7/scrapy/commands/crawl.py", line 47, in run
crawler = self.crawler_process.create_crawler()
File "/usr/lib/pymodules/python2.7/scrapy/crawler.py", line 87, in create_crawler
self.crawlers[name] = Crawler(self.settings)
File "/usr/lib/pymodules/python2.7/scrapy/crawler.py", line 25, in __init__
self.spiders = spman_cls.from_crawler(self)
File "/usr/lib/pymodules/python2.7/scrapy/spidermanager.py", line 35, in from_crawler
sm = cls.from_settings(crawler.settings)
File "/usr/lib/pymodules/python2.7/scrapy/spidermanager.py", line 31, in from_settings
return cls(settings.getlist('SPIDER_MODULES'))
File "/usr/lib/pymodules/python2.7/scrapy/spidermanager.py", line 22, in __init__
for module in walk_modules(name):
File "/usr/lib/pymodules/python2.7/scrapy/utils/misc.py", line 68, in walk_modules
submod = import_module(fullpath)
File "/usr/lib/python2.7/importlib/__init__.py", line 37, in import_module
__import__(name)
File "/home/vivek/mywork/walmart_sample/walmart_sample/spiders/test.py", line 8, in <module>
class MySpider(CrawlSpider):
File "/home/vivek/mywork/walmart_sample/walmart_sample/spiders/test.py", line 15, in MySpider
rules = (Rule(SgmlLinkExtractor(allow=("http://www.walmart.com/search/search-ng.do?tab_value=all&search_query=%s&search_constraint=%s&Find=Find&pref_store=1801&ss=false&ic=16_\d*2&_mm=" %(keyword,departments.get(domain))),),restrict_xpaths=('//li[#class="btn-nextResults"]'),callback='parse',follow=True),)
TypeError: unhashable type: 'list'
The problematic bit is this:
departments.get(domain)
domain is a list and thus you need to specify which individual item in the list you want to use. In this case using domain[0] fixes the problem, and your rule becomes:
rules = (Rule(SgmlLinkExtractor(allow=("http://www.walmart.com/search/search-ng.do?tab_value=all&search_query=%s&search_constraint=%s&Find=Find&pref_store=1801&ss=false&ic=16_\d*2&_mm=" %(keyword,departments.get(domain[0]))),),restrict_xpaths=('//li[#class="btn-nextResults"]'),callback='parse',follow=True),)
i'm new to python and scrapy. After setting restrict_xpaths settings to "//table[#class="lista"]" I've received following traceback. What's strange, by using other xpath rule the crawler works properly.
Traceback (most recent call last):
File "/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/twisted/internet/base.py", line 800, in runUntilCurrent
call.func(*call.args, **call.kw)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/twisted/internet/task.py", line 602, in _tick
taskObj._oneWorkUnit()
File "/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/twisted/internet/task.py", line 479, in _oneWorkUnit
result = self._iterator.next()
File "/Library/Python/2.7/site-packages/scrapy/utils/defer.py", line 57, in <genexpr>
work = (callable(elem, *args, **named) for elem in iterable)
--- <exception caught here> ---
File "/Library/Python/2.7/site-packages/scrapy/utils/defer.py", line 96, in iter_errback
yield it.next()
File "/Library/Python/2.7/site-packages/scrapy/contrib/spidermiddleware/offsite.py", line 23, in process_spider_output
for x in result:
File "/Library/Python/2.7/site-packages/scrapy/contrib/spidermiddleware/referer.py", line 22, in <genexpr>
return (_set_referer(r) for r in result or ())
File "/Library/Python/2.7/site-packages/scrapy/contrib/spidermiddleware/urllength.py", line 33, in <genexpr>
return (r for r in result or () if _filter(r))
File "/Library/Python/2.7/site-packages/scrapy/contrib/spidermiddleware/depth.py", line 50, in <genexpr>
return (r for r in result or () if _filter(r))
File "/Library/Python/2.7/site-packages/scrapy/contrib/spiders/crawl.py", line 73, in _parse_response
for request_or_item in self._requests_to_follow(response):
File "/Library/Python/2.7/site-packages/scrapy/contrib/spiders/crawl.py", line 52, in _requests_to_follow
links = [l for l in rule.link_extractor.extract_links(response) if l not in seen]
File "/Library/Python/2.7/site-packages/scrapy/contrib/linkextractors/sgml.py", line 124, in extract_links
).encode(response.encoding)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/encodings/iso8859_2.py", line 12, in encode
return codecs.charmap_encode(input,errors,encoding_table)
exceptions.UnicodeEncodeError: 'charmap' codec can't encode character u'\xbb' in position 686: character maps to <undefined>
Here is MySpider Class.
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from ds_crawl.items import DsCrawlItem
class MySpider(CrawlSpider):
name = 'inside'
allowed_domains = ['wroclaw.dlastudenta.pl']
start_urls = ['http://wroclaw.dlastudenta.pl/stancje/']
rules = (
Rule(SgmlLinkExtractor(allow=('show_stancja'), restrict_xpaths=('//table[#class="lista"]')), callback='parse_item', follow= True),)
def parse_item(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.select("//p[#class='bbtext intextAd']")
for titles in titles:
item = DsCrawlItem()
item['content'] = titles.select("text()").extract()
print item
Any explanation of this error and help will be appreciated. Thank you.
That's a bug caused by the web page using the » entity which is translated by lxml to the unicode character \xbb and when you use the restrict_xpaths argument the link extractors encodes the content to the original encoding iso8859-2 which fails because \xbb is not valid character in that encoding.
This simple line reproduces the exception:
>>> u'\xbb'.encode('iso8859-2')
...
UnicodeEncodeError: 'charmap' codec can't encode character u'\xbb' in position 0: character maps to <undefined>
A workaround for this can be forcing to use utf8 for all responses. This can be done by a simple downloader middleware:
# file: myproject/middlewares.py
class ForceUTF8Response(object):
"""A downloader middleware to force UTF-8 encoding for all responses."""
encoding = 'utf-8'
def process_response(self, request, response, spider):
# Note: Use response.body_as_unicode() instead of response.text in in Scrapy <1.0.
new_body = response.text.encode(self.encoding)
return response.replace(body=new_body, encoding=self.encoding)
In your settings:
DOWNLOADER_MIDDLEWARES = {
'myproject.middlewares.ForceUTF8Response': 100,
}