i'm new to python and scrapy. After setting restrict_xpaths settings to "//table[#class="lista"]" I've received following traceback. What's strange, by using other xpath rule the crawler works properly.
Traceback (most recent call last):
File "/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/twisted/internet/base.py", line 800, in runUntilCurrent
call.func(*call.args, **call.kw)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/twisted/internet/task.py", line 602, in _tick
taskObj._oneWorkUnit()
File "/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/twisted/internet/task.py", line 479, in _oneWorkUnit
result = self._iterator.next()
File "/Library/Python/2.7/site-packages/scrapy/utils/defer.py", line 57, in <genexpr>
work = (callable(elem, *args, **named) for elem in iterable)
--- <exception caught here> ---
File "/Library/Python/2.7/site-packages/scrapy/utils/defer.py", line 96, in iter_errback
yield it.next()
File "/Library/Python/2.7/site-packages/scrapy/contrib/spidermiddleware/offsite.py", line 23, in process_spider_output
for x in result:
File "/Library/Python/2.7/site-packages/scrapy/contrib/spidermiddleware/referer.py", line 22, in <genexpr>
return (_set_referer(r) for r in result or ())
File "/Library/Python/2.7/site-packages/scrapy/contrib/spidermiddleware/urllength.py", line 33, in <genexpr>
return (r for r in result or () if _filter(r))
File "/Library/Python/2.7/site-packages/scrapy/contrib/spidermiddleware/depth.py", line 50, in <genexpr>
return (r for r in result or () if _filter(r))
File "/Library/Python/2.7/site-packages/scrapy/contrib/spiders/crawl.py", line 73, in _parse_response
for request_or_item in self._requests_to_follow(response):
File "/Library/Python/2.7/site-packages/scrapy/contrib/spiders/crawl.py", line 52, in _requests_to_follow
links = [l for l in rule.link_extractor.extract_links(response) if l not in seen]
File "/Library/Python/2.7/site-packages/scrapy/contrib/linkextractors/sgml.py", line 124, in extract_links
).encode(response.encoding)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/encodings/iso8859_2.py", line 12, in encode
return codecs.charmap_encode(input,errors,encoding_table)
exceptions.UnicodeEncodeError: 'charmap' codec can't encode character u'\xbb' in position 686: character maps to <undefined>
Here is MySpider Class.
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from ds_crawl.items import DsCrawlItem
class MySpider(CrawlSpider):
name = 'inside'
allowed_domains = ['wroclaw.dlastudenta.pl']
start_urls = ['http://wroclaw.dlastudenta.pl/stancje/']
rules = (
Rule(SgmlLinkExtractor(allow=('show_stancja'), restrict_xpaths=('//table[#class="lista"]')), callback='parse_item', follow= True),)
def parse_item(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.select("//p[#class='bbtext intextAd']")
for titles in titles:
item = DsCrawlItem()
item['content'] = titles.select("text()").extract()
print item
Any explanation of this error and help will be appreciated. Thank you.
That's a bug caused by the web page using the » entity which is translated by lxml to the unicode character \xbb and when you use the restrict_xpaths argument the link extractors encodes the content to the original encoding iso8859-2 which fails because \xbb is not valid character in that encoding.
This simple line reproduces the exception:
>>> u'\xbb'.encode('iso8859-2')
...
UnicodeEncodeError: 'charmap' codec can't encode character u'\xbb' in position 0: character maps to <undefined>
A workaround for this can be forcing to use utf8 for all responses. This can be done by a simple downloader middleware:
# file: myproject/middlewares.py
class ForceUTF8Response(object):
"""A downloader middleware to force UTF-8 encoding for all responses."""
encoding = 'utf-8'
def process_response(self, request, response, spider):
# Note: Use response.body_as_unicode() instead of response.text in in Scrapy <1.0.
new_body = response.text.encode(self.encoding)
return response.replace(body=new_body, encoding=self.encoding)
In your settings:
DOWNLOADER_MIDDLEWARES = {
'myproject.middlewares.ForceUTF8Response': 100,
}
Related
Intention / expected behaviour
Return the text of the links from page: https://www.bezrealitky.cz/vypis/nabidka-prodej/byt/praha
In CSV format and in the shell.
Error
I get a KeyError: 'title', even though I have defined the key in the item.py itemloader.
Full Traceback
Traceback (most recent call last):
File "C:\Users\phili\Anaconda3\envs\py35\lib\site-packages\scrapy\utils\defer.py", line 102, in iter_errback
yield next(it)
File "C:\Users\phili\Anaconda3\envs\py35\lib\site-packages\scrapy\spidermiddlewares\offsite.py", line 29, in process_spider_output
for x in result:
File "C:\Users\phili\Anaconda3\envs\py35\lib\site-packages\scrapy\spidermiddlewares\referer.py", line 22, in <genexpr>
return (_set_referer(r) for r in result or ())
File "C:\Users\phili\Anaconda3\envs\py35\lib\site-packages\scrapy\spidermiddlewares\urllength.py", line 37, in <genexpr>
return (r for r in result or () if _filter(r))
File "C:\Users\phili\Anaconda3\envs\py35\lib\site-packages\scrapy\spidermiddlewares\depth.py", line 58, in <genexpr>
return (r for r in result or () if _filter(r))
File "C:\Users\phili\Documents\Python Scripts\Scrapy Spiders\bezrealitky\bezrealitky\spiders\bezrealitky_spider.py", line 33, in parse
yield loader.load_item()
File "C:\Users\phili\Anaconda3\envs\py35\lib\site-packages\scrapy\loader\__init__.py", line 115, in load_item
value = self.get_output_value(field_name)
File "C:\Users\phili\Anaconda3\envs\py35\lib\site-packages\scrapy\loader\__init__.py", line 122, in get_output_value
proc = self.get_output_processor(field_name)
File "C:\Users\phili\Anaconda3\envs\py35\lib\site-packages\scrapy\loader\__init__.py", line 144, in get_output_processor
self.default_output_processor)
File "C:\Users\phili\Anaconda3\envs\py35\lib\site-packages\scrapy\loader\__init__.py", line 154, in _get_item_field_attr
value = self.item.fields[field_name].get(key, default)
KeyError: 'title'
Spider.py
def parse(self, response):
for records in response.xpath('//*[starts-with(#class,"record")]'):
loader = BaseItemLoader(selector=records)
loader.add_xpath('title', './/div[#class="details"]/h2/a[#href]/text()')
yield loader.load_item()
Item.py - Itemloader
class BaseItemLoader(ItemLoader):
title_in = MapCompose(unidecode)
Conclusion
I am a bit at a loss, as I think I followed the Scrapy manual and defined the item loader and the key by "title_in", but then when I yield the value to it I get the KeyError. I check in the shell that the Xpath provides the text I want, so at least that is working. Hoping to get some help!
Even if you use ItemLoader you should define Item class first and then pass it to the item loader either defining it as loader's property:
class CustomItemLoader(ItemLoader):
default_item_class = MyItem
or passing its instance to loader's constructor:
l = CustomItemLoader(item=Item())
otherwise item loader knows nothing about the item and its fields.
I added restrict_xpaths rules to my scrapy spider and now it immediately fails with:
2015-03-16 15:46:53+0000 [tsr] ERROR: Spider error processing <GET http://www.thestudentroom.co.uk/forumdisplay.php?f=143>
Traceback (most recent call last):
File "/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/twisted/internet/base.py", line 800, in runUntilCurrent
call.func(*call.args, **call.kw)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/twisted/internet/task.py", line 602, in _tick
taskObj._oneWorkUnit()
File "/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/twisted/internet/task.py", line 479, in _oneWorkUnit
result = self._iterator.next()
File "/Library/Python/2.7/site-packages/scrapy/utils/defer.py", line 57, in <genexpr>
work = (callable(elem, *args, **named) for elem in iterable)
--- <exception caught here> ---
File "/Library/Python/2.7/site-packages/scrapy/utils/defer.py", line 96, in iter_errback
yield next(it)
File "/Library/Python/2.7/site-packages/scrapy/contrib/spidermiddleware/offsite.py", line 26, in process_spider_output
for x in result:
File "/Library/Python/2.7/site-packages/scrapy/contrib/spidermiddleware/referer.py", line 22, in <genexpr>
return (_set_referer(r) for r in result or ())
File "/Library/Python/2.7/site-packages/scrapy/contrib/spidermiddleware/urllength.py", line 33, in <genexpr>
return (r for r in result or () if _filter(r))
File "/Library/Python/2.7/site-packages/scrapy/contrib/spidermiddleware/depth.py", line 50, in <genexpr>
return (r for r in result or () if _filter(r))
File "/Library/Python/2.7/site-packages/scrapy/contrib/spiders/crawl.py", line 73, in _parse_response
for request_or_item in self._requests_to_follow(response):
File "/Library/Python/2.7/site-packages/scrapy/contrib/spiders/crawl.py", line 52, in _requests_to_follow
links = [l for l in rule.link_extractor.extract_links(response) if l not in seen]
File "/Library/Python/2.7/site-packages/scrapy/contrib/linkextractors/lxmlhtml.py", line 107, in extract_links
links = self._extract_links(doc, response.url, response.encoding, base_url)
File "/Library/Python/2.7/site-packages/scrapy/linkextractor.py", line 94, in _extract_links
return self.link_extractor._extract_links(*args, **kwargs)
File "/Library/Python/2.7/site-packages/scrapy/contrib/linkextractors/lxmlhtml.py", line 50, in _extract_links
for el, attr, attr_val in self._iter_links(selector._root):
**File "/Library/Python/2.7/site-packages/scrapy/contrib/linkextractors/lxmlhtml.py", line 38, in _iter_links
for el in document.iter(etree.Element):
exceptions.AttributeError: 'str' object has no attribute 'iter'**
I cannot understand why this error is happening.
Here is my short Spider:
import scrapy
from tutorial.items import DmozItem
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
class TsrSpider(CrawlSpider):
name = 'tsr'
allowed_domains = ['thestudentroom.co.uk']
start_urls = ['http://www.thestudentroom.co.uk/forumdisplay.php?f=143']
download_delay = 4
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:35.0) Gecko/20100101 Firefox/35.0'
rules = (
Rule(
LinkExtractor(
allow=('forumdisplay\.php\?f=143\&page=\d',),
restrict_xpaths=("//li[#class='pager-page_numbers']/a/#href",))),
Rule(
LinkExtractor(
allow=('showthread\.php\?t=\d+\&page=\d+',),
restrict_xpaths=("//li[#class='pager-page_numbers']/a/#href",)),
callback='parse_link'),
Rule(
LinkExtractor(
allow=('showthread\.php\?t=\d+',),
restrict_xpaths=("//tr[#class='thread unread ']",)),
callback='parse_link'),
)
def parse_link(self, response):
# Iterate over posts.
for sel in response.xpath("//li[#class='post threadpost old ']"):
rating = sel.xpath(
"div[#class='post-footer']//span[#class='score']/text()").extract()
if not rating:
rating = 0
else:
rating = rating[0]
item = DmozItem()
item['post'] = sel.xpath(
"div[#class='post-content']/blockquote[#class='postcontent restore']/text()").extract()
item['link'] = response.url
item['topic'] = response.xpath(
"//div[#class='forum-header section-header']/h1/span/text()").extract()
item['rating'] = rating
yield item
source: http://pastebin.com/YXdWvPgX
Can someone help me out? Where is the mistake? I've been searching for days!?
The problem is that restrict_xpaths should point to elements - either the links directly or containers containing links, not attributes:
rules = [
Rule(LinkExtractor(allow='forumdisplay\.php\?f=143\&page=\d',
restrict_xpaths="//li[#class='pager-page_numbers']/a")),
Rule(LinkExtractor(allow='showthread\.php\?t=\d+\&page=\d+',
restrict_xpaths="//li[#class='pager-page_numbers']/a"),
callback='parse_link'),
Rule(LinkExtractor(allow='showthread\.php\?t=\d+',
restrict_xpaths="//tr[#class='thread unread ']"),
callback='parse_link'),
]
Tested (worked for me).
FYI, Scrapy defines restrict_xpaths as "expressions pointing to regions":
restrict_xpaths (str or list) – is a XPath (or list of XPath’s) which
defines regions inside the response where links should be extracted
from. If given, only the text selected by those XPath will be scanned
for links. See examples below.
I am using scrapy with python and I have this code in a python item pipline
def process_item(self, item, spider):
import pdb; pdb.set_trace()
ID = str(uuid.uuid5(uuid.NAMESPACE_DNS, item['link']))
I got this error :
Traceback (most recent call last):
File "C:\Python27\lib\site-packages\scrapy-0.20.2-py2.7.egg\scrapy\mid
dleware.py", line 62, in _process_chain
return process_chain(self.methods[methodname], obj, *args)
File "C:\Python27\lib\site-packages\scrapy-0.20.2-py2.7.egg\scrapy\uti
ls\defer.py", line 65, in process_chain
d.callback(input)
File "C:\Python27\lib\site-packages\twisted\internet\defer.py", line 3
82, in callback
self._startRunCallbacks(result)
File "C:\Python27\lib\site-packages\twisted\internet\defer.py", line 4
90, in _startRunCallbacks
self._runCallbacks()
--- <exception caught here> ---
File "C:\Python27\lib\site-packages\twisted\internet\defer.py", line 5
77, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "General_Spider_code_version_2\pipelines.py", line 7, in process_
item
ID = str(uuid.uuid5(uuid.NAMESPACE_DNS, item['link']))
File "C:\Python27\lib\uuid.py", line 549, in uuid5
hash = sha1(namespace.bytes + name).digest()
exceptions.UnicodeDecodeError: 'ascii' codec can't decode byte 0xa7 in p
osition 1: ordinal not in range(128)
I tried to debug the item['link']
and this is the result
-> ID = str(uuid.uuid5(uuid.NAMESPACE_DNS, item['link']))
(Pdb) item['link']
u'http://dubai.dubizzle.com/property-for-rent/residential/apartmentflat/2014/4/6
/palm-jumeirah-abu-keibal-3-br-maid-partial-2/?back=ZHViYWkuZHViaXp6bGUuY29tL3By
b3BlcnR5LWZvci1yZW50L3Jlc2lkZW50aWFsL2FwYXJ0bWVudGZsYXQv&pos=1'
(Pdb)
as you see the item['link'] is unicode
Edit1
when I change the item['link'] to any other attribute like item['date'] the code works perfectly
Encode the unicode string into byte string with .encode('utf-8') and it should work:
str(uuid.uuid5(uuid.NAMESPACE_DNS, item['link'].encode('utf-8')))
I'm getting an error: "Unhasable type: list" in the rules which i have defined to extract next-button link.
from scrapy.http import Request
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider,Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from walmart_sample.items import WalmartSampleItem
class MySpider(CrawlSpider):
name = "my_spider"
domain = ['Apparel']
keyword = 'Bags'
departments = {"All Departments": "0", "Apparel": "5438", "Auto": "91083", "Baby": "5427", "Beauty": "1085666","Books": "3920", "Electronics": "3944", "Gifts": "1094765", "Grocery": "976759", "Health": "976760","Home": "4044", "Home Improvement": "1072864", "Jwelery": "3891", "Movies": "4096", "Music": "4104","Party": "2637", "Patio": "5428", "Pets": "5440", "Pharmacy": "5431", "Photo Center": "5426","Sports": "4125", "Toys": "4171", "Video Games": "2636"}
allowed_domains = ['walmart.com']
denied_domains = ['reviews.walmart.com','facebook.com','twitter.com']
rules = (Rule(SgmlLinkExtractor(allow=("http://www.walmart.com/search/search-ng.do?tab_value=all&search_query=%s&search_constraint=%s&Find=Find&pref_store=1801&ss=false&ic=16_\d*2&_mm=" %(keyword,departments.get(domain))),),restrict_xpaths=('//li[#class="btn-nextResults"]'),callback='parse',follow=True),)
def start_requests(self):
for domains in self.domain:
if domains in self.departments:
url = 'http://www.walmart.com/search/search-ng.do?search_query=%s&ic=16_0&Find=Find&search_constraint=%s' % (self.keyword, self.departments.get(domains))
yield Request(url)
def parse(self, response):
hxs = HtmlXPathSelector(response)
links = hxs.select('//a[#class="prodLink ListItemLink"]/#href')
last = hxs.select('//a[#class="SPPagNoLink jump next"]').extract()
if last is None:
for link in links:
href = link.extract()
yield Request('http://www.walmart.com/' + href, self.parse_data)
else:
print "<<<<<Last Page>>>>>>"
def parse_data(self, response):
hxs = HtmlXPathSelector(response)
items=[]
walmart=WalmartSampleItem()
walmart['Title']=hxs.select('//h1[#class="productTitle"]/text()').extract()
walmart['Price']=hxs.select('//span[#class="bigPriceText1"]/text()').extract()+hxs.select('//span[#class="smallPriceText1"]/text()').extract()
walmart['Availability']=hxs.select('//span[#id="STORE_AVAIL"]/text()').extract()
walmart['Description']=hxs.select('//span[#class="ql-details-short-desc"]/p/text()').extract()
#walmart['Avg_Rating']=
#walmart['Detailed_Rating']=
items.append(walmart)
return items
Traceback (most recent call last):
File "/usr/bin/scrapy", line 4, in <module>
execute()
File "/usr/lib/pymodules/python2.7/scrapy/cmdline.py", line 143, in execute
_run_print_help(parser, _run_command, cmd, args, opts)
File "/usr/lib/pymodules/python2.7/scrapy/cmdline.py", line 89, in _run_print_help
func(*a, **kw)
File "/usr/lib/pymodules/python2.7/scrapy/cmdline.py", line 150, in _run_command
cmd.run(args, opts)
File "/usr/lib/pymodules/python2.7/scrapy/commands/crawl.py", line 47, in run
crawler = self.crawler_process.create_crawler()
File "/usr/lib/pymodules/python2.7/scrapy/crawler.py", line 87, in create_crawler
self.crawlers[name] = Crawler(self.settings)
File "/usr/lib/pymodules/python2.7/scrapy/crawler.py", line 25, in __init__
self.spiders = spman_cls.from_crawler(self)
File "/usr/lib/pymodules/python2.7/scrapy/spidermanager.py", line 35, in from_crawler
sm = cls.from_settings(crawler.settings)
File "/usr/lib/pymodules/python2.7/scrapy/spidermanager.py", line 31, in from_settings
return cls(settings.getlist('SPIDER_MODULES'))
File "/usr/lib/pymodules/python2.7/scrapy/spidermanager.py", line 22, in __init__
for module in walk_modules(name):
File "/usr/lib/pymodules/python2.7/scrapy/utils/misc.py", line 68, in walk_modules
submod = import_module(fullpath)
File "/usr/lib/python2.7/importlib/__init__.py", line 37, in import_module
__import__(name)
File "/home/vivek/mywork/walmart_sample/walmart_sample/spiders/test.py", line 8, in <module>
class MySpider(CrawlSpider):
File "/home/vivek/mywork/walmart_sample/walmart_sample/spiders/test.py", line 15, in MySpider
rules = (Rule(SgmlLinkExtractor(allow=("http://www.walmart.com/search/search-ng.do?tab_value=all&search_query=%s&search_constraint=%s&Find=Find&pref_store=1801&ss=false&ic=16_\d*2&_mm=" %(keyword,departments.get(domain))),),restrict_xpaths=('//li[#class="btn-nextResults"]'),callback='parse',follow=True),)
TypeError: unhashable type: 'list'
The problematic bit is this:
departments.get(domain)
domain is a list and thus you need to specify which individual item in the list you want to use. In this case using domain[0] fixes the problem, and your rule becomes:
rules = (Rule(SgmlLinkExtractor(allow=("http://www.walmart.com/search/search-ng.do?tab_value=all&search_query=%s&search_constraint=%s&Find=Find&pref_store=1801&ss=false&ic=16_\d*2&_mm=" %(keyword,departments.get(domain[0]))),),restrict_xpaths=('//li[#class="btn-nextResults"]'),callback='parse',follow=True),)
I have the following code in my parse_item callback:
sel = Selector(response)
item['name'] = sel.xpath('//div[#class="productDescriptionBlock"]/h2/text()').extract()[0]
return item
But I get UnicodeEncodeError:
exceptions.UnicodeEncodeError: 'charmap' codec can't encode character u'\uff01' in position 271761: character maps to <undefined>
I also tried adding .encode('utf-8') but still get the same error.
Traceback (most recent call last):
File "/home/scraper/.fakeroot/lib/python2.7/site-packages/twisted/internet/base.py", line 824, in runUntilCurrent
call.func(*call.args, **call.kw)
File "/home/scraper/.fakeroot/lib/python2.7/site-packages/twisted/internet/task.py", line 638, in _tick
taskObj._oneWorkUnit()
File "/home/scraper/.fakeroot/lib/python2.7/site-packages/twisted/internet/task.py", line 484, in _oneWorkUnit
result = next(self._iterator)
File "/home/scraper/.fakeroot/lib/python2.7/site-packages/scrapy/utils/defer.py", line 57, in <genexpr>
work = (callable(elem, *args, **named) for elem in iterable)
--- <exception caught here> ---
File "/home/scraper/.fakeroot/lib/python2.7/site-packages/scrapy/utils/defer.py", line 96, in iter_errback
yield next(it)
File "/home/scraper/.fakeroot/lib/python2.7/site-packages/scrapy/contrib/spidermiddleware/offsite.py", line 23, in process_spider_output
for x in result:
File "/home/scraper/.fakeroot/lib/python2.7/site-packages/scrapy/contrib/spidermiddleware/referer.py", line 22, in <genexpr>
return (_set_referer(r) for r in result or ())
File "/home/scraper/.fakeroot/lib/python2.7/site-packages/scrapy/contrib/spidermiddleware/urllength.py", line 33, in <genexpr>
return (r for r in result or () if _filter(r))
File "/home/scraper/.fakeroot/lib/python2.7/site-packages/scrapy/contrib/spidermiddleware/depth.py", line 50, in <genexpr>
return (r for r in result or () if _filter(r))
File "/home/scraper/.fakeroot/lib/python2.7/site-packages/scrapy/contrib/spiders/crawl.py", line 73, in _parse_response
for request_or_item in self._requests_to_follow(response):
File "/home/scraper/.fakeroot/lib/python2.7/site-packages/scrapy/contrib/spiders/crawl.py", line 52, in _requests_to_follow
links = [l for l in rule.link_extractor.extract_links(response) if l not in seen]
File "/home/scraper/.fakeroot/lib/python2.7/site-packages/scrapy/contrib/linkextractors/sgml.py", line 124, in extract_links
).encode(response.encoding)
File "/home/scraper/.fakeroot/lib/python2.7/encodings/cp1252.py", line 12, in encode
return codecs.charmap_encode(input,errors,encoding_table)
exceptions.UnicodeEncodeError: 'charmap' codec can't encode character u'\x99' in position 349751: character maps to <undefined>
I've seen this before. If I'm not wrong, you are using the restrict_xpaths parameter in your rule's link extractor.
Possible solutions are:
Avoid to use restrict_xpaths for that particular site. This happens because the page content contains characters not defined in the declared encoding.
Identify the invalid characters and replace them before the rule acts on it. This can be tricky, though.
Use the middleware in this answer to re-encode the response into its declared encoding: UnicodeEncodeError after setting restrict_xpaths settings