Intention / expected behaviour
Return the text of the links from page: https://www.bezrealitky.cz/vypis/nabidka-prodej/byt/praha
In CSV format and in the shell.
Error
I get a KeyError: 'title', even though I have defined the key in the item.py itemloader.
Full Traceback
Traceback (most recent call last):
File "C:\Users\phili\Anaconda3\envs\py35\lib\site-packages\scrapy\utils\defer.py", line 102, in iter_errback
yield next(it)
File "C:\Users\phili\Anaconda3\envs\py35\lib\site-packages\scrapy\spidermiddlewares\offsite.py", line 29, in process_spider_output
for x in result:
File "C:\Users\phili\Anaconda3\envs\py35\lib\site-packages\scrapy\spidermiddlewares\referer.py", line 22, in <genexpr>
return (_set_referer(r) for r in result or ())
File "C:\Users\phili\Anaconda3\envs\py35\lib\site-packages\scrapy\spidermiddlewares\urllength.py", line 37, in <genexpr>
return (r for r in result or () if _filter(r))
File "C:\Users\phili\Anaconda3\envs\py35\lib\site-packages\scrapy\spidermiddlewares\depth.py", line 58, in <genexpr>
return (r for r in result or () if _filter(r))
File "C:\Users\phili\Documents\Python Scripts\Scrapy Spiders\bezrealitky\bezrealitky\spiders\bezrealitky_spider.py", line 33, in parse
yield loader.load_item()
File "C:\Users\phili\Anaconda3\envs\py35\lib\site-packages\scrapy\loader\__init__.py", line 115, in load_item
value = self.get_output_value(field_name)
File "C:\Users\phili\Anaconda3\envs\py35\lib\site-packages\scrapy\loader\__init__.py", line 122, in get_output_value
proc = self.get_output_processor(field_name)
File "C:\Users\phili\Anaconda3\envs\py35\lib\site-packages\scrapy\loader\__init__.py", line 144, in get_output_processor
self.default_output_processor)
File "C:\Users\phili\Anaconda3\envs\py35\lib\site-packages\scrapy\loader\__init__.py", line 154, in _get_item_field_attr
value = self.item.fields[field_name].get(key, default)
KeyError: 'title'
Spider.py
def parse(self, response):
for records in response.xpath('//*[starts-with(#class,"record")]'):
loader = BaseItemLoader(selector=records)
loader.add_xpath('title', './/div[#class="details"]/h2/a[#href]/text()')
yield loader.load_item()
Item.py - Itemloader
class BaseItemLoader(ItemLoader):
title_in = MapCompose(unidecode)
Conclusion
I am a bit at a loss, as I think I followed the Scrapy manual and defined the item loader and the key by "title_in", but then when I yield the value to it I get the KeyError. I check in the shell that the Xpath provides the text I want, so at least that is working. Hoping to get some help!
Even if you use ItemLoader you should define Item class first and then pass it to the item loader either defining it as loader's property:
class CustomItemLoader(ItemLoader):
default_item_class = MyItem
or passing its instance to loader's constructor:
l = CustomItemLoader(item=Item())
otherwise item loader knows nothing about the item and its fields.
Related
I have a list in spider class. I need to initialize it. This is what code looks like:
class Myspider(SitemapSpider):
name = 'spidername'
sitemap_urls = [
'https://www.arabam.com/sitemap/otomobil_13.xml']
sitemap_rules = [
('/otomobil/', 'parse'),
]
custom_settings = {'FEED_FORMAT':'csv','FEED_URI': "arabam_"+str(datetime.today().strftime('%d%m%y'))+'.csv'
}
crawled = []
new_links = 0
def parse(self,response):
if self.new_links >3:
with open("URLs", "wb") as f:
pickle.dump(self.crawled, f)
self.new_links = 0
for td in response.xpath("/html/body/div[3]/div[6]/div[4]/div/div[2]/table/tbody/tr/td[4]/div/a"):
if link[0] not in self.crawled:
self.crawled.append(link[0])
#################################some code
process = CrawlerProcess({
})
Myspider.crawled = []
Myspider.crawled.append("hi")
try:
with (open("URLs", "rb")) as openfile:
while True:
try:
Myspider.crawled = pickle.load(openfile)
except EOFError:
break
except:
with open("URLs", "wb") as f:
pickle.dump("", f)
print(Myspider.crawled)
process.crawl(Myspider, Myspider.crawled)
process.start() # the script wi
It keeps throwing following exception:
Traceback (most recent call last):
File "C:\Users\fatima.arshad\AppData\Local\Continuum\anaconda2\envs\web_scraping\lib\site-packages\twisted\internet\defer.py", line 151, in maybeDeferred
result = f(*args, **kw)
File "C:\Users\fatima.arshad\AppData\Local\Continuum\anaconda2\envs\web_scraping\lib\site-packages\pydispatch\robustapply.py", line 55, in robustApply
return receiver(*arguments, **named)
File "C:\Users\fatima.arshad\AppData\Local\Continuum\anaconda2\envs\web_scraping\lib\site-packages\scrapy\extensions\feedexport.py", line 262, in item_scraped
slot = self.slot
AttributeError: 'FeedExporter' object has no attribute 'slot'
According to some resource it is because of this:
Traceback (most recent call last):
File "C:\Users\fatima.arshad\AppData\Local\Continuum\anaconda2\envs\web_scraping\lib\site-packages\twisted\internet\defer.py", line 151, in maybeDeferred
result = f(*args, **kw)
File "C:\Users\fatima.arshad\AppData\Local\Continuum\anaconda2\envs\web_scraping\lib\site-packages\pydispatch\robustapply.py", line 55, in robustApply
return receiver(*arguments, **named)
File "C:\Users\fatima.arshad\AppData\Local\Continuum\anaconda2\envs\web_scraping\lib\site-packages\scrapy\extensions\feedexport.py", line 232, in open_spider
uri = self.urifmt % self._get_uri_params(spider)
File "C:\Users\fatima.arshad\AppData\Local\Continuum\anaconda2\envs\web_scraping\lib\site-packages\scrapy\extensions\feedexport.py", line 313, in _get_uri_params
params[k] = getattr(spider, k)
File "C:\Users\fatima.arshad\AppData\Local\Continuum\anaconda2\envs\web_scraping\lib\site-packages\scrapy\spiders\__init__.py", line 36, in logger
logger = logging.getLogger(self.name)
File "C:\Users\fatima.arshad\AppData\Local\Continuum\anaconda2\envs\web_scraping\lib\logging\__init__.py", line 1845, in getLogger
return Logger.manager.getLogger(name)
File "C:\Users\fatima.arshad\AppData\Local\Continuum\anaconda2\envs\web_scraping\lib\logging\__init__.py", line 1174, in getLogger
raise TypeError('A logger name must be a string')
TypeError: A logger name must be a string
How do I pass it the list or there is any way that this list canbe initialized only once with scrapy spider?
List contains all the urls that have been crawled. This list is pickled. When the code starts, it initializes this list and crawls further only if the link is not present in this list.
You need to pass the list of urls using the spider attribute name (which is crawled) in your case.
According to the docs, if you don't override the __init__ method of the spider, all the passed arguments to the spider class are mapped to the spider attributes. So in order to override the crawled attribute, you need to send the extact argument name.
Something like this:
process = CrawlerProcess()
crawled_urls = []
try:
with (open("URLs", "rb")) as openfile:
while True:
try:
crawled_urls = pickle.load(openfile)
except EOFError:
break
except:
with open("URLs", "wb") as f:
pickle.dump("", f)
print(crawled_urls)
process.crawl(Myspider, crawled=crawled_urls)
process.start() # the script wi
i do not quite get while i kept getting Index error that says out of range here. Is it because the list link is empty? What is the format of writing link here
class POSpider(CrawlSpider):
name = 'po'
start_urls = ['https://www.poets.org/poetsorg/poems']
allowed_domains = ['poets.org/poetsorg/poems']
def parse(self, response):
items=[]
l=response.xpath('//*[#class="themes"]//a//#href').extract()
theme_ids=[]
for item in l:
theme_ids.append(item[855:1412])
theme_urls=[]
for tid in theme_ids:
theme_urls.append('https://www.poets.org/poetsorg/poems? field_occasion_tid=All&field_poem_themes_tid='+ tid)
for link in theme_urls:
request=scrapy.Request(link,callback=self.parse_layer2,dont_filter=True)
yield request
def parse_layer2(self,response):
items=[]
p=response.xpath('//*[#id="block-views-poems-poems-block-all"]/div/div//tbody//td[2]//#href')[-1].extract()
poem_urls=[]
for item in p:
poem_urls.append(item)
for link in poem_urls:
request=scrapy.Request(link,callback=self.parse_layer3,dont_filter=True)
yield request
def parse_layer3(self,response):
items=[]
poems=response.xpath('//*[#id="poem-content"]/div[2]/div/div').extract()
for poem in poems:
item=PoetryItem()
s=poem.xpath('*/p/text()').extract()
t=strip_list(s)
t=t.encode('ascii','replace').lower()+'\r\n'
item['poem']=t
items.append(item)
return items
and this is what i kept getting as an result.
Traceback (most recent call last):
File "//anaconda/lib/python2.7/site-packages/scrapy/utils/defer.py", line 102, in iter_errback
yield next(it)
File "//anaconda/lib/python2.7/site-packages/scrapy/spidermiddlewares/offsite.py", line 29, in process_spider_output
for x in result:
File "//anaconda/lib/python2.7/site-packages/scrapy/spidermiddlewares/referer.py", line 22, in <genexpr>
return (_set_referer(r) for r in result or ())
File "//anaconda/lib/python2.7/site-packages/scrapy/spidermiddlewares/urllength.py", line 37, in <genexpr>
return (r for r in result or () if _filter(r))
File "//anaconda/lib/python2.7/site-packages/scrapy/spidermiddlewares/depth.py", line 58, in <genexpr>
return (r for r in result or () if _filter(r))
File "/Users/chinouhane/Desktop/Org/Org/spiders/PO_spider.py", line 37, in parse_layer2
p=response.xpath('//*[#id="block-views-poems-poems-block-all"]/div/div//tbody//td[2]//#href')[-1].extract()
File "//anaconda/lib/python2.7/site-packages/parsel/selector.py", line 56, in __getitem__
o = super(SelectorList, self).__getitem__(pos)
IndexError: list index out of range
I'm working with scrapy. I want to rotate proxies on a per request basis and get a proxy from an api I have that returns a single proxy. My plan is to make a request to the api, get a proxy, then use it to set the proxy based on :
http://stackoverflow.com/questions/39430454/making-request-to-api-from-within-scrapy-function
I have the following:
class ContactSpider(Spider):
name = "contact"
def parse(self, response):
....
PR = Request(
'my_api'
headers=self.headers,
meta={'newrequest': Request(url_to_scrape, headers=self.headers),},
callback=self.parse_PR
)
yield PR
def parse_PR(self, response):
newrequest = response.meta['newrequest']
proxy_data = response.body
newrequest.meta['proxy'] = 'http://'+proxy_data
newrequest.replace(url = 'http://ipinfo.io/ip') #TESTING
newrequest.replace(callback= self.form_output) #TESTING
yield newrequest
def form_output(self, response):
open_in_browser(response)
but I'm getting:
Traceback (most recent call last):
File "C:\twisted\internet\defer.py", line 1126, in _inlineCallbacks
result = result.throwExceptionIntoGenerator(g)
File "C:\twisted\python\failure.py", line 389, in throwExceptionIntoGenerator
return g.throw(self.type, self.value, self.tb)
File "C:\scrapy\core\downloader\middleware.py", line 43, in process_request
defer.returnValue((yield download_func(request=request,spider=spider)))
File "C:\scrapy\utils\defer.py", line 45, in mustbe_deferred
result = f(*args, **kw)
File "C:\scrapy\core\downloader\handlers\__init__.py", line 65, in download_request
return handler.download_request(request, spider)
File "C:\scrapy\core\downloader\handlers\http11.py", line 60, in download_request
return agent.download_request(request)
File "C:\scrapy\core\downloader\handlers\http11.py", line 255, in download_request
agent = self._get_agent(request, timeout)
File "C:\scrapy\core\downloader\handlers\http11.py", line 235, in _get_agent
_, _, proxyHost, proxyPort, proxyParams = _parse(proxy)
File "C:\scrapy\core\downloader\webclient.py", line 37, in _parse
return _parsed_url_args(parsed)
File "C:\scrapy\core\downloader\webclient.py", line 20, in _parsed_url_args
host = b(parsed.hostname)
File "C:\scrapy\core\downloader\webclient.py", line 17, in <lambda>
b = lambda s: to_bytes(s, encoding='ascii')
File "C:\scrapy\utils\python.py", line 117, in to_bytes
'object, got %s' % type(text).__name__)
TypeError: to_bytes must receive a unicode, str or bytes object, got NoneType
what am I doing wrong?
The stacktrace info suggests Scrapy has encountered a request object whose url is None, which is expected to be of string type.
These two lines in your code:
newrequest.replace(url = 'http://ipinfo.io/ip') #TESTING
newrequest.replace(callback= self.form_output) #TESTING
would not work as expected, since method Request.replace returns a new instance instead of modifying the original request in-place.
You would need something like this:
newrequest = newrequest.replace(url = 'http://ipinfo.io/ip') #TESTING
newrequest = newrequest.replace(callback= self.form_output) #TESTING
or simply:
newrequest = newrequest.replace(
url='http://ipinfo.io/ip',
callback=self.form_output
)
I added restrict_xpaths rules to my scrapy spider and now it immediately fails with:
2015-03-16 15:46:53+0000 [tsr] ERROR: Spider error processing <GET http://www.thestudentroom.co.uk/forumdisplay.php?f=143>
Traceback (most recent call last):
File "/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/twisted/internet/base.py", line 800, in runUntilCurrent
call.func(*call.args, **call.kw)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/twisted/internet/task.py", line 602, in _tick
taskObj._oneWorkUnit()
File "/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/twisted/internet/task.py", line 479, in _oneWorkUnit
result = self._iterator.next()
File "/Library/Python/2.7/site-packages/scrapy/utils/defer.py", line 57, in <genexpr>
work = (callable(elem, *args, **named) for elem in iterable)
--- <exception caught here> ---
File "/Library/Python/2.7/site-packages/scrapy/utils/defer.py", line 96, in iter_errback
yield next(it)
File "/Library/Python/2.7/site-packages/scrapy/contrib/spidermiddleware/offsite.py", line 26, in process_spider_output
for x in result:
File "/Library/Python/2.7/site-packages/scrapy/contrib/spidermiddleware/referer.py", line 22, in <genexpr>
return (_set_referer(r) for r in result or ())
File "/Library/Python/2.7/site-packages/scrapy/contrib/spidermiddleware/urllength.py", line 33, in <genexpr>
return (r for r in result or () if _filter(r))
File "/Library/Python/2.7/site-packages/scrapy/contrib/spidermiddleware/depth.py", line 50, in <genexpr>
return (r for r in result or () if _filter(r))
File "/Library/Python/2.7/site-packages/scrapy/contrib/spiders/crawl.py", line 73, in _parse_response
for request_or_item in self._requests_to_follow(response):
File "/Library/Python/2.7/site-packages/scrapy/contrib/spiders/crawl.py", line 52, in _requests_to_follow
links = [l for l in rule.link_extractor.extract_links(response) if l not in seen]
File "/Library/Python/2.7/site-packages/scrapy/contrib/linkextractors/lxmlhtml.py", line 107, in extract_links
links = self._extract_links(doc, response.url, response.encoding, base_url)
File "/Library/Python/2.7/site-packages/scrapy/linkextractor.py", line 94, in _extract_links
return self.link_extractor._extract_links(*args, **kwargs)
File "/Library/Python/2.7/site-packages/scrapy/contrib/linkextractors/lxmlhtml.py", line 50, in _extract_links
for el, attr, attr_val in self._iter_links(selector._root):
**File "/Library/Python/2.7/site-packages/scrapy/contrib/linkextractors/lxmlhtml.py", line 38, in _iter_links
for el in document.iter(etree.Element):
exceptions.AttributeError: 'str' object has no attribute 'iter'**
I cannot understand why this error is happening.
Here is my short Spider:
import scrapy
from tutorial.items import DmozItem
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
class TsrSpider(CrawlSpider):
name = 'tsr'
allowed_domains = ['thestudentroom.co.uk']
start_urls = ['http://www.thestudentroom.co.uk/forumdisplay.php?f=143']
download_delay = 4
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:35.0) Gecko/20100101 Firefox/35.0'
rules = (
Rule(
LinkExtractor(
allow=('forumdisplay\.php\?f=143\&page=\d',),
restrict_xpaths=("//li[#class='pager-page_numbers']/a/#href",))),
Rule(
LinkExtractor(
allow=('showthread\.php\?t=\d+\&page=\d+',),
restrict_xpaths=("//li[#class='pager-page_numbers']/a/#href",)),
callback='parse_link'),
Rule(
LinkExtractor(
allow=('showthread\.php\?t=\d+',),
restrict_xpaths=("//tr[#class='thread unread ']",)),
callback='parse_link'),
)
def parse_link(self, response):
# Iterate over posts.
for sel in response.xpath("//li[#class='post threadpost old ']"):
rating = sel.xpath(
"div[#class='post-footer']//span[#class='score']/text()").extract()
if not rating:
rating = 0
else:
rating = rating[0]
item = DmozItem()
item['post'] = sel.xpath(
"div[#class='post-content']/blockquote[#class='postcontent restore']/text()").extract()
item['link'] = response.url
item['topic'] = response.xpath(
"//div[#class='forum-header section-header']/h1/span/text()").extract()
item['rating'] = rating
yield item
source: http://pastebin.com/YXdWvPgX
Can someone help me out? Where is the mistake? I've been searching for days!?
The problem is that restrict_xpaths should point to elements - either the links directly or containers containing links, not attributes:
rules = [
Rule(LinkExtractor(allow='forumdisplay\.php\?f=143\&page=\d',
restrict_xpaths="//li[#class='pager-page_numbers']/a")),
Rule(LinkExtractor(allow='showthread\.php\?t=\d+\&page=\d+',
restrict_xpaths="//li[#class='pager-page_numbers']/a"),
callback='parse_link'),
Rule(LinkExtractor(allow='showthread\.php\?t=\d+',
restrict_xpaths="//tr[#class='thread unread ']"),
callback='parse_link'),
]
Tested (worked for me).
FYI, Scrapy defines restrict_xpaths as "expressions pointing to regions":
restrict_xpaths (str or list) – is a XPath (or list of XPath’s) which
defines regions inside the response where links should be extracted
from. If given, only the text selected by those XPath will be scanned
for links. See examples below.
i'm new to python and scrapy. After setting restrict_xpaths settings to "//table[#class="lista"]" I've received following traceback. What's strange, by using other xpath rule the crawler works properly.
Traceback (most recent call last):
File "/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/twisted/internet/base.py", line 800, in runUntilCurrent
call.func(*call.args, **call.kw)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/twisted/internet/task.py", line 602, in _tick
taskObj._oneWorkUnit()
File "/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/twisted/internet/task.py", line 479, in _oneWorkUnit
result = self._iterator.next()
File "/Library/Python/2.7/site-packages/scrapy/utils/defer.py", line 57, in <genexpr>
work = (callable(elem, *args, **named) for elem in iterable)
--- <exception caught here> ---
File "/Library/Python/2.7/site-packages/scrapy/utils/defer.py", line 96, in iter_errback
yield it.next()
File "/Library/Python/2.7/site-packages/scrapy/contrib/spidermiddleware/offsite.py", line 23, in process_spider_output
for x in result:
File "/Library/Python/2.7/site-packages/scrapy/contrib/spidermiddleware/referer.py", line 22, in <genexpr>
return (_set_referer(r) for r in result or ())
File "/Library/Python/2.7/site-packages/scrapy/contrib/spidermiddleware/urllength.py", line 33, in <genexpr>
return (r for r in result or () if _filter(r))
File "/Library/Python/2.7/site-packages/scrapy/contrib/spidermiddleware/depth.py", line 50, in <genexpr>
return (r for r in result or () if _filter(r))
File "/Library/Python/2.7/site-packages/scrapy/contrib/spiders/crawl.py", line 73, in _parse_response
for request_or_item in self._requests_to_follow(response):
File "/Library/Python/2.7/site-packages/scrapy/contrib/spiders/crawl.py", line 52, in _requests_to_follow
links = [l for l in rule.link_extractor.extract_links(response) if l not in seen]
File "/Library/Python/2.7/site-packages/scrapy/contrib/linkextractors/sgml.py", line 124, in extract_links
).encode(response.encoding)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/encodings/iso8859_2.py", line 12, in encode
return codecs.charmap_encode(input,errors,encoding_table)
exceptions.UnicodeEncodeError: 'charmap' codec can't encode character u'\xbb' in position 686: character maps to <undefined>
Here is MySpider Class.
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from ds_crawl.items import DsCrawlItem
class MySpider(CrawlSpider):
name = 'inside'
allowed_domains = ['wroclaw.dlastudenta.pl']
start_urls = ['http://wroclaw.dlastudenta.pl/stancje/']
rules = (
Rule(SgmlLinkExtractor(allow=('show_stancja'), restrict_xpaths=('//table[#class="lista"]')), callback='parse_item', follow= True),)
def parse_item(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.select("//p[#class='bbtext intextAd']")
for titles in titles:
item = DsCrawlItem()
item['content'] = titles.select("text()").extract()
print item
Any explanation of this error and help will be appreciated. Thank you.
That's a bug caused by the web page using the » entity which is translated by lxml to the unicode character \xbb and when you use the restrict_xpaths argument the link extractors encodes the content to the original encoding iso8859-2 which fails because \xbb is not valid character in that encoding.
This simple line reproduces the exception:
>>> u'\xbb'.encode('iso8859-2')
...
UnicodeEncodeError: 'charmap' codec can't encode character u'\xbb' in position 0: character maps to <undefined>
A workaround for this can be forcing to use utf8 for all responses. This can be done by a simple downloader middleware:
# file: myproject/middlewares.py
class ForceUTF8Response(object):
"""A downloader middleware to force UTF-8 encoding for all responses."""
encoding = 'utf-8'
def process_response(self, request, response, spider):
# Note: Use response.body_as_unicode() instead of response.text in in Scrapy <1.0.
new_body = response.text.encode(self.encoding)
return response.replace(body=new_body, encoding=self.encoding)
In your settings:
DOWNLOADER_MIDDLEWARES = {
'myproject.middlewares.ForceUTF8Response': 100,
}