I've been using scrapy for over a year now with a script that someone else wrote for me. It was working great for over a year, until 6-8 weeks ago when it started giving me the following error whenever I try to download. Does anyone have any ideas?
I am running this on Ubuntu 14.04 LTS.
Command: scrapy crawl googleplay
2015-08-30 13:10:31-0400 [googleplay] ERROR: Spider error processing <GET https://accounts.google.com/ServiceLogin?continue=https%3A%2F%2Fplay.google.com%2Fstore%2Fapps%2Fcategory%2FGAME&followup=https%3A%2F%2Fplay.google.com%2Fstore%2Fapps%2Fcategory%2FGAME&passive=1209600&service=googleplay>
Traceback (most recent call last):
File "/usr/lib/python2.7/dist-packages/twisted/internet/base.py", line 800, in runUntilCurrent
call.func(*call.args, **call.kw)
File "/usr/lib/python2.7/dist-packages/twisted/internet/task.py", line 595, in _tick
taskObj._oneWorkUnit()
File "/usr/lib/python2.7/dist-packages/twisted/internet/task.py", line 472, in _oneWorkUnit
result = self._iterator.next()
File "/usr/lib/pymodules/python2.7/scrapy/utils/defer.py", line 57, in <genexpr>
work = (callable(elem, *args, **named) for elem in iterable)
--- <exception caught here> ---
File "/usr/lib/pymodules/python2.7/scrapy/utils/defer.py", line 96, in iter_errback
yield next(it)
File "/usr/lib/pymodules/python2.7/scrapy/contrib/spidermiddleware/offsite.py", line 23, in process_spider_output
for x in result:
File "/usr/lib/pymodules/python2.7/scrapy/contrib/spidermiddleware/referer.py", line 22, in <genexpr>
return (_set_referer(r) for r in result or ())
File "/usr/lib/pymodules/python2.7/scrapy/contrib/spidermiddleware/urllength.py", line 33, in <genexpr>
return (r for r in result or () if _filter(r))
File "/usr/lib/pymodules/python2.7/scrapy/contrib/spidermiddleware/depth.py", line 50, in <genexpr>
return (r for r in result or () if _filter(r))
File "/usr/lib/pymodules/python2.7/scrapy/contrib/spiders/crawl.py", line 73, in _parse_response
for request_or_item in self._requests_to_follow(response):
File "/usr/lib/pymodules/python2.7/scrapy/contrib/spiders/crawl.py", line 52, in _requests_to_follow
links = [l for l in rule.link_extractor.extract_links(response) if l not in seen]
File "/usr/lib/pymodules/python2.7/scrapy/contrib/linkextractors/sgml.py", line 129, in extract_links
links = self._extract_links(body, response.url, response.encoding, base_url)
File "/usr/lib/pymodules/python2.7/scrapy/contrib/linkextractors/sgml.py", line 29, in _extract_links
self.feed(response_text)
File "/usr/lib/python2.7/sgmllib.py", line 104, in feed
self.goahead(0)
File "/usr/lib/python2.7/sgmllib.py", line 174, in goahead
k = self.parse_declaration(i)
File "/usr/lib/python2.7/markupbase.py", line 98, in parse_declaration
decltype, j = self._scan_name(j, i)
File "/usr/lib/python2.7/markupbase.py", line 392, in _scan_name
% rawdata[declstartpos:declstartpos+20])
File "/usr/lib/python2.7/sgmllib.py", line 111, in error
raise SGMLParseError(message)
sgmllib.SGMLParseError: expected name token at '<!\\\\])/g,"\\\\$1").rep'
here is my GooglePlay spider (after update) along with the error message I am now receiving
import string
import requests
from scrapy import log
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.htmlparser import HtmlParserLinkExtractor
from scrapy.selector import Selector
from scrapy.http import Request
from scraper.items import ApkItem
from play import parse_app
class GooglePlaySpider(CrawlSpider):
name = 'googleplay'
start_urls = [
'https://play.google.com/store/apps'
]
rules = (
Rule(SgmlLinkExtractor(allow=('/store/apps$', )), callback='parse_category_group', follow=True),
Rule(SgmlLinkExtractor(allow=('/store/apps/category/.*', )), callback='parse_category', follow=True),
Rule(SgmlLinkExtractor(allow=('/store/search\?.*', )), callback='parse_search', follow=True),
)
def parse_category_group(self, response):
sel = Selector(response)
category_groups = sel.xpath('//div[#class="padded-content3 app-home-nav"]')
for category_group in category_groups:
category_group_name = category_group.xpath('h2/a/text()').extract()
categories = category_group.xpath('ul/li')
for category in categories:
category_name = category.xpath('a/text()').extract()
category_url = category.xpath('a/#href').extract()[0]
chars = string.ascii_uppercase + string.digits
for x in chars:
yield Request('https://play.google.com/store/search?q=' + x + '&c=apps', callback=self.parse_search)
for x in chars:
for y in chars:
yield Request('https://play.google.com/store/search?q=' + x + y + '&c=apps', callback=self.parse_search)
for x in chars:
for y in chars:
for z in chars:
yield Request('https://play.google.com/store/search?q=' + x + y + z + '&c=apps', callback=self.parse_search)
return
def parse_category(self, response):
base_path = response.url.split('?')[0]
if '/collection/' in response.url:
sel = Selector(response)
apps = sel.xpath('//a[#class="title"]')
has_app = False
for app in apps:
has_app = True
app_name = app.xpath('text()').extract()
app_url = app.xpath('#href').extract()
yield Request('https://play.google.com' + app_url[0], meta={'come_from': self.name}, callback=parse_app)
if has_app:
m = re.match(r'(.*)\?start=(\d+)&num=24', response.url)
if m is None:
start_number = 24
else:
start_number = int(m.group(2)) + 24
yield Request(base_path + '?start=' + str(start_number) + '&num=24', callback=self.parse_category)
return
def parse_search(self, response):
m = re.match(r'(.*)&start=(\d+)&num=24', response.url)
if m is None:
base_path = response.url
start_number = 24
else:
start_number = int(m.group(2)) + 24
base_path = m.group(1)
sel = Selector(response)
apps = sel.xpath('//a[contains(#href,"/store/apps/details")]')
has_app = False
for app in apps:
has_app = True
app_url = app.xpath('#href').extract()
yield Request('https://play.google.com' + app_url[0], meta={'come_from': self.name}, callback=parse_app)
if has_app:
yield Request(base_path + '&start=' + str(start_number) + '&num=24', callback=self.parse_search)
return
**** Error ****Traceback (most recent call last):
File "/usr/bin/scrapy", line 4, in <module>
execute()
File "/usr/lib/pymodules/python2.7/scrapy/cmdline.py", line 143, in execute
_run_print_help(parser, _run_command, cmd, args, opts)
File "/usr/lib/pymodules/python2.7/scrapy/cmdline.py", line 89, in _run_print_help
func(*a, **kw)
File "/usr/lib/pymodules/python2.7/scrapy/cmdline.py", line 150, in _run_command
cmd.run(args, opts)
File "/usr/lib/pymodules/python2.7/scrapy/commands/crawl.py", line 47, in run
crawler = self.crawler_process.create_crawler()
File "/usr/lib/pymodules/python2.7/scrapy/crawler.py", line 87, in create_crawler
self.crawlers[name] = Crawler(self.settings)
File "/usr/lib/pymodules/python2.7/scrapy/crawler.py", line 25, in __init__
self.spiders = spman_cls.from_crawler(self)
File "/usr/lib/pymodules/python2.7/scrapy/spidermanager.py", line 35, in from_crawler
sm = cls.from_settings(crawler.settings)
File "/usr/lib/pymodules/python2.7/scrapy/spidermanager.py", line 31, in from_settings
return cls(settings.getlist('SPIDER_MODULES'))
File "/usr/lib/pymodules/python2.7/scrapy/spidermanager.py", line 22, in __init__
for module in walk_modules(name):
File "/usr/lib/pymodules/python2.7/scrapy/utils/misc.py", line 68, in walk_modules
submod = import_module(fullpath)
File "/usr/lib/python2.7/importlib/__init__.py", line 37, in import_module
__import__(name)
File "/home/darwin/ProjectKrutz/scraper/scraper/spiders/googlePlaySpider.py", line 12, in <module>
class GooglePlaySpider(CrawlSpider):
File "/home/darwin/ProjectKrutz/scraper/scraper/spiders/googlePlaySpider.py", line 18, in GooglePlaySpider
Rule(SgmlLinkExtractor(allow=('/store/apps$', )), callback='parse_category_group', follow=True),
NameError: name 'SgmlLinkExtractor' is not defined
The problem is that SgmlLinkExtractor has problems with comments. And the error message tells you that there is a comment: <!.
So the solution would be to change the spider you are using and replace the SgmlLinkExtractor with either
from scrapy.contrib.linkextractors.htmlparser import HtmlParserLinkExtractor
or
from scrapy.contrib.linkextractors.lxmlhtml import LxmlParserLinkExtractor
Naturally these were only the import statements, you have to change the Rule where the link extractor is used too to use one of these extractors.
Without the code I cannot give you more advice where to change the parts.
Related
i am trying to get last row of column plate and append data to it. But it gives corrupt file error even though scrapy is working properly.
I guess error is due to lines below. Where I firstly, use pandas ExcelWriter object, then for getting last row I use dataframe.
with pd.ExcelWriter('output_res.xlsx', mode='r+',if_sheet_exists='overlay') as writer:
df_last=pd.DataFrame('output_res.xlsx')
lastRow=df_last['plate'].iget(-1)
df_output = pd.DataFrame(itemList)
df_output.to_excel(writer, sheet_name='result', index=False, header=True,startrow=lastRow)
and variable lastRow is unassigned, as I guess. That's why it does not give a value to to_excel method
import scrapy
from scrapy.crawler import CrawlerProcess
import pandas as pd
class plateScraper(scrapy.Spider):
name = 'scrapePlate'
allowed_domains = ['dvlaregistrations.dvla.gov.uk']
def start_requests(self):
df=pd.read_excel('data.xlsx')
columnA_values=df['PLATE']
for row in columnA_values:
global plate_num_xlsx
plate_num_xlsx=row
base_url =f"https://dvlaregistrations.dvla.gov.uk/search/results.html?search={plate_num_xlsx}&action=index&pricefrom=0&priceto=&prefixmatches=¤tmatches=&limitprefix=&limitcurrent=&limitauction=&searched=true&openoption=&language=en&prefix2=Search&super=&super_pricefrom=&super_priceto="
url=base_url
yield scrapy.Request(url)
def parse(self, response):
itemList=[]
for row in response.css('div.resultsstrip'):
plate = row.css('a::text').get()
price = row.css('p::text').get()
if plate_num_xlsx==plate.replace(" ","").strip():
item= {"plate": plate.strip(), "price": price.strip()}
itemList.append(item)
yield item
else:
item = {"plate": plate.strip(), "price": "-"}
itemList.append(item)
yield item
with pd.ExcelWriter('output_res.xlsx', mode='r+',if_sheet_exists='overlay') as writer:
df_last=pd.DataFrame('output_res.xlsx')
lastRow=df_last['plate'].iget(-1)
df_output = pd.DataFrame(itemList)
df_output.to_excel(writer, sheet_name='result', index=False, header=True,startrow=lastRow)
process = CrawlerProcess()
process.crawl(plateScraper)
process.start()
gives an error
Traceback (most recent call last):
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\utils\defer.py", line 240, in iter_errback
yield next(it)
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\utils\python.py", line 338, in __next__
return next(self.data)
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\utils\python.py", line 338, in __next__
return next(self.data)
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\core\spidermw.py", line 79, in process_sync
for r in iterable:
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\spidermiddlewares\offsite.py", line 29, in <genexpr>
return (r for r in result or () if self._filter(r, spider))
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\core\spidermw.py", line 79, in process_sync
for r in iterable:
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\spidermiddlewares\referer.py", line 336, in <genexpr>
return (self._set_referer(r, response) for r in result or ())
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\core\spidermw.py", line 79, in process_sync
for r in iterable:
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\spidermiddlewares\urllength.py", line 28, in <genexpr>
return (r for r in result or () if self._filter(r, spider))
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\core\spidermw.py", line 79, in process_sync
for r in iterable:
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\spidermiddlewares\depth.py", line 32, in <genexpr>
return (r for r in result or () if self._filter(r, response, spider))
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\core\spidermw.py", line 79, in process_sync
for r in iterable:
File "C:\pythonPro\w_crawl\SimonDarak\scrpy_00.py", line 33, in parse
with pd.ExcelWriter('output_res.xlsx', mode='a',if_sheet_exists='overlay') as writer:
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\io\excel\_openpyxl.py", line 73, in __init__
self._book = load_workbook(self._handles.handle, **engine_kwargs)
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\openpyxl\reader\excel.py", line 317, in load_workbook
reader.read()
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\openpyxl\reader\excel.py", line 282, in read
self.read_worksheets()
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\openpyxl\reader\excel.py", line 228, in read_worksheets
ws_parser.bind_all()
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\openpyxl\worksheet\_reader.py", line 448, in bind_all
self.bind_cells()
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\openpyxl\worksheet\_reader.py", line 351, in bind_cells
for idx, row in self.parser.parse():
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\openpyxl\worksheet\_reader.py", line 144, in parse
for _, element in it:
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\xml\etree\ElementTree.py", line 1255, in iterator
data = source.read(16 * 1024)
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\zipfile.py", line 925, in read
data = self._read1(n)
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\zipfile.py", line 1015, in _read1
self._update_crc(data)
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\zipfile.py", line 943, in _update_crc
raise BadZipFile("Bad CRC-32 for file %r" % self.name)
zipfile.BadZipFile: Bad CRC-32 for file 'xl/worksheets/sheet1.xml'
Process finished with exit code -1
I take list from parse method and put it outside class.
itemList=[]
def parse(self, response):
for row in response.css('div.resultsstrip'):
plate = row.css('a::text').get()
price = row.css('p::text').get()
a = plate.replace(" ", "").strip()
print(plate_num_xlsx,a,a == plate_num_xlsx)
if plate_num_xlsx==plate.replace(" ","").strip():
item= {"plate": plate.strip(), "price": price.strip()}
itemList.append(item)
yield item
else:
item = {"plate": plate_num_xlsx, "price": "-"}
itemList.append(item)
yield item
with pd.ExcelWriter('output_res.xlsx', mode='r+',if_sheet_exists='overlay') as writer:
df_output = pd.DataFrame(itemList)
df_output.to_excel(writer, sheet_name='result', index=False, header=True)
I'm trying to get Google Cloud Storage working with a Scrapy Cloud + Crawlera project so that I can save text files I'm trying to download. I'm encountering an error when I run my script that seems to have to do with my Google permissions not working properly.
Error:
Traceback (most recent call last):
File "/usr/local/lib/python3.7/site-packages/scrapy/pipelines/media.py", line 68, in from_crawler
pipe = cls.from_settings(crawler.settings)
File "/usr/local/lib/python3.7/site-packages/scrapy/pipelines/files.py", line 325, in from_settings
return cls(store_uri, settings=settings)
File "/usr/local/lib/python3.7/site-packages/scrapy/pipelines/files.py", line 289, in __init__
self.store = self._get_store(store_uri)
File "/usr/local/lib/python3.7/site-packages/scrapy/pipelines/files.py", line 333, in _get_store
return store_cls(uri)
File "/usr/local/lib/python3.7/site-packages/scrapy/pipelines/files.py", line 217, in __init__
client = storage.Client(project=self.GCS_PROJECT_ID)
File "/app/python/lib/python3.7/site-packages/google/cloud/storage/client.py", line 82, in __init__
project=project, credentials=credentials, _http=_http
File "/app/python/lib/python3.7/site-packages/google/cloud/client.py", line 228, in __init__
Client.__init__(self, credentials=credentials, _http=_http)
File "/app/python/lib/python3.7/site-packages/google/cloud/client.py", line 133, in __init__
credentials, _ = google.auth.default()
File "/app/python/lib/python3.7/site-packages/google/auth/_default.py", line 305, in default
credentials, project_id = checker()
File "/app/python/lib/python3.7/site-packages/google/auth/_default.py", line 165, in _get_explicit_environ_credentials
os.environ[environment_vars.CREDENTIALS])
File "/app/python/lib/python3.7/site-packages/google/auth/_default.py", line 102, in _load_credentials_from_file
credential_type = info.get('type')
AttributeError: 'str' object has no attribute 'get'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/local/lib/python3.7/site-packages/twisted/internet/defer.py", line 1418, in _inlineCallbacks
result = g.send(result)
File "/usr/local/lib/python3.7/site-packages/scrapy/crawler.py", line 80, in crawl
self.engine = self._create_engine()
File "/usr/local/lib/python3.7/site-packages/scrapy/crawler.py", line 105, in _create_engine
return ExecutionEngine(self, lambda _: self.stop())
File "/usr/local/lib/python3.7/site-packages/scrapy/core/engine.py", line 70, in __init__
self.scraper = Scraper(crawler)
File "/usr/local/lib/python3.7/site-packages/scrapy/core/scraper.py", line 71, in __init__
self.itemproc = itemproc_cls.from_crawler(crawler)
File "/usr/local/lib/python3.7/site-packages/scrapy/middleware.py", line 53, in from_crawler
return cls.from_settings(crawler.settings, crawler)
File "/usr/local/lib/python3.7/site-packages/scrapy/middleware.py", line 35, in from_settings
mw = create_instance(mwcls, settings, crawler)
File "/usr/local/lib/python3.7/site-packages/scrapy/utils/misc.py", line 140, in create_instance
return objcls.from_crawler(crawler, *args, **kwargs)
File "/usr/local/lib/python3.7/site-packages/scrapy/pipelines/media.py", line 70, in from_crawler
pipe = cls()
TypeError: __init__() missing 1 required positional argument: 'store_uri'
__init__.py where I create the credentials file:
# Code from https://medium.com/#rutger_93697/i-thought-this-solution-was-somewhat-complex-3e8bc91f83f8
import os
import json
import pkgutil
import logging
path = "{}/google-cloud-storage-credentials.json".format(os.getcwd())
credentials_content = '<escaped JSON data>'
with open(path, "w") as text_file:
text_file.write(json.dumps(credentials_content))
logging.warning("Path to credentials: %s" % path)
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = path
settings.py:
BOT_NAME = 'get_case_urls'
SPIDER_MODULES = ['get_case_urls.spiders']
NEWSPIDER_MODULE = 'get_case_urls.spiders'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# Crawlera
DOWNLOADER_MIDDLEWARES = {'scrapy_crawlera.CrawleraMiddleware': 300}
CRAWLERA_ENABLED = True
CRAWLERA_APIKEY = '<crawlera-api-key>'
CONCURRENT_REQUESTS = 32
CONCURRENT_REQUESTS_PER_DOMAIN = 32
AUTOTHROTTLE_ENABLED = False
DOWNLOAD_TIMEOUT = 600
ITEM_PIPELINES = {
'scrapy.pipelines.files.FilesPipeline': 500
}
FILES_STORE = 'gs://<name-of-my-gcs-project>'
IMAGES_STORE = 'gs://<name-of-my-gcs-project>'
GCS_PROJECT_ID = "<id-of-my-gcs-project>"
After looking at the code for _load_credentials_from_file it seems to me that I had not saved the JSON to a text file correctly: in __init__.py, rather than having text_file.write(json.dumps(credentials_content)), I should have had text_file.write(credentials_content) or text_file.write(json.dumps(json.loads(credentials_content))).
I am working on a college project, but I need to make the code below work with socks4 instead of tor/socks5. I have tried modifying SOCKS5Agent to SOCKS4Agent but then I receive and error:
Original code: https://stackoverflow.com/a/33944924/11219616
My code:
import scrapy.core.downloader.handlers.http11 as handler
from twisted.internet import reactor
from txsocksx.http import SOCKS4Agent
from twisted.internet.endpoints import TCP4ClientEndpoint
from scrapy.core.downloader.webclient import _parse
class TorScrapyAgent(handler.ScrapyAgent):
_Agent = SOCKS4Agent
def _get_agent(self, request, timeout):
proxy = request.meta.get('proxy')
if proxy:
proxy_scheme, _, proxy_host, proxy_port, _ = _parse(proxy)
if proxy_scheme == 'socks4':
endpoint = TCP4ClientEndpoint(reactor, proxy_host, proxy_port)
return self._Agent(reactor, proxyEndpoint=endpoint)
return super(TorScrapyAgent, self)._get_agent(request, timeout)
class TorHTTPDownloadHandler(handler.HTTP11DownloadHandler):
def download_request(self, request, spider):
agent = TorScrapyAgent(contextFactory=self._contextFactory, pool=self._pool,
maxsize=getattr(spider, 'download_maxsize', self._default_maxsize),
warnsize=getattr(spider, 'download_warnsize', self._default_warnsize))
return agent.download_request(request)
I get the error:
Traceback (most recent call last):
File "C:\Python27\lib\site-packages\twisted\internet\defer.py", line 1416, in _inlineCallbacks
result = result.throwExceptionIntoGenerator(g)
File "C:\Python27\lib\site-packages\twisted\python\failure.py", line 491, in throwExceptionIntoGenerator
return g.throw(self.type, self.value, self.tb)
File "C:\Python27\lib\site-packages\scrapy\core\downloader\middleware.py", line 43, in process_request
defer.returnValue((yield download_func(request=request,spider=spider)))
File "C:\Python27\lib\site-packages\ometa\protocol.py", line 53, in dataReceived
self._parser.receive(data)
File "C:\Python27\lib\site-packages\ometa\tube.py", line 41, in receive
status = self._interp.receive(data)
File "C:\Python27\lib\site-packages\ometa\interp.py", line 48, in receive
for x in self.next:
File "C:\Python27\lib\site-packages\ometa\interp.py", line 177, in apply
for x in self._apply(f, ruleName, argvals):
File "C:\Python27\lib\site-packages\ometa\interp.py", line 110, in _apply
for x in rule():
File "C:\Python27\lib\site-packages\ometa\interp.py", line 256, in parse_Or
for x in self._eval(subexpr):
File "C:\Python27\lib\site-packages\ometa\interp.py", line 241, in parse_And
for x in self._eval(subexpr):
File "C:\Python27\lib\site-packages\ometa\interp.py", line 440, in parse_Action
val = eval(expr.data, self.globals, self._localsStack[-1])
File "<string>", line 1, in <module>
File "C:\Python27\lib\site-packages\txsocksx\client.py", line 276, in serverResponse
raise e.socks4ErrorMap.get(status)()
RequestRejectedOrFailed
I added restrict_xpaths rules to my scrapy spider and now it immediately fails with:
2015-03-16 15:46:53+0000 [tsr] ERROR: Spider error processing <GET http://www.thestudentroom.co.uk/forumdisplay.php?f=143>
Traceback (most recent call last):
File "/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/twisted/internet/base.py", line 800, in runUntilCurrent
call.func(*call.args, **call.kw)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/twisted/internet/task.py", line 602, in _tick
taskObj._oneWorkUnit()
File "/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/twisted/internet/task.py", line 479, in _oneWorkUnit
result = self._iterator.next()
File "/Library/Python/2.7/site-packages/scrapy/utils/defer.py", line 57, in <genexpr>
work = (callable(elem, *args, **named) for elem in iterable)
--- <exception caught here> ---
File "/Library/Python/2.7/site-packages/scrapy/utils/defer.py", line 96, in iter_errback
yield next(it)
File "/Library/Python/2.7/site-packages/scrapy/contrib/spidermiddleware/offsite.py", line 26, in process_spider_output
for x in result:
File "/Library/Python/2.7/site-packages/scrapy/contrib/spidermiddleware/referer.py", line 22, in <genexpr>
return (_set_referer(r) for r in result or ())
File "/Library/Python/2.7/site-packages/scrapy/contrib/spidermiddleware/urllength.py", line 33, in <genexpr>
return (r for r in result or () if _filter(r))
File "/Library/Python/2.7/site-packages/scrapy/contrib/spidermiddleware/depth.py", line 50, in <genexpr>
return (r for r in result or () if _filter(r))
File "/Library/Python/2.7/site-packages/scrapy/contrib/spiders/crawl.py", line 73, in _parse_response
for request_or_item in self._requests_to_follow(response):
File "/Library/Python/2.7/site-packages/scrapy/contrib/spiders/crawl.py", line 52, in _requests_to_follow
links = [l for l in rule.link_extractor.extract_links(response) if l not in seen]
File "/Library/Python/2.7/site-packages/scrapy/contrib/linkextractors/lxmlhtml.py", line 107, in extract_links
links = self._extract_links(doc, response.url, response.encoding, base_url)
File "/Library/Python/2.7/site-packages/scrapy/linkextractor.py", line 94, in _extract_links
return self.link_extractor._extract_links(*args, **kwargs)
File "/Library/Python/2.7/site-packages/scrapy/contrib/linkextractors/lxmlhtml.py", line 50, in _extract_links
for el, attr, attr_val in self._iter_links(selector._root):
**File "/Library/Python/2.7/site-packages/scrapy/contrib/linkextractors/lxmlhtml.py", line 38, in _iter_links
for el in document.iter(etree.Element):
exceptions.AttributeError: 'str' object has no attribute 'iter'**
I cannot understand why this error is happening.
Here is my short Spider:
import scrapy
from tutorial.items import DmozItem
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
class TsrSpider(CrawlSpider):
name = 'tsr'
allowed_domains = ['thestudentroom.co.uk']
start_urls = ['http://www.thestudentroom.co.uk/forumdisplay.php?f=143']
download_delay = 4
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:35.0) Gecko/20100101 Firefox/35.0'
rules = (
Rule(
LinkExtractor(
allow=('forumdisplay\.php\?f=143\&page=\d',),
restrict_xpaths=("//li[#class='pager-page_numbers']/a/#href",))),
Rule(
LinkExtractor(
allow=('showthread\.php\?t=\d+\&page=\d+',),
restrict_xpaths=("//li[#class='pager-page_numbers']/a/#href",)),
callback='parse_link'),
Rule(
LinkExtractor(
allow=('showthread\.php\?t=\d+',),
restrict_xpaths=("//tr[#class='thread unread ']",)),
callback='parse_link'),
)
def parse_link(self, response):
# Iterate over posts.
for sel in response.xpath("//li[#class='post threadpost old ']"):
rating = sel.xpath(
"div[#class='post-footer']//span[#class='score']/text()").extract()
if not rating:
rating = 0
else:
rating = rating[0]
item = DmozItem()
item['post'] = sel.xpath(
"div[#class='post-content']/blockquote[#class='postcontent restore']/text()").extract()
item['link'] = response.url
item['topic'] = response.xpath(
"//div[#class='forum-header section-header']/h1/span/text()").extract()
item['rating'] = rating
yield item
source: http://pastebin.com/YXdWvPgX
Can someone help me out? Where is the mistake? I've been searching for days!?
The problem is that restrict_xpaths should point to elements - either the links directly or containers containing links, not attributes:
rules = [
Rule(LinkExtractor(allow='forumdisplay\.php\?f=143\&page=\d',
restrict_xpaths="//li[#class='pager-page_numbers']/a")),
Rule(LinkExtractor(allow='showthread\.php\?t=\d+\&page=\d+',
restrict_xpaths="//li[#class='pager-page_numbers']/a"),
callback='parse_link'),
Rule(LinkExtractor(allow='showthread\.php\?t=\d+',
restrict_xpaths="//tr[#class='thread unread ']"),
callback='parse_link'),
]
Tested (worked for me).
FYI, Scrapy defines restrict_xpaths as "expressions pointing to regions":
restrict_xpaths (str or list) – is a XPath (or list of XPath’s) which
defines regions inside the response where links should be extracted
from. If given, only the text selected by those XPath will be scanned
for links. See examples below.
I'm getting an error: "Unhasable type: list" in the rules which i have defined to extract next-button link.
from scrapy.http import Request
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider,Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from walmart_sample.items import WalmartSampleItem
class MySpider(CrawlSpider):
name = "my_spider"
domain = ['Apparel']
keyword = 'Bags'
departments = {"All Departments": "0", "Apparel": "5438", "Auto": "91083", "Baby": "5427", "Beauty": "1085666","Books": "3920", "Electronics": "3944", "Gifts": "1094765", "Grocery": "976759", "Health": "976760","Home": "4044", "Home Improvement": "1072864", "Jwelery": "3891", "Movies": "4096", "Music": "4104","Party": "2637", "Patio": "5428", "Pets": "5440", "Pharmacy": "5431", "Photo Center": "5426","Sports": "4125", "Toys": "4171", "Video Games": "2636"}
allowed_domains = ['walmart.com']
denied_domains = ['reviews.walmart.com','facebook.com','twitter.com']
rules = (Rule(SgmlLinkExtractor(allow=("http://www.walmart.com/search/search-ng.do?tab_value=all&search_query=%s&search_constraint=%s&Find=Find&pref_store=1801&ss=false&ic=16_\d*2&_mm=" %(keyword,departments.get(domain))),),restrict_xpaths=('//li[#class="btn-nextResults"]'),callback='parse',follow=True),)
def start_requests(self):
for domains in self.domain:
if domains in self.departments:
url = 'http://www.walmart.com/search/search-ng.do?search_query=%s&ic=16_0&Find=Find&search_constraint=%s' % (self.keyword, self.departments.get(domains))
yield Request(url)
def parse(self, response):
hxs = HtmlXPathSelector(response)
links = hxs.select('//a[#class="prodLink ListItemLink"]/#href')
last = hxs.select('//a[#class="SPPagNoLink jump next"]').extract()
if last is None:
for link in links:
href = link.extract()
yield Request('http://www.walmart.com/' + href, self.parse_data)
else:
print "<<<<<Last Page>>>>>>"
def parse_data(self, response):
hxs = HtmlXPathSelector(response)
items=[]
walmart=WalmartSampleItem()
walmart['Title']=hxs.select('//h1[#class="productTitle"]/text()').extract()
walmart['Price']=hxs.select('//span[#class="bigPriceText1"]/text()').extract()+hxs.select('//span[#class="smallPriceText1"]/text()').extract()
walmart['Availability']=hxs.select('//span[#id="STORE_AVAIL"]/text()').extract()
walmart['Description']=hxs.select('//span[#class="ql-details-short-desc"]/p/text()').extract()
#walmart['Avg_Rating']=
#walmart['Detailed_Rating']=
items.append(walmart)
return items
Traceback (most recent call last):
File "/usr/bin/scrapy", line 4, in <module>
execute()
File "/usr/lib/pymodules/python2.7/scrapy/cmdline.py", line 143, in execute
_run_print_help(parser, _run_command, cmd, args, opts)
File "/usr/lib/pymodules/python2.7/scrapy/cmdline.py", line 89, in _run_print_help
func(*a, **kw)
File "/usr/lib/pymodules/python2.7/scrapy/cmdline.py", line 150, in _run_command
cmd.run(args, opts)
File "/usr/lib/pymodules/python2.7/scrapy/commands/crawl.py", line 47, in run
crawler = self.crawler_process.create_crawler()
File "/usr/lib/pymodules/python2.7/scrapy/crawler.py", line 87, in create_crawler
self.crawlers[name] = Crawler(self.settings)
File "/usr/lib/pymodules/python2.7/scrapy/crawler.py", line 25, in __init__
self.spiders = spman_cls.from_crawler(self)
File "/usr/lib/pymodules/python2.7/scrapy/spidermanager.py", line 35, in from_crawler
sm = cls.from_settings(crawler.settings)
File "/usr/lib/pymodules/python2.7/scrapy/spidermanager.py", line 31, in from_settings
return cls(settings.getlist('SPIDER_MODULES'))
File "/usr/lib/pymodules/python2.7/scrapy/spidermanager.py", line 22, in __init__
for module in walk_modules(name):
File "/usr/lib/pymodules/python2.7/scrapy/utils/misc.py", line 68, in walk_modules
submod = import_module(fullpath)
File "/usr/lib/python2.7/importlib/__init__.py", line 37, in import_module
__import__(name)
File "/home/vivek/mywork/walmart_sample/walmart_sample/spiders/test.py", line 8, in <module>
class MySpider(CrawlSpider):
File "/home/vivek/mywork/walmart_sample/walmart_sample/spiders/test.py", line 15, in MySpider
rules = (Rule(SgmlLinkExtractor(allow=("http://www.walmart.com/search/search-ng.do?tab_value=all&search_query=%s&search_constraint=%s&Find=Find&pref_store=1801&ss=false&ic=16_\d*2&_mm=" %(keyword,departments.get(domain))),),restrict_xpaths=('//li[#class="btn-nextResults"]'),callback='parse',follow=True),)
TypeError: unhashable type: 'list'
The problematic bit is this:
departments.get(domain)
domain is a list and thus you need to specify which individual item in the list you want to use. In this case using domain[0] fixes the problem, and your rule becomes:
rules = (Rule(SgmlLinkExtractor(allow=("http://www.walmart.com/search/search-ng.do?tab_value=all&search_query=%s&search_constraint=%s&Find=Find&pref_store=1801&ss=false&ic=16_\d*2&_mm=" %(keyword,departments.get(domain[0]))),),restrict_xpaths=('//li[#class="btn-nextResults"]'),callback='parse',follow=True),)