I have a Scrapy multi-level spider which works locally, but returns GeneratorExit in Cloud on every request.
Here're parse methods:
def parse(self, response):
results = list(response.css(".list-group li a::attr(href)"))
for c in results:
meta = {}
for key in response.meta.keys():
meta[key] = response.meta[key]
yield response.follow(c,
callback=self.parse_category,
meta=meta,
errback=self.errback_httpbin)
def parse_category(self, response):
category_results = list(response.css(
".item a.link-unstyled::attr(href)"))
category = response.css(".active [itemprop='title']")
for r in category_results:
meta = {}
for key in response.meta.keys():
meta[key] = response.meta[key]
meta["category"] = category
yield response.follow(r, callback=self.parse_item,
meta=meta,
errback=self.errback_httpbin)
def errback_httpbin(self, failure):
# log all failures
self.logger.error(repr(failure))
Here's the traceback:
Traceback (most recent call last):
File "/usr/local/lib/python3.6/site-packages/scrapy/utils/defer.py", line 102, in iter_errback
yield next(it)
GeneratorExit
[stderr] Exception ignored in: <generator object iter_errback at 0x7fdea937a9e8>
File "/usr/local/lib/python3.6/site-packages/twisted/internet/base.py", line 1243, in run
self.mainLoop()
File "/usr/local/lib/python3.6/site-packages/twisted/internet/base.py", line 1252, in mainLoop
self.runUntilCurrent()
File "/usr/local/lib/python3.6/site-packages/twisted/internet/base.py", line 878, in runUntilCurrent
call.func(*call.args, **call.kw)
File "/usr/local/lib/python3.6/site-packages/twisted/internet/task.py", line 671, in _tick
taskObj._oneWorkUnit()
--- <exception caught here> ---
File "/usr/local/lib/python3.6/site-packages/twisted/internet/task.py", line 517, in _oneWorkUnit
result = next(self._iterator)
File "/usr/local/lib/python3.6/site-packages/scrapy/utils/defer.py", line 63, in <genexpr>
work = (callable(elem, *args, **named) for elem in iterable)
File "/usr/local/lib/python3.6/site-packages/scrapy/core/scraper.py", line 183, in _process_spidermw_output
self.crawler.engine.crawl(request=output, spider=spider)
File "/usr/local/lib/python3.6/site-packages/scrapy/core/engine.py", line 210, in crawl
self.schedule(request, spider)
File "/usr/local/lib/python3.6/site-packages/scrapy/core/engine.py", line 216, in schedule
if not self.slot.scheduler.enqueue_request(request):
File "/usr/local/lib/python3.6/site-packages/scrapy/core/scheduler.py", line 57, in enqueue_request
dqok = self._dqpush(request)
File "/usr/local/lib/python3.6/site-packages/scrapy/core/scheduler.py", line 86, in _dqpush
self.dqs.push(reqd, -request.priority)
File "/usr/local/lib/python3.6/site-packages/queuelib/pqueue.py", line 35, in push
q.push(obj) # this may fail (eg. serialization error)
File "/usr/local/lib/python3.6/site-packages/scrapy/squeues.py", line 15, in push
s = serialize(obj)
File "/usr/local/lib/python3.6/site-packages/scrapy/squeues.py", line 27, in _pickle_serialize
return pickle.dumps(obj, protocol=2)
builtins.TypeError: can't pickle HtmlElement objects
I set a errback but it doesn't provide any error details. Also I wrote meta in every request, but it doesn't make any difference. Am I missing something?
Update:
It seems that the error is inherent to multi level spiders in particular. For now, I rewrote this one with just one parse method.
One of the differences between running a job locally and on Scrapy Cloud is that the JOBDIR setting is enabled, which makes Scrapy serialize requests into a disk queue instead of a memory one.
When serializing to disk, the Pickle operation fails because your request.meta dict contains a SelectorList object (assigned in the line category = response.css(".active [itemprop='title']")), and the selectors contain instances of lxml.html.HtmlElement objects (which cannot be pickled, and this issue is not in the Scrapy scope), hence the TypeError: can't pickle HtmlElement objects.
There is a merged pull request that addresses this issue. It does not fix the Pickle operation, what it does is indicate the Scheduler that it should not try to serialize to disk these kind of requests, they go to memory instead.
Related
I am keeping a naive connection pool using a python dictionary. As reference, I am using asyncio within Sanic if that matters.
Not often, but at times, I get this error:
Traceback (most recent call last):
File "/Users/Documents/venv/lib/python3.6/site-packages/sanic/app.py", line 556, in handle_request
response = await response
File "/usr/local/Cellar/python/3.6.4_4/Frameworks/Python.framework/Versions/3.6/lib/python3.6/asyncio/coroutines.py", line 110, in __next__
return self.gen.send(None)
File "/Users/Documents/Project/<proj>/<dir>/devices/services.py", line 181, in dev_execute_cmd
return HTTPResponse(output, content_type='application/json')
File "/usr/local/Cellar/python/3.6.4_4/Frameworks/Python.framework/Versions/3.6/lib/python3.6/asyncio/coroutines.py", line 110, in __next__
return self.gen.send(None)
File "/Users/Documents/Project/<proj>/<dir>/devices/services.py", line 132, in dev_execute_cmd
uid, last_cmd, mode)
File "/usr/local/Cellar/python/3.6.4_4/Frameworks/Python.framework/Versions/3.6/lib/python3.6/asyncio/coroutines.py", line 110, in __next__
return self.gen.send(None)
File "/Users/Documents/Project/<proj>/<dir>/devices/managers.py", line 260, in async_dev_execute_cmd
# return False if max connections have been exceeded
File "/usr/local/Cellar/python/3.6.4_4/Frameworks/Python.framework/Versions/3.6/lib/python3.6/asyncio/coroutines.py", line 110, in __next__
return self.gen.send(None)
File "/Users/Documents/Project/<proj>/<dir>/devices/managers.py", line 122, in async_open_connection
ip_addr_conn_count = self.per_dev_conn_count.get(device.ip_addr, 0) + 1
KeyError: '10.32.255.80'
My question is - how is a key error using .get() possible? In what scenarios can this happen? One thing I have noticed is that this error only occurs when I'm running concurrent requests albeit rarely.
In my understanding, asyncio uses an event loop so it schedules tasks and pauses tasks as it waits. So in my mind, 2+ concurrent requests should never really hit the same dictionary at the same time.
Thanks in advance!
Trying to get the GET parameters from the URL. I have it working in my __init__.py file, but in a different file its not working.
I tried to use with app.app_context(): but I am still getting the same issue.
def log_entry(entity, type, entity_id, data, error):
with app.app_context():
zip_id = request.args.get('id')
RuntimeError: working outside of request context
Any suggestions?
Additional Info:
This is using Flask web framework which is setup as a service (API).
Example URL the user would hit http://website.com/api/endpoint?id=1
As mentioned above using `zip_id = request.args.get('id') works fine in the main file but I am in runners.py (just another file with definitions in)
Full traceback:
Debugging middleware caught exception in streamed response at a point where response headers were already sent.
Traceback (most recent call last):
File "/Users/ereeve/.virtualenvs/pi-automation-api/lib/python2.7/site-packages/werkzeug/wsgi.py", line 703, in __next__
return self._next()
File "/Users/ereeve/.virtualenvs/pi-automation-api/lib/python2.7/site-packages/werkzeug/wrappers.py", line 81, in _iter_encoded
for item in iterable:
File "/Users/ereeve/Documents/TechSol/pi-automation-api/automation_api/runners.py", line 341, in create_agencies
log_entry("test", "created", 1, "{'data':'hey'}", "")
File "/Users/ereeve/Documents/TechSol/pi-automation-api/automation_api/runners.py", line 315, in log_entry
zip_id = request.args.get('id')
File "/Users/ereeve/.virtualenvs/pi-automation-api/lib/python2.7/site-packages/werkzeug/local.py", line 343, in __getattr__
return getattr(self._get_current_object(), name)
File "/Users/ereeve/.virtualenvs/pi-automation-api/lib/python2.7/site-packages/werkzeug/local.py", line 302, in _get_current_object
return self.__local()
File "/Users/ereeve/.virtualenvs/pi-automation-api/lib/python2.7/site-packages/flask/globals.py", line 20, in _lookup_req_object
raise RuntimeError('working outside of request context')
RuntimeError: working outside of request context
Def in the same file calling the log_entry def
def create_agencies(country_code, DB, session):
document = DB.find_one({'rb_account_id': RB_COUNTRIES_new[country_code]['rb_account_id']})
t2 = new_t2(session)
log_entry("test", "created", 1, "{'data':'hey'}", "")
I'm working on an application in which a server and client are being created; the ServerAPI is using SimpleXMLRPCServer and the ClientAPI is using xmlrpclib. The client is initiated with:
class w_Client:
def __init__(self, ServerIP, ServerPort, ClientIP):
self.conn = xmlrpclib.ServerProxy("http://" + ServerIP + ":" + str(ServerPort))
self.ClientIP = ClientIP
upon a button being pressed in the application, an xml specification file is created and passed thru
def Create(self, XMLstring):
return self.conn.Create(XMLstring, self.ClientIP)
I've already checked to make sure that the XMLstring is valid XML; however, when I get press the button, I get the following error:
Traceback (most recent call last):
File "/home/app/UI/MainWindow.py", line 461, in compile
xmlFile = compiler.compile()
File "/home/app/Core/Compiler.py", line 75, in compile
self.compile_top()
File "/home/app/Core/Compiler.py", line 354, in compile_top
status = mainWidgets["w_client"].Create(xmlString)
File "/home/app/Wireless/ClientAPI.py", line 12, in Create
return self.conn.Create(XMLstring, self.ClientIP)
File "/usr/lib/python2.7/xmlrpclib.py", line 1233, in __call__
return self.__send(self.__name, args)
File "/usr/lib/python2.7/xmlrpclib.py", line 1591, in __request
verbose=self.__verbose
File "/usr/lib/python2.7/xmlrpclib.py", line 1273, in request
return self.single_request(host, handler, request_body, verbose)
File "/usr/lib/python2.7/xmlrpclib.py", line 1306, in single_request
return self.parse_response(response)
File "/usr/lib/python2.7/xmlrpclib.py", line 1482, in parse_response
return u.close()
File "/usr/lib/python2.7/xmlrpclib.py", line 794, in close
raise Fault(**self._stack[0])
xmlrpclib.Fault: <Fault 1: "<type 'exceptions.TypeError'>:'NoneType' object has no attribute '__getitem__'">
I've also made sure that the ClientIP is passed correctly. Otherwise, I'm not entirely sure what's going on or how to even go about fixing it.
<type 'exceptions.TypeError'>:'NoneType' object has no attribute '__getitem__'
This exception may have been generated by the xmlrpc method you were calling (i.e. server side).
I suggest that you add verbose=True to your instantiation of the server proxy:
xmlrpclib.ServerProxy("http://" + ServerIP + ":" + str(ServerPort),verbose=True)
This will allow you to see what you're sending and receiving.
It seems the method you're calling is expecting a dict
I recently updated my version of django from 1.2.5 to 1.7. Once done, all new transactions on my app were working as expected. However whenever I try to access a pickled object, I get the error
EncodeError: 'QuerySet' object has no attribute '_prefetch_related_lookups'
Here is the error thrown
'QuerySet' object has no attribute '_prefetch_related_lookups'
Traceback (most recent call last):
File "/foo/bar/gateway/baseGateway.py", line 108, in queueMessage
eng.processMessage(msgRow)
File "/foo/bar/engine/processor.py", line 101, in processMessage
tasks.deliverMessage.apply_async(args=[foo, bar], queue='message-deliver')
File "/opt/bitnami/python/lib/python2.7/site-packages/celery/app/task.py", line 555, in apply_async
**dict(self._get_exec_options(), **options)
File "/opt/bitnami/python/lib/python2.7/site-packages/celery/app/base.py", line 353, in send_task
reply_to=reply_to or self.oid, **options
File "/opt/bitnami/python/lib/python2.7/site-packages/celery/app/amqp.py", line 305, in publish_task
**kwargs
File "/opt/bitnami/python/lib/python2.7/site-packages/kombu/messaging.py", line 161, in publish
compression, headers)
File "/opt/bitnami/python/lib/python2.7/site-packages/kombu/messaging.py", line 237, in _prepare
body) = dumps(body, serializer=serializer)
File "/opt/bitnami/python/lib/python2.7/site-packages/kombu/serialization.py", line 164, in dumps
payload = encoder(data)
File "/opt/bitnami/python/lib/python2.7/contextlib.py", line 35, in __exit__
self.gen.throw(type, value, traceback)
File "/opt/bitnami/python/lib/python2.7/site-packages/kombu/serialization.py", line 59, in _reraise_errors
reraise(wrapper, wrapper(exc), sys.exc_info()[2])
File "/opt/bitnami/python/lib/python2.7/site-packages/kombu/serialization.py", line 55, in _reraise_errors
yield
File "/opt/bitnami/python/lib/python2.7/site-packages/kombu/serialization.py", line 164, in dumps
payload = encoder(data)
File "/opt/bitnami/python/lib/python2.7/site-packages/kombu/serialization.py", line 356, in pickle_dumps
return dumper(obj, protocol=pickle_protocol)
File "/opt/bitnami/python/lib/python2.7/site-packages/django/db/models/query.py", line 113, in __reduce__
return super(QuerySet, self).__reduce__()
File "/opt/bitnami/python/lib/python2.7/copy_reg.py", line 84, in _reduce_ex
dict = getstate()
File "/opt/bitnami/python/lib/python2.7/site-packages/django/db/models/query.py", line 91, in __getstate__
self._fetch_all()
File "/opt/bitnami/python/lib/python2.7/site-packages/django/db/models/query.py", line 967, in _fetch_all
if self._prefetch_related_lookups and not self._prefetch_done:
EncodeError: 'QuerySet' object has no attribute '_prefetch_related_lookups'
Looking at some of the solutions offered online and by django here and here, I cleared the sessions table in django to no avail.The error still persists. I use memcache in my application too and i cleared that. I also use celery.
Anyone know how to fix this?
I was seeing a related issue when trying to change a queryset on a model form within a view. The error was:
'NoneType' object has no attribute '_prefetch_related_lookups'
forms.py
class S1Form(forms.ModelForm):
library = forms.ModelChoiceField(
queryset = Library.objects.all(),
to_field_name = 'title',
required = True,
widget = forms.Select(
attrs = {
'class': 'custom-select'}
),
disabled = False,
empty_label = 'Select a library'
)
views.py
class FilteredSpectraSearchListView(SingleTableMixin, FilterView):
...
def get_context_data(self, **kwargs):
context = super().get_context_data(**kwargs)
context['sform'] = S1Form()
context['sform'].fields['library'].queryset = None
if <something>:
context['sform'].fields['library'].queryset = <...>
elif <something-else>:
context['sform'].fields['library'].queryset = <...>
return context
The goal was to have an empty queryset initially which is later changed based on a few conditional statments. The problem was that the conditional "<something>" was not firing and None remained the queryset. The solution was simply to provide an empty queryset rather than None for this case:
...
context['sform'].fields['library'].queryset = Library.objects.none()
if <something>:
context['sform'].fields['library'].queryset = <...>
elif <something-else>:
context['sform'].fields['library'].queryset = <...>
...
Googling giving us a results
resetting sessions app fixed the issue (at least for the moment...)
https://code.djangoproject.com/ticket/18674
The problem is the Sessions, I had to delete them all for it to work. Addtionally I changed the settings SECRET_KEY so all sessions don’t validate.
http://jj.isgeek.net/2013/04/django-queryset-object-has-no-attribute-_prefetch_related_lookups/
You have some serialized data from Django < 1.4, and Kombu tries to deserialize it in your current Django version.
I don't know where Kombu saves its serialized data, but that's where you should look. You should either delete the stale data, or if you need to keep the data, manually change it to match your current Django version.
CORE
The server part - Core, which is responsible for the registration of modules and the interaction between them. Core runs as ThreadedServer. CoreService provides registration modules. When registering I keep a list of Connections, then to use them. Module calls at the core function that it should call another module. But to use the list of connections does not work, the performance goes into an infinite loop.
class CoreService(rpyc.Service):
__modules = {}
def exposed_register_module(self, module_name):
if module_name in self.__modules:
return False
self.__modules[module_name] = self._conn
return True
def exposed_execute_query_module(self, module_name, attribute_name, args):
# TTTTTTTTTTHHHHHHHHHIIIIIIIISSSSSSSSSSSSSS
if module_name in self.__modules:
self.__modules[module_name].root
# return None
Run test
When you run the test I get in into a loop which is interrupted by a combination of keys and get the following output:
^CTraceback (most recent call last):
File "/home/kpv/perseus/control-lib/perseus_control_lib/module.py", line 67, in __getattr__
return self.__core_connector.root.execute_query_module(self.__proxy_module_name, name, args)
File "/usr/local/lib/python2.7/dist-packages/rpyc/core/netref.py", line 196, in __call__
return syncreq(_self, consts.HANDLE_CALL, args, kwargs)
File "/usr/local/lib/python2.7/dist-packages/rpyc/core/netref.py", line 71, in syncreq
return conn.sync_request(handler, oid, *args)
File "/usr/local/lib/python2.7/dist-packages/rpyc/core/protocol.py", line 438, in sync_request
self.serve(0.1)
File "/usr/local/lib/python2.7/dist-packages/rpyc/core/protocol.py", line 387, in serve
data = self._recv(timeout, wait_for_lock = True)
File "/usr/local/lib/python2.7/dist-packages/rpyc/core/protocol.py", line 344, in _recv
if self._channel.poll(timeout):
File "/usr/local/lib/python2.7/dist-packages/rpyc/core/channel.py", line 43, in poll
return self.stream.poll(timeout)
File "/usr/local/lib/python2.7/dist-packages/rpyc/core/stream.py", line 41, in poll
rl, _, _ = select([self], [], [], timeout)
KeyboardInterrupt