Make callback to function - python

I want to make a callback to spider_opened with the values given in either deferred1, deferred2. This should pause and then immediately unpause the spiders. However, I am getting a fairly simple python issue:
TypeError: spider_opened() takes 1 positional argument but 2 were given
It turns out that when I send multiple callback/errback python interprets this as adding multiple parameters back. If I uncomment out one of these, then I do not get an issue. Therefore, how do I properly implement this?
Here's my example scraper:
import scrapy
from scrapy.utils import reactor
from scrapy import signals
import logging
from twisted.internet import defer
logger = logging.getLogger(__name__)
class TestSpider(scrapy.Spider):
name = 'pause_test'
start_urls = [ f'http://quotes.toscrape.com/page/{i}/' for i in range(1, 5) ]
custom_settings = {
'DOWNLOAD_DELAY':1
}
def __init__(self, stats, pause):
self.stats = stats
self.pause = pause
#classmethod
def from_crawler(cls, crawler):
stat = cls(crawler.stats, crawler)
crawler.signals.connect(stat.spider_later, signals.response_downloaded)
crawler.signals.connect(stat.spider_opened, signals.spider_opened)
return stat
def spider_opened(self):
for (success, value) in self:
if success:
print(value)
self.pause.engine.pause()
print('Success:', value)
else:
self.pause.engine.unpause()
print('Failure:', value.getErrorMessage())
def spider_later(self):
# Create three deferreds.
deferred1 = defer.Deferred()
deferred2 = defer.Deferred()
# Pack them into a DeferredList
dl = defer.DeferredList([deferred1, deferred2], consumeErrors=True)
# Add our callback
dl.addCallback(self.spider_opened)
# Fire our three deferreds with various values.
deferred1.callback(True)
deferred2.errback(Exception('bang!'))
def parse(self, response):
logger.info("Urls passed to: %s", response.url)

Related

Update ObservableGauge in Open Telemetry Python

I am using opentelemetry-api 1.14 and opentelemetry-sdk 1.14. I know how to create and use Counter and ObservableGauge instruments. However, I need to update and set the gauge throughout my application in a similar manner to how a counter can use its add method. I have working code below but in this working code the gauge is static at 9.
import time
""" API is the interface that you should interact with."""
from opentelemetry import metrics
"""
SDK is the implementation. Only access SDK during initialization, startup, and shutdown.
"""
from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader, ConsoleMetricExporter
from opentelemetry.sdk.metrics import MeterProvider
from opentelemetry.sdk.resources import Resource
def initialize():
resource = Resource(attributes={"service.name": "otel-test"})
readers = []
# Console Exporter
exporter = ConsoleMetricExporter()
reader1 = PeriodicExportingMetricReader(exporter, export_interval_millis=5000)
readers.append(reader1)
provider = MeterProvider(metric_readers=readers, resource=resource)
metrics.set_meter_provider(provider)
initialize()
provider = metrics.get_meter_provider()
meter = provider.get_meter("my-demo-meter")
simple_counter = meter.create_counter("simple_counter", description="simply increments each loop")
# Async Gauge
def observable_gauge_func(options):
yield metrics.Observation(9, {})
simple_gauge = meter.create_observable_gauge("simple_gauge", [observable_gauge_func])
# How can I update simple_gauge in main
def main():
loop_counter = 0
while True:
print(loop_counter)
loop_counter += 1
simple_counter.add(1)
# How can I update simple_gauge here?
time.sleep(5)
main()
I'm not sure if this is the best pattern for implementing an ObservableGauge instrument but this a method I used to implement it for my application given the requirements described in my question (i.e. update gauge in main function). Its worth providing given how few examples of ObservableGauge instrumentation there are online.
import time
import random
""" API is the interface that you should interact with."""
from opentelemetry import metrics
"""
SDK is the implementation. Only access SDK during initialization, startup, and shutdown.
"""
from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader, ConsoleMetricExporter
from opentelemetry.sdk.metrics import MeterProvider
from opentelemetry.sdk.resources import Resource
def initialize():
resource = Resource(attributes={"service.name": "otel-test"})
readers = []
# Console Exporter
exporter = ConsoleMetricExporter()
reader1 = PeriodicExportingMetricReader(exporter, export_interval_millis=5000)
readers.append(reader1)
provider = MeterProvider(metric_readers=readers, resource=resource)
metrics.set_meter_provider(provider)
initialize()
provider = metrics.get_meter_provider()
meter = provider.get_meter("my-demo-meter")
simple_counter = meter.create_counter("simple_counter", description="simply increments each loop")
def create_simple_gauge(signal):
# Async Gauge
def observable_gauge_func(options):
yield metrics.Observation(signal.get_current_value(), {"simple_attribute": signal.attribute})
simple_gauge = meter.create_observable_gauge("simple_gauge", [observable_gauge_func])
return simple_gauge
class Signal:
def __init__(self, attribute):
self.attribute = attribute
def set_current_value(self, i):
self.current_value = i
def get_current_value(self):
return self.current_value
# How can I update the simple_gauge here?
def main():
loop_counter = 0
simple_signal = Signal("simple_attribute")
create_simple_gauge(simple_signal)
while True:
print(loop_counter)
loop_counter += 1
simple_counter.add(1)
randint = random.randint(0, 5)
print(randint)
simple_signal.set_current_value(randint)
time.sleep(5)
main()

Looping deferLater with timer to choose schedule

I am working on a more complex example however I think this is a simplified version. The function should pause for 1 second, and given a delay to fire the function, we loop this call by intervals with a start value and stop the scheduler afterwards.
from twisted.internet import reactor
import time
from twisted.internet import task
class timer:
def __init__(self, *args):
self._paused = True
self._unpaused = False
def sleep(self):
if self._paused:
print(f"You have paused for this many seconds: {1}s")
time.sleep(1)
def scheduler(self, delay=0, *args):
if self._paused:
from twisted.internet import reactor
self._paused = task.deferLater(reactor,delay, self, *args)
if __name__ == '__main__':
pause_timer= timer()
timer_list = task.LoopingCall(pause_timer.scheduler)
timer_list.start(5)
reactor.callLater(10, reactor.stop)
reactor.run()
However, I get this error:
builtins.TypeError: 'timer' object is not callable
I will throw the complex example. am working with in here also:
import scrapy
from scrapy.utils import reactor
from scrapy import signals
import logging
logger = logging.getLogger(__name__)
class TestSpider(scrapy.Spider):
name = 'pause'
start_urls = [ f'http://quotes.toscrape.com/page/{i}/' for i in range(1, 11) ]
custom_settings = {
'DOWNLOAD_DELAY':1
}
def __init__(self, stats, pause):
self.stats = stats
self.pause = pause
#classmethod
def from_crawler(cls, crawler, *args, **kwargs):
stat = cls(crawler.stats, crawler)
crawler.signals.connect(stat.spider_opened, signals.spider_opened)
return stat
def spider_opened(self):
reactor.CallLaterOnce(self.pause.engine.pause).schedule(20)
def parse(self, response):
logger.info("Urls passed to: %s", response.url)
The class for callLaterOnce is defined by (I updated the scheduler in the reactor.py module):
class CallLaterOnce:
"""Schedule a function to be called in the next reactor loop, but only if
it hasn't been already scheduled since the last time it ran.
"""
def __init__(self, func, *a, **kw):
self._func = func
self._a = a
self._kw = kw
self._call = None
def schedule(self, delay=0):
from twisted.internet import reactor
if self._call is None:
self._call = task.deferLater(reactor,0, self)
scheduler = task.LoopingCall(self._call)
scheduler.start(delay)
scheduler.stop()

Tornado gen.sleep add delay

I'm trying to add a delay between requests in an asynchronous way.
When I use Tornado gen.sleep(x) my function (launch) doesn't get executed.
If I remove yield from yield gen.sleep(1.0), function is called, but no delay is added.
How to add delay between requests in my for loop? I need to control Request per second to external API.
If I use time.sleep the response is delayed after all requests are completed.
Tried to add #gen.engine decorator to launch function and no results.
Code:
import collections
import tornado.httpclient
class BacklogClient(object):
MAX_CONCURRENT_REQUESTS = 20
def __init__(self, ioloop):
self.ioloop = ioloop
self.client = tornado.httpclient.AsyncHTTPClient(max_clients=self.MAX_CONCURRENT_REQUESTS)
self.client.configure(None, defaults=dict(connect_timeout=20, request_timeout=30))
self.backlog = collections.deque()
self.concurrent_requests = 0
def __get_callback(self, function):
def wrapped(*args, **kwargs):
self.concurrent_requests -= 1
self.try_run_request()
return function(*args, **kwargs)
return wrapped
def try_run_request(self):
while self.backlog and self.concurrent_requests < self.MAX_CONCURRENT_REQUESTS:
request, callback = self.backlog.popleft()
self.client.fetch(request, callback=callback)
self.concurrent_requests += 1
def fetch(self, request, callback=None):
wrapped = self.__get_callback(callback)
self.backlog.append((request, wrapped))
self.try_run_request()
import time
from tornado import ioloop, httpclient, gen
class TornadoBacklog:
def __init__(self):
self.queue = 0
self.debug = 1
self.toProcess = [
'http://google.com',
'http://yahoo.com',
'http://nytimes.com',
'http://msn.com',
'http://cnn.com',
'http://twitter.com',
'http://facebook.com',
]
def handle_request(self, response):
print response.code
if not self.backlog.backlog and self.backlog.concurrent_requests == 0:
ioloop.IOLoop.instance().stop()
def launch(self):
self.ioloop = ioloop.IOLoop.current()
self.backlog = BacklogClient(self.ioloop)
for item in self.toProcess:
yield gen.sleep(1.0)
print item
self.backlog.fetch(
httpclient.HTTPRequest(
item,
method='GET',
headers=None,
),
self.handle_request
)
self.ioloop.start()
def main():
start_time = time.time()
scraper = TornadoBacklog()
scraper.launch()
elapsed_time = time.time() - start_time
print('Process took %f seconds processed %d items.' % (elapsed_time, len(scraper.toProcess)))
if __name__ == "__main__":
main()
Reference: https://github.com/tornadoweb/tornado/issues/1400
Tornado coroutines have two components:
They contain "yield" statements
They are decorated with "gen.coroutine"
Use the "coroutine" decorator on your "launch" function:
#gen.coroutine
def launch(self):
Run a Tornado coroutine from start to finish like this:
tornado.ioloop.IOLoop.current().run_sync(launch)
Remove the call to "ioloop.start" from your "launch" function: the loop runs the "launch" function, not vice-versa.

How to pass an object from callback to errback (twisted)?

I have a callback chain with an errback at the end. If any of the callbacks fail, I need to pass an object to be used on errBack.
How can I pass an object from callback to the errback?
The following code exemplifies what I want to do:
from twisted.internet.defer import FAILURE
from twisted.internet import defer
class CodMsg(object):
def __init__(self, code, msg):
self.code = code
self.msg = msg
class Resource(object):
#classmethod
def checkCondition(cls, result):
if result == "error":
cdm = CodMsg(1, 'Error 1')
raise FAILURE, cdm
else:
return "ok"
#classmethod
def erBackTst (cls, result):
####### How to get the value of cdm here? ######## <<<===
print 'Error:'
print result
return result
d = defer.Deferred()
d.addCallback(Resource.checkCondition)
d.addErrback(Resource.erBackTst)
d.callback("error")
print d.result
In this case you can just raise an exception, containing all info you need
For example:
from twisted.internet import defer
class MyCustomException(Exception):
def __init__(self, msg, code):
self.code = code
self.message = msg
def callback(result):
print result
raise MyCustomException('Message', 23)
def errback(failure):
# failure.value is an exception instance that you raised in callback
print failure.value.message
print failure.value.code
d = defer.Deferred()
d.addCallback(callback)
d.addErrback(errback)
d.callback("error")
Also for better understanding deffereds and async programming you can read this nice twisted tutorial http://krondo.com/an-introduction-to-asynchronous-programming-and-twisted/.
It uses a little bit outdated twisted version in examples but it is still an exellent source to start learning twisted

Twisted Python getPage

I tried to get support on this but I am TOTALLY confused.
Here's my code:
from twisted.internet import reactor
from twisted.web.client import getPage
from twisted.web.error import Error
from twisted.internet.defer import DeferredList
from sys import argv
class GrabPage:
def __init__(self, page):
self.page = page
def start(self, *args):
if args == ():
# We apparently don't need authentication for this
d1 = getPage(self.page)
else:
if len(args) == 2:
# We have our login information
d1 = getPage(self.page, headers={"Authorization": " ".join(args)})
else:
raise Exception('Missing parameters')
d1.addCallback(self.pageCallback)
dl = DeferredList([d1])
d1.addErrback(self.errorHandler)
dl.addCallback(self.listCallback)
def errorHandler(self,result):
# Bad thingy!
pass
def pageCallback(self, result):
return result
def listCallback(self, result):
print result
a = GrabPage('http://www.google.com')
data = a.start() # Not the HTML
I wish to get the HTML out which is given to pageCallback when start() is called. This has been a pita for me. Ty! And sorry for my sucky coding.
You're missing the basics of how Twisted operates. It all revolves around the reactor, which you're never even running. Think of the reactor like this:
(source: krondo.com)
Until you start the reactor, by setting up deferreds all you're doing is chaining them with no events from which to fire.
I recommend you give the Twisted Intro by Dave Peticolas a read. It's quick and it really gives you all the missing information that the Twisted documentation doesn't.
Anyways, here is the most basic usage example of getPage as possible:
from twisted.web.client import getPage
from twisted.internet import reactor
url = 'http://aol.com'
def print_and_stop(output):
print output
if reactor.running:
reactor.stop()
if __name__ == '__main__':
print 'fetching', url
d = getPage(url)
d.addCallback(print_and_stop)
reactor.run()
Since getPage returns a deferred, I'm adding the callback print_and_stop to the deferred chain. After that, I start the reactor. The reactor fires getPage, which then fires print_and_stop which prints the data from aol.com and then stops the reactor.
Edit to show a working example of OP's code:
class GrabPage:
def __init__(self, page):
self.page = page
########### I added this:
self.data = None
def start(self, *args):
if args == ():
# We apparently don't need authentication for this
d1 = getPage(self.page)
else:
if len(args) == 2:
# We have our login information
d1 = getPage(self.page, headers={"Authorization": " ".join(args)})
else:
raise Exception('Missing parameters')
d1.addCallback(self.pageCallback)
dl = DeferredList([d1])
d1.addErrback(self.errorHandler)
dl.addCallback(self.listCallback)
def errorHandler(self,result):
# Bad thingy!
pass
def pageCallback(self, result):
########### I added this, to hold the data:
self.data = result
return result
def listCallback(self, result):
print result
# Added for effect:
if reactor.running:
reactor.stop()
a = GrabPage('http://google.com')
########### Just call it without assigning to data
#data = a.start() # Not the HTML
a.start()
########### I added this:
if not reactor.running:
reactor.run()
########### Reference the data attribute from the class
data = a.data
print '------REACTOR STOPPED------'
print
########### First 100 characters of a.data:
print '------a.data[:100]------'
print data[:100]

Categories

Resources