What is the best way to make an asynchronous call appear synchronous? Eg, something like this, but how do I coordinate the calling thread and the async reply thread? In java I might use a CountdownLatch() with a timeout, but I can't find a definite solution for Python
def getDataFromAsyncSource():
asyncService.subscribe(callback=functionToCallbackTo)
# wait for data
return dataReturned
def functionToCallbackTo(data):
dataReturned = data
There is a module you can use
import concurrent.futures
Check this post for sample code and module download link: Concurrent Tasks Execution in Python
You can put executor results in future, then get them, here is the sample code from http://pypi.python.org:
import concurrent.futures
import urllib.request
URLS = ['http://www.foxnews.com/',
'http://www.cnn.com/',
'http://europe.wsj.com/',
'http://www.bbc.co.uk/',
'http://some-made-up-domain.com/']
def load_url(url, timeout):
return urllib.request.urlopen(url, timeout=timeout).read()
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
future_to_url = dict((executor.submit(load_url, url, 60), url)
for url in URLS)
for future in concurrent.futures.as_completed(future_to_url):
url = future_to_url[future]
if future.exception() is not None:
print('%r generated an exception: %s' % (url,future.exception()))
else:
print('%r page is %d bytes' % (url, len(future.result())))
A common solution would be the usage of a synchronized Queue and passing it to the callback function. See http://docs.python.org/library/queue.html.
So for your example this could look like (I'm just guessing the API to pass additional arguments to the callback function):
from Queue import Queue
def async_call():
q = Queue()
asyncService.subscribe(callback=callback, args=(q,))
data = q.get()
return data
def callback(data, q):
q.put(data)
This is a solution using the threading module internally so it might not work depending on your async library.
Related
I have some HTML pages that I am trying to extract the text from using asynchronous web requests through aiohttp and asyncio, after extracting them I save the files locally. I am using BeautifulSoup(under extract_text()), to process the text from the response and extract the relevant text within the HTML page(exclude the code, etc.) but facing an issue where my synchronous version of the script is faster than my asynchronous + multiprocessing.
As I understand, using the BeautifulSoup function causes the main event loop to block within parse(), so based on these two StackOverflow questions[0, 1], I figured the best thing to do was to run the extract_text() within its own process(as its a CPU task) which should prevent the event loop from blocking.
This results in the script taking 1.5x times longer than the synchronous version(with no multiprocessing).
To confirm that this was not an issue with my implementation of the asynchronous code, I removed the use of the extract_text() and instead saved the raw text from the response object. Doing this resulted in my asynchronous code being much faster, showcasing that the issue is purely from the extract_text() being run on a separate process.
Am I missing some important detail here?
import asyncio
from asyncio import Semaphore
import json
import logging
from pathlib import Path
from typing import List, Optional
import aiofiles
from aiohttp import ClientSession
import aiohttp
from bs4 import BeautifulSoup
import concurrent.futures
import functools
def extract_text(raw_text: str) -> str:
return " ".join(BeautifulSoup(raw_text, "html.parser").stripped_strings)
async def fetch_text(
url: str,
session: ClientSession,
semaphore: Semaphore,
**kwargs: dict,
) -> str:
async with semaphore:
response = await session.request(method="GET", url=url, **kwargs)
response.raise_for_status()
logging.info("Got response [%s] for URL: %s", response.status, url)
text = await response.text(encoding="utf-8")
return text
async def parse(
url: str,
session: ClientSession,
semaphore: Semaphore,
**kwargs,
) -> Optional[str]:
try:
text = await fetch_text(
url=url,
session=session,
semaphore=semaphore,
**kwargs,
)
except (
aiohttp.ClientError,
aiohttp.http_exceptions.HttpProcessingError,
) as e:
logging.error(
"aiohttp exception for %s [%s]: %s",
url,
getattr(e, "status", None),
getattr(e, "message", None),
)
except Exception as e:
logging.exception(
"Non-aiohttp exception occured: %s",
getattr(e, "__dict__", None),
)
else:
loop = asyncio.get_running_loop()
with concurrent.futures.ProcessPoolExecutor() as pool:
extract_text_ = functools.partial(extract_text, text)
text = await loop.run_in_executor(pool, extract_text_)
logging.info("Found text for %s", url)
return text
async def process_file(
url: dict,
session: ClientSession,
semaphore: Semaphore,
**kwargs: dict,
) -> None:
category = url.get("category")
link = url.get("link")
if category and link:
text = await parse(
url=f"{URL}/{link}",
session=session,
semaphore=semaphore,
**kwargs,
)
if text:
save_path = await get_save_path(
link=link,
category=category,
)
await write_file(html_text=text, path=save_path)
else:
logging.warning("Text for %s not found, skipping it...", link)
async def process_files(
html_files: List[dict],
semaphore: Semaphore,
) -> None:
async with ClientSession() as session:
tasks = [
process_file(
url=file,
session=session,
semaphore=semaphore,
)
for file in html_files
]
await asyncio.gather(*tasks)
async def write_file(
html_text: str,
path: Path,
) -> None:
# Write to file using aiofiles
...
async def get_save_path(link: str, category: str) -> Path:
# return path to save
...
async def main_async(
num_files: Optional[int],
semaphore_count: int,
) -> None:
html_files = # get all the files to process
semaphore = Semaphore(semaphore_count)
await process_files(
html_files=html_files,
semaphore=semaphore,
)
if __name__ == "__main__":
NUM_FILES = # passed through CLI args
SEMAPHORE_COUNT = # passed through CLI args
asyncio.run(
main_async(
num_files=NUM_FILES,
semaphore_count=SEMAPHORE_COUNT,
)
)
SnakeViz charts across 1000 samples
Async version with extract_text and multiprocessing
Async version without extract_text
Sync version with extract_text(notice how the html_parser from BeautifulSoup takes up the majority of the time here)
Sync version without extract_text
Here is roughly what your asynchronous program does:
Launch num_files parse() tasks concurrently
Each parse() task creates its own ProcessPoolExecutor and asynchronously awaits for extract_text (which is executed in the previously created process pool).
This is suboptimal for several reasons:
It creates num_files process pools, which are expensive to create and takes memory
Each pool is only used for one single operation, which is counterproductive: as many concurrent operations as possible should be submitted to a given pool
You are creating a new ProcessPoolExecutor each time the parse() function is called. You could try to instantiate it once (as a global for instance, of passed through a function argument):
from concurrent.futures import ProcessPoolExecutor
async def parse(loop, executor, ...):
...
text = await loop.run_in_executor(executor, extract_text)
# and then in `process_file` (or `process_files`):
async def process_file(...):
...
loop = asyncio.get_running_loop()
with ProcessPoolExecutor() as executor:
...
await process(loop, executor, ...)
I benchmarked the overhead of creating a ProcessPoolExecutor on my old MacBook Air 2015 and it shows that it is quite slow (almost 100 ms for pool creation, opening, submit and shutdown):
from time import perf_counter
from concurrent.futures import ProcessPoolExecutor
def main_1():
"""Pool crated once"""
reps = 100
t1 = perf_counter()
with ProcessPoolExecutor() as executor:
for _ in range(reps):
executor.submit(lambda: None)
t2 = perf_counter()
print(f"{(t2 - t1) / reps * 1_000} ms") # 2 ms/it
def main_2():
"""Pool created at each iteration"""
reps = 100
t1 = perf_counter()
for _ in range(reps):
with ProcessPoolExecutor() as executor:
executor.submit(lambda: None)
t2 = perf_counter()
print(f"{(t2 - t1) / reps * 1_000} ms") # 100 ms/it
if __name__ == "__main__":
main_1()
main_2()
You may again hoist it up in the process_files function, which avoid recreating the pool for each file.
Also, try to inspect more closely your first SnakeViz chart in order to know what exactly in process.py:submit is taking that much time.
One last thing, be careful of the semantics of using a context manager on an executor:
from concurrent.futures import ProcessPoolExecutor
with ProcessPoolExecutor() as executor:
for i in range(100):
executor.submit(some_work, i)
Not only this creates and executor and submit work to it but it also waits for all work to finish before exiting the with statement.
I'm working on a project that parses data from a lot of websites. Most of my code is done, so i'm looking forward to use asyncio in order to eliminate that I/O waiting, but still i wanted to test how threading would work, better or worse. To do that, i wrote some simple code to make requests to 100 websites. Btw i'm using requests_html library for that, fortunately it supports asynchronous requests as well.
asyncio code looks like:
import requests
import time
from requests_html import AsyncHTMLSession
aio_session = AsyncHTMLSession()
urls = [...] # 100 urls
async def fetch(url):
try:
response = await aio_session.get(url, timeout=5)
status = 200
except requests.exceptions.ConnectionError:
status = 404
except requests.exceptions.ReadTimeout:
status = 408
if status == 200:
return {
'url': url,
'status': status,
'html': response.html
}
return {
'url': url,
'status': status
}
def extract_html(urls):
tasks = []
for url in urls:
tasks.append(lambda url=url: fetch(url))
websites = aio_session.run(*tasks)
return websites
if __name__ == "__main__":
start_time = time.time()
websites = extract_html(urls)
print(time.time() - start_time)
Execution time (multiple tests):
13.466366291046143
14.279950618743896
12.980706453323364
BUT
If i run an example with threading:
from queue import Queue
import requests
from requests_html import HTMLSession
from threading import Thread
import time
num_fetch_threads = 50
enclosure_queue = Queue()
html_session = HTMLSession()
urls = [...] # 100 urls
def fetch(i, q):
while True:
url = q.get()
try:
response = html_session.get(url, timeout=5)
status = 200
except requests.exceptions.ConnectionError:
status = 404
except requests.exceptions.ReadTimeout:
status = 408
q.task_done()
if __name__ == "__main__":
for i in range(num_fetch_threads):
worker = Thread(target=fetch, args=(i, enclosure_queue,))
worker.setDaemon(True)
worker.start()
start_time = time.time()
for url in urls:
enclosure_queue.put(url)
enclosure_queue.join()
print(time.time() - start_time)
Execution time (multiple tests):
7.476433515548706
6.786043643951416
6.717151403427124
The thing that i don't understand .. both libraries are used against I/O problems, but why are threads faster ? The more i increase the number of threads, the more resources it uses but it's a lot faster.. Can someone please explain to me why are threads faster than asyncio in my example ?
Thanks in advance.
It turns out requests-html uses a pool of threads for running the requests. The default number of threads is the number of core on the machine multiplied by 5. This probably explains the difference in performance you noticed.
You might want to try the experiment again using aiohttp instead. In the case of aiohttp, the underlying socket for the HTTP connection is actually registered in the asyncio event loop, so no threads should be involved here.
I'm trying to create a script that send's over 1000 requests to one page at the same time. But requests library with threading (1000) threads. Seems to be doing to first 50 or so requests all within 1 second, whereas the other 9950 are taking considerably longer. I measured it like this.
def print_to_cmd(strinng):
queueLock.acquire()
print strinng
queueLock.release()
start = time.time()
resp = requests.get('http://test.net/', headers=header)
end = time.time()
print_to_cmd(str(end-start))
I'm thinking requests library is limiting how fast they are getting sent.
Doe's anybody know a way in python to send requests all at the same time? I have a VPS with 200mb upload so that is not the issue its something to do with python or requests library limiting it. They all need to hit the website within 1 second of each other.
Thanks for reading and I hope somebody can help.
I have generally found that the best solution is to use an asynchronous library like tornado. The easiest solution that I found however is to use ThreadPoolExecutor.
import requests
from concurrent.futures import ThreadPoolExecutor
def get_url(url):
return requests.get(url)
with ThreadPoolExecutor(max_workers=50) as pool:
print(list(pool.map(get_url,list_of_urls)))
I know this is an old question, but you can now do this using asyncio and aiohttp.
import asyncio
import aiohttp
from aiohttp import ClientSession
async def fetch_html(url: str, session: ClientSession, **kwargs) -> str:
resp = await session.request(method="GET", url=url, **kwargs)
resp.raise_for_status()
return await resp.text()
async def make_requests(url: str, **kwargs) -> None:
async with ClientSession() as session:
tasks = []
for i in range(1,1000):
tasks.append(
fetch_html(url=url, session=session, **kwargs)
)
results = await asyncio.gather(*tasks)
# do something with results
if __name__ == "__main__":
asyncio.run(make_requests(url='http://test.net/'))
You can read more about it and see an example here.
Assumed that you know what you are doing, I first suggest you to implement a backoff policy with a jitter to prevent "predictable thundering hoardes" to your server. That said, you should consider to do some threading
import threading
class FuncThread(threading.Thread):
def __init__(self, target, *args):
self._target = target
self._args = args
threading.Thread.__init__(self)
def run(self):
self._target(*self._args)
so that you would do something like
t = FuncThread(doApiCall, url)
t.start()
where your method doApiCall is defined like this
def doApiCall(self, url):
Is there a possible way to speed up my code using multiprocessing interface? The problem is that this interface uses map function, which works only with 1 function. But my code has 3 functions. I tried to combine my functions into one, but didn't get success. My script reads the URL of site from file and performs 3 functions over it. For Loop makes it very slow, because I got a lot of URLs
import requests
def Login(url): #Log in
payload = {
'UserName_Text' : 'user',
'UserPW_Password' : 'pass',
'submit_ButtonOK' : 'return buttonClick;'
}
try:
p = session.post(url+'/login.jsp', data = payload, timeout=10)
except (requests.exceptions.ConnectionError, requests.exceptions.Timeout):
print "site is DOWN! :", url[8:]
session.cookies.clear()
session.close()
else:
print 'OK: ', p.url
def Timer(url): #Measure request time
try:
timer = requests.get(url+'/login.jsp').elapsed.total_seconds()
except (requests.exceptions.ConnectionError):
print 'Request time: None'
print '-----------------------------------------------------------------'
else:
print 'Request time:', round(timer, 2), 'sec'
def Logout(url): # Log out
try:
logout = requests.get(url+'/logout.jsp', params={'submit_ButtonOK' : 'true'}, cookies = session.cookies)
except(requests.exceptions.ConnectionError):
pass
else:
print 'Logout '#, logout.url
print '-----------------------------------------------------------------'
session.cookies.clear()
session.close()
for line in open('text.txt').read().splitlines():
session = requests.session()
Login(line)
Timer(line)
Logout(line)
Yes, you can use multiprocessing.
from multiprocessing import Pool
def f(line):
session = requests.session()
Login(session, line)
Timer(session, line)
Logout(session, line)
if __name__ == '__main__':
urls = open('text.txt').read().splitlines()
p = Pool(5)
print(p.map(f, urls))
The requests session cannot be global and shared between workers, every worker should use its own session.
You write that you already "tried to combine my functions into one, but didn't get success". What exactly didn't work?
There are many ways to accomplish your task, but multiprocessing is not needed at that level, it will just add complexity, imho.
Take a look at gevent, greenlets and monkey patching, instead!
Once your code is ready, you can wrap a main function into a gevent loop, and if you applied the monkey patches, the gevent framework will run N jobs concurrently (you can create a jobs pool, set the limits of concurrency, etc.)
This example should help:
#!/usr/bin/python
# Copyright (c) 2009 Denis Bilenko. See LICENSE for details.
"""Spawn multiple workers and wait for them to complete"""
from __future__ import print_function
import sys
urls = ['http://www.google.com', 'http://www.yandex.ru', 'http://www.python.org']
import gevent
from gevent import monkey
# patches stdlib (including socket and ssl modules) to cooperate with other greenlets
monkey.patch_all()
if sys.version_info[0] == 3:
from urllib.request import urlopen
else:
from urllib2 import urlopen
def print_head(url):
print('Starting %s' % url)
data = urlopen(url).read()
print('%s: %s bytes: %r' % (url, len(data), data[:50]))
jobs = [gevent.spawn(print_head, url) for url in urls]
gevent.wait(jobs)
You can find more here and in the Github repository, from where this example comes from
P.S.
Greenlets will works with requests as well, you don't need to change your code.
In python I want to create an async method in a class that create a thread without blocking the main thread. When the new thread finish, I return a value from that function/thread.
For example the class is used for retrieve some information from web pages. I want run parallel processing in a function that download the page and return a object.
class WebDown:
def display(self, url):
print 'display(): ' + content
def download(self, url):
thread = Thread(target=self.get_info)
# thread join
print 'download(): ' + content
# return the info
def get_info(self, url):
# download page
# retrieve info
return info
if __name__ == '__main__':
wd = WebDown()
ret = wd.download('http://...')
wd.display('http://...')
I this example, in order I call download() for retrieve the info, after display() for print others information. The print output should be
display(): foo, bar, ....
download(): blue, red, ....
One way to write asynchronous, non blocking code in python involves using Python's Twisted. Twisted does not rely on multithreading but uses multiprocessing instead. It gives you convenient way to create Deferred objects, adding callbacks and errbacks to them. The example you give would look like this in Twisted, I'm using treq (Twisted Requests) library which makes generating requests a little quicker and easier:
from treq import get
from twisted.internet import reactor
class WebAsync(object):
def download(self, url):
request = get(url)
request.addCallback(self.deliver_body)
def deliver_body(self, response):
deferred = response.text()
deferred.addCallback(self.display)
return deferred
def display(self, response_body):
print response_body
reactor.stop()
if __name__ == "__main__":
web_client = WebAsync()
web_client.download("http://httpbin.org/html")
reactor.run()
Both 'download' and 'deliver_body' methods return deferreds, you add callbacks to them that are going to be executed when results is available.
I would simply use request and gevent called grequests.
import grequests
>>> urls = [
'http://...',
'http://...'
]
>>> rs = (grequests.get(u) for u in urls)
>>> grequests.map(rs)
[<Response [200]>, <Response [200]>]