Hi I am trying to send this processing to different cores as they are all independent of one another, however none of them are awaiting so the tasks never run. I thought that was what futures were for?
async def process_object(filename):
# await 1 - download file from S3
# await 2 - parse XML file
if "__main__" == __name__:
objects = get_objects(
bucket_name=bucket_name, prefix=prefix, file_extension=".xml", top_n=top_n
)
futures = []
with concurrent.futures.ProcessPoolExecutor(
multiprocessing.cpu_count()
) as executor:
futures = [executor.submit(process_object, filename) for filename in objects]
concurrent.futures.wait(futures)
You don't need to use asyncio if you're submitting a task to ProcessPoolExecutor. Those tasks will be executed in another process so they are already running concurrently without the use of asyncio. Your process_object function never runs with your current code because a coroutine must be awaited before it will execute.
That is, you want something like:
def process_object(filename):
# download file
# parse file
...
if "__main__" == __name__:
objects = get_objects(
bucket_name=bucket_name, prefix=prefix, file_extension=".xml", top_n=top_n
)
futures = []
with concurrent.futures.ProcessPoolExecutor(
multiprocessing.cpu_count()
) as executor:
futures = [executor.submit(process_object, filename) for filename in objects]
concurrent.futures.wait(futures)
Related
I have some HTML pages that I am trying to extract the text from using asynchronous web requests through aiohttp and asyncio, after extracting them I save the files locally. I am using BeautifulSoup(under extract_text()), to process the text from the response and extract the relevant text within the HTML page(exclude the code, etc.) but facing an issue where my synchronous version of the script is faster than my asynchronous + multiprocessing.
As I understand, using the BeautifulSoup function causes the main event loop to block within parse(), so based on these two StackOverflow questions[0, 1], I figured the best thing to do was to run the extract_text() within its own process(as its a CPU task) which should prevent the event loop from blocking.
This results in the script taking 1.5x times longer than the synchronous version(with no multiprocessing).
To confirm that this was not an issue with my implementation of the asynchronous code, I removed the use of the extract_text() and instead saved the raw text from the response object. Doing this resulted in my asynchronous code being much faster, showcasing that the issue is purely from the extract_text() being run on a separate process.
Am I missing some important detail here?
import asyncio
from asyncio import Semaphore
import json
import logging
from pathlib import Path
from typing import List, Optional
import aiofiles
from aiohttp import ClientSession
import aiohttp
from bs4 import BeautifulSoup
import concurrent.futures
import functools
def extract_text(raw_text: str) -> str:
return " ".join(BeautifulSoup(raw_text, "html.parser").stripped_strings)
async def fetch_text(
url: str,
session: ClientSession,
semaphore: Semaphore,
**kwargs: dict,
) -> str:
async with semaphore:
response = await session.request(method="GET", url=url, **kwargs)
response.raise_for_status()
logging.info("Got response [%s] for URL: %s", response.status, url)
text = await response.text(encoding="utf-8")
return text
async def parse(
url: str,
session: ClientSession,
semaphore: Semaphore,
**kwargs,
) -> Optional[str]:
try:
text = await fetch_text(
url=url,
session=session,
semaphore=semaphore,
**kwargs,
)
except (
aiohttp.ClientError,
aiohttp.http_exceptions.HttpProcessingError,
) as e:
logging.error(
"aiohttp exception for %s [%s]: %s",
url,
getattr(e, "status", None),
getattr(e, "message", None),
)
except Exception as e:
logging.exception(
"Non-aiohttp exception occured: %s",
getattr(e, "__dict__", None),
)
else:
loop = asyncio.get_running_loop()
with concurrent.futures.ProcessPoolExecutor() as pool:
extract_text_ = functools.partial(extract_text, text)
text = await loop.run_in_executor(pool, extract_text_)
logging.info("Found text for %s", url)
return text
async def process_file(
url: dict,
session: ClientSession,
semaphore: Semaphore,
**kwargs: dict,
) -> None:
category = url.get("category")
link = url.get("link")
if category and link:
text = await parse(
url=f"{URL}/{link}",
session=session,
semaphore=semaphore,
**kwargs,
)
if text:
save_path = await get_save_path(
link=link,
category=category,
)
await write_file(html_text=text, path=save_path)
else:
logging.warning("Text for %s not found, skipping it...", link)
async def process_files(
html_files: List[dict],
semaphore: Semaphore,
) -> None:
async with ClientSession() as session:
tasks = [
process_file(
url=file,
session=session,
semaphore=semaphore,
)
for file in html_files
]
await asyncio.gather(*tasks)
async def write_file(
html_text: str,
path: Path,
) -> None:
# Write to file using aiofiles
...
async def get_save_path(link: str, category: str) -> Path:
# return path to save
...
async def main_async(
num_files: Optional[int],
semaphore_count: int,
) -> None:
html_files = # get all the files to process
semaphore = Semaphore(semaphore_count)
await process_files(
html_files=html_files,
semaphore=semaphore,
)
if __name__ == "__main__":
NUM_FILES = # passed through CLI args
SEMAPHORE_COUNT = # passed through CLI args
asyncio.run(
main_async(
num_files=NUM_FILES,
semaphore_count=SEMAPHORE_COUNT,
)
)
SnakeViz charts across 1000 samples
Async version with extract_text and multiprocessing
Async version without extract_text
Sync version with extract_text(notice how the html_parser from BeautifulSoup takes up the majority of the time here)
Sync version without extract_text
Here is roughly what your asynchronous program does:
Launch num_files parse() tasks concurrently
Each parse() task creates its own ProcessPoolExecutor and asynchronously awaits for extract_text (which is executed in the previously created process pool).
This is suboptimal for several reasons:
It creates num_files process pools, which are expensive to create and takes memory
Each pool is only used for one single operation, which is counterproductive: as many concurrent operations as possible should be submitted to a given pool
You are creating a new ProcessPoolExecutor each time the parse() function is called. You could try to instantiate it once (as a global for instance, of passed through a function argument):
from concurrent.futures import ProcessPoolExecutor
async def parse(loop, executor, ...):
...
text = await loop.run_in_executor(executor, extract_text)
# and then in `process_file` (or `process_files`):
async def process_file(...):
...
loop = asyncio.get_running_loop()
with ProcessPoolExecutor() as executor:
...
await process(loop, executor, ...)
I benchmarked the overhead of creating a ProcessPoolExecutor on my old MacBook Air 2015 and it shows that it is quite slow (almost 100 ms for pool creation, opening, submit and shutdown):
from time import perf_counter
from concurrent.futures import ProcessPoolExecutor
def main_1():
"""Pool crated once"""
reps = 100
t1 = perf_counter()
with ProcessPoolExecutor() as executor:
for _ in range(reps):
executor.submit(lambda: None)
t2 = perf_counter()
print(f"{(t2 - t1) / reps * 1_000} ms") # 2 ms/it
def main_2():
"""Pool created at each iteration"""
reps = 100
t1 = perf_counter()
for _ in range(reps):
with ProcessPoolExecutor() as executor:
executor.submit(lambda: None)
t2 = perf_counter()
print(f"{(t2 - t1) / reps * 1_000} ms") # 100 ms/it
if __name__ == "__main__":
main_1()
main_2()
You may again hoist it up in the process_files function, which avoid recreating the pool for each file.
Also, try to inspect more closely your first SnakeViz chart in order to know what exactly in process.py:submit is taking that much time.
One last thing, be careful of the semantics of using a context manager on an executor:
from concurrent.futures import ProcessPoolExecutor
with ProcessPoolExecutor() as executor:
for i in range(100):
executor.submit(some_work, i)
Not only this creates and executor and submit work to it but it also waits for all work to finish before exiting the with statement.
I have a task that is IO bound running in a loop. This task does a lot of work and is often times hogging the loop (Is that the right word for it?). My plan is to run it in a separate process or thread using run_in_executor with ProcessPoolExecutor or ThreadPoolExecutor to run it separately and allow the main loop to do its work. Currently for communication between tasks I use asyncio.PriorityQueue() and asyncio.Event() for communication and would like to reuse these, or something with the same interface, if possible.
Current code:
# Getter for events and queues so communication can happen
send, receive, send_event, receive_event = await process_obj.get_queues()
# Creates task based off the process object
future = asyncio.create_task(process_obj.main())
Current process code:
async def main():
while True:
#does things that hogs loop
What I want to do:
# Getter for events and queues so communication can happen
send, receive, send_event, receive_event = await process_obj.get_queues()
# I assume I could use Thread or Process executors
pool = concurrent.futures.ThreadPoolExecutor()
result = await loop.run_in_executor(pool, process_obj.run())
New process code:
def run():
asyncio.create_task(main())
async def main():
while True:
#does things that hogs loop
How do I communicate between this new thread and the original loop like I could originally?
There is not much I could reproduce your code. So please consider this code from YouTube Downloader as example and I hope that will help you to understand how to get result from thread function:
example code:
def on_download(self, is_mp3: bool, is_mp4: bool, url: str) -> None:
if is_mp3 == False and is_mp4 == False:
self.ids.info_lbl.text = 'Please select a type of file to download.'
else:
self.ids.info_lbl.text = 'Downloading...'
self.is_mp3 = is_mp3
self.is_mp4 = is_mp4
self.url = url
Clock.schedule_once(self.schedule_download, 2)
Clock.schedule_interval(self.start_progress_bar, 0.1)
def schedule_download(self, dt: float) -> None:
'''
Callback method for the download.
'''
pool = ThreadPool(processes=1)
_downloader = Downloader(self.d_path)
self.async_result = pool.apply_async(_downloader.download,
(self.is_mp3, self.is_mp4, self.url))
Clock.schedule_interval(self.check_process, 0.1)
def check_process(self, dt: float) -> None:
'''
Check if download is complete.
'''
if self.async_result.ready():
resp = self.async_result.get()
if resp[0] == 'Error. Download failed.':
self.ids.info_lbl.text = resp[0]
# progress bar gray if error
self.stop_progress_bar(value=0)
else:
# progress bar blue if success
self.stop_progress_bar(value=100)
self.ids.file_name.text = resp[0]
self.ids.info_lbl.text = 'Finished downloading.'
self.ids.url_input.text = ''
Clock.unschedule(self.check_process)
Personally I prefer from multiprocessing.pool import ThreadPool and now it looks like your code 'hogs up' because you are awaiting for result. So obviously until there is result program will wait (and that may be long). If you look in my example code:
on_download will schedule and event schedule download and this one will schedule another event check process. I can't tell if you app is GUI app or terminal as there is pretty much no code in your question but what you have to do, in your loop you have to schedule an event of check process.
If you look on my check process: if self.async_result.ready(): that will only return when my result is ready.
Now you are waiting for the result, here everything is happening in the background and every now and then the main loop will check for the result (it won't hog up as if there is no result the main loop will carry on doing what it have to rather than wait for it).
So basically you have to schedule some events (especially the one for the result) in your loop rather than going line by line and waiting for one. Does that make sense and does my example code is helpful? Sorry I am really bad at explaining what is in my head ;)
-> mainloop
-> new Thread if there is any
-> check for result if there is any Threads
-> if there is a result
-> do something
-> mainloop keeps running
-> back to top
When you execute the while True in your main coroutine, it doesn't hog the loop but blocks the loop not accepting the rest task to do their jobs. Running a process in your event-based application is not the best solution as the processes are not much friendly in data sharing.
It is possible to do all concurrently without using parallelism. All you need is to execute a await asyncio.sleep(0) at the end of while True. It yields back to the loop and allows the rest tasks to be executed. So we do not exit from the coroutine.
In the following example, I have a listener that uses while True and handles the data added by emitter to the queue.
import asyncio
from queue import Empty
from queue import Queue
from random import choice
queue = Queue()
async def listener():
while True:
try:
# data polling from the queue
data = queue.get_nowait()
print(data) # {"type": "event", "data": {...}}
except (Empty, Exception):
pass
finally:
# the magic action
await asyncio.sleep(0)
async def emitter():
# add a data to the queue
queue.put({"type": "event", "data": {...}})
async def main():
# first create a task for listener
running_loop = asyncio.get_running_loop()
running_loop.create_task(listener())
for _ in range(5):
# create tasks for emitter with random intervals to
# demonstrate that the listener is still running in
# the loop and handling the data put into the queue
running_loop.create_task(emitter())
await asyncio.sleep(choice(range(2)))
if __name__ == "__main__":
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
P.S. Started an issue https://github.com/robinhood/faust/issues/702
Developing Faust-app:
from concurrent.futures import ProcessPoolExecutor, as_completed
import faust
app = faust.App('my-app-name', broker='kafka://localhost:9092')
sink = app.topic('topic')
#app.task()
async def check():
# 3 is amount of different folders where archives are laced
with ProcessPoolExecutor(max_workers=3) as executor:
fs = [executor.submit(handle, directory) for directory in ['dir1', 'dir2', 'dir3']]
for future in as_completed(fs):
future.result()
def handle(directory):
# finding archives in directory
# unpacking 7z with mdb-files
# converting mdb tables to csv
# reading csv to dataframe
# some data manipulating
# and at last sending dataframe records to kafka
f = sink.send_soon(value={'ts': 1234567890, 'count': 10}) # always in pending status
Faced a problem when method sink.send_soon returns FutureMessage(asyncio.Future, Awaitable[RecordMetadata]) which is always in pending status.
This is the situation when future inside another future.
Note. Function handle should be sync because one cannot pass async function to ProcessPollExecutor. Method send_soon is sync method. According to this example https://github.com/robinhood/faust/blob/b5e159f1d104ad4a6aa674d14b6ba0be19b5f9f5/examples/windowed_aggregation.py#L47 awaiting is not necessarily.
If there any way to handle pending future?
Also tried this:
import asyncio
from concurrent.futures import ProcessPoolExecutor
import faust
loop = asyncio.get_event_loop()
app = faust.App('my-app-name', broker='kafka://localhost:9092', loop=loop)
sink = app.topic('topic')
#app.task()
async def check():
tasks = []
with ProcessPoolExecutor(max_workers=3) as executor:
for dir_ in ['dir1', 'dir2', 'dir3']:
task = asyncio.create_task(run_dir_handling(executor, dir_))
tasks.append(task)
await asyncio.gather(*tasks)
async def run_dir_handling(executor, dir_):
print('running blocking')
await loop.run_in_executor(executor, handle, dir_)
def handle(directory):
print('Handle directory')
# finding archives in directory
# unpacking 7z with mdb-files
# converting mdb tables to csv
# reading csv to dataframe
# some data manipulating
# and at last sending dataframe records to kafka
# `send_soon` is not non-`async def but `send` is async
# async `soon` cannot be implemented because of
# `await loop.run_in_executor(executor, handle, dir_) TypeError: cannot pickle 'coroutine' object` error
f = sink.send_soon(value={'ts': 1234567890, 'count': 10, 'dir': directory})
print(f) # always <FutureMessage pending>
But it didn't work too.
It seems loop is not even have a chance to run send_soon method.
Changed code structure for this:
import asyncio
from concurrent.futures import ProcessPoolExecutor
import faust
loop = asyncio.get_event_loop()
app = faust.App('my-app-name', broker='kafka://localhost:9092')
sink = app.topic('topic1')
#app.task()
async def check():
tasks = []
with ProcessPoolExecutor(max_workers=3) as executor:
for dir_ in ['dir1', 'dir2', 'dir3']:
task = asyncio.create_task(run_dir_handling(executor, dir_))
tasks.append(task)
await asyncio.gather(*tasks)
async def run_dir_handling(executor, dir_):
directory = await loop.run_in_executor(executor, handle, dir_)
await sink.send(value={'dir': directory})
def handle(directory):
print('Handle directory')
# finding archives in directory
# unpacking 7z with mdb-files
# converting mdb tables to csv
# reading csv to dataframe
# some data manipulating
# and at last sending dataframe records to kafka
return directory
I am developing a crawler and I would like to utilize asyncio to crawl links asynchronously, in order to improve performance.
I have already utilized Celery in my synchronous crawler, which makes me able to run several crawlers in parallel. However the crawler itself is sync, so the performance is poor. I have redesigned my code using asyncio and defined the new crawler as a Celery task, but the crawler is getting network errors for no apparent reason, which makes me think is it even possible to use a coroutine as a Celery task?
The answer to that question would help me to figure out if there is a problem with my code, or there is incompatibility with the stack I'm using.
async def url_helper(url):
return url.html.absolute_links
async def url_worker(session, queue, urls, request_count):
while True:
# Get a "work item" out of the queue.
try:
current = await queue.get()
except asyncio.QueueEmpty:
return
# Stay in domain
if not config.regex_url.match(current):
# Notify the queue that the "work item" has been processed.
queue.task_done()
return
# Check against the desired level of depth
depth = urls.get(current, 0)
if depth == config.max_depth:
logger.info("Out of depth: {}".format(current))
# Notify the queue that the "work item" has been processed.
queue.task_done()
return
# Get all URLs from the page
try:
logger.info("current link: {}".format(current))
resp = await session.get(
current, allow_redirects=True, timeout=15
)
new_urls = await url_helper(resp)
request_count += 1
except TimeoutError as e:
logger.exception(e)
# Notify the queue that the "work item" has been processed.
queue.task_done()
return
except OSError as e:
logger.exception(e)
# Notify the queue that the "work item" has been processed.
queue.task_done()
return
# Add URLs to queue if internal and not already in urls
for link in new_urls:
if config.regex_url.match(link):
if link not in urls:
urls[link] = depth + 1
await queue.put(link)
# Notify the queue that the "work item" has been processed.
queue.task_done()
async def url_crawler(url):
"""
Crawls.
"""
# Set time
start = time.time()
http_asession = requests_html.AsyncHTMLSession()
logger.debug("Sending GET request to start_url...")
start_response = await http_asession.get(url=url)
logger.debug("Received {} response.".format(start_response.status_code))
request_count = 1
# Dict to hold all URLs on the website
urls = {}
# List containing all URLs from the start page
start_urls = list(start_response.html.absolute_links)
logger.info("start urls: %s", start_urls)
# A queue to store our to-be-visited urls
queue = asyncio.Queue()
# Max concurrency
max_workers = config.max_concurrency
# Put urls from start page in the queue
for url in start_urls:
await queue.put(url)
# Create three worker tasks to process the queue concurrently.
coros = asyncio.gather(
*[
url_worker(
queue=queue,
session=http_asession,
urls=urls,
request_count=request_count,
)
for i in range(max_workers)
]
)
await coros
await http_asession.close()
logger.info("Crawled {} links.".format(len(urls)))
logger.info(urls)
logger.debug("Made {} HTTP requests.".format(request_count))
finish = time.time()
logger.info("Execution time: {}".format(finish - start))
#celery_app.task
def run_crawler(url):
loop = asyncio.get_event_loop()
loop.run_until_complete(url_crawler(url))
loop.close()
I have a large file, with a JSON record on each line. I'm writing a script to upload a subset of these records to CouchDB via the API, and experimenting with different approaches to see what works the fastest. Here's what I've found to work fastest to slowest (on a CouchDB instance on my localhost):
Read each needed record into memory. After all records are in memory, generate an upload coroutine for each record, and gather/run all the coroutines at once
Synchronously read file and when a needed record is encountered, synchronously upload
Use aiofiles to read the file, and when a needed record is encountered, asynchronously update
Approach #1 is much faster than the other two (about twice as fast). I am confused why approach #2 is faster than #3, especially in contrast to this example here, which takes half as much time to run asynchronously than synchronously (sync code not provided, had to rewrite it myself). Is it the context switching from file i/o to HTTP i/o, especially with file reads ocurring much more often than API uploads?
For additional illustration, here's some Python pseudo-code that represents each approach:
Approach 1 - Sync File IO, Async HTTP IO
import json
import asyncio
import aiohttp
records = []
with open('records.txt', 'r') as record_file:
for line in record_file:
record = json.loads(line)
if valid(record):
records.append(record)
async def batch_upload(records):
async with aiohttp.ClientSession() as session:
tasks = []
for record in records:
task = async_upload(record, session)
tasks.append(task)
await asyncio.gather(*tasks)
asyncio.run(batch_upload(properties))
Approach 2 - Sync File IO, Sync HTTP IO
import json
with open('records.txt', 'r') as record_file:
for line in record_file:
record = json.loads(line)
if valid(record):
sync_upload(record)
Approach 3 - Async File IO, Async HTTP IO
import json
import asyncio
import aiohttp
import aiofiles
async def batch_upload()
async with aiohttp.ClientSession() as session:
async with open('records.txt', 'r') as record_file:
line = await record_file.readline()
while line:
record = json.loads(line)
if valid(record):
await async_upload(record, session)
line = await record_file.readline()
asyncio.run(batch_upload())
The file I'm developing this with is about 1.3 GB, with 100000 records total, 691 of which I upload. Each upload begins with a GET request to see if the record already exists in CouchDB. If it does, then a PUT is performed to update the CouchDB record with any new information; if it doesn't, then a the record is POSTed to the db. So, each upload consists of two API requests. For dev purposes, I'm only creating records, so I run the GET and POST requests, 1382 API calls total.
Approach #1 takes about 17 seconds, approach #2 takes about 33 seconds, and approach #3 takes about 42 seconds.
your code uses async but it does the work synchronously and in this case it will be slower than the sync approach. Asyc won't speed up the execution if not constructed/used effectively.
You can create 2 coroutines and make them run in parallel.. perhaps that speeds up the operation.
Example:
#!/usr/bin/env python3
import asyncio
async def upload(event, queue):
# This logic is not so correct when it comes to shutdown,
# but gives the idea
while not event.is_set():
record = await queue.get()
print(f'uploading record : {record}')
return
async def read(event, queue):
# dummy logic : instead read here and populate the queue.
for i in range(1, 10):
await queue.put(i)
# Initiate shutdown..
event.set()
async def main():
event = asyncio.Event()
queue = asyncio.Queue()
uploader = asyncio.create_task(upload(event, queue))
reader = asyncio.create_task(read(event, queue))
tasks = [uploader, reader]
await asyncio.gather(*tasks)
if __name__ == '__main__':
asyncio.run(main())