I'm using python boto and threading to download many files from S3 rapidly. I use this several times in my program and it works great. However, there is one time when it doesn't work. In that step, I try to download 3,000 files on a 32 core machine (Amazon EC2 cc2.8xlarge).
The code below actually succeeds in downloading every file (except sometimes there is an httplib.IncompleteRead error that doesn't get fixed by the retries). However, only 10 or so of the 32 threads actually terminate and the program just hangs. Not sure why this is. All the files have been downloaded and all the threads should have exited. They do on other steps when I download fewer files. I've been reduced to downloading all these files with a single thread (which works but is super slow). Any insights would be greatly appreciated!
from boto.ec2.connection import EC2Connection
from boto.s3.connection import S3Connection
from boto.s3.key import Key
from boto.exception import BotoClientError
from socket import error as socket_error
from httplib import IncompleteRead
import multiprocessing
from time import sleep
import os
import Queue
import threading
def download_to_dir(keys, dir):
"""
Given a list of S3 keys and a local directory filepath,
downloads the files corresponding to the keys to the local directory.
Returns a list of filenames.
"""
filenames = [None for k in keys]
class DownloadThread(threading.Thread):
def __init__(self, queue, dir):
# call to the parent constructor
threading.Thread.__init__(self)
# create a connection to S3
connection = S3Connection(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
self.conn = connection
self.dir = dir
self.__queue = queue
def run(self):
while True:
key_dict = self.__queue.get()
print self, key_dict
if key_dict is None:
print "DOWNLOAD THREAD FINISHED"
break
elif key_dict == 'DONE': #last job for last worker
print "DOWNLOADING DONE"
break
else: #still work to do!
index = key_dict.get('idx')
key = key_dict.get('key')
bucket_name = key.bucket.name
bucket = self.conn.get_bucket(bucket_name)
k = Key(bucket) #clone key to use new connection
k.key = key.key
filename = os.path.join(dir, k.key)
#make dirs if don't exist yet
try:
f_dirname = os.path.dirname(filename)
if not os.path.exists(f_dirname):
os.makedirs(f_dirname)
except OSError: #already written to
pass
#inspired by: http://code.google.com/p/s3funnel/source/browse/trunk/scripts/s3funnel?r=10
RETRIES = 5 #attempt at most 5 times
wait = 1
for i in xrange(RETRIES):
try:
k.get_contents_to_filename(filename)
break
except (IncompleteRead, socket_error, BotoClientError), e:
if i == RETRIES-1: #failed final attempt
raise Exception('FAILED TO DOWNLOAD %s, %s' % (k, e))
break
wait *= 2
sleep(wait)
#put filename in right spot!
filenames[index] = filename
num_cores = multiprocessing.cpu_count()
q = Queue.Queue(0)
for i, k in enumerate(keys):
q.put({'idx': i, 'key':k})
for i in range(num_cores-1):
q.put(None) # add end-of-queue markers
q.put('DONE') #to signal absolute end of job
#Spin up all the workers
workers = [DownloadThread(q, dir) for i in range(num_cores)]
for worker in workers:
worker.start()
#Block main thread until completion
for worker in workers:
worker.join()
return filenames
Upgrade to AWS SDK version 1.4.4.0 or newer, or stick to exactly 2 threads. Older versions have a limit of at most 2 simultaneous connections. This means that your code will work well if you launch 2 threads; if you launch 3 or more, you are bound to see incomplete reads and exhausted timeouts.
You will see that while 2 threads can boost your throughput greatly, more than 2 does not change much because your network card is busy all the time anyway.
S3Connection uses httplib.py and that library is not threadsafe so ensuring each thread has it's own connection is critical. It looks like you are doing that.
Boto already has it's own retry mechanism but you are layering one on top of that to handle certain other errors. I wonder if it would be advisable to create a new S3Connection object inside the except block. It just seems like the underlying http connection could be in an unusual state at that point and it might be best to start with a fresh connection.
Just a thought.
Related
I'm trying to improving the interactive output of small CLI program walking a directory to process files, and using a Rich progress bar to display the progression of the tasks.
At the moment, I'm doing this in 2 steps:
pool.submit() all the tasks
for future in as_completed(xxxx) wait for the next future available.
The problem is that the first step (pool.submit) might take some time (since I'm walking the directory), and the UI isn't updated, even though futures have already been available.
So, I tried to come up with a Thread that would submit on my pool, while the main thread would wait on the next Future and update the UI:
"""
Usage: walker.py [options] <file/directory>...
Options:
-r --recursive Walk directories recursively
-w WORKERS --workers=WORKERS Specify the number of process pool workers [default: 4]
-d --debug Enable debug output
-h --help Display this message
"""
import os
import threading
import time
from concurrent.futures._base import as_completed
from concurrent.futures.process import ProcessPoolExecutor
from pathlib import Path
from random import randint
from typing import List
from docopt import docopt
from rich.console import Console
from rich.progress import BarColumn, Progress, TextColumn
def walk_filepath_list(filepath_list: List[Path], recursive: bool = False):
for path in filepath_list:
if path.is_dir() and not path.is_symlink():
if recursive:
for f in os.scandir(path):
yield from walk_filepath_list([Path(f)], recursive)
else:
yield from (Path(f) for f in os.scandir(path))
elif path.is_file():
yield path
def process_task(filepath):
rand = randint(0, 1)
time.sleep(rand)
def thread_submit(pool, filepath_list, recursive, future_to_filepath):
for filepath in walk_filepath_list(filepath_list, recursive):
future = pool.submit(process_task, filepath)
# update shared dict
future_to_filepath[future] = filepath
def main(args):
filepath_list = [Path(entry) for entry in args["<file/directory>"]]
debug = args["--debug"]
workers = int(args["--workers"])
recursive = args["--recursive"]
console = Console()
process_bar = Progress(
TextColumn("[bold blue]Processing...", justify="left"),
BarColumn(bar_width=None),
"{task.completed}/{task.total}",
"•",
"[progress.percentage]{task.percentage:>3.1f}%",
console=console,
)
process_bar.start()
# we need to consume the iterator once to get the total
# for the progress bar
count = sum(1 for i in walk_filepath_list(filepath_list, recursive))
task_process_bar = process_bar.add_task("Main task", total=count)
with ProcessPoolExecutor(max_workers=workers) as pool:
# shared dict between threads
# [Future] => [filepath]
future_to_filepath = {}
submit_thread = threading.Thread(
target=thread_submit, args=(pool, filepath_list, recursive, future_to_filepath)
)
submit_thread.start()
while len(future_to_filepath.keys()) != count:
for future in as_completed(future_to_filepath):
filepath = future_to_filepath[future]
# print(f"processing future: {filepath}")
try:
data = future.result()
finally:
# update progress bar
process_bar.update(task_process_bar, advance=1)
process_bar.stop()
def entrypoint():
args = docopt(__doc__)
main(args)
if __name__ == "__main__":
entrypoint()
However, the progress bar isn't updated as expected.
Worse, there are cases where the processing doesn't seem to end.
is it a race conditions when I update my dict future_to_filepath ?
how would you go to have a submit thread and a process_results thread with concurrent.futures ?
Thank you SO !
See my comments to your question and then:
Change:
submit_thread = threading.Thread(
target=thread_submit, args=(pool, filepath_list, recursive, future_to_filepath)
)
submit_thread.start()
To:
thread_submit(pool, filepath_list, recursive, future_to_filepath)
(a change to this function name, since it is no longer running as a separate thread, would be a good thing -- how about create_futures?)
And remove the outer loop:
while len(future_to_filepath.keys()) != count:
Finally, it is not clear what your real process_task will do with the file but it certainly seems possible that it will be I/O bound. In that case, you might benefit instead from using the ThreadPoolExecutor class, easily substitutable for the ProcessPoolExecutor class, in which case you should consider specifying a much larger number of workers, possibly equal to count. Since your current process_task is doing nothing much more than sleeping, it would probably profit from threading with the larger number of workers.
Update
One thing you can do to reduce the time it takes to run walk_filepath_list is to modify the function to be passed a single path to walk rather than a list and to process each path that was in the original list concurrently in separate threads. In the code below I am using the ThreadPoolExecutor map function for convenience which really requires that the arguments to the (newly renamed) walk_filepath function be reversed so that I can use functools.partial to "harcode" the first argument, recursive, for all the calls:
from concurrent.futures import ThreadPoolExecutor
from functools import partial
def walk_filepath(recursive: bool = False, path: Path = None):
if path.is_dir() and not path.is_symlink():
if recursive:
for f in os.scandir(path):
yield from walk_filepath(recursive, Path(f))
else:
yield from (Path(f) for f in os.scandir(path))
elif path.is_file():
yield path
def walker(recursive, path):
return list(walk_filepath(recursive, path))
def thread_submit(pool, filepath_list, recursive, future_to_filepath):
n_workers = len(filepath_list)
with ThreadPoolExecutor(max_workers=n_workers) as executor:
filepath_lists = executor.map(partial(walker, recursive), filepath_list)
for filepath_list in filepath_lists:
for filepath in filepath_list:
future = pool.submit(process_task, filepath)
# update shared dict
future_to_filepath[future] = filepath
Update 2
A benchmark of the above code reveals that it does not save time (perhaps if the directories were on different physical drives?).
I would like to create a queue of about 256K paths to files and have the paths dequeued and processed by parallel worker processes. This is multiprocessing rather than threads.
However, when I create a multiprocessing.queue there seems to be a hard limit at 32K objects in the queue. This might be even smaller if the objects were full paths to files, as intended.
What would be an alternate way to create a multiserver queue for multiprocessing?
import multiprocessing
import sys
q = multiprocessing.Queue()
for i in range(32768 * 2):
print i
try:
q.put('abcdef')
except:
print "Unexpected error on ()".format(i), sys.exc_info()[0]
raise
yields:
...
32766
32767
Traceback (most recent call last):
Unexpected error on () <type 'exceptions.KeyboardInterrupt'>
File "/Users/Wes/Dropbox/Programming/ElectionTransparency/vops_addons/dead/tryq.py", line 13, in <module>
q.put('abc')
File "/usr/local/Cellar/python#2/2.7.16/Frameworks/Python.framework/Versions/2.7/lib/python2.7/multiprocessing/queues.py", line 101, in put
if not self._sem.acquire(block, timeout):
KeyboardInterrupt
You could try using celery - http://www.celeryproject.org/ - the queue limit would be up to the broker configuration.
Moreover, you would not be limited to workers on the same machine - any computer that could mount the same filesystem could run celery workers to process your tasks. (Although if remote processing then is not an option, using celery workers could still have advantages over raw multiprocessing, as there are niceties such as automatic retry)
Here is what I finally found that worked. I made the array of paths available to all the worker processes and used a multiprocessing.Value() object to create a shared index into the array protected with a lock.
from multiprocessing import Process, Lock, Value
import os
import sys
import time
def info(title, lock, item=None):
pid = os.getpid()
lock.acquire()
print '<', title, item,' ', __name__, pid, '>'
sys.stdout.flush()
lock.release()
def f(stdout_lock, next_item, worklist):
while True:
with next_item.get_lock():
if len(worklist) <= next_item.value:
return
item = worklist[next_item.value]
next_item.value += 1
info('queue item: ', stdout_lock, item)
time.sleep(0.0001)
if __name__ == '__main__':
next_item = Value('l')
worklist = [str(i) for i in range(250000)]
next_item.value = 0
stdout_lock = Lock()
plist = []
for i in range(3):
plist.append(Process(target=f, args=(stdout_lock, next_item, worklist)))
plist[-1].start()
for i in range(3):
plist[i].join()
I am trying to download images from a list of URLs using Python. To make the process faster, I used the multiprocessing library.
The problem I am facing is that the script often hangs/freezes on its own, and I don't know why.
Here is the code that I am using
...
import multiprocessing as mp
def getImages(val):
#Dowload images
try:
url= # preprocess the url from the input val
local= #Filename Generation From Global Varables And Rand Stuffs...
urllib.request.urlretrieve(url,local)
print("DONE - " + url)
return 1
except Exception as e:
print("CAN'T DOWNLOAD - " + url )
return 0
if __name__ == '__main__':
files = "urls.txt"
lst = list(open(files))
lst = [l.replace("\n", "") for l in lst]
pool = mp.Pool(processes=4)
res = pool.map(getImages, lst)
print ("tempw")
It often gets stuck halfway through the list (it prints DONE, or CAN't DOWNLOAD to half of the list it has processed but I don't know what is happening on the rest of them). Has anyone faced this problem? I have searched for similar problems (e.g. this link) but found no answer.
Thanks in advance
Ok, I have found an answer.
A possible culprit was the script was stuck in connecting/downloading from the URL. So what I added was a socket timeout to limit the time to connect and download the image.
And now, the issue no longer bothers me.
Here is my complete code
...
import multiprocessing as mp
import socket
# Set the default timeout in seconds
timeout = 20
socket.setdefaulttimeout(timeout)
def getImages(val):
#Dowload images
try:
url= # preprocess the url from the input val
local= #Filename Generation From Global Varables And Rand Stuffs...
urllib.request.urlretrieve(url,local)
print("DONE - " + url)
return 1
except Exception as e:
print("CAN'T DOWNLOAD - " + url )
return 0
if __name__ == '__main__':
files = "urls.txt"
lst = list(open(files))
lst = [l.replace("\n", "") for l in lst]
pool = mp.Pool(processes=4)
res = pool.map(getImages, lst)
print ("tempw")
Hope this solution helps others who are facing the same issue
It looks like you're facing a GIL issue : The python Global Interpreter Lock basically forbid python to do more than one task at the same time.
The Multiprocessing module is really launching separate instances of python to get the work done in parallel.
But in your case, urllib is called in all these instances : each of them is trying to lock the IO process : the one who succeed (e.g. come first) get you the result, while the others (trying to lock an already locked process) fail.
This is a very simplified explanation, but here are some additionnal ressources :
You can find another way to parallelize requests here : Multiprocessing useless with urllib2?
And more info about the GIL here : What is a global interpreter lock (GIL)?
I'm trying to download a lot of data using multiple threads from Yahoo Finance. I'm using concurrent.futures.ThreadPoolExecutor to speed things up. Everything goes well until I consume all the available file descriptors (1024 by default).
When urllib.request.urlopen() raises an exception the file descriptor is left open (no matter what timeout for socket I use). Normally this file descriptor is reused if I run stuff only from a single (main) thread so this problem doesn't occur. But when these exceptional urlopen() calls are made from ThreadPoolExecutor threads these file descriptors are left open. The only solution I have come up with so far is to use either processes (ProcessPoolExecutor) which is very cumbersome and inefficient or increase the number of allowed file descriptors to something really big (not all the potential users of my library are going to do this anyway). There must be a smarter way to deal with this problem.
And also I wonder whether this is a bug in Python libraries or am I just doing something wrong...
I'm running Python 3.4.1 on Debian (testing, kernel 3.10-3-amd64).
This is an example code that demonstrates this behaviour:
import concurrent
import concurrent.futures
import urllib.request
import os
import psutil
from time import sleep
def fetchfun(url):
urllib.request.urlopen(url)
def main():
print(os.getpid())
p = psutil.Process(os.getpid())
print(p.get_num_fds())
# this url doesn't exist
test_url = 'http://ichart.finance.yahoo.com/table.csv?s=YHOOxyz' + \
'&a=00&b=01&c=1900&d=11&e=31&f=2019&g=d'
with concurrent.futures.ThreadPoolExecutor(1) as executor:
futures = []
for i in range(100):
futures.append(executor.submit(fetchfun, test_url))
count = 0
for future in concurrent.futures.as_completed(futures):
count += 1
print("{}: {} (ex: {})".format(count, p.get_num_fds(), future.exception()))
print(os.getpid())
sleep(60)
if __name__ == "__main__":
main()
When the HTTPError is raised, it saves a reference to the HTTPResponse object for the request as the fp attribute of the HTTPError. That reference gets saved in your futures list, which isn't destroyed until your program ends. That means there's a reference to the HTTPResponse being kept alive for your entire program. As long as that reference exists, the socket used in the HTTPResponse stays open. One way you can work around this is by explicitly closing the HTTPResponse when you handle the exception:
with concurrent.futures.ThreadPoolExecutor(1) as executor:
futures = []
for i in range(100):
futures.append(executor.submit(fetchfun, test_url))
count = 0
for future in concurrent.futures.as_completed(futures):
count += 1
exc = future.exception()
print("{}: {} (ex: {})".format(count, p.get_num_fds(), exc))
exc.fp.close() # Close the HTTPResponse
Excuse the unhelpful variable names and unnecessarily bloated code, but I just quickly whipped this together and haven't had time to optimise or tidy up yet.
I wrote this program to dump all the images my friend and I had sent to each other using a webcam photo sharing service ( 321cheese.com ) by parsing a message log for the URLs. The problem is that my multithreading doesn't seem to work.
At the bottom of my code, you'll see my commented-out non-multithreaded download method, which consistently produces the correct results (which is 121 photos in this case). But when I try to send this action to a new thread, the program sometimes downloads 112 photos, sometimes 90, sometimes 115 photos, etc, but never gives out the correct result.
Why would this create a problem? Should I limit the number of simultaneous threads (and how)?
import urllib
import thread
def getName(input):
l = input.split(".com/")
m = l[1]
return m
def parseMessages():
theFile = open('messages.html', 'r')
theLines = theFile.readlines()
theFile.close()
theNewFile = open('new321.txt','w')
for z in theLines:
if "321cheese" in z:
theNewFile.write(z)
theNewFile.close()
def downloadImage(inputURL):
urllib.urlretrieve (inputURL, "./grabNew/" + d)
parseMessages()
f = open('new321.txt', 'r')
lines = f.readlines()
f.close()
g = open('output.txt', 'w')
for x in lines:
a = x.split("<a href=\"")
b = a[1].split("\"")
c = b[0]
if ".png" in c:
d = getName(c)
g.write(c+"\n")
thread.start_new_thread( downloadImage, (c,) )
##downloadImage(c)
g.close()
There are multiple issues in your code.
The main issue is d global name usage in multiple threads. To fix it, pass the name explicitly as an argument to downloadImage().
The easy way (code-wise) to limit the number of concurrent downloads is to use concurrent.futures (available on Python 2 as futures) or multiprocessing.Pool:
#!/usr/bin/env python
import urllib
from multiprocessing import Pool
from posixpath import basename
from urllib import unquote
from urlparse import urlsplit
download_dir = "grabNew"
def url2filename(url):
return basename(unquote(urlsplit(url).path).decode('utf-8'))
def download_image(url):
filename = None
try:
filename = os.path.join(download_dir, url2filename(url))
return urllib.urlretrieve(url, filename), None
except Exception as e:
return (filename, None), e
def main():
pool = Pool(processes=10)
for (filename, headers), error in pool.imap_unordered(download_image, get_urls()):
pass # do something with the downloaded file or handle an error
if __name__ == "__main__":
main()
Did you make sure your parsing is working correctly?
Also, you are launching too many threads.
And finally... threads in python are FAKE! Use the multiprocessing module if you want real parallelism, but since the images are probably all from the same server, if you open one hundred connections at the same time with the same server, probably its firewall will start dropping your connections.