Use of subprocess.call results in "too many open files" - python

I have the following code to create thumbnails and save images. However, after about 1000 items it raises an error saying too many open files. Where is this coming from? And how would I fix the code?
def download_file(url, extension='jpg'):
""" Download a large file. Return path to saved file.
"""
req = requests.get(url)
if not req.ok:
return None
guid = str(uuid.uuid4())
tmp_filename = '/tmp/%s.%s' % (guid, extension)
with open(tmp_filename, 'w') as f:
for chunk in req.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
f.flush()
return tmp_filename
def update_artwork_item(item):
# Download the file
tmp_filename = util.download_file(item.artwork_url)
# Create thumbs
THUMB_SIZES = [(1000, 120), (1000, 30)]
guid = str(uuid.uuid4())
S3_BASE_URL = 'https://s3-us-west-1.amazonaws.com/xxx/'
try:
for size in THUMB_SIZES:
outfile = '%s_%s.jpg' % (guid, size[1])
img = Image.open(tmp_filename).convert('RGB')
img.thumbnail(size, Image.ANTIALIAS)
img.save(outfile, "JPEG")
s3_cmd = '%s %s premiere-avails --norr --public' % (S3_CMD, outfile) ## doesn't work half the time
x = subprocess.check_call(shlex.split(s3_cmd))
if x: raise
subprocess.call(['rm', outfile], stdout=FNULL, stderr=subprocess.STDOUT)
except Exception, e:
print '&&&&&&&&&&', Exception, e
else:
# Save the artwork icons
item.artwork_120 = S3_BASE_URL + guid + '_120.jpg'
item.artwork_30 = S3_BASE_URL + guid + '_30.jpg'
# hack to fix parallel saving
while True:
try:
item.save()
except Exception, e:
print '******************', Exception, e
time.sleep(random.random()*1e-1)
continue
else:
subprocess.call(['rm', tmp_filename], stdout=FNULL, stderr=subprocess.STDOUT)
break

It's almost certainly your use of subprocess.call. subprocess.call is asynchronous, and returns a pipe object, which you are responsible for closing. (See the documentation). So what's happening is that each time you call subprocess.call, a new pipe object is being returned, and you eventually run out of file handles.
By far the easiest thing to do would be to just remove the file from Python by calling os.remove instead of piping to the Unix rm command. Your use of check_call is okay, because check_call is synchronous and won't return a file object you have to close.

Related

Can't write to file from child process

I can't wrap my head around this... I have the following code:
def launch(command):
pf = os.path.join(working_directory, '.pid')
pid = os.fork()
if pid == 0:
stdout = os.open(..., os.O_WRONLY | os.O_CREAT)
try:
proc = Popen(command, shell=False, stdout=stdout, cwd=workdir)
print(proc.pid)
with open(pf, 'wb') as p: # pf should not be open as the file is just created.
p.write(proc.pid)
print("Hello World")
except OSError as proc_error:
...
finally:
os._exit(o) # socketserver catches SystemExit exception (flask)
else:
start = time.time()
while not os.path.isfile(pf): # I'm just checking if that file exists here never opened it in the first place.
if time.time() - start >= 30:
raise TimeoutError...
time.sleep(5)
pid = int(open(pf, 'rb').read())
Here's the output:
$pid
TimeoutError occurred
The script seem to be hanging at opening pf for writing. I verified, the file if not created, Hello World never gets printed.
Why is this happening, and how can fix it?
Thanks!
I have reduced your code to this (removing everything I could not reproduce given your code):
import os
import time
s = "foo"
pid = os.fork()
from subprocess import Popen
if pid == 0:
proc = Popen(["sleep", "3"])
with open(s, "w") as p:
p.write(str(proc.pid)) # <- Only real error I could see
os._exit(0)
else:
start = time.time()
while not os.path.isfile(s):
if time.time() - start >= 30:
raise TimeoutError("Command took to long")
time.sleep(5)
print("Read from file: " + open(s, 'r').read())
However, it works just fine, it prints Read from file: 12075. So the issue is not in the part that can be reproduced, given your code.
To read/write the procid to the binary file I succesfully used the pickle module:
pickle.dump(proc.pid,p) # write to file
pickle.load(open(s, "rb")) #read from file

Python - write() adding content I didn't expect

I am playing with the file I/O functions, and I am having issues writing to a file.
To get a feel for it, I have either run a FOR loop on a range, adding each to a new line, or done the same for a list. Either way, I get the following appended to the file after the loop:
98
99
is dropped.
"""
global quitting
try:
raise
except SystemExit:
raise
except EOFError:
global exit_now
exit_now = True
thread.interrupt_main()
except:
erf = sys.__stderr__
print>>erf, '\n' + '-'*40
print>>erf, 'Unhandled server exception!'
print>>erf, 'Thread: %s' % threading.currentThread().getName()
print>>erf, 'Client Address: ', client_address
print>>erf, 'Request: ', repr(request)
traceback.print_exc(file=erf)
print>>erf, '\n*** Unrecoverable, server exiting!'
print>>erf, '-'*40
quitting = True
thread.interrupt_main()
class MyHandler(rpc.RPCHandler):
def handle(self):
"""Override base method"""
executive = Executive(self)
self.register("exec", executive)
self.console = self.get_remote_proxy("console")
sys.stdin = PyShell.PseudoInputFile(self.console, "stdin",
IOBinding.encoding)
sys.stdout = PyShell.PseudoOutputFile(self.console, "stdout",
IOBinding.encoding)
sys.stderr = PyShell.PseudoOutputFile(self.console, "stderr",
IOBinding.encoding)
# Keep a reference to stdin so that it won't try to exit IDLE if
# sys.stdin gets changed from within IDLE's shell. See issue17838.
self._keep_stdin = sys.stdin
self.interp = self.get_remote_proxy("interp")
rpc.RPCHandler.getresponse(self, myseq=None, wait=0.05)
def exithook(self):
"override SocketIO method - wait for MainThread to shut us down"
time.sleep(10)
<ad nauseum>
The code for creating this is:
f = open('test.txt', 'w+')
for x in range(100):
f.write((str(x) + '\n'))
f.read()
But even if I close it and open the file itself, this stuff is appended.
How can I just write the data to the file without this extra stuff?

using python stream downloading file with server limit

I tried to download file from a server using python, sometimes the file is very large, I would like to have some progress bar, one way to do this I can come up with is to download in a stream, so that I can print the progress. Currently I have tried the standard urlopen, urlretrieve, and requests module (with stream on).
Obviously, urlopen cannot download file in stream, requests module support this, however, the server has limit on the file I can download at one time (its limit is 1). So everytime, I tried to use requests, it only get the webpage told me to wait, is there any other way to do this?
I have very recently downloaded many types of media with this function:
import sys
import requests
import time
def download_resource(domain, url, file_name = None, download = True):
cookies = {}
s = requests.Session()
s.config['keep_alive'] = True
#add your own cookies here, I have a specific function I call
#for my application but yours is different
r = s.get(url, cookies = cookies, stream = True)
if not r.ok:
print "error in downloading"
return -1
file_size = int(r.headers['content-length'])
if not file_name:
try:
temp = r.headers['content-disposition']
except Exception as e:
pass
#failing download
return -1
else:
if not temp:
return -1
else:
file_name = temp.split("filename=")[-1]
return_obj["filename"] = file_name
#print "File size:", file_size
#print "\n", str(self.entire_size / float(1024*1024*1024)), "\n"
print "Downloading:", file_name
if download:
with open(file_name, "wb") as fh:
count = 1
chunk_size = 1048576
start_time = time.time()
try:
for block in r.iter_content(chunk_size):
total_time = time.time() - start_time
percent = count*chunk_size/float(file_size) * 100.0
fraction = int(percent/5)
download_speed = 1.0 / total_time
sys.stdout.write('\r')
sys.stdout.write("[%-20s] %d%% %3.2f MB/s " % ('='* fraction , percent, download_speed))
sys.stdout.flush()
if not block:
break
fh.write(block)
count += 1
start_time = time.time()
except Exception as e:
print e
finally:
#close up the stream
r.close()

url fetch gets stuck when multiple urls are passed

in the following code below I am trying to first check if the URL status code and then start the relevant thread and do the same for adding it to queue,
however if urls are too many then I get TimeOut error.
all code added below
but just discovered another bug if I am passing a mp3 file along with some jpeg images the mp3 file downloaded of its correct size is opening as one of the image in urls passed.
_fdUtils
def getParser():
parser = argparse.ArgumentParser(prog='FileDownloader',
description='Utility to download files from internet')
parser.add_argument('-v', '--verbose', default=logging.DEBUG,
help='by default its on, pass None or False to not spit in shell')
parser.add_argument('-st', '--saveTo', default=None, action=FullPaths,
help='location where you want files to download to')
parser.add_argument('-urls', nargs='*',
help='urls of files you want to download.')
parser.add_argument('-se', nargs='*', default=[1], help='Split each url passed to urls by the'\
" respective split order, if a url doesn't have a split default is taken 1 ")
return parser.parse_args()
def getResponse(url):
return requests.head(url, allow_redirects=True, timeout=10, headers={'Accept-Encoding': 'identity'})
def isWorkingURL(url):
response = getResponse(url)
return response.status_code in [302, 200, 100, 204, 300]
def getUrl(url):
""" gets the actual url to download file from.
"""
response = getResponse(url)
return response.headers.get('location', url)
error stack Trace:
Traceback (most recent call last):
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/threading.py", line 810, in __bootstrap_inner
self.run()
File "python/file_download.py", line 181, in run
_grabAndWriteToDisk(self, split, url, self.__saveTo, 0, self.queue)
File "python/file_download.py", line 70, in _grabAndWriteToDisk
resp = requests.get(url, headers={'Range': 'bytes=%s' % irange}, stream=True)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/requests-2.1.0-py2.7.egg/requests/api.py", line 55, in get
return request('get', url, **kwargs)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/requests-2.1.0-py2.7.egg/requests/api.py", line 44, in request
return session.request(method=method, url=url, **kwargs)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/requests-2.1.0-py2.7.egg/requests/sessions.py", line 382, in request
resp = self.send(prep, **send_kwargs)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/requests-2.1.0-py2.7.egg/requests/sessions.py", line 505, in send
history = [resp for resp in gen] if allow_redirects else []
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/requests-2.1.0-py2.7.egg/requests/sessions.py", line 167, in resolve_redirects
allow_redirects=False,
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/requests-2.1.0-py2.7.egg/requests/sessions.py", line 485, in send
r = adapter.send(request, **kwargs)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/requests-2.1.0-py2.7.egg/requests/adapters.py", line 381, in send
raise Timeout(e)
Timeout: HTTPConnectionPool(host='ia600506.us.archive.org', port=80): Read timed out. (read timeout=<object object at 0x1002b40b0>)
there we go again:
import argparse
import logging
import Queue
import os
import requests
import signal
import socket
import sys
import time
import threading
import utils as _fdUtils
from collections import OrderedDict
from itertools import izip_longest
from socket import error as SocketError, timeout as SocketTimeout
# timeout in seconds
TIMEOUT = 10
socket.setdefaulttimeout(TIMEOUT)
DESKTOP_PATH = os.path.expanduser("~/Desktop")
appName = 'FileDownloader'
logFile = os.path.join(DESKTOP_PATH, '%s.log' % appName)
_log = _fdUtils.fdLogger(appName, logFile, logging.DEBUG, logging.DEBUG, console_level=logging.DEBUG)
queue = Queue.Queue()
STOP_REQUEST = threading.Event()
maxSplits = threading.BoundedSemaphore(3)
threadLimiter = threading.BoundedSemaphore(5)
lock = threading.Lock()
pulledSize = 0
dataDict = {}
def _grabAndWriteToDisk(threadName, url, saveTo, first=None, queue=None, mode='wb', irange=None):
""" Function to download file..
Args:
url(str): url of file to download
saveTo(str): path where to save file
first(int): starting byte of the range
queue(Queue.Queue): queue object to set status for file download
mode(str): mode of file to be downloaded
irange(str): range of byte to download
"""
fileName = _fdUtils.getFileName(url)
filePath = os.path.join(saveTo, fileName)
fileSize = _fdUtils.getUrlSizeInBytes(url)
downloadedFileSize = 0 if not first else first
block_sz = 8192
resp = requests.get(url, headers={'Range': 'bytes=%s' % irange}, stream=True)
for fileBuffer in resp.iter_content(block_sz):
if not fileBuffer:
break
with open(filePath, mode) as fd:
downloadedFileSize += len(fileBuffer)
fd.write(fileBuffer)
mode = 'a'
status = r"%10d [%3.2f%%]" % (downloadedFileSize, downloadedFileSize * 100. / fileSize)
status = status + chr(8)*(len(status)+1)
sys.stdout.write('%s\r' % status)
time.sleep(.01)
sys.stdout.flush()
if downloadedFileSize == fileSize:
STOP_REQUEST.set()
queue.task_done()
_log.debug("Downloaded %s %s%% using %s and saved to %s", fileName,
downloadedFileSize * 100. / fileSize, threadName.getName(), saveTo)
def _downloadChunk(url, idx, irange, fileName, sizeInBytes):
_log.debug("Downloading %s for first chunk %s of %s " % (irange, idx+1, fileName))
pulledSize = irange[-1]
try:
resp = requests.get(url, allow_redirects=False, timeout=TIMEOUT,
headers={'Range': 'bytes=%s-%s' % (str(irange[0]), str(irange[-1]))},
stream=True)
except (SocketTimeout, requests.exceptions), e:
_log.error(e)
return
chunk_size = str(irange[-1])
for chunk in resp.iter_content(chunk_size):
status = r"%10d [%3.2f%%]" % (pulledSize, pulledSize * 100. / int(chunk_size))
status = status + chr(8)*(len(status)+1)
sys.stdout.write('%s\r' % status)
sys.stdout.flush()
pulledSize += len(chunk)
dataDict[idx] = chunk
time.sleep(.03)
if pulledSize == sizeInBytes:
_log.info("%s downloaded %3.0f%%", fileName, pulledSize * 100. / sizeInBytes)
class ThreadedFetch(threading.Thread):
""" docstring for ThreadedFetch
"""
def __init__(self, saveTo, queue):
super(ThreadedFetch, self).__init__()
self.queue = queue
self.__saveTo = saveTo
def run(self):
threadLimiter.acquire()
try:
items = self.queue.get()
url = items[0]
split = items[-1]
fileName = _fdUtils.getFileName(url)
# grab split chunks in separate thread.
if split > 1:
maxSplits.acquire()
try:
sizeInBytes = _fdUtils.getUrlSizeInBytes(url)
byteRanges = _fdUtils.getRangeSegements(sizeInBytes, split)
filePath = os.path.join(self.__saveTo, fileName)
downloaders = [
threading.Thread(
target=_downloadChunk,
args=(url, idx, irange, fileName, sizeInBytes),
)
for idx, irange in enumerate(byteRanges)
]
# start threads, let run in parallel, wait for all to finish
for th in downloaders:
th.start()
# this makes the wait for all thread to finish
# which confirms the dataDict is up-to-date
for th in downloaders:
th.join()
downloadedSize = 0
with open(filePath, 'wb') as fh:
for _idx, chunk in sorted(dataDict.iteritems()):
downloadedSize += len(chunk)
status = r"%10d [%3.2f%%]" % (downloadedSize, downloadedSize * 100. / sizeInBytes)
status = status + chr(8)*(len(status)+1)
fh.write(chunk)
sys.stdout.write('%s\r' % status)
time.sleep(.04)
sys.stdout.flush()
if downloadedSize == sizeInBytes:
_log.info("%s, saved to %s", fileName, self.__saveTo)
self.queue.task_done()
finally:
maxSplits.release()
else:
while not STOP_REQUEST.isSet():
self.setName("primary_%s_thread" % fileName.split(".")[0])
# if downlaod whole file in single chunk no need
# to start a new thread, so directly download here.
_grabAndWriteToDisk(self, url, self.__saveTo, 0, self.queue)
finally:
threadLimiter.release()
def main(appName):
args = _fdUtils.getParser()
saveTo = args.saveTo if args.saveTo else DESKTOP_PATH
# spawn a pool of threads, and pass them queue instance
# each url will be downloaded concurrently
unOrdUrls = dict(izip_longest(args.urls, args.se, fillvalue=1))
ordUrls = OrderedDict([(k, unOrdUrls[k]) for k in sorted(unOrdUrls, key=unOrdUrls.get, reverse=False) if _fdUtils.isWorkingURL(k, _log) and _fdUtils.notOnDisk(k, saveTo)])
print "length: %s " % len(ordUrls)
for i in xrange(len(ordUrls)):
t = ThreadedFetch(saveTo, queue)
t.daemon = True
t.start()
try:
# populate queue with data
for url, split in ordUrls.iteritems():
url = _fdUtils.getUrl(url)
print url
queue.put((url, int(split)))
# wait on the queue until everything has been processed
queue.join()
_log.info('All tasks completed.')
except (KeyboardInterrupt, SystemExit):
_log.critical('! Received keyboard interrupt, quitting threads.')
if __name__ == "__main__":
# change the name of MainThread.
threading.currentThread().setName("FileDownloader")
myapp = threading.currentThread().getName()
main(myapp)
I see two problems in your code. Since it's incomplete, I'm not sure how it's supposed to work, so I can't promise either one is the particular one you're running into first, but I'm pretty sure you need to fix both.
First:
queue.put((_fdUtils.getUrl(url), int(split)))
That's going to call _fdUtils.getUrl(url) in the main thread, and put the result on the queue. Your comments clearly imply that you intended the downloading to happen on the background threads.
If you wanted to pass a function to be called, just pass the function and its argument as separate members of the tuple, or wrap it up in a closure or a partial:
queue.put((lambda: _fdUtils.getUrl(url), int(split)))
Second:
t = ThreadedFetch(saveTo, queue)
t.daemon = True
t.start()
This starts a thread for every URL. That's almost never a good idea. Generally, downloaders don't use more than 4-16 threads at a time, and no more than 2-4 to the same site. You could easily be timing out because you're spamming some sit too fast and its server or router is making you back off for a while. Or, with a huge number of requests, you could be flooding your own network and blocking ACKs or even rebooting the router (especially if you have either a cheap home WiFi router or ADSL with a crappy provider).
Also, a much simpler way to do this would be to use a smart pool, like a multiprocessing.dummy.Pool (multiprocessing.dummy means it acts like the multiprocessing module but uses threads) or, even better, a concurrent.futures.ThreadPoolExecutor. In fact, if you look at the docs, a parallel downloader is the first example for ThreadPoolExecutor.

Upload a file-like object with Paramiko?

I have a bunch of code that looks like this:
with tempfile.NamedTemporaryFile() as tmpfile:
tmpfile.write(fileobj.read()) # fileobj is some file-like object
tmpfile.flush()
try:
self.sftp.put(tmpfile.name, path)
except IOError:
# error handling removed for ease of reading
pass
Is it possible to do an upload like this without having to write the file out somewhere?
Update As of Paramiko 1.10, you can use putfo:
self.sftp.putfo(fileobj, path)
Instead of using paramiko.SFTPClient.put, you can use paramiko.SFTPClient.open, which opens a file-like object. You can write to that. Something like this:
f = self.sftp.open(path, 'wb')
f.write(fileobj.read())
f.close()
Note that it may be worthwhile to feed paramiko data in 32 KiB chunks, since that's the largest chunk underlying SSH protocol can handle without breaking it into multiple packets.
Is StringIO what you're looking for? (doc page)
SFTPClient's get() and put() functions take paths and not file-handles, which makes things a bit awkward.
You could write a wrapper for paramiko.SFTPClient to give it the functionality that you want.
Here's my best untested attempt:
from paramiko import SFTPClient
class SFTPClient2(SFTPClient):
def put(self, local_file, remotepath, callback=None, confirm=True):
fl = source_file
file_size = os.fstat(fl.fileno()).st_size
try:
fr = self.file(remotepath, 'wb')
fr.set_pipelined(True)
size = 0
try:
while True:
data = fl.read(32768)
if len(data) == 0:
break
fr.write(data)
size += len(data)
if callback is not None:
callback(size, file_size)
finally:
fr.close()
finally:
fl.close()
if confirm:
s = self.stat(remotepath)
if s.st_size != size:
raise IOError('size mismatch in put! %d != %d' % (s.st_size, size))
else:
s = SFTPAttributes()
return s
def get(self, remotepath, local_file, callback=None):
fr = self.file(remotepath, 'rb')
file_size = self.stat(remotepath).st_size
fr.prefetch()
try:
fl = local_file
try:
size = 0
while True:
data = fr.read(32768)
if len(data) == 0:
break
fl.write(data)
size += len(data)
if callback is not None:
callback(size, file_size)
finally:
fl.close()
finally:
fr.close()
s = os.fstat(fl.fileno())
if s.st_size != size:
raise IOError('size mismatch in get! %d != %d' % (s.st_size, size))
If it works, the get and put functions should now take local file-handles rather than paths.
All I had to do was get rid of the code that opens the file from the path, and change the code that gets the size of the file to use os.fstat instead of os.stat.

Categories

Resources