Best approach to stop a thread that's downloading a file - python

I'm using PySide, and running downloads on secondary threads, so that the UI doesn't block.
It's just basically a button that starts the download on a new thread and saves the file to disk. I'd like to have a "cancel" button that stops the download. I've looked up ways to stop threads, but they seem hackish and look like something that shouldn't really be done.
If thread stopping is hackish, what is the proper way of doing this? It's obviously done in hundreds of software. Maybe with a flag? My thread looks something like this. Once it's started, I don't know how to "pause" it or stop it.
goOn = True
def MyThread(threading.Thread):
def __init__(self, url):
super().__init__(self)
self.url = url
def run(self):
data = urllib.request.urlopen(self.url)
f = open('fileName', 'wb')
size = int(data.headers['Content-Length'])
downloaded = 0
blockSize = 1024 * 8
while True:
buffer = data.read(blockSize)
if not buffer:
break
f.write(buffer)
downloaded += blockSize
'''
if not goOn: # Something like this?
break
'''
Thanks.

def MyThread(threading.Thread):
def __init__(self, url):
super().__init__(self)
self.url = url
def run(self):
data = urllib.request.urlopen(self.url)
f = open('fileName', 'wb')
size = int(data.headers['Content-Length'])
downloaded = 0
blockSize = 1024 * 8
self.running = 1
while self.running:
buffer = data.read(blockSize)
if not buffer:
break
f.write(buffer)
downloaded += blockSize
def stop_running(self):
self.running =0
should work I think ...

Related

Python multithreading- memory leak when use an shared object(so)

I have a python programs that gets memory leaks when use an third-party SO.
I simplify my code like this:
import time
import sys
import threading
import codecs
import ctypes
sys.stdout = codecs.getwriter("utf-8")(sys.stdout.detach())
class TestThirdPartySo(object):
def __init__(self):
# this so uses thread-specific data
self.call_stat_so = ctypes.CDLL("./third_party_fun.so")
self.handle = self.call_stat_so._handle
def test_fun(self):
self.call_stat_so.fun_xxx()
def thread_fun():
TestThirdPartySo().test_fun()
def test_main(num):
count = 0
while True:
# create 3 * num threads
thread_num = 3
thread_list = []
for _ in range(thread_num):
thread_list.append(threading.Thread(target=thread_fun))
for thread in thread_list:
thread.start()
for thread in thread_list:
thread.join()
count += thread_num
time.sleep(0.01)
if count % 100 == 0:
print("finied %s" % count)
if count > num:
break
print("end !!!!")
if __name__ == '__main__':
num = sys.argv[1]
test_main(int(num))
Now, I know this shared object uses thread-specific data.And I have tried to close the SO after called it like this:
class TestThirdPartySo(object):
def __init__(self):
# this so uses thread-specific data
self.call_stat_so = ctypes.CDLL("./third_party_fun.so")
self.handle = self.call_stat_so._handle
def test_fun(self):
self.call_stat_so.fun_xxx()
def __del__(self):
dlclose_func(self.handle)
def dlclose_func(_handle):
dlclose_func_tmp = ctypes.cdll.LoadLibrary('libdl.so').dlclose
dlclose_func_tmp.argtypes = [ctypes.c_void_p]
dlclose_func_tmp(_handle)
But I failed to close the so. And I'm also not sure if the leaked memory will be freed after closing the so.
If the program not uses multi-threads or creates a fixed number of threads(threadpool), it works ok.
For some reason,I need create threads constantly in my program. What can I do to prevent this memory leaks?

Python: Slow generator streaming into fast consumer depletes buffer and terminates early

Say I have a generator that slowly creates my stream of data:
import threading
import time
class SlowStreamSource():
def __init__(self):
self.buffer = ""
self.gen_data = threading.Thread(target=self.generate_stream)
self.gen_data.start()
def generate_stream(self):
i = 0
while i < 10:
self.buffer += str(i)
# Other processing happens
time.sleep(0.1)
i += 1
self.buffer += "-Stream Finished-"
def read(self, hint = -1):
if hint is None or hint < 0:
result = self.buffer
self.buffer = ""
else:
result = self.buffer[:hint]
result = self.buffer[hint:]
return result
This data is sent to a consumer that is much faster than the generator and follows the standard practice of invoking read() until there is no more data and exiting
import time
class FastStreamDestination():
def __init__(self, source):
self.source = source
def process_stream(self):
while True:
data = self.source.read()
if not data:
break
print(f'read "{data}"')
# Other processing happens
time.sleep(0.05)
(I have no control over the consumer. It's Amazon's boto3 upload_fileobj, but I have reviewed their code to determine that this is essentially how it functions.)
When I feed my generator into my consumer, it very quickly depletes the buffer, concludes that the stream is finished and exits prematurely.
src = SlowStreamSource()
dst = FastStreamDestination(src)
dst.process_stream()
yields read "0", but I ultimately need something like
read "0"
read "1"
read "2"
read "3"
read "4"
read "5"
read "6"
read "7"
read "8"
read "9"
read "-Stream Finished-"
Is there anyway to ensure my consumer reads the entire stream from my generator, keeping in mind that I cannot meaningfully speed up the generator, nor can I modify the consumer in any way?
Ok, with some help from a co-worker, I think I have the solution.
My generator can be aware of if there is more data for it to provide, even if it doesn't have the data ready yet. Since it's a File-like object, that means it has a close function that can be invoked when I'm sure the data is all generated.
With that awareness, I can make the read function block as long as it needs in order to ensure that it has some data to return.
import threading
import time
class SlowStreamSource():
def __init__(self):
self.buffer = ""
self.done = False
self.gen_data = threading.Thread(target=self.generate_stream)
self.gen_data.start()
self.closed = False
def generate_stream(self):
i = 0
while i < 10:
self.buffer += str(i)
# Other processing happens
time.sleep(0.1)
i += 1
self.buffer += "-Stream Finished-"
self.closed = True
def read(self, hint = -1):
while not self.closed and len(self.buffer) == 0:
time.sleep(0.1)
if hint is None or hint < 0:
result = self.buffer
self.buffer = ""
else:
result = self.buffer[:hint]
result = self.buffer[hint:]
return result

IP Camera recorder class not working properly

Edit: I figured it out shortly after posting, it's amazing how asking a question can help you re-examine things :-). I commented the switch in the code.
I'm working on a class to download single frames from motion jpeg streams every so many seconds. It worked fine until I added some code to disconnect from the stream once a single jpeg is loaded. Now the "recorder" seems to be storing the first frame retrieved and not replacing the "image" with new content. I can destroy and recreate the object every time to fix this but that's not a very elegant solution. Any help would be appreciated!
Here's the code for the Recorder class:
import cv2
import urllib2
import threading
from PIL import Image
import numpy as np
class MJpegCam:
def __init__(self, ip, timeout=5):
self.ip = ip
self.bytes= ''
self.image = ''
self.stream = None
self.stopcam = False
self.timeout = timeout
def start(self):
self.stream=urllib2.urlopen(self.ip, timeout=self.timeout)
def run():
while True:
if self.stopcam:
self.stopcam = False
return
try:
self.bytes+=self.stream.read(1024)
a = self.bytes.find('\xff\xd8')
b = self.bytes.find('\xff\xd9')
if a!=-1 and b!=-1:
jpg = self.bytes[a:b+2]
self.bytes= self.bytes[b+2:]
cv2img = cv2.imdecode(np.fromstring(jpg, dtype=np.uint8),cv2.CV_LOAD_IMAGE_COLOR)
try:
cv2RGB = cv2.cvtColor(cv2img,cv2.COLOR_BGR2RGB)
self.image = Image.fromarray(cv2RGB)
self.stop() #This is where the program was breaking
return #
except Exception as e:
pass
except AttributeError as e:
pass
thread = threading.Thread(target=run)
thread.start()
def stop(self):
self.stream.close()
self.stopcam = False #this needed to be switched to False
self.bytes = ''
def getImage(self):
return self.image
In the actual program I'm running multiple objects on separate processes but here's the basic idea:
cam = MJpegCam(ip)
secs = 10
while(i < end):
cam.start()
time.sleep(cam.timeout)
cam.stop()
time.sleep(secs - cam.timeout)

Python Queue and Threading Module - Impose a extra custom lock?

Using Linux and Python 2.7.6, I have a script that uploads lots of files at one time. I am using multi-threading with the Queue and Threading modules.
I have a object that keeps track of the files that have been successfully uploaded and decrements after each successfull upload. I need to make this operation atomic/thread safe. Since the Queue module is high level and has it's own mutex on the lower level, can I impose my own lock/acquire in addition to it? I tried doing this and had no errors(at the bottom of the last code block where file_quantity.deduct() is). But I am not sure if it is truly working as it should. Here is the shortened version for readability:
class FileQuantity(object):
"""Keeps track of files that have been uploaded and how many are left"""
def __init__(self, file_quantity):
self.quantity = file_quantity
self.total = file_quantity
def deduct(self):
self.quantity -= 1
kill_received = False
lock = threading.Lock()
class CustomQueue(Queue.Queue):
#Can not use .join() because it would block any processing
#for SIGINT untill threads are done. To counter this,
# wait() is given a time out along with while not kill_received
#to be checked
def join(self):
self.all_tasks_done.acquire()
try:
while not kill_received and self.unfinished_tasks:
self.all_tasks_done.wait(10.0)
finally:
self.all_tasks_done.release()
def do_the_uploads(file_list, file_quantity,
retry_list, authenticate):
"""The uploading engine"""
value = raw_input(
"\nPlease enter how many concurent "
"uploads you want at one time(example: 200)> ")
value = int(value)
logger.info('{} concurent uploads will be used.'.format(value))
confirm = raw_input(
"\nProceed to upload files? Enter [Y/y] for yes: ").upper()
if confirm == "Y":
kill_received = False
sys.stdout.write("\x1b[2J\x1b[H")
q = CustomQueue()
def worker():
global kill_received
while not kill_received:
item = q.get()
upload_file(item, file_quantity, retry_list, authenticate, q)
q.task_done()
for i in range(value):
t = Thread(target=worker)
t.setDaemon(True)
t.start()
for item in file_list:
q.put(item)
q.join()
print "Finished. Cleaning up processes...",
#Allowing the threads to cleanup
time.sleep(4)
print "done."
def upload_file(file_obj, file_quantity, retry_list, authenticate, q):
"""Uploads a file. One file per it's own thread. No batch style. This way if one upload
fails no others are effected."""
absolute_path_filename, filename, dir_name, token, url = file_obj
url = url + dir_name + '/' + filename
try:
with open(absolute_path_filename) as f:
r = requests.put(url, data=f, headers=header_collection, timeout=20)
except requests.exceptions.ConnectionError as e:
pass
if src_md5 == r.headers['etag']:
lock.acquire()
file_quantity.deduct()
lock.release()
Well, the code you posted doesn't define lock anywhere, so hard to say for sure. It would be more common to protect the code that actually needs protecting:
def deduct(self):
with lock:
self.quantity -= 1
Sanest is to allocate a lock in the structure that needs it, like so:
class FileQuantity(object):
"""Keeps track of files that have been uploaded and how many are left"""
def __init__(self, file_quantity):
self.quantity = file_quantity
self.total = file_quantity
self.lock = threading.Lock()
def deduct(self):
with self.lock:
self.quantity -= 1
and use self.lock similarly for any other mutations of FileQuantity data members that may be invoked simultaneously by multiple threads.

python threading in a loop

I have a project that requires a bunch of large matrices, which are stored in ~200 MB files, to be cross-correlated (i.e. FFT * conj(FFT)) with each other. The number of files is such that I can't just load them all up and then do my processing. On the other hand, reading in each file as I need it is slower than I'd like.
what I have so far is something like:
result=0
for i in xrange(N_files):
f1 = file_reader(file_list[i])
############################################################################
# here I want to have file_reader go start reading the next file I'll need #
############################################################################
in_place_processing(f1)
for j in xrange(i+1,N_files):
f2 = file_reader(file_list[j])
##################################################################
# here I want to have file_reader go start reading the next file #
##################################################################
in_place_processing(f2)
result += processing_function(f1,f2)
So basically, I just want to have two threads that will each read a file, give it to me when I ask for it (or as soon as it's done after I ask for it), and then go start reading the next file for when I ask for it. The object the file_reader returns is rather large and complicated, so I'm not sure if multiprocessing is the way to go here...
I've read about threading and queues but can't seem to figure out the part where I ask the thread to go read the file and can proceed with the program while it does. I don't want the threads to simply go about their business in the background -- am I missing a detail here, or is threading not the way to go?
Below is an example of using the multiprocessing module that will spawn off child processes to call your file_reader method and queue up their results. The queue should block when full, so you can control the number of read ahead's you'd like to perform with the QUEUE_SIZE constant.
This utilizes a standard Producer/Consumer model of multiprocess communication, with the child processes act as Producers, with the main thread being the Consumer. The join method call in the class destructor ensures the child process resources are cleaned up properly. There are some print statements interspersed for demonstration purposes.
Additionally, I added the ability for the QueuedFileReader class to offload work to a worker thread or run in the main thread, rather than using a child process, for comparison. This is done by specifying the mode parameter at class initialization to MODE_THREADS or MODE_SYNCHRONOUS, respectively.
import multiprocessing as mp
import Queue
import threading
import time
QUEUE_SIZE = 2 #buffer size of queue
## Placeholder for your functions and variables
N_files = 10
file_list = ['file %d' % i for i in range(N_files)]
def file_reader(filename):
time.sleep(.1)
result = (filename,'processed')
return result
def in_place_processing(f):
time.sleep(.2)
def processing_function(f1,f2):
print f1, f2
return id(f1) & id(f2)
MODE_SYNCHRONOUS = 0 #file_reader called in main thread synchronously
MODE_THREADS = 1 #file_reader executed in worker thread
MODE_PROCESS = 2 #file_reader executed in child_process
##################################################
## Class to encapsulate multiprocessing objects.
class QueuedFileReader():
def __init__(self, idlist, mode=MODE_PROCESS):
self.mode = mode
self.idlist = idlist
if mode == MODE_PROCESS:
self.queue = mp.Queue(QUEUE_SIZE)
self.process = mp.Process(target=QueuedFileReader.worker,
args=(self.queue,idlist))
self.process.start()
elif mode == MODE_THREADS:
self.queue = Queue.Queue(QUEUE_SIZE)
self.thread = threading.Thread(target=QueuedFileReader.worker,
args=(self.queue,idlist))
self.thread.start()
#staticmethod
def worker(queue, idlist):
for i in idlist:
queue.put((i, file_reader(file_list[i])))
print id(queue), 'queued', file_list[i]
queue.put('done')
def __iter__(self):
if self.mode == MODE_SYNCHRONOUS:
self.index = 0
return self
def next(self):
if self.mode == MODE_SYNCHRONOUS:
if self.index == len(self.idlist): raise StopIteration
q = (self.idlist[self.index],
file_reader(file_list[self.idlist[self.index]]))
self.index += 1
else:
q = self.queue.get()
if q == 'done': raise StopIteration
return q
def __del__(self):
if self.mode == MODE_PROCESS:
self.process.join()
elif self.mode == MODE_THREADS:
self.thread.join()
#mode = MODE_PROCESS
mode = MODE_THREADS
#mode = MODE_SYNCHRONOUS
result = 0
for i, f1 in QueuedFileReader(range(N_files),mode):
in_place_processing(f1)
for j, f2 in QueuedFileReader(range(i+1,N_files),mode):
in_place_processing(f2)
result += processing_function(f1,f2)
If your intermediate values are too large to pass through the Queue, you can execute each iteration of the outer loop in its own process. A handy way to do that would be using the Pool class in multiprocessing as in the example below.
import multiprocessing as mp
import time
## Placeholder for your functions and variables
N_files = 10
file_list = ['file %d' % i for i in range(N_files)]
def file_reader(filename):
time.sleep(.1)
result = (filename,'processed')
return result
def in_place_processing(f):
time.sleep(.2)
def processing_function(f1,f2):
print f1, f2
return id(f1) & id(f2)
def file_task(file_index):
print file_index
f1 = file_reader(file_list[file_index])
in_place_processing(f1)
task_result = 0
for j in range(file_index+1, N_files):
f2 = file_reader(file_list[j])
in_place_processing(f2)
task_result += processing_function(f1,f2)
return task_result
pool = mp.Pool(processes=None) #processes default to mp.cpu_count()
result = 0
for file_result in pool.map(file_task, range(N_files)):
result += file_result
print 'result', result
#or simply
#result = sum(pool.map(file_task, range(N_files)))

Categories

Resources