How get output in _thread? - python
i created a function with Python, for the poaching of some devices, the need for fast times or the idea of using threads. the python code I wrote function and it is very fast the peripherals respond (verified with wire shark), but now I need each thread to have the output of the function I launch to have them all in an output vector. How can I save the output of each thread I launch with this "_thread" library?
below is the code I used:
import _thread
import time
import atenapy
try:
tic = time.process_time()
_thread.start_new_thread(atenapy.connect_PE,('192.168.2.172',9761,'5A0000005A'))
_thread.start_new_thread(atenapy.connect_PE,('192.168.2.170',9761,'2600000026'))
_thread.start_new_thread(atenapy.connect_PE,('192.168.2.172',9761,'5100000051'))
_thread.start_new_thread(atenapy.connect_PE,('192.168.2.170',9761,'2700000027'))
_thread.start_new_thread(atenapy.connect_PE,('192.168.2.172',9761,'5000000050'))
_thread.start_new_thread(atenapy.connect_PE,('192.168.2.170',9761,'6000000060'))
_thread.start_new_thread(atenapy.connect_PE,('192.168.2.172',9761,'5200000052'))
_thread.start_new_thread(atenapy.connect_PE,('192.168.2.170',9761,'2D0000002D'))
_thread.start_new_thread(atenapy.connect_PE,('192.168.2.172',9761,'5700000057'))
_thread.start_new_thread(atenapy.connect_PE,('192.168.2.170',9761,'5F0000005F'))
_thread.start_new_thread(atenapy.connect_PE,('192.168.2.172',9761,'5300000053'))
_thread.start_new_thread(atenapy.connect_PE,('192.168.2.170',9761,'2200000022'))
_thread.start_new_thread(atenapy.connect_PE,('192.168.2.172',9761,'5600000056'))
_thread.start_new_thread(atenapy.connect_PE,('192.168.2.170',9761,'2300000023'))
_thread.start_new_thread(atenapy.connect_PE,('192.168.2.172',9761,'5500000055'))
_thread.start_new_thread(atenapy.connect_PE,('192.168.2.170',9761,'2B0000002B'))
_thread.start_new_thread(atenapy.connect_PE,('192.168.2.172',9761,'5400000054'))
_thread.start_new_thread(atenapy.connect_PE,('192.168.2.170',9761,'2C0000002C'))
_thread.start_new_thread(atenapy.connect_PE,('192.168.2.164',9761,'0C0000000C'))
_thread.start_new_thread(atenapy.connect_PE,('192.168.2.170',9761,'2800000028'))
_thread.start_new_thread(atenapy.connect_PE,('192.168.2.164',9761,'0D0000000D'))
_thread.start_new_thread(atenapy.connect_PE,('192.168.2.170',9761,'2900000029'))
_thread.start_new_thread(atenapy.connect_PE,('192.168.2.164',9761,'0E0000000E'))
_thread.start_new_thread(atenapy.connect_PE,('192.168.2.170',9761,'2A0000002A'))
_thread.start_new_thread(atenapy.connect_PE,('192.168.2.164',9761,'0F0000000F'))
_thread.start_new_thread(atenapy.connect_PE,('192.168.2.164',9761,'1400000014'))
_thread.start_new_thread(atenapy.connect_PE,('192.168.2.164',9761,'1800000018'))
_thread.start_new_thread(atenapy.connect_PE,('192.168.2.164',9761,'1900000019'))
_thread.start_new_thread(atenapy.connect_PE,('192.168.2.164',9761,'1A0000001A'))
_thread.start_new_thread(atenapy.connect_PE,('192.168.2.164',9761,'1B0000001B'))
_thread.start_new_thread(atenapy.connect_PE,('192.168.2.164',9761,'1C0000001C'))
_thread.start_new_thread(atenapy.connect_PE,('192.168.2.164',9761,'1D0000001D'))
_thread.start_new_thread(atenapy.connect_PE,('192.168.2.164',9761,'1E0000001E'))
_thread.start_new_thread(atenapy.connect_PE,('192.168.2.164',9761,'1F0000001F'))
_thread.start_new_thread(atenapy.connect_PE,('192.168.2.164',9761,'2000000020'))
_thread.start_new_thread(atenapy.connect_PE,('192.168.2.164',9761,'2100000021'))
_thread.start_new_thread(atenapy.connect_PE,('192.168.2.162',9761,'0200000002'))
_thread.start_new_thread(atenapy.connect_PE,('192.168.2.162',9761,'0300000003'))
_thread.start_new_thread(atenapy.connect_PE,('192.168.2.162',9761,'0800000008'))
toc = time.process_time()
print("all PE time pooling = "+str(toc - tic))
except:
print ("Error: unable to start thread")
Wrap your function in a worker function that collects the result and appends to a list. The lock is optional when appending to a list (Ref: What kinds of global value mutation are thread safe).
import threading
lock = threading.Lock()
results = []
def func(a,b):
with lock:
results.append(a+b)
threads = [threading.Thread(target=func,args=(a,b))
for a in range(3) for b in range(3)]
for thread in threads:
thread.start()
for thread in threads:
thread.join()
print(results)
Related
Python Multi-Processing List Issue
I am attempting to dynamically open and parse through several text files (~10) to extract a particular value from key, for which I am utilizing multi-processing within Python to do this. My issue is that the function that I am calling writes particular data to a class list which I can see in the method, however outside the method that list is empty. Refer to the following: class: class MyClass(object): __id_list = [] def __init__(self): self.process_wrapper() Caller Method: def process_wrapper(self): from multiprocessing import Pool import multiprocessing info_file = 'info*' file_list = [] p = Pool(processes = multiprocessing.cpu_count() - 1) for file_name in Path('c:/').glob('**/*/' + info_file): file_list.append(str(os.path.join('c:/', file_name))) p.map_async(self.get_ids, file_list) p.close() p.join() print(self.__id_list) # this is showing as empty Worker method: def get_ids(self, file_name): try: with open(file_name) as data: for line in data: temp_split = line.split() for item in temp_split: value_split = str(item).split('=') if 'id' == value_split[0].lower(): if int(value_split[1]) not in self._id_list: self.__id_list.append(int(value_split[1])) except: raise FileReadError(f'There was an issue parsing "{file_name}".') print(self.__id_list) # here the list prints fine
The map call returns a AysncResult class object. you should use that to wait for the processing to finish before checking self.__id_list. also you might consider returning a local list, collected those lists and aggregating them into the final list.
1. It looks like you have a typo in your get_ids method (self._id_list instead of self.__id_list). You can see it if you wait for the result: result = p.map_async(self.get_ids, file_list) result.get() 2. When a new child process is created, it gets a copy of the parent's address space however any subsequent changes (either by parent or child) are not reflected in the memory of the other process. They each have their own private address space. Example: $ cat fork.py import os l = [] l.append('global') # Return 0 in the child and the child’s process id in the parent pid = os.fork() if pid == 0: l.append('child') print(f'Child PID: {os.getpid()}, {l}') else: l.append('parent') print(f'Parent PID: {os.getpid()}, {l}') print(l) $ python3 fork.py Parent PID: 9933, ['global', 'parent'] ['global', 'parent'] Child PID: 9934, ['global', 'child'] ['global', 'child'] Now back to your problem, you can use multiprocessing.Manager.list to create an object that is shared between processes: from multiprocessing import Manager, Pool m = Manager() self.__id_list = m.list() Docs: Sharing state between processes or use threads as your workload seems to be I/O bound anyway: from multiprocessing.dummy import Pool as ThreadPool p = ThreadPool(processes = multiprocessing.cpu_count() - 1) Alternatively check concurrent.futures
python multiprocessing to create an excel file with multiple sheets [duplicate]
I am new to Python and I am trying to save the results of five different processes to one excel file (each process write to a different sheet). I have read different posts here, but still can't get it done as I'm very confused about pool.map, queues, and locks, and I'm not sure what is required here to fulfill this task. This is my code so far: list_of_days = ["2017.03.20", "2017.03.21", "2017.03.22", "2017.03.23", "2017.03.24"] results = pd.DataFrame() if __name__ == '__main__': global list_of_days writer = pd.ExcelWriter('myfile.xlsx', engine='xlsxwriter') nr_of_cores = multiprocessing.cpu_count() l = multiprocessing.Lock() pool = multiprocessing.Pool(processes=nr_of_cores, initializer=init, initargs=(l,)) pool.map(f, range(len(list_of_days))) pool.close() pool.join() def init(l): global lock lock = l def f(k): global results *** DO SOME STUFF HERE*** results = results[ *** finished pandas dataframe *** ] lock.acquire() results.to_excel(writer, sheet_name=list_of_days[k]) writer.save() lock.release() The result is that only one sheet gets created in excel (I assume it is the process finishing last). Some questions about this code: How to avoid defining global variables? Is it even possible to pass around dataframes? Should I move the locking to main instead? Really appreciate some input here, as I consider mastering multiprocessing as instrumental. Thanks
1) Why did you implement time.sleep in several places in your 2nd method? In __main__, time.sleep(0.1), to give the started process a timeslice to startup. In f2(fq, q), to give the queue a timeslice to flushed all buffered data to the pipe and as q.get_nowait() are used. In w(q), are only for testing simulating long run of writer.to_excel(...), i removed this one. 2) What is the difference between pool.map and pool = [mp.Process( . )]? Using pool.map needs no Queue, no parameter passed, shorter code. The worker_process have to return immediately the result and terminates. pool.map starts a new process as long as all iteration are done. The results have to be processed after that. Using pool = [mp.Process( . )], starts n processes. A process terminates on queue.Empty Can you think of a situation where you would prefer one method over the other? Methode 1: Quick setup, serialized, only interested in the result to continue. Methode 2: If you want to do all workload parallel. You could't use global writer in processes. The writer instance has to belong to one process. Usage of mp.Pool, for instance: def f1(k): # *** DO SOME STUFF HERE*** results = pd.DataFrame(df_) return results if __name__ == '__main__': pool = mp.Pool() results = pool.map(f1, range(len(list_of_days))) writer = pd.ExcelWriter('../test/myfile.xlsx', engine='xlsxwriter') for k, result in enumerate(results): result.to_excel(writer, sheet_name=list_of_days[k]) writer.save() pool.close() This leads to .to_excel(...) are called in sequence in the __main__ process. If you want parallel .to_excel(...) you have to use mp.Queue(). For instance: The worker process: # mp.Queue exeptions have to load from try: # Python3 import queue except: # Python 2 import Queue as queue def f2(fq, q): while True: try: k = fq.get_nowait() except queue.Empty: exit(0) # *** DO SOME STUFF HERE*** results = pd.DataFrame(df_) q.put( (list_of_days[k], results) ) time.sleep(0.1) The writer process: def w(q): writer = pd.ExcelWriter('myfile.xlsx', engine='xlsxwriter') while True: try: titel, result = q.get() except ValueError: writer.save() exit(0) result.to_excel(writer, sheet_name=titel) The __main__ process: if __name__ == '__main__': w_q = mp.Queue() w_p = mp.Process(target=w, args=(w_q,)) w_p.start() time.sleep(0.1) f_q = mp.Queue() for i in range(len(list_of_days)): f_q.put(i) pool = [mp.Process(target=f2, args=(f_q, w_q,)) for p in range(os.cpu_count())] for p in pool: p.start() time.sleep(0.1) for p in pool: p.join() w_q.put('STOP') w_p.join() Tested with Python:3.4.2 - pandas:0.19.2 - xlsxwriter:0.9.6
Python - For loop finishing before it is supposed to
I am currently executing tasks via a thread pool based on a for loop length, and it is ending its execution when it is not supposed to (before end of loop). Any ideas why? Here is the relavent code: from classes.scraper import size from multiprocessing import Pool import threading if __name__ == '__main__': print("Do something") size = size() pool = Pool(processes=50) with open('size.txt','r') as file: asf = file.read() for x in range(0,1000000): if '{num:06d}'.format(num=x) in asf: continue else: res = pool.apply_async(size.scrape, ('{num:06d}'.format(num=x),)) Here is the console output (I am printing out the values inside size.scrape(). ... ... ... 013439 013440 013441 013442 013443 Process finished with exit code 0
What is wrong with my threaded programming?
There is an efficieny issue with my thread programming where I try to simulate producer-consumer schema. Phase1's output is phase2's input. The problem is when I run phase 1 and phase 2 independently, I have efficient results in terms of time that it took to execute. But when I run threaded fashion it took x1000 times longer to execute. This is my main module for threaded programming: #Phase1 is producer phase1_thread= Phase1Thread() phase1_thread.daemon=True phase1_thread.start() # Phase 2 is consumer phase2_thread = Phase2Thread() phase2_thread.daemon=True phase2_thread.start() import time while True: time.sleep(1) This is phase 1 (producer): global queue self.Phase1= phase1() input= .... for i, batch in input.groupby(np.arange(len(input)) //1000): #print(i) tuple_of= self.Phase1.extract(batch,i) output=tuple_of[0] queue.put(output) time.sleep(random.random()) This is phase 2 (consumer) self.Phase2 = phase2.EntityResolver() global queue global stream_count global thread_processed counter=0 while True: tuple_of = queue.get() input=tuple_of[0] queue.task_done() self.Phase2.run(input) counter=counter+1 time.sleep(random.random())
Multiprocessing in Python not calling the worker functions
I'm fairly new to multiprocessing and I have written the script below, but the methods are not getting called. I dont understand what I'm missing. What I want to do is the following: call two different methods asynchronously. call one method before the other. # import all necessary modules import Queue import logging import multiprocessing import time, sys import signal debug = True def init_worker(): signal.signal(signal.SIGINT, signal.SIG_IGN) research_name_id = {} ids = [55, 125, 428, 429, 430, 895, 572, 126, 833, 502, 404] # declare all the static variables num_threads = 2 # number of parallel threads minDelay = 3 # minimum delay maxDelay = 7 # maximum delay # declare an empty queue which will hold the publication ids queue = Queue.Queue(0) proxies = [] #print (proxies) def split(a, n): """Function to split data evenly among threads""" k, m = len(a) / n, len(a) % n return (a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in xrange(n)) def run_worker( i, data, queue, research_name_id, proxies, debug, minDelay, maxDelay): """ Function to pull out all publication links from nist data - research ids pulled using a different script queue - add the publication urls to the list research_name_id - dictionary with research id as key and name as value proxies - scraped proxies """ print 'getLinks', i for d in data: print d queue.put(d) def fun_worker(i, queue, proxies, debug, minDelay, maxDelay): print 'publicationData', i try: print queue.pop() except: pass def main(): print "Initializing workers" pool = multiprocessing.Pool(num_threads, init_worker) distributed_ids = list(split(list(ids), num_threads)) for i in range(num_threads): data_thread = distributed_ids[i] print data_thread pool.apply_async(run_worker, args=(i + 1, data_thread, queue, research_name_id, proxies, debug, minDelay, maxDelay, )) pool.apply_async(fun_worker, args=( i + 1, queue, proxies, debug, minDelay, maxDelay, )) try: print "Waiting 10 seconds" time.sleep(10) except KeyboardInterrupt: print "Caught KeyboardInterrupt, terminating workers" pool.terminate() pool.join() else: print "Quitting normally" pool.close() pool.join() if __name__ == "__main__": main() The only output that I get is Initializing workers [55, 125, 428, 429, 430, 895] [572, 126, 833, 502, 404] Waiting 10 seconds Quitting normally
There are a couple of issues: You're not using multiprocessing.Queue If you want to share a queue with a subprocess via apply_async etc, you need to use a manager (see example). However, you should take a step back and ask yourself what you are trying to do. Is apply_async is really the way to go? You have a list of items that you want to map over repeatedly, applying some long-running transformations that are compute intensive (because if they're just blocking on I/O, you might as well use threads). It seems to me that imap_unordered is actually what you want: pool = multiprocessing.Pool(num_threads, init_worker) links = pool.imap_unordered(run_worker1, ids) output = pool.imap_unordered(fun_worker1, links) run_worker1 and fun_worker1 need to be modified to take a single argument. If you need to share other data, then you should pass it in the initializer instead of passing it to the subprocesses over and over again.