How get output in _thread? - python

i created a function with Python, for the poaching of some devices, the need for fast times or the idea of using threads. the python code I wrote function and it is very fast the peripherals respond (verified with wire shark), but now I need each thread to have the output of the function I launch to have them all in an output vector. How can I save the output of each thread I launch with this "_thread" library?
below is the code I used:
import _thread
import time
import atenapy
try:
tic = time.process_time()
_thread.start_new_thread(atenapy.connect_PE,('192.168.2.172',9761,'5A0000005A'))
_thread.start_new_thread(atenapy.connect_PE,('192.168.2.170',9761,'2600000026'))
_thread.start_new_thread(atenapy.connect_PE,('192.168.2.172',9761,'5100000051'))
_thread.start_new_thread(atenapy.connect_PE,('192.168.2.170',9761,'2700000027'))
_thread.start_new_thread(atenapy.connect_PE,('192.168.2.172',9761,'5000000050'))
_thread.start_new_thread(atenapy.connect_PE,('192.168.2.170',9761,'6000000060'))
_thread.start_new_thread(atenapy.connect_PE,('192.168.2.172',9761,'5200000052'))
_thread.start_new_thread(atenapy.connect_PE,('192.168.2.170',9761,'2D0000002D'))
_thread.start_new_thread(atenapy.connect_PE,('192.168.2.172',9761,'5700000057'))
_thread.start_new_thread(atenapy.connect_PE,('192.168.2.170',9761,'5F0000005F'))
_thread.start_new_thread(atenapy.connect_PE,('192.168.2.172',9761,'5300000053'))
_thread.start_new_thread(atenapy.connect_PE,('192.168.2.170',9761,'2200000022'))
_thread.start_new_thread(atenapy.connect_PE,('192.168.2.172',9761,'5600000056'))
_thread.start_new_thread(atenapy.connect_PE,('192.168.2.170',9761,'2300000023'))
_thread.start_new_thread(atenapy.connect_PE,('192.168.2.172',9761,'5500000055'))
_thread.start_new_thread(atenapy.connect_PE,('192.168.2.170',9761,'2B0000002B'))
_thread.start_new_thread(atenapy.connect_PE,('192.168.2.172',9761,'5400000054'))
_thread.start_new_thread(atenapy.connect_PE,('192.168.2.170',9761,'2C0000002C'))
_thread.start_new_thread(atenapy.connect_PE,('192.168.2.164',9761,'0C0000000C'))
_thread.start_new_thread(atenapy.connect_PE,('192.168.2.170',9761,'2800000028'))
_thread.start_new_thread(atenapy.connect_PE,('192.168.2.164',9761,'0D0000000D'))
_thread.start_new_thread(atenapy.connect_PE,('192.168.2.170',9761,'2900000029'))
_thread.start_new_thread(atenapy.connect_PE,('192.168.2.164',9761,'0E0000000E'))
_thread.start_new_thread(atenapy.connect_PE,('192.168.2.170',9761,'2A0000002A'))
_thread.start_new_thread(atenapy.connect_PE,('192.168.2.164',9761,'0F0000000F'))
_thread.start_new_thread(atenapy.connect_PE,('192.168.2.164',9761,'1400000014'))
_thread.start_new_thread(atenapy.connect_PE,('192.168.2.164',9761,'1800000018'))
_thread.start_new_thread(atenapy.connect_PE,('192.168.2.164',9761,'1900000019'))
_thread.start_new_thread(atenapy.connect_PE,('192.168.2.164',9761,'1A0000001A'))
_thread.start_new_thread(atenapy.connect_PE,('192.168.2.164',9761,'1B0000001B'))
_thread.start_new_thread(atenapy.connect_PE,('192.168.2.164',9761,'1C0000001C'))
_thread.start_new_thread(atenapy.connect_PE,('192.168.2.164',9761,'1D0000001D'))
_thread.start_new_thread(atenapy.connect_PE,('192.168.2.164',9761,'1E0000001E'))
_thread.start_new_thread(atenapy.connect_PE,('192.168.2.164',9761,'1F0000001F'))
_thread.start_new_thread(atenapy.connect_PE,('192.168.2.164',9761,'2000000020'))
_thread.start_new_thread(atenapy.connect_PE,('192.168.2.164',9761,'2100000021'))
_thread.start_new_thread(atenapy.connect_PE,('192.168.2.162',9761,'0200000002'))
_thread.start_new_thread(atenapy.connect_PE,('192.168.2.162',9761,'0300000003'))
_thread.start_new_thread(atenapy.connect_PE,('192.168.2.162',9761,'0800000008'))
toc = time.process_time()
print("all PE time pooling = "+str(toc - tic))
except:
print ("Error: unable to start thread")

Wrap your function in a worker function that collects the result and appends to a list. The lock is optional when appending to a list (Ref: What kinds of global value mutation are thread safe).
import threading
lock = threading.Lock()
results = []
def func(a,b):
with lock:
results.append(a+b)
threads = [threading.Thread(target=func,args=(a,b))
for a in range(3) for b in range(3)]
for thread in threads:
thread.start()
for thread in threads:
thread.join()
print(results)

Related

Python Multi-Processing List Issue

I am attempting to dynamically open and parse through several text files (~10) to extract a particular value from key, for which I am utilizing multi-processing within Python to do this. My issue is that the function that I am calling writes particular data to a class list which I can see in the method, however outside the method that list is empty. Refer to the following:
class:
class MyClass(object):
__id_list = []
def __init__(self):
self.process_wrapper()
Caller Method:
def process_wrapper(self):
from multiprocessing import Pool
import multiprocessing
info_file = 'info*'
file_list = []
p = Pool(processes = multiprocessing.cpu_count() - 1)
for file_name in Path('c:/').glob('**/*/' + info_file):
file_list.append(str(os.path.join('c:/', file_name)))
p.map_async(self.get_ids, file_list)
p.close()
p.join()
print(self.__id_list) # this is showing as empty
Worker method:
def get_ids(self, file_name):
try:
with open(file_name) as data:
for line in data:
temp_split = line.split()
for item in temp_split:
value_split = str(item).split('=')
if 'id' == value_split[0].lower():
if int(value_split[1]) not in self._id_list:
self.__id_list.append(int(value_split[1]))
except:
raise FileReadError(f'There was an issue parsing "{file_name}".')
print(self.__id_list) # here the list prints fine
The map call returns a AysncResult class object. you should use that to wait for the processing to finish before checking self.__id_list. also you might consider returning a local list, collected those lists and aggregating them into the final list.
1. It looks like you have a typo in your get_ids method (self._id_list instead of self.__id_list). You can see it if you wait for the result:
result = p.map_async(self.get_ids, file_list)
result.get()
2. When a new child process is created, it gets a copy of the parent's address space however any subsequent changes (either by parent or child) are not reflected in the memory of the other process. They each have their own private address space.
Example:
$ cat fork.py
import os
l = []
l.append('global')
# Return 0 in the child and the child’s process id in the parent
pid = os.fork()
if pid == 0:
l.append('child')
print(f'Child PID: {os.getpid()}, {l}')
else:
l.append('parent')
print(f'Parent PID: {os.getpid()}, {l}')
print(l)
$ python3 fork.py
Parent PID: 9933, ['global', 'parent']
['global', 'parent']
Child PID: 9934, ['global', 'child']
['global', 'child']
Now back to your problem, you can use multiprocessing.Manager.list to create an object that is shared between processes:
from multiprocessing import Manager, Pool
m = Manager()
self.__id_list = m.list()
Docs: Sharing state between processes
or use threads as your workload seems to be I/O bound anyway:
from multiprocessing.dummy import Pool as ThreadPool
p = ThreadPool(processes = multiprocessing.cpu_count() - 1)
Alternatively check concurrent.futures

python multiprocessing to create an excel file with multiple sheets [duplicate]

I am new to Python and I am trying to save the results of five different processes to one excel file (each process write to a different sheet). I have read different posts here, but still can't get it done as I'm very confused about pool.map, queues, and locks, and I'm not sure what is required here to fulfill this task.
This is my code so far:
list_of_days = ["2017.03.20", "2017.03.21", "2017.03.22", "2017.03.23", "2017.03.24"]
results = pd.DataFrame()
if __name__ == '__main__':
global list_of_days
writer = pd.ExcelWriter('myfile.xlsx', engine='xlsxwriter')
nr_of_cores = multiprocessing.cpu_count()
l = multiprocessing.Lock()
pool = multiprocessing.Pool(processes=nr_of_cores, initializer=init, initargs=(l,))
pool.map(f, range(len(list_of_days)))
pool.close()
pool.join()
def init(l):
global lock
lock = l
def f(k):
global results
*** DO SOME STUFF HERE***
results = results[ *** finished pandas dataframe *** ]
lock.acquire()
results.to_excel(writer, sheet_name=list_of_days[k])
writer.save()
lock.release()
The result is that only one sheet gets created in excel (I assume it is the process finishing last). Some questions about this code:
How to avoid defining global variables?
Is it even possible to pass around dataframes?
Should I move the locking to main instead?
Really appreciate some input here, as I consider mastering multiprocessing as instrumental. Thanks
1) Why did you implement time.sleep in several places in your 2nd method?
In __main__, time.sleep(0.1), to give the started process a timeslice to startup.
In f2(fq, q), to give the queue a timeslice to flushed all buffered data to the pipe and
as q.get_nowait() are used.
In w(q), are only for testing simulating long run of writer.to_excel(...),
i removed this one.
2) What is the difference between pool.map and pool = [mp.Process( . )]?
Using pool.map needs no Queue, no parameter passed, shorter code.
The worker_process have to return immediately the result and terminates.
pool.map starts a new process as long as all iteration are done.
The results have to be processed after that.
Using pool = [mp.Process( . )], starts n processes.
A process terminates on queue.Empty
Can you think of a situation where you would prefer one method over the other?
Methode 1: Quick setup, serialized, only interested in the result to continue.
Methode 2: If you want to do all workload parallel.
You could't use global writer in processes.
The writer instance has to belong to one process.
Usage of mp.Pool, for instance:
def f1(k):
# *** DO SOME STUFF HERE***
results = pd.DataFrame(df_)
return results
if __name__ == '__main__':
pool = mp.Pool()
results = pool.map(f1, range(len(list_of_days)))
writer = pd.ExcelWriter('../test/myfile.xlsx', engine='xlsxwriter')
for k, result in enumerate(results):
result.to_excel(writer, sheet_name=list_of_days[k])
writer.save()
pool.close()
This leads to .to_excel(...) are called in sequence in the __main__ process.
If you want parallel .to_excel(...) you have to use mp.Queue().
For instance:
The worker process:
# mp.Queue exeptions have to load from
try:
# Python3
import queue
except:
# Python 2
import Queue as queue
def f2(fq, q):
while True:
try:
k = fq.get_nowait()
except queue.Empty:
exit(0)
# *** DO SOME STUFF HERE***
results = pd.DataFrame(df_)
q.put( (list_of_days[k], results) )
time.sleep(0.1)
The writer process:
def w(q):
writer = pd.ExcelWriter('myfile.xlsx', engine='xlsxwriter')
while True:
try:
titel, result = q.get()
except ValueError:
writer.save()
exit(0)
result.to_excel(writer, sheet_name=titel)
The __main__ process:
if __name__ == '__main__':
w_q = mp.Queue()
w_p = mp.Process(target=w, args=(w_q,))
w_p.start()
time.sleep(0.1)
f_q = mp.Queue()
for i in range(len(list_of_days)):
f_q.put(i)
pool = [mp.Process(target=f2, args=(f_q, w_q,)) for p in range(os.cpu_count())]
for p in pool:
p.start()
time.sleep(0.1)
for p in pool:
p.join()
w_q.put('STOP')
w_p.join()
Tested with Python:3.4.2 - pandas:0.19.2 - xlsxwriter:0.9.6

Python - For loop finishing before it is supposed to

I am currently executing tasks via a thread pool based on a for loop length, and it is ending its execution when it is not supposed to (before end of loop). Any ideas why? Here is the relavent code:
from classes.scraper import size
from multiprocessing import Pool
import threading
if __name__ == '__main__':
print("Do something")
size = size()
pool = Pool(processes=50)
with open('size.txt','r') as file:
asf = file.read()
for x in range(0,1000000):
if '{num:06d}'.format(num=x) in asf:
continue
else:
res = pool.apply_async(size.scrape, ('{num:06d}'.format(num=x),))
Here is the console output (I am printing out the values inside size.scrape().
...
...
...
013439
013440
013441
013442
013443
Process finished with exit code 0

What is wrong with my threaded programming?

There is an efficieny issue with my thread programming where I try to simulate producer-consumer schema.
Phase1's output is phase2's input.
The problem is when I run phase 1 and phase 2 independently, I have efficient results in terms of time that it took to execute.
But when I run threaded fashion it took x1000 times longer to execute.
This is my main module for threaded programming:
#Phase1 is producer
phase1_thread= Phase1Thread()
phase1_thread.daemon=True
phase1_thread.start()
# Phase 2 is consumer
phase2_thread = Phase2Thread()
phase2_thread.daemon=True
phase2_thread.start()
import time
while True:
time.sleep(1)
This is phase 1 (producer):
global queue
self.Phase1= phase1()
input= ....
for i, batch in input.groupby(np.arange(len(input)) //1000):
#print(i)
tuple_of= self.Phase1.extract(batch,i)
output=tuple_of[0]
queue.put(output)
time.sleep(random.random())
This is phase 2 (consumer)
self.Phase2 = phase2.EntityResolver()
global queue
global stream_count
global thread_processed
counter=0
while True:
tuple_of = queue.get()
input=tuple_of[0]
queue.task_done()
self.Phase2.run(input)
counter=counter+1
time.sleep(random.random())

Multiprocessing in Python not calling the worker functions

I'm fairly new to multiprocessing and I have written the script below, but the methods are not getting called. I dont understand what I'm missing.
What I want to do is the following:
call two different methods asynchronously.
call one method before the other.
# import all necessary modules
import Queue
import logging
import multiprocessing
import time, sys
import signal
debug = True
def init_worker():
signal.signal(signal.SIGINT, signal.SIG_IGN)
research_name_id = {}
ids = [55, 125, 428, 429, 430, 895, 572, 126, 833, 502, 404]
# declare all the static variables
num_threads = 2 # number of parallel threads
minDelay = 3 # minimum delay
maxDelay = 7 # maximum delay
# declare an empty queue which will hold the publication ids
queue = Queue.Queue(0)
proxies = []
#print (proxies)
def split(a, n):
"""Function to split data evenly among threads"""
k, m = len(a) / n, len(a) % n
return (a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)]
for i in xrange(n))
def run_worker(
i,
data,
queue,
research_name_id,
proxies,
debug,
minDelay,
maxDelay):
""" Function to pull out all publication links from nist
data - research ids pulled using a different script
queue - add the publication urls to the list
research_name_id - dictionary with research id as key and name as value
proxies - scraped proxies
"""
print 'getLinks', i
for d in data:
print d
queue.put(d)
def fun_worker(i, queue, proxies, debug, minDelay, maxDelay):
print 'publicationData', i
try:
print queue.pop()
except:
pass
def main():
print "Initializing workers"
pool = multiprocessing.Pool(num_threads, init_worker)
distributed_ids = list(split(list(ids), num_threads))
for i in range(num_threads):
data_thread = distributed_ids[i]
print data_thread
pool.apply_async(run_worker, args=(i + 1,
data_thread,
queue,
research_name_id,
proxies,
debug,
minDelay,
maxDelay,
))
pool.apply_async(fun_worker,
args=(
i + 1,
queue,
proxies,
debug,
minDelay,
maxDelay,
))
try:
print "Waiting 10 seconds"
time.sleep(10)
except KeyboardInterrupt:
print "Caught KeyboardInterrupt, terminating workers"
pool.terminate()
pool.join()
else:
print "Quitting normally"
pool.close()
pool.join()
if __name__ == "__main__":
main()
The only output that I get is
Initializing workers
[55, 125, 428, 429, 430, 895]
[572, 126, 833, 502, 404]
Waiting 10 seconds
Quitting normally
There are a couple of issues:
You're not using multiprocessing.Queue
If you want to share a queue with a subprocess via apply_async etc, you need to use a manager (see example).
However, you should take a step back and ask yourself what you are trying to do. Is apply_async is really the way to go? You have a list of items that you want to map over repeatedly, applying some long-running transformations that are compute intensive (because if they're just blocking on I/O, you might as well use threads). It seems to me that imap_unordered is actually what you want:
pool = multiprocessing.Pool(num_threads, init_worker)
links = pool.imap_unordered(run_worker1, ids)
output = pool.imap_unordered(fun_worker1, links)
run_worker1 and fun_worker1 need to be modified to take a single argument. If you need to share other data, then you should pass it in the initializer instead of passing it to the subprocesses over and over again.

Categories

Resources