Python multiprocessing not working as intended with fuzzywuzzy - python

Either my processes kicking off one after another finishes or they start (simultaneously) but without calling the pointing function. I tried many variants somehow it will not act like many tutorials teach.
My Goal is to fuzzywuzzy String match a 80k item list of text sentences, droping unneccessary 90%+ matches while keeping the String with the most information (scorer=fuzz.token_set_ratio).
Thank you!
IDE is Anaconda Spyder 4.0, IPython 7.10.1, Python 3.7.5
# -*- coding: utf-8 -*-
import pandas as pd
import multiprocessing
import time
from datetime import datetime
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
#########
preparedDF = []
df1 = []
df2 = []
df3 = []
df4 = []
df5 = []
df6 = []
df7 = []
df8 = []
#########
xdf1 = []
xdf2 = []
xdf3 = []
xdf4 = []
xdf5 = []
xdf6 = []
xdf7 = []
xdf8 = []
#########
def fuzzyPrepare():
#load data do some easy cleaning
global preparedDF
df = pd.read_csv("newEN.csv")
df = df["description"].fillna("#####").tolist()
df = list(dict.fromkeys(df))
try:
df = df.remove("#####")
except ValueError:
pass
preparedDF=df
def fuzzySplit(df=preparedDF):
#split data to feed processes
global df1, df2, df3, df4, df5, df6, df7, df8
df1 = df[:100]
df2 = df[100:200]
df3 = df[200:300]
df4 = df[300:400]
df5 = df[400:500]
df6 = df[500:600]
df7 = df[600:700]
df8 = df[700:800]
def fuzzyMatch(x):
#process.dedupe returns dict_keys object so pass it to a list()
global xdf1, xdf2, xdf3, xdf4, xdf5, xdf6, xdf7, xdf8
if x == 1:
xdf1=list(process.dedupe(df1,threshold=90,scorer=fuzz.token_set_ratio))
elif x == 2:
xdf2=list(process.dedupe(df2,threshold=90,scorer=fuzz.token_set_ratio))
elif x == 3:
xdf3=list(process.dedupe(df3,threshold=90,scorer=fuzz.token_set_ratio))
elif x == 4:
xdf4=list(process.dedupe(df4,threshold=90,scorer=fuzz.token_set_ratio))
elif x == 5:
xdf5=list(process.dedupe(df5,threshold=90,scorer=fuzz.token_set_ratio))
elif x == 6:
xdf6=list(process.dedupe(df6,threshold=90,scorer=fuzz.token_set_ratio))
elif x == 7:
xdf7=list(process.dedupe(df7,threshold=90,scorer=fuzz.token_set_ratio))
elif x == 8:
xdf8=list(process.dedupe(df8,threshold=90,scorer=fuzz.token_set_ratio))
else:
return "error in fuzzyCases!"
#if __name__ == '__main__':
fuzzyPrepare()
fuzzySplit(preparedDF)
#UNHEEDED MULTIPROCESSING, ONLY THIS LINE TRIGGERS THE ACTUAL FUNCTION -> p1 = multiprocessing.Process(name="p1",target=fuzzyMatch(1), args=(1,))
p1 = multiprocessing.Process(name="p1",target=fuzzyMatch, args=(1,))
p2 = multiprocessing.Process(name="p2",target=fuzzyMatch, args=(2,))
p3 = multiprocessing.Process(name="p3",target=fuzzyMatch, args=(3,))
p4 = multiprocessing.Process(name="p4",target=fuzzyMatch, args=(4,))
p5 = multiprocessing.Process(name="p5",target=fuzzyMatch, args=(5,))
p6 = multiprocessing.Process(name="p6",target=fuzzyMatch, args=(6,))
p7 = multiprocessing.Process(name="p7",target=fuzzyMatch, args=(7,))
p8 = multiprocessing.Process(name="p8",target=fuzzyMatch, args=(8,))
jobs = []
jobs.append(p1)
jobs.append(p2)
jobs.append(p3)
jobs.append(p4)
jobs.append(p5)
jobs.append(p6)
jobs.append(p7)
jobs.append(p8)
for j in jobs:
print("process "+ j.name +" started at "+ datetime.now().strftime('%H:%M:%S'))
j.start()
time.sleep(0.3)
for j in jobs:
j.join()
print ("processing complete at "+datetime.now().strftime('%H:%M:%S'))

Ok, you are dealing with a non-trivial problem here. I have taken the liberty
to DRY (Don't Repeat Yourself)
your code a bit. I also dont have your data or pandas installed so I have simplified
the inputs and outputs. The principles however are all the same and with few changes
you should be able to make your code work!
Attempt #1
I have an array of 800 int elements and each process is going to calculate the
sum of 100 of them. Look for # DRY: comments
# -*- coding: utf-8 -*-
import multiprocessing
import time
from datetime import datetime
#########
number_of_proc = 8
preparedDF = []
# DRY: This is now a list of lists. This allows us to refer to df1 as dfs[1]
dfs = []
# DRY: A dict of results. The key will be int (the process number!)
xdf = {}
#########
def fuzzyPrepare():
global preparedDF
# Generate fake data
preparedDF = range(number_of_proc * 100)
def fuzzySplit(df):
#split data to feed processes
global dfs
# DRY: Loop and generate N lists for N processes
for i in range(number_of_proc):
from_element = i * 100
to_element = from_element + 100
print("Packing [{}, {})".format(from_element, to_element))
dfs.append(df[from_element:to_element])
def fuzzyMatch(x):
global xdf
# DRY: Since we now have a dict, all the if-else is not needed any more...
xdf[x] = sum(dfs[x])
print("In process: x={}, xdf[{}]={}".format(x, x, xdf[x]))
if __name__ == '__main__':
fuzzyPrepare()
fuzzySplit(preparedDF)
# DRY: Create N processes AND append them
jobs = []
for p in range(number_of_proc):
p = multiprocessing.Process(name="p{}".format(p),target=fuzzyMatch, args=(p,))
jobs.append(p)
for j in jobs:
print("process "+ j.name +" started at "+ datetime.now().strftime('%H:%M:%S'))
j.start()
time.sleep(0.3)
for j in jobs:
j.join()
print ("processing complete at "+datetime.now().strftime('%H:%M:%S'))
print("results:")
for x in range(number_of_proc):
print("In process: x={}, xdf[{}]={}".format(x, x, xdf[x]))
Output:
Packing [0, 100)
Packing [100, 200)
Packing [200, 300)
Packing [300, 400)
Packing [400, 500)
Packing [500, 600)
Packing [600, 700)
Packing [700, 800)
process p0 started at 19:12:00
In process: x=0, xdf[0]=4950
process p1 started at 19:12:00
In process: x=1, xdf[1]=14950
process p2 started at 19:12:00
In process: x=2, xdf[2]=24950
process p3 started at 19:12:01
In process: x=3, xdf[3]=34950
process p4 started at 19:12:01
In process: x=4, xdf[4]=44950
process p5 started at 19:12:01
In process: x=5, xdf[5]=54950
process p6 started at 19:12:01
In process: x=6, xdf[6]=64950
process p7 started at 19:12:02
In process: x=7, xdf[7]=74950
processing complete at 19:12:02
results:
Traceback (most recent call last):
File "./tmp/proctest.py", line 58, in <module>
print("In process: x={}, xdf[{}]={}".format(x, x, xdf[x]))
KeyError: 0
What happened? I printed the values in the processing function and they were there?!
Well, I am not an expert but a python process works much like fork().
The basic principle is that it will spawn and initialize a new child process. The
child process will be having a COPY(!) of the parents memory. This means that
the parent and child processes do not share any data/memory!!!
So in our case:
We prepare our data
We create N processes
Each process has a COPY of dfs and xdf variables
While for dfs we do not care too much (since they are used for input), each
process now has it own xdf and not the parent's one! You see why the KeyError?
How to fix this (Attempt #2)
It is now obvious that we need to return data back from the process to the parent.
There are many ways of doing this but the simpest (code-wise) is to use a
multiprocessing.Manager to share data between your child processes (look for # NEW:
tag in the code - Note I have only changed 2 lines!):
# -*- coding: utf-8 -*-
import multiprocessing
import time
from datetime import datetime
# NEW: This can manage data between processes
from multiprocessing import Manager
#########
number_of_proc = 8
preparedDF = []
dfs = []
# NEW: we create a manager object to store the results
manager = Manager()
xdf = manager.dict()
#########
def fuzzyPrepare():
global preparedDF
# Generate fake data
preparedDF = range(number_of_proc * 100)
def fuzzySplit(df):
#split data to feed processes
global dfs
# DRY: Loop and generate N lists for N processes
for i in range(number_of_proc):
from_element = i * 100
to_element = from_element + 100
print("Packing [{}, {})".format(from_element, to_element))
dfs.append(df[from_element:to_element])
def fuzzyMatch(x):
global xdf
# DRY: Since we no have a dict, all the if-else is not needed any more...
xdf[x] = sum(dfs[x])
print("In process: x={}, xdf[{}]={}".format(x, x, xdf[x]))
if __name__ == '__main__':
fuzzyPrepare()
fuzzySplit(preparedDF)
# DRY: Create N processes AND append them
jobs = []
for p in range(number_of_proc):
p = multiprocessing.Process(name="p{}".format(p),target=fuzzyMatch, args=(p,))
jobs.append(p)
for j in jobs:
print("process "+ j.name +" started at "+ datetime.now().strftime('%H:%M:%S'))
j.start()
time.sleep(0.3)
for j in jobs:
j.join()
print ("processing complete at "+datetime.now().strftime('%H:%M:%S'))
print("results:")
for x in range(number_of_proc):
print("Out of process: x={}, xdf[{}]={}".format(x, x, xdf[x]))
And the output:
Packing [0, 100)
Packing [100, 200)
Packing [200, 300)
Packing [300, 400)
Packing [400, 500)
Packing [500, 600)
Packing [600, 700)
Packing [700, 800)
process p0 started at 19:34:50
In process: x=0, xdf[0]=4950
process p1 started at 19:34:50
In process: x=1, xdf[1]=14950
process p2 started at 19:34:50
In process: x=2, xdf[2]=24950
process p3 started at 19:34:51
In process: x=3, xdf[3]=34950
process p4 started at 19:34:51
In process: x=4, xdf[4]=44950
process p5 started at 19:34:51
In process: x=5, xdf[5]=54950
process p6 started at 19:34:52
In process: x=6, xdf[6]=64950
process p7 started at 19:34:52
In process: x=7, xdf[7]=74950
processing complete at 19:34:52
results:
Out of process: x=0, xdf[0]=4950
Out of process: x=1, xdf[1]=14950
Out of process: x=2, xdf[2]=24950
Out of process: x=3, xdf[3]=34950
Out of process: x=4, xdf[4]=44950
Out of process: x=5, xdf[5]=54950
Out of process: x=6, xdf[6]=64950
Out of process: x=7, xdf[7]=74950
Read more about this here and
note the warning about Manager being slower than a multiprocessing.Array (which actually also solves your problem here)

Related

Modify multiprocess function to accept a list of args

How do I update the multi_proc_parallel_functions function below to accept a list of args. This is using the multiprocess module.
Please note I will be using this with AWS Lambda and other multiprocessing modules can have issues on Lambda.
adder functions below are simply toy functions used to demo the issue.
import multiprocess as mp
def parallel_functions(function,send_end):
send_end.send(function())
def multi_proc_parallel_functions(function_list,target_func):
jobs = []
pipe_list = []
for function in function_list:
recv_end, send_end = mp.Pipe(False)
p = mp.Process(target=target_func, args=(function,send_end))
jobs.append(p)
pipe_list.append(recv_end)
p.start()
result_list = [x.recv() for x in pipe_list]
for proc in jobs:
proc.join()
return result_list
def adder10():
return np.random.randint(5) + 10
def adder1000():
return np.random.randint(5) + 1000
Create a list of functions
function_list = [adder10,adder10,adder10,adder1000]
Run all functions
multi_proc_parallel_functions(function_list,parallel_functions)
[13, 13, 13, 1003]
How do I update the multi_proc_parallel_functions to accept a varied length list of args which will vary per function as follows:
def adder10(x,y):
return np.random.randint(5) + 10 + x * y
def adder1000(a,b, c):
return np.random.randint(5) + 1000 -a + b +c
I think this will require *args.
This would be one way of doing it (with positional argument support):
import multiprocessing as mp
def parallel_functions(function, send_end, *args):
send_end.send(function(*args))
def multi_proc_parallel_functions(function_list, target_func):
jobs = []
pipe_list = []
for (function, *args) in function_list:
recv_end, send_end = mp.Pipe(False)
p = mp.Process(target=target_func, args=(function, send_end, *args))
jobs.append(p)
pipe_list.append(recv_end)
p.start()
result_list = [x.recv() for x in pipe_list]
for proc in jobs:
proc.join()
return result_list
import numpy as np
def adder10(x, y):
return np.random.randint(5) + 10 + x * y
def adder1000(a, b, c):
return np.random.randint(5) + 1000 -a + b +c
multi_proc_parallel_functions(
[ (adder10, 5, 4),
(adder10, 1, 2),
(adder1000, 5, 6, 7) ],
parallel_functions
)
Note that how the multiprocessing module works will depend on whether you are on Windows, macOS or Linux.
On Linux, the default way of creating a mp.Process is by using the fork-syscall, which means the function/its arguments does not need to be serializable/possible to pickle. The child process will inherit memory from the parent. macOS supports fork, Windows doesn't.
On Windows/macOS, the spawn syscall is used by default instead. This requires that everything sent to the child process is serializable/possible to pickle. This means you won't be able to send lambda expressions or dynamically created functions for example.
Example of something that would work on Linux (with your original implementation), but not on Windows (or macOS by default):
multi_proc_parallel_functions(
[ lambda: adder10(5, 4),
lambda: adder10(1, 2),
lambda: adder1000(5, 6, 7) ],
parallel_functions
)
# spawn: _pickle.PicklingError: Can't pickle <function <lambda> at 0x7fce7cc43010>: attribute lookup <lambda> on __main__ failed
# fork: [30, 12, 1008]
I would suggest using the operator module, which has functions for the math operations. This way, you can send a list of operators and values to modify the initial value in a flexible way.
Example where each argument is a tuple of (operator, value):
import operator
import numpy as np
np.random.seed(123)
def adder(*args):
x = np.random.randint(5)
print(x)
for operator, value in args:
x = operator(x, value)
print(x)
return x
adder((operator.add, 5), (operator.mul, 10))
This operation (2 + 5 * 10) outputs:
2
7
70

Python concurrent.futures.ProcessPoolExecutor() not executing methods inside objects

I am trying to concurrently execute methods from two objects concurrently for a computer vision task. My idea is to use two different feature detectors to compute their respective feature descriptions inside a base class.
In this regard, I built the following toy example to understand python concurrent.futures.ProcessPoolExecutor class.
When executed, the first part of the code runs as expected with 20 Heartbeat (10 from each method executed 10 times in total) strings printed out with the sum for two objects coming out correctly as 100, -100.
But in the second half of the code, it appears the ProcessPoolExecutor is not running the do_math(self, numx) method at all. What am I doing wrong here?
With best,
Azmyin
import numpy as np
import concurrent.futures as cf
import time
def current_milli_time():
# CORE FUNCTION
# Function that returns a time tick in milliseconds
return round(time.time() * 1000)
class masterClass(object):
super_multiplier = 1 # Class variable
def __init__(self, ls):
# Attributes of masterClass
self.var1 = ls[0]
self.sumx = ls[1]
def __rep__(self):
print(f"sumx value -- {self.sumx}")
def apply_sup_mult(self, var_in):
self.sumx = self.sumx + (var_in * masterClass.super_multiplier)
time.sleep(0.025)
print(f"Hearbeat!!")
# This is a regular method
def do_math(self, numx):
self.apply_sup_mult(numx)
ls = [10,0]
ls2 = [-10,0]
numx = 10
obj1 = masterClass(ls)
obj2 = masterClass(ls2)
t1 = current_milli_time()
# Run methods one by one
for _ in range(numx):
obj1.do_math(ls[0])
obj2.do_math(ls2[0])
obj1.__rep__()
obj2.__rep__()
t2 = current_milli_time()
print(f"Time taken -- {t2 - t1} ms")
print()
## Using multiprocessing to concurrently run two methods
# Intentionally reinitialize objects
obj1 = masterClass(ls)
obj1 = masterClass(ls2)
t1 = current_milli_time()
resx = []
with cf.ProcessPoolExecutor() as executor:
for i in range(numx):
#fs = [executor.submit(obj3.do_math, ls[0]), executor.submit(obj4.do_math, ls2[0])]
f1 = executor.submit(obj1.do_math, ls[0])
f2 = executor.submit(obj2.do_math, ls2[0])
# for i,f in enumerate(cf.as_completed(fs)):
# print(f"Done with {f}")
# # State of sumx
obj1.__rep__()
obj2.__rep__()
t2 = current_milli_time()
print(f"Time taken -- {t2 - t1} ms")

Converting from ThreadPool to ProcessExecutorPool

I have the following code which I would like to convert from using ThreadPool to use of ProcessPoolExecutor since it is all CPU intensive calculations and when i observe the CPU monitor I note that my 8 core processor is only using a single thread.
import datetime
from multiprocessing.dummy import Pool as ThreadPool
def thread_run(q, clients_credit_array, clients_terr_array,
freq_small_list, freq_large_list, clients, year, admin):
claim_id = []
claim_client_id = []
claim_company_id = []
claim_year = []
claim_type = []
claim_closed = []
claim_cnt = []
claim_amount = []
print(datetime.datetime.utcnow())
i = 0
client_cnt = 1000
loop_incr = 8
while i < client_cnt:
ind_rng = range(i, min((i + loop_incr), (client_cnt)), 1)
call_var = []
for q in ind_rng:
call_var.append((q,
clients_credit_array,
clients_terr_array,
freq_small_list,
freq_large_list,
clients,
year,
admin))
pool = ThreadPool(len(call_var))
results = pool.map(call_claim, call_var)
pool.close()
pool.join()
for result in results:
if result[0] == []:
pass
else:
r = 0
if r < len(result[0]):
claim_index += 1
claim_id.append(claim_index)
claim_client_id.append(result[0][r])
claim_company_id.append(result[1][r])
claim_year.append(result[2][r])
claim_type.append(result[3][r])
claim_closed.append(result[4][r])
claim_cnt.append(result[5][r])
claim_amount.append(result[6][r])
r += 1
i += loop_incr
print(datetime.datetime.utcnow())
The difficulty I am having, however, is that when I modify the code as follows, I get error messages:
from concurrent.futures import ProcessPoolExecutor as PThreadPool
pool = PThreadPool(max_workers=len(call_var))
#pool = ThreadPool(len(call_var))
results = pool.map(call_claim, call_var)
#pool.close()
#pool.join()
I had to remove the pool.close() and pool.join() as it generated errors. But when I removed them, my code was not utilizing parallel processors and it ran much longer and slower than originally. What am I missing?
As was pointed out in the comments, it is common to see Executor used as part of a context manager and without the need for join or close operations. Below is a simplified example to illustrate the concepts.
Example:
import concurrent.futures
import random
import time
import os
values = [1, 2, 3, 4, 5]
def times_two(n):
time.sleep(random.randrange(1, 5))
print("pid:", os.getpid())
return n * 2
def main():
with concurrent.futures.ProcessPoolExecutor() as executor:
results = executor.map(times_two, values)
for one_result in results:
print(one_result)
if __name__ == "__main__":
main()
Output:
pid: 396
pid: 8904
pid: 25440
pid: 20592
pid: 14636
2
4
6
8
10

How to get return value of different functions in a for loop with multiprocessing

I'm currently making some numerical solver for current simulation. To make my code faster, I made a function that returns the result of elementwise matrix multiplication, and gradient... and so on.
def mmul(A, B, procname, return_dict):
return_dict[procname] = np.multiply(A,B)
def mgrad(A, procname, return_dict):
return_dict[procname] = np.gradient(A/dx)
def madd(A, B, procname, return_dict):
return_dict[procname] = A+B
Now here's the body of the code. I first made a dictionary(return_dict) and store the results for each processing units, and get the values(Vgrad, Pgrad, Psquare) from the dictionary.
for k in range(0,max_iter-1, 1):
#0. Firstly generate all of the auxiliay calculation arrays
post_V, post_p, Vij_coeff = np.zeros((3, lx, ly), dtype = float)
# Calculate carrier density of the next step
processes = []
#---------------------------- # Const/mtx for calculating p
manager = multiprocessing.Manager()
return_dict = manager.dict()
p0 = multiprocessing.Process(target = mgrad, args = (V, Vgrad, return_dict))
processes.append(p0)
p0.start()
p1 = multiprocessing.Process(target = mgrad, args = (p, Pgrad, return_dict))
processes.append(p1);p1.start()
p2 = multiprocessing.Process(target = mmul, args = (p,p, Psquare, return_dict))
processes.append(p2);p2.start()
for process in processes:
process.join()
Vgrad = return_dict['Vgrad']
Pgrad = return_dict['Pgrad']
Psquare = return_dict['Psquare']
However, this code makes the error below
PicklingError: Can't pickle <function mgrad at 0x000002776C3614C8>: it's not the same object as __main__.mgrad
Is there any solutions to get the calculated value of the function, while running in multiprocessor?

Converting a serial task to parallel to map inputs and outputs

I have tens of thousands of simulations to run on a system with several cores. Currently, it is done in serial, where I know my input parameters, and store my results in a dict.
Serial version
import time
import random
class MyModel(object):
input = None
output = None
def run(self):
time.sleep(random.random()) # simulate a complex task
self.output = self.input * 10
# Run serial tasks and store results for each parameter
parameters = range(10)
results = {}
for p in parameters:
m = MyModel()
m.input = p
m.run()
results[p] = m.output
print('results: ' + str(results))
Which takes <10 seconds, and displays correct results:
results: {0: 0, 1: 10, 2: 20, 3: 30, 4: 40, 5: 50, 6: 60, 7: 70, 8: 80, 9: 90}
Parallel version
My attempts to parallelize this procedure are based on the example in the multiprocessing module near the text "An example showing how to use queues to feed tasks to a collection of worker processes and collect the results" (sorry, no URL anchor available).
The following builds on the top half of the serial version:
from multiprocessing import Process, Queue
NUMBER_OF_PROCESSES = 4
def worker(input, output):
for args in iter(input.get, 'STOP'):
m = MyModel()
m.input = args[0]
m.run()
output.put(m.output)
# Run parallel tasks and store results for each parameter
parameters = range(10)
results = {}
# Create queues
task_queue = Queue()
done_queue = Queue()
# Submit tasks
tasks = [(t,) for t in parameters]
for task in tasks:
task_queue.put(task)
# Start worker processes
for i in range(NUMBER_OF_PROCESSES):
Process(target=worker, args=(task_queue, done_queue)).start()
# Get unordered results
for i in range(len(tasks)):
results[i] = done_queue.get()
# Tell child processes to stop
for i in range(NUMBER_OF_PROCESSES):
task_queue.put('STOP')
print('results: ' + str(results))
Takes only a few seconds now, but the mapping orders between inputs and results are mixed up.
results: {0: 10, 1: 0, 2: 60, 3: 40, 4: 20, 5: 80, 6: 30, 7: 90, 8: 70, 9: 50}
I realise that I'm populating the results based on an unordered done_queue.get(), but I'm not sure how to get the correct mapping to task_queue. Any ideas? Any other way to make this somehow cleaner?
A-ha! The worker needs to embed some kind of ID, such as the input parameter(s) used to return to the output queue, which can be used to identify the returned process. Here are the required modifications:
def worker(input, output):
for args in iter(input.get, 'STOP'):
m = MyModel()
m.input = args[0]
m.run()
# Return a tuple of an ID (the input parameter), and the model output
return_obj = (m.input, m.output)
output.put(return_obj)
and
# Get unordered results
for i in range(len(tasks)):
# Unravel output tuple, which has the input parameter 'p' used as an ID
p, result = done_queue.get()
results[p] = result

Categories

Resources