I want to write data to the same list via python multiprocessing, I do interprocess data sharing via mp.manager.list. The code is shown below, this is just a demo, I want to add the same numbers to the same list. However, counter can be increased, but grp remains the same. Where is the problem?
import multiprocessing as mp
import random
import time
import numpy as np
class A:
def __init__(self):
self.raw = [random.randint(1, 4) for _ in range(100)]
self.manager = mp.Manager()
self.grp = self.manager.list([[1], [2], [3], [4]])
self.use_cpu_num = 2
self.counter = self.manager.Value('i', 0)
def run(self):
subsets = np.array_split(self.raw, self.use_cpu_num)
subsets = [each.tolist() for each in subsets]
process = []
for i in range(self.use_cpu_num):
process.append(mp.Process(target=self.process, args=(subsets[i], )))
for each in process:
each.start()
for each in process:
each.join()
each.close()
print(self.grp)
def process(self, subset):
for each in subset:
for i in range(len(self.grp)):
each_grp = self.grp[i]
if each in each_grp:
self.counter.set(self.counter.value + 1)
self.grp[i].append(each)
print(self.counter.value)
if __name__ == '__main__':
a = A()
a.run()
I tried using mp.Lock(), but that doesn't share data between different processes.
Put it this way, self.grp is a managed object, any change on it using self.grp.append or self.grp[i] = x will be transferred to the manager process.
The objects inside self.grp are not managed, any change to them will not be transferred to the manager, you only get a copy of them when you use self.grp[i].
In order to allow modifications to the lists inside self.grp to propagate, those lists must themselves be manager.list object, and nesting managed objects is not supported for versions of python below 3.6
self.grp = self.manager.list([self.manager.list(x) for x in ([1], [2], [3], [4])])
If you are only storing numbers you can pass multiprocessing.Array which can be wrapped as a numpy ndarray for convenience, but you cannot append to it, and must know the size beforehand.
Edit: on windows you will get an error when trying to pickle the self.manager object, so i modified it out of the class in the example below.
import multiprocessing as mp
import random
import time
import numpy as np
class A:
def __init__(self):
self.raw = [random.randint(1, 4) for _ in range(100)]
self.grp = manager.list([manager.list(x) for x in ([1], [2], [3], [4])])
self.use_cpu_num = 2
self.counter = manager.Value('i', 0)
def run(self):
subsets = np.array_split(self.raw, self.use_cpu_num)
subsets = [each.tolist() for each in subsets]
process = []
for i in range(self.use_cpu_num):
process.append(mp.Process(target=self.process, args=(subsets[i], )))
for each in process:
each.start()
for each in process:
each.join()
each.close()
print([list(x) for x in self.grp])
def process(self, subset):
for each in subset:
for i in range(len(self.grp)):
each_grp = self.grp[i]
if each in each_grp:
self.counter.set(self.counter.value + 1)
self.grp[i].append(each)
print(self.counter.value)
if __name__ == '__main__':
manager = mp.Manager()
a = A()
a.run()
Related
I've been trying parallelize a process inside a class method. When I try using Pool() from multiprocessing I get pickling errors. When I use Pool() from multiprocessing.dummy my execution is slower than serialized execution.
I've attempted several variations of my code below, using Stackoverflow posts as a guide, but none of them were a successful workaround for the problem outlined above.
One for example: if I move process_function above the class definition (globalizing it) it's doesn't work because I can't access my objects attributes.
Anyway, my code is similar to:
from multiprocessing.dummy import Pool as ThreadPool
from my_other_module import other_module_class
class myClass:
def __init__(self, some_list, number_iterations):
self.my_interface = other_module_class
self.relevant_list = []
self.some_list = some_list
self.number_iterations = number_iterations
# self.other_attributes = stuff from import statements
def load_relevant_data:
self.relevant_list = self.interface.other_function
def compute_foo(self, relevant_list_member_value):
# math involving class attributes
return foo_scalar
def higher_function(self):
self.relevant_list = self.load_relevant_data
np.random.seed(0)
pool = ThreadPool() # I've tried different args here, no help
pool.map(self.process_function, self.relevant_list)
def process_function(self, dict_from_relevant_list):
foo_bar = self.compute_foo(dict_from_relevant_list['key'])
a = 0
for i in some_other_list:
# do other stuff involving class attributes and foo_bar
# a = some of that
dict_from_relevant_list['other_key'] = a
if __name__ == '__main__':
import time
import pprint as pp
some_list = blah
number_of_iterations = 10**4
my_obj = myClass(some_list, number_of_iterations
my_obj.load_third_parties()
start = time.time()
my_obj.higher_function()
execution_time = time.time() - start
print()
print("Execution time for %s simulation runs: %s" % (number_of_iterations, execution_time))
print()
pp.pprint(my_obj.relevant_list[0:5])
I have a few hundred dictionaries inside relevant list. I just want to populate each of those dictionary's 'other_key' field from a computationally expensive simulation on my inner most loop, which yields a scalar value, like a above. It seems like there should be a simple way to do this since in Matlab I could just right parfor and it's done automatically. Maybe that instinct is wrong for Python.
In the example problem below, the main program creates a list of random strings of length data_size. Without multi-processing the data is sent directly to Test.iterate() where the class merely adds the string Test- to the beginning of each random string. When run without multiprocessing the code works very well with small values of data_size and large values of data_size.
I decided to add a multiprocessing ability to this test problem and broke down the core components of multiprocessing into a class title MultiProc. The member function Multiproc.run_processes() manages all functions in the class. The function assumes that the input list will be divided into x smaller lists depending on how many processes the user wishes to utilize. As a result, the function starts by determining the upper and lower indices for each sub-list relative to the initial list so the code knows which portions to iterate over for each thread. The function then initiates the processes, starts the process, joins the process, extracts the data from Queue, then it re-orders the returned data based on a counter that is passed to the primary function. The MultiProc class works fairly well at small values of data_size, but above a value of ~500, the code never terminates, although I suspect the value will vary from computer to computer depending on memory. However, at some point the multiprocess function stops working and I suspect it has something to do with the way data is returned from multiprocess. Does anyone know what might be causing this problem and how to fix it?
from multiprocessing import Process, Queue
from itertools import chain
import string
import random
class Test:
def __init__(self, array_list):
self.array_list = array_list
def func(self, names):
return 'Test-' + names
def iterate(self, upper, lower, counter):
output = [self.func(self.array_list[i]) for i in range(lower, upper)]
return output, counter
class MultiProc:
def __init__(self, num_procs, data_array, func):
self.num_procs = num_procs
self.data_array = data_array
self.func = func
if self.num_procs > len(self.data_array):
self.num_procs = len(self.data_array)
self.length = int((len(self.data_array) / self.num_procs) // 1)
def run_processes(self):
upper = self.__determine_upper_indices()
lower = self.__determine_lower_indices(upper)
p, q = self.__initiate_proc(self.func, upper, lower)
self.__start_thread(p)
self.__join_threads(p)
results = self.__extract_data(q)
new = self.__reorder_data(results)
return new
def __determine_upper_indices(self):
upper = [i * self.length for i in range(1, self.num_procs)]
upper.append(len(self.data_array))
return upper
def __determine_lower_indices(self, upper):
lower = [upper[i] for i in range(len(upper) - 1)]
lower = [0] + lower
return lower
def __initiate_proc(self, func, upper, lower):
q = Queue()
p = [Process(target=self.run_and_send_back_output,
args=(q, func, upper[i], lower[i], i))
for i in range(self.num_procs)]
return p, q
def __start_thread(self, p):
[p[i].start() for i in range(self.num_procs)]
def __join_threads(self, p):
[p[i].join() for i in range(self.num_procs)]
def __extract_data(self, q):
results = []
while not q.empty():
results.extend(q.get())
return results
def __reorder_data(self, results):
new = [results[i - 1] for j in range(self.num_procs)
for i in range(len(results)) if results[i] == j]
new = list(chain.from_iterable(new))
return new
def run_and_send_back_output(self, queue, func, *args):
result = func(*args) # run the func
queue.put(result) # send the result back
def id_generator(size=6, chars=string.ascii_uppercase + string.digits):
return ''.join(random.choice(chars) for _ in range(size))
if __name__ == "__main__":
random.seed(1234)
data_size = 9
num_proc = 2
test_list = [id_generator() for i in range(data_size)]
obj1 = Test(test_list)
result1 = obj1.iterate(data_size, 0, 1)
print(result1)
multi = MultiProc(num_proc, test_list, obj1.iterate)
result2 = multi.run_processes()
print(result2)
# >> ['Test-2HAFCF', 'Test-GWPBBB', 'Test-W43JFL', 'Test-HA65PE',
# 'Test-83EF6C', 'Test-R9ET4W', 'Test-RPM37B', 'Test-6EAVJ4',
# 'Test-YKDE5K']
Your main problem is this:
self.__start_thread(p)
self.__join_threads(p)
results = self.__extract_data(q)
You start your workers that try to put something in a queue, then join the workers and only after that you start retreiving data from the queue. The workers however can only exit after all data has been flushed to the underlying pipe, and will block on exit otherwise. Joining processes blocked like this before starting to retrieve elements from the pipe can result in a deadlock.
Maybe you should look into multiprocessing.Pool, as what you're trying to implement is some kind of a map() operation. Your example could rewritten more elegantly something like this:
from multiprocessing import Pool
import string
import random
def func(name):
return 'Test-' + name
def id_generator(size=6, chars=string.ascii_uppercase + string.digits):
return ''.join(random.choice(chars) for _ in range(size))
if __name__ == "__main__":
random.seed(1234)
data_size = 5000
num_proc = 2
test_list = [id_generator() for i in range(data_size)]
with Pool(num_proc) as pool:
result = pool.map(func, test_list)
print(result)
My program needs to spawn multiple instances of a class, each processing data that is coming from a streaming data source.
For example:
parameters = [1, 2, 3]
class FakeStreamingApi:
def __init__(self):
pass
def data(self):
return 42
pass
class DoStuff:
def __init__(self, parameter):
self.parameter = parameter
def run(self):
data = streaming_api.data()
output = self.parameter ** 2 + data # Some CPU intensive task
print output
streaming_api = FakeStreamingApi()
# Here's how this would work with no multiprocessing
instance_1 = DoStuff(parameters[0])
instance_1.run()
Once the instances are running they don't need to interact with each other, they just have to get the data as it comes in. (and print error messages, etc)
I am totally at a loss how to make this work with multiprocessing, since I first have to create a new instance of the class DoStuff, and then have it run.
This is definitely not the way to do it:
# Let's try multiprocessing
import multiprocessing
for parameter in parameters:
processes = [ multiprocessing.Process(target = DoStuff, args = (parameter)) ]
# Hmm, this doesn't work...
We could try defining a function to spawn classes, but that seems ugly:
import multiprocessing
def spawn_classes(parameter):
instance = DoStuff(parameter)
instance.run()
for parameter in parameters:
processes = [ multiprocessing.Process(target = spawn_classes, args = (parameter,)) ]
# Can't tell if it works -- no output on screen?
Plus, I don't want to have 3 different copies of the API interface class running, I want that data to be shared between all the processes... and as far as I can tell, multiprocessing creates copies of everything for each new process.
Ideas?
Edit:
I think I may have got it... is there anything wrong with this?
import multiprocessing
parameters = [1, 2, 3]
class FakeStreamingApi:
def __init__(self):
pass
def data(self):
return 42
pass
class Worker(multiprocessing.Process):
def __init__(self, parameter):
super(Worker, self).__init__()
self.parameter = parameter
def run(self):
data = streaming_api.data()
output = self.parameter ** 2 + data # Some CPU intensive task
print output
streaming_api = FakeStreamingApi()
if __name__ == '__main__':
jobs = []
for parameter in parameters:
p = Worker(parameter)
jobs.append(p)
p.start()
for j in jobs:
j.join()
I came to the conclusion that it would be necessary to use multiprocessing.Queues to solve this. The data source (the streaming API) needs to pass copies of the data to all the different processes, so they can consume it.
There's another way to solve this using the multiprocessing.Manager to create a shared dict, but I didn't explore it further, as it looks fairly inefficient and cannot propagate changes to inner values (e.g if you have a dict of lists, changes to the inner lists will not propagate).
I have a large list of elements ~ 100000 and need to map it as follows:
def mark_diff(args):
item = args[0]
pi = args[1]
item.marked_diff = (item.p/pi[0]+item.c/pi[1]+item.f/pi[2] - 3)**2
return item
def mark(f_set , goal):
with Pool(3) as p:
data = p.map(mark_diff , zip(f_set , itertools.repeat(goal)))
return data
The default value of item.markded_diff is 0, and item is a mongoengine document.
I am resorting to multiprocessing because the mark_diff is substantially more complicated than shown here and involves a lot of exponents, logarithms for which i am using numpy.
Now for the problem,
The returned data still has item.marked_diff as 0.
While if I add a print statement at the bottom of mark_diff correct values are being assigned and are non-zero.
Definition of item.
import random,mongoengine
class F(mongoengine.Document):
p = mongoengine.FloatField()
c = mongoengine.FloatField()
f = mongoengine.FloatField()
marked_diff = 0
f_sets = F.objects.all()
goal = [0.2,0.35,0.45]
So something is going on in what you didn't show. When I flesh this out into a complete, executable program, it appears to work fine. Here's the output from one run under Python 3.6.1:
0.7024116548559156
13.468354599594324
6.036133666404753
0.16520292241977205
0.17073749475275496
1.903674418518389
0.2432159511273063
7.743326563037492
4.1990243814914425
19.36243187965931
And here's the full program:
from multiprocessing import Pool
import random
import itertools
class F:
def __init__(self):
self.p = random.random()
self.c = random.random()
self.f = random.random()
def mark_diff(args):
item = args[0]
pi = args[1]
item.marked_diff = (item.p/pi[0]+item.c/pi[1]+item.f/pi[2] - 3)**2
return item
def mark(f_set , goal):
with Pool(3) as p:
data = p.map(mark_diff , zip(f_set , itertools.repeat(goal)))
return data
if __name__ == "__main__":
f_set = [F() for _ in range(10)]
goal = [0.2,0.35,0.45]
xs = mark(f_set, goal)
for x in xs:
print(x.marked_diff)
Is it possible that you're looking at marked_diff in the original f_set instead of in the items returned by mark()?
For more setup, see this question. I want to create lots of instances of class Toy, in parallel. Then I want to write them to an xml tree.
import itertools
import pandas as pd
import lxml.etree as et
import numpy as np
import sys
import multiprocessing as mp
def make_toys(df):
l = []
for index, row in df.iterrows():
toys = [Toy(row) for _ in range(row['number'])]
l += [x for x in toys if x is not None]
return l
class Toy(object):
def __new__(cls, *args, **kwargs):
if np.random.uniform() <= 1:
return super(Toy, cls).__new__(cls, *args, **kwargs)
def __init__(self, row):
self.id = None
self.type = row['type']
def set_id(self, x):
self.id = x
def write(self, tree):
et.SubElement(tree, "toy", attrib={'id': str(self.id), 'type': self.type})
if __name__ == "__main__":
table = pd.DataFrame({
'type': ['a', 'b', 'c', 'd'],
'number': [5, 4, 3, 10]})
n_cores = 2
split_df = np.array_split(table, n_cores)
p = mp.Pool(n_cores)
pool_results = p.map(make_toys, split_df)
p.close()
p.join()
l = [a for L in pool_results for a in L]
box = et.Element("box")
box_file = et.ElementTree(box)
for i, toy in itertools.izip(range(len(l)), l):
Toy.set_id(toy, i)
[Toy.write(x, box) for x in l]
box_file.write(sys.stdout, pretty_print=True)
This code runs beautifully. But I redefined the __new__ method to only have a random chance of instantiating a class. So if I set if np.random.uniform() < 0.5, I want to create half as many instances as I asked for, randomly determined. Doing this returns the following error:
Exception in thread Thread-3:
Traceback (most recent call last):
File "/usr/lib/python2.7/threading.py", line 810, in __bootstrap_inner
self.run()
File "/usr/lib/python2.7/threading.py", line 763, in run
self.__target(*self.__args, **self.__kwargs)
File "/usr/lib/python2.7/multiprocessing/pool.py", line 380, in _handle_results
task = get()
AttributeError: 'NoneType' object has no attribute '__dict__'
I don't know what this even means, or how to avoid it. If I do this process monolithically, as in l = make_toys(table), it runs well for any random chance.
Another solution
By the way, I know that this can be solved by leaving the __new__ method alone and instead rewriting make_toys() as
def make_toys(df):
l = []
for index, row in df.iterrows():
prob = np.random.binomial(row['number'], 0.1)
toys = [Toy(row) for _ in range(prob)]
l += [x for x in toys if x is not None]
return l
But I'm trying to learn about the error.
I think you've uncovered a surprising "gotcha" caused by Toy
instances becoming None as they are passed through the multiprocessing Pool's
result Queue.
The multiprocessing.Pool uses Queue.Queues to pass results from the subprocesses back to the main process.
Per the docs:
When an object is put on a queue, the object is pickled and a background
thread later flushes the pickled data to an underlying pipe.
While the actual serialization might be different, in spirit
the pickling of an instance of Toy becomes a stream of bytes such as this:
In [30]: import pickle
In [31]: pickle.dumps(Toy(table.iloc[0]))
Out[31]: "ccopy_reg\n_reconstructor\np0\n(c__main__\nToy\np1\nc__builtin__\nobject\np2\nNtp3\nRp4\n(dp5\nS'type'\np6\nS'a'\np7\nsS'id'\np8\nNsb."
Notice that the module and class of the object is mentioned in the stream of
bytes: __main__\nToy.
The class itself is not pickled. There is only a reference to the name of the class.
When the stream of bytes is unpickled on the other side of the pipe, Toy.__new__ is called to instantiate a new instance of Toy. The new object's __dict__ is then reconstituted using unpickled data from the byte stream. When the new object is None, it has no __dict__ attribute, and hence the AttributeError is raised.
Thus, as a Toy instance is passed through the Queue, it might become None on the other side.
I believe this is the reason why using
class Toy(object):
def __new__(cls, *args, **kwargs):
x = np.random.uniform() <= 0.5
if x:
return super(Toy, cls).__new__(cls, *args, **kwargs)
logger.info('Returning None')
leads to
AttributeError: 'NoneType' object has no attribute '__dict__'
If you add logging to your script,
import itertools
import pandas as pd
import lxml.etree as et
import numpy as np
import sys
import multiprocessing as mp
import logging
logger = mp.log_to_stderr(logging.INFO)
def make_toys(df):
result = []
for index, row in df.iterrows():
toys = [Toy(row) for _ in range(row['number'])]
result += [x for x in toys if x is not None]
return result
class Toy(object):
def __new__(cls, *args, **kwargs):
x = np.random.uniform() <= 0.97
if x:
return super(Toy, cls).__new__(cls, *args, **kwargs)
logger.info('Returning None')
def __init__(self, row):
self.id = None
self.type = row['type']
def set_id(self, x):
self.id = x
def write(self, tree):
et.SubElement(tree, "toy", attrib={'id': str(self.id), 'type': self.type})
if __name__ == "__main__":
table = pd.DataFrame({
'type': ['a', 'b', 'c', 'd'],
'number': [5, 4, 3, 10]})
n_cores = 2
split_df = np.array_split(table, n_cores)
p = mp.Pool(n_cores)
pool_results = p.map(make_toys, split_df)
p.close()
p.join()
l = [a for L in pool_results for a in L]
box = et.Element("box")
box_file = et.ElementTree(box)
for i, toy in itertools.izip(range(len(l)), l):
toy.set_id(i)
for x in l:
x.write(box)
box_file.write(sys.stdout, pretty_print=True)
you will find that the AttributeError only
occurs after a logging message of the form
[INFO/MainProcess] Returning None
Notice that the logging message comes from the MainProcess, not one of the
PoolWorker processes. Since the Returning None message comes from
Toy.__new__, this shows that Toy.__new__ was called by the main process.
This corroborates the claim that unpickling is calling
Toy.__new__ and transforming instances of Toy into None.
The moral of the story is that for Toy instances to be passed through a multiprocessing Pool's Queue, Toy.__new__ must always return an instance of
Toy. And as you noted, the code can be fixed by instantiating only the desired number of Toys in make_toys:
def make_toys(df):
result = []
for index, row in df.iterrows():
prob = np.random.binomial(row['number'], 0.1)
result.extend([Toy(row) for _ in range(prob)])
return result
By the way, it is non-standard to call instance methods with Toy.write(x, box)
when x is an instance of Toy. The preferred way is to use
x.write(box)
Similary, use toy.set_id(i) instead of Toy.set_id(toy, i).