Python multiprocessing example never terminates when the dataset is too large - python

In the example problem below, the main program creates a list of random strings of length data_size. Without multi-processing the data is sent directly to Test.iterate() where the class merely adds the string Test- to the beginning of each random string. When run without multiprocessing the code works very well with small values of data_size and large values of data_size.
I decided to add a multiprocessing ability to this test problem and broke down the core components of multiprocessing into a class title MultiProc. The member function Multiproc.run_processes() manages all functions in the class. The function assumes that the input list will be divided into x smaller lists depending on how many processes the user wishes to utilize. As a result, the function starts by determining the upper and lower indices for each sub-list relative to the initial list so the code knows which portions to iterate over for each thread. The function then initiates the processes, starts the process, joins the process, extracts the data from Queue, then it re-orders the returned data based on a counter that is passed to the primary function. The MultiProc class works fairly well at small values of data_size, but above a value of ~500, the code never terminates, although I suspect the value will vary from computer to computer depending on memory. However, at some point the multiprocess function stops working and I suspect it has something to do with the way data is returned from multiprocess. Does anyone know what might be causing this problem and how to fix it?
from multiprocessing import Process, Queue
from itertools import chain
import string
import random
class Test:
def __init__(self, array_list):
self.array_list = array_list
def func(self, names):
return 'Test-' + names
def iterate(self, upper, lower, counter):
output = [self.func(self.array_list[i]) for i in range(lower, upper)]
return output, counter
class MultiProc:
def __init__(self, num_procs, data_array, func):
self.num_procs = num_procs
self.data_array = data_array
self.func = func
if self.num_procs > len(self.data_array):
self.num_procs = len(self.data_array)
self.length = int((len(self.data_array) / self.num_procs) // 1)
def run_processes(self):
upper = self.__determine_upper_indices()
lower = self.__determine_lower_indices(upper)
p, q = self.__initiate_proc(self.func, upper, lower)
self.__start_thread(p)
self.__join_threads(p)
results = self.__extract_data(q)
new = self.__reorder_data(results)
return new
def __determine_upper_indices(self):
upper = [i * self.length for i in range(1, self.num_procs)]
upper.append(len(self.data_array))
return upper
def __determine_lower_indices(self, upper):
lower = [upper[i] for i in range(len(upper) - 1)]
lower = [0] + lower
return lower
def __initiate_proc(self, func, upper, lower):
q = Queue()
p = [Process(target=self.run_and_send_back_output,
args=(q, func, upper[i], lower[i], i))
for i in range(self.num_procs)]
return p, q
def __start_thread(self, p):
[p[i].start() for i in range(self.num_procs)]
def __join_threads(self, p):
[p[i].join() for i in range(self.num_procs)]
def __extract_data(self, q):
results = []
while not q.empty():
results.extend(q.get())
return results
def __reorder_data(self, results):
new = [results[i - 1] for j in range(self.num_procs)
for i in range(len(results)) if results[i] == j]
new = list(chain.from_iterable(new))
return new
def run_and_send_back_output(self, queue, func, *args):
result = func(*args) # run the func
queue.put(result) # send the result back
def id_generator(size=6, chars=string.ascii_uppercase + string.digits):
return ''.join(random.choice(chars) for _ in range(size))
if __name__ == "__main__":
random.seed(1234)
data_size = 9
num_proc = 2
test_list = [id_generator() for i in range(data_size)]
obj1 = Test(test_list)
result1 = obj1.iterate(data_size, 0, 1)
print(result1)
multi = MultiProc(num_proc, test_list, obj1.iterate)
result2 = multi.run_processes()
print(result2)
# >> ['Test-2HAFCF', 'Test-GWPBBB', 'Test-W43JFL', 'Test-HA65PE',
# 'Test-83EF6C', 'Test-R9ET4W', 'Test-RPM37B', 'Test-6EAVJ4',
# 'Test-YKDE5K']

Your main problem is this:
self.__start_thread(p)
self.__join_threads(p)
results = self.__extract_data(q)
You start your workers that try to put something in a queue, then join the workers and only after that you start retreiving data from the queue. The workers however can only exit after all data has been flushed to the underlying pipe, and will block on exit otherwise. Joining processes blocked like this before starting to retrieve elements from the pipe can result in a deadlock.
Maybe you should look into multiprocessing.Pool, as what you're trying to implement is some kind of a map() operation. Your example could rewritten more elegantly something like this:
from multiprocessing import Pool
import string
import random
def func(name):
return 'Test-' + name
def id_generator(size=6, chars=string.ascii_uppercase + string.digits):
return ''.join(random.choice(chars) for _ in range(size))
if __name__ == "__main__":
random.seed(1234)
data_size = 5000
num_proc = 2
test_list = [id_generator() for i in range(data_size)]
with Pool(num_proc) as pool:
result = pool.map(func, test_list)
print(result)

Related

python multiprocessing write data to the same list

I want to write data to the same list via python multiprocessing, I do interprocess data sharing via mp.manager.list. The code is shown below, this is just a demo, I want to add the same numbers to the same list. However, counter can be increased, but grp remains the same. Where is the problem?
import multiprocessing as mp
import random
import time
import numpy as np
class A:
def __init__(self):
self.raw = [random.randint(1, 4) for _ in range(100)]
self.manager = mp.Manager()
self.grp = self.manager.list([[1], [2], [3], [4]])
self.use_cpu_num = 2
self.counter = self.manager.Value('i', 0)
def run(self):
subsets = np.array_split(self.raw, self.use_cpu_num)
subsets = [each.tolist() for each in subsets]
process = []
for i in range(self.use_cpu_num):
process.append(mp.Process(target=self.process, args=(subsets[i], )))
for each in process:
each.start()
for each in process:
each.join()
each.close()
print(self.grp)
def process(self, subset):
for each in subset:
for i in range(len(self.grp)):
each_grp = self.grp[i]
if each in each_grp:
self.counter.set(self.counter.value + 1)
self.grp[i].append(each)
print(self.counter.value)
if __name__ == '__main__':
a = A()
a.run()
I tried using mp.Lock(), but that doesn't share data between different processes.
Put it this way, self.grp is a managed object, any change on it using self.grp.append or self.grp[i] = x will be transferred to the manager process.
The objects inside self.grp are not managed, any change to them will not be transferred to the manager, you only get a copy of them when you use self.grp[i].
In order to allow modifications to the lists inside self.grp to propagate, those lists must themselves be manager.list object, and nesting managed objects is not supported for versions of python below 3.6
self.grp = self.manager.list([self.manager.list(x) for x in ([1], [2], [3], [4])])
If you are only storing numbers you can pass multiprocessing.Array which can be wrapped as a numpy ndarray for convenience, but you cannot append to it, and must know the size beforehand.
Edit: on windows you will get an error when trying to pickle the self.manager object, so i modified it out of the class in the example below.
import multiprocessing as mp
import random
import time
import numpy as np
class A:
def __init__(self):
self.raw = [random.randint(1, 4) for _ in range(100)]
self.grp = manager.list([manager.list(x) for x in ([1], [2], [3], [4])])
self.use_cpu_num = 2
self.counter = manager.Value('i', 0)
def run(self):
subsets = np.array_split(self.raw, self.use_cpu_num)
subsets = [each.tolist() for each in subsets]
process = []
for i in range(self.use_cpu_num):
process.append(mp.Process(target=self.process, args=(subsets[i], )))
for each in process:
each.start()
for each in process:
each.join()
each.close()
print([list(x) for x in self.grp])
def process(self, subset):
for each in subset:
for i in range(len(self.grp)):
each_grp = self.grp[i]
if each in each_grp:
self.counter.set(self.counter.value + 1)
self.grp[i].append(each)
print(self.counter.value)
if __name__ == '__main__':
manager = mp.Manager()
a = A()
a.run()

Multiprocessing event-queue not updating

So I'm writing a program with an event system.
I got a list of events to be handled.
One Process is supposed to push to the handler-list new events.
This part seems to work as I tried to print out the to-handle-list after pushing one event.
It gets longer and longer, while, when I print out the to handle list in the handle-event method, it is empty all the time.
Here is my event_handler code:
class Event_Handler:
def __init__(self):
self._to_handle_list = [deque() for _ in range(Event_Prio.get_num_prios()) ]
self._controll_handler= None
self._process_lock = Lock()
def init(self, controll_EV_handler):
self._controll_handler= controll_EV_handler
def new_event(self, event): #adds a new event to list
with self._process_lock:
self._to_handle_list[event.get_Prio()].append(event) #this List grows
def handle_event(self): #deals with the to_handle_list
self._process_lock.acquire()
for i in range(Event_Prio.get_num_prios()): #here i keep a list of empty deque
print(self._to_handle_list)
if (self._to_handle_list[i]): #checks if to-do is empty, never gets here that its not
self._process_lock.release()
self._controll_handler.controll_event(self._to_handle_list[i].popleft())
return
self._process_lock.release()
def create_Event(self, prio, type):
return Event(prio, type)
I tried everything. I checked if the event-handler-id is the same for both processes (plus the lock works)
I even checked if the to-handle-list-id is the same for both methods; yes it is.
Still the one in the one process grows, while the other is empty.
Can someone please tell me why the one list is empty?
Edit: It works just fine if I throw a event through the system with only one process. has to do sth with multiprocessing
Edit: Because someone asked, here is a simple usecase for it(I only used the essentials):
class EV_Main():
def __init__(self):
self.e_h = Event_Handler()
self.e_controll = None #the controller doesnt even matter because the controll-function never gets called....list is always empty
def run(self):
self.e_h.init(self.e_controll)
process1 = Process(target = self.create_events)
process2 = Process(target = self.handle_events)
process1.start()
process2.start()
def create_events(self):
while True:
self.e_h.new_event(self.e_h.create_Event(0, 3)) # eEvent_Type.S_TOUCH_EVENT
time.sleep(0.3)
def handle_events(self):
while True:
self.e_h.handle_event()
time.sleep(0.1)
To have a shareable set of deque instances, you could create a special class DequeArray which will hold an internal list of deque instances and expose whatever methods you might need. Then I would turn this into a shareable, managed object. When the manager creates an instance of this class, what is returned is a proxy to the actual instance that resides in the manager's address space. Any method calls you make on this proxy are actually shipped of to the manager's process using pickle and any results returned the same way. Since the individual deque instances are not shareable, managed objects, do not add a method that returns one of these deque instances which is then modified without being cognizant that the version of the deque in the manager's address space has not been modified.
Individual operations on a deque are serialized. But if you are doing some operation on a deque that consists of multiple method calls on the deque and you require atomicity, then that sequence is a critical section that needs to be done under control of a lock, as in the left_rotate function below.
from multiprocessing import Process, Lock
from multiprocessing.managers import BaseManager
from collections import deque
# Add methods to this as required:
class DequeArray:
def __init__(self, array_size):
self._deques = [deque() for _ in range(array_size)]
def __repr__(self):
l = []
l.append('DequeArray [')
for d in self._deques:
l.append(' ' + str(d))
l.append(']')
return '\n'.join(l)
def __len__(self):
"""
Return our length (i.e. the number of deque
instances we have).
"""
return len(self._deques)
def append(self, i, value):
"""
Append value to the ith deque
"""
self._deques[i].append(value)
def popleft(self, i):
"""
Eexcute a popleft operation on the ith deque
and return the result.
"""
return self._deques[i].popleft()
def length(self, i):
"""
Return length of the ith dequeue.
"""
return len(self._deques[i])
class DequeArrayManager(BaseManager):
pass
DequeArrayManager.register('DequeArray', DequeArray)
# Demonstrate how to use a sharable DequeArray
def left_rotate(deque_array, lock, i):
# Rotate first element to be last element:
# This is not an atomic operation, so do under control of a lock:
with lock:
deque_array.append(i, deque_array.popleft(i))
# Required for Windows:
if __name__ == '__main__':
# This starts the manager process:
with DequeArrayManager() as manager:
# Two deques:
deque_array = manager.DequeArray(2)
# Initialize with some values:
deque_array.append(0, 0)
deque_array.append(0, 1)
deque_array.append(0, 2)
# Same values in second deque:
deque_array.append(1, 0)
deque_array.append(1, 1)
deque_array.append(1, 2)
print(deque_array)
# Both processses will be modifying the same deque in a
# non-atomic way, so we definitely need to be doing this under
# control of a lock. We don't care which process acquires the
# lock first because the results will be the same regardless.
lock = Lock()
p1 = Process(target=left_rotate, args=(deque_array, lock, 0))
p2 = Process(target=left_rotate, args=(deque_array, lock, 0))
p1.start()
p2.start()
p1.join()
p2.join()
print(deque_array)
Prints:
DequeArray [
deque([0, 1, 2])
deque([0, 1, 2])
]
DequeArray [
deque([2, 0, 1])
deque([0, 1, 2])
]

Multiprocessing Pool creating and killing processes indefinitely

EDIT
This short code below triggers the same issue.
# top_level.py
import to_import
if __name__ == '__main__':
# This does not work
t = to_import.Test()
from pprint import pprint
pprint(t.test())
#to_import.py
import multiprocessing as mp
def test_func(a, b):
return a * b
class Test:
def __init__(self):
self.pairs = list()
for i in range(10):
for j in range(10):
self.pairs.append((i, j))
def test(self):
pairs = tuple(self.pairs)
with mp.Pool() as pool:
results = pool.starmap(test_func, pairs)
return results
if __name__ == '__main__':
# This works fine
t = Test()
from pprint import pprint
pprint(t.test())
END EDIT
DOUBLE EDIT
Interestingly, this code works correctly when run from my command prompt, as opposed to how I'd been running it from within Spyder previously
EDIT END
I have a class Tin which stores a 3d surface as a series of points and triangles, and can generate a regular grid of points on that surface. The process of creating these points works fine when the multiprocessing flag is False.
However for very dense grids on large surfaces this process can be quite slow, so I implemented multiprocessing to speed it up.
# tin.py
from time import time
import multiprocessing as mp
def _points_from_face(points, grid_size):
create 3d points within triangle on grid, uses other functions withinin this module
def _multiprocess_function(function, vals_gen, pool_size):
with mp.Pool(processes=pool_size) as pool:
results = pool.starmap(func=function,
iterable=vals_gen)
return results
class Tin:
def __init__(self, name, surface_dict):
self.name = name
self.points = surface_dict['Points']
self.faces = dict(enumerate(surface_dict['Faces']))
def generate_regular_grid(self, grid_size,
multiprocess=False,
pool_size=(mp.cpu_count()//2)):
return_grid = dict()
if pool_size < 1:
multiprocess = False
if multiprocess:
faces_tuple = tuple(self.faces.values())
vals_tuple = tuple((tuple(self.points[pid] for pid in face), grid_size)
for face in faces_tuple)
results = _multiprocess_function(_points_from_face,
vals_tuple,
pool_size)
for result in results:
return_grid.update(result)
else:
for face in self.faces.values():
points = tuple(self.points[pid] for pid in face)
return_grid.update(_points_from_face(points, grid_size))
return return_grid
When the Tin class and associated functions are in the same python file as the code calling them, the script works fine, the processes spin up, do their thing, and then close.
But when I import tin.py into another script and try to use multiprocessing, the program gets stuck creating and killing processes over and over without returning anything.
e.g.
# landxml.py
from time import time
from tin import Tin
def parse_landxml(xml_path: str, print_times=False) -> Tin:
read xml file and return Tin contained within
if __name__ == '__main__':
st = time()
surface = parse_landxml('some_tin.xml',
print_times=True)
grid = surface.generate_regular_grid(grid_size=2,
print_times=True,
multiprocess=True)
Do I need to keep everything in one long script or is there a way I can still use multiprocessing inside an imported script.
In addition landxml.py will be imported into another file itself, is this likely to cause the same problem again?

How to slow down asynchrounous API calls to match API limits?

I have a list of ~300K URLs for an API i need to get data from.
The API limit is 100 calls per second.
I have made a class for the asynchronous but this is working to fast and I am hitting an error on the API.
How do I slow down the asynchronous, so that I can make 100 calls per second?
import grequests
lst = ['url.com','url2.com']
class Test:
def __init__(self):
self.urls = lst
def exception(self, request, exception):
print ("Problem: {}: {}".format(request.url, exception))
def async(self):
return grequests.map((grequests.get(u) for u in self.urls), exception_handler=self.exception, size=5)
def collate_responses(self, results):
return [x.text for x in results]
test = Test()
#here we collect the results returned by the async function
results = test.async()
response_text = test.collate_responses(results)
The first step that I took was to create an object who can distribute a maximum of n coins every t ms.
import time
class CoinsDistribution:
"""Object that distribute a maximum of maxCoins every timeLimit ms"""
def __init__(self, maxCoins, timeLimit):
self.maxCoins = maxCoins
self.timeLimit = timeLimit
self.coin = maxCoins
self.time = time.perf_counter()
def getCoin(self):
if self.coin <= 0 and not self.restock():
return False
self.coin -= 1
return True
def restock(self):
t = time.perf_counter()
if (t - self.time) * 1000 < self.timeLimit:
return False
self.coin = self.maxCoins
self.time = t
return True
Now we need a way of forcing function to only get called if they can get a coin.
To do that we can write a decorator function that we could use like that:
#limitCalls(callLimit=1, timeLimit=1000)
def uniqFunctionRequestingServer1():
return 'response from s1'
But sometimes, multiple functions are calling requesting the same server so we would want them to get coins from the the same CoinsDistribution object.
Therefor, another use of the decorator would be by supplying the CoinsDistribution object:
server_2_limit = CoinsDistribution(3, 1000)
#limitCalls(server_2_limit)
def sendRequestToServer2():
return 'it worked !!'
#limitCalls(server_2_limit)
def sendAnOtherRequestToServer2():
return 'it worked too !!'
We now have to create the decorator, it can take either a CoinsDistribution object or enough data to create a new one.
import functools
def limitCalls(obj=None, *, callLimit=100, timeLimit=1000):
if obj is None:
obj = CoinsDistribution(callLimit, timeLimit)
def limit_decorator(func):
#functools.wraps(func)
def limit_wrapper(*args, **kwargs):
if obj.getCoin():
return func(*args, **kwargs)
return 'limit reached, please wait'
return limit_wrapper
return limit_decorator
And it's done ! Now you can limit the number of calls any API that you use and you can build a dictionary to keep track of your CoinsDistribution objects if you have to manage a lot of them (to differrent API endpoints or to different APIs).
Note: Here I have choosen to return an error message if there are no coins available. You should adapt this behaviour to your needs.
You can just keep track of how much time has passed and decide if you want to do more requests or not.
This will print 100 numbers per second, for example:
from datetime import datetime
import time
start = datetime.now()
time.sleep(1);
counter = 0
while (True):
end = datetime.now()
s = (end-start).seconds
if (counter >= 100):
if (s <= 1):
time.sleep(1) # You can keep track of the time and sleep less, actually
start = datetime.now()
counter = 0
print(counter)
counter += 1
This other question in SO shows exactly how to do this. By the way, what you need is usually called throttling.

Mongoengine Document resets value of attribute after multiprocessing map

I have a large list of elements ~ 100000 and need to map it as follows:
def mark_diff(args):
item = args[0]
pi = args[1]
item.marked_diff = (item.p/pi[0]+item.c/pi[1]+item.f/pi[2] - 3)**2
return item
def mark(f_set , goal):
with Pool(3) as p:
data = p.map(mark_diff , zip(f_set , itertools.repeat(goal)))
return data
The default value of item.markded_diff is 0, and item is a mongoengine document.
I am resorting to multiprocessing because the mark_diff is substantially more complicated than shown here and involves a lot of exponents, logarithms for which i am using numpy.
Now for the problem,
The returned data still has item.marked_diff as 0.
While if I add a print statement at the bottom of mark_diff correct values are being assigned and are non-zero.
Definition of item.
import random,mongoengine
class F(mongoengine.Document):
p = mongoengine.FloatField()
c = mongoengine.FloatField()
f = mongoengine.FloatField()
marked_diff = 0
f_sets = F.objects.all()
goal = [0.2,0.35,0.45]
So something is going on in what you didn't show. When I flesh this out into a complete, executable program, it appears to work fine. Here's the output from one run under Python 3.6.1:
0.7024116548559156
13.468354599594324
6.036133666404753
0.16520292241977205
0.17073749475275496
1.903674418518389
0.2432159511273063
7.743326563037492
4.1990243814914425
19.36243187965931
And here's the full program:
from multiprocessing import Pool
import random
import itertools
class F:
def __init__(self):
self.p = random.random()
self.c = random.random()
self.f = random.random()
def mark_diff(args):
item = args[0]
pi = args[1]
item.marked_diff = (item.p/pi[0]+item.c/pi[1]+item.f/pi[2] - 3)**2
return item
def mark(f_set , goal):
with Pool(3) as p:
data = p.map(mark_diff , zip(f_set , itertools.repeat(goal)))
return data
if __name__ == "__main__":
f_set = [F() for _ in range(10)]
goal = [0.2,0.35,0.45]
xs = mark(f_set, goal)
for x in xs:
print(x.marked_diff)
Is it possible that you're looking at marked_diff in the original f_set instead of in the items returned by mark()?

Categories

Resources