Mongoengine Document resets value of attribute after multiprocessing map

Mongoengine Document resets value of attribute after multiprocessing map - python

I have a large list of elements ~ 100000 and need to map it as follows:
def mark_diff(args):
item = args[0]
pi = args[1]
item.marked_diff = (item.p/pi[0]+item.c/pi[1]+item.f/pi[2] - 3)**2
return item
def mark(f_set , goal):
with Pool(3) as p:
data = p.map(mark_diff , zip(f_set , itertools.repeat(goal)))
return data
The default value of item.markded_diff is 0, and item is a mongoengine document.
I am resorting to multiprocessing because the mark_diff is substantially more complicated than shown here and involves a lot of exponents, logarithms for which i am using numpy.
Now for the problem,
The returned data still has item.marked_diff as 0.
While if I add a print statement at the bottom of mark_diff correct values are being assigned and are non-zero.
Definition of item.
import random,mongoengine
class F(mongoengine.Document):
p = mongoengine.FloatField()
c = mongoengine.FloatField()
f = mongoengine.FloatField()
marked_diff = 0
f_sets = F.objects.all()
goal = [0.2,0.35,0.45]

So something is going on in what you didn't show. When I flesh this out into a complete, executable program, it appears to work fine. Here's the output from one run under Python 3.6.1:
0.7024116548559156
13.468354599594324
6.036133666404753
0.16520292241977205
0.17073749475275496
1.903674418518389
0.2432159511273063
7.743326563037492
4.1990243814914425
19.36243187965931
And here's the full program:
from multiprocessing import Pool
import random
import itertools
class F:
def __init__(self):
self.p = random.random()
self.c = random.random()
self.f = random.random()
def mark_diff(args):
item = args[0]
pi = args[1]
item.marked_diff = (item.p/pi[0]+item.c/pi[1]+item.f/pi[2] - 3)**2
return item
def mark(f_set , goal):
with Pool(3) as p:
data = p.map(mark_diff , zip(f_set , itertools.repeat(goal)))
return data
if __name__ == "__main__":
f_set = [F() for _ in range(10)]
goal = [0.2,0.35,0.45]
xs = mark(f_set, goal)
for x in xs:
print(x.marked_diff)
Is it possible that you're looking at marked_diff in the original f_set instead of in the items returned by mark()?

Related

python multiprocessing write data to the same list

I want to write data to the same list via python multiprocessing, I do interprocess data sharing via mp.manager.list. The code is shown below, this is just a demo, I want to add the same numbers to the same list. However, counter can be increased, but grp remains the same. Where is the problem?
import multiprocessing as mp
import random
import time
import numpy as np
class A:
def __init__(self):
self.raw = [random.randint(1, 4) for _ in range(100)]
self.manager = mp.Manager()
self.grp = self.manager.list([[1], [2], [3], [4]])
self.use_cpu_num = 2
self.counter = self.manager.Value('i', 0)
def run(self):
subsets = np.array_split(self.raw, self.use_cpu_num)
subsets = [each.tolist() for each in subsets]
process = []
for i in range(self.use_cpu_num):
process.append(mp.Process(target=self.process, args=(subsets[i], )))
for each in process:
each.start()
for each in process:
each.join()
each.close()
print(self.grp)
def process(self, subset):
for each in subset:
for i in range(len(self.grp)):
each_grp = self.grp[i]
if each in each_grp:
self.counter.set(self.counter.value + 1)
self.grp[i].append(each)
print(self.counter.value)
if __name__ == '__main__':
a = A()
a.run()
I tried using mp.Lock(), but that doesn't share data between different processes.

Put it this way, self.grp is a managed object, any change on it using self.grp.append or self.grp[i] = x will be transferred to the manager process.
The objects inside self.grp are not managed, any change to them will not be transferred to the manager, you only get a copy of them when you use self.grp[i].
In order to allow modifications to the lists inside self.grp to propagate, those lists must themselves be manager.list object, and nesting managed objects is not supported for versions of python below 3.6
self.grp = self.manager.list([self.manager.list(x) for x in ([1], [2], [3], [4])])
If you are only storing numbers you can pass multiprocessing.Array which can be wrapped as a numpy ndarray for convenience, but you cannot append to it, and must know the size beforehand.
Edit: on windows you will get an error when trying to pickle the self.manager object, so i modified it out of the class in the example below.
import multiprocessing as mp
import random
import time
import numpy as np
class A:
def __init__(self):
self.raw = [random.randint(1, 4) for _ in range(100)]
self.grp = manager.list([manager.list(x) for x in ([1], [2], [3], [4])])
self.use_cpu_num = 2
self.counter = manager.Value('i', 0)
def run(self):
subsets = np.array_split(self.raw, self.use_cpu_num)
subsets = [each.tolist() for each in subsets]
process = []
for i in range(self.use_cpu_num):
process.append(mp.Process(target=self.process, args=(subsets[i], )))
for each in process:
each.start()
for each in process:
each.join()
each.close()
print([list(x) for x in self.grp])
def process(self, subset):
for each in subset:
for i in range(len(self.grp)):
each_grp = self.grp[i]
if each in each_grp:
self.counter.set(self.counter.value + 1)
self.grp[i].append(each)
print(self.counter.value)
if __name__ == '__main__':
manager = mp.Manager()
a = A()
a.run()

Python multiprocessing example never terminates when the dataset is too large

In the example problem below, the main program creates a list of random strings of length data_size. Without multi-processing the data is sent directly to Test.iterate() where the class merely adds the string Test- to the beginning of each random string. When run without multiprocessing the code works very well with small values of data_size and large values of data_size.
I decided to add a multiprocessing ability to this test problem and broke down the core components of multiprocessing into a class title MultiProc. The member function Multiproc.run_processes() manages all functions in the class. The function assumes that the input list will be divided into x smaller lists depending on how many processes the user wishes to utilize. As a result, the function starts by determining the upper and lower indices for each sub-list relative to the initial list so the code knows which portions to iterate over for each thread. The function then initiates the processes, starts the process, joins the process, extracts the data from Queue, then it re-orders the returned data based on a counter that is passed to the primary function. The MultiProc class works fairly well at small values of data_size, but above a value of ~500, the code never terminates, although I suspect the value will vary from computer to computer depending on memory. However, at some point the multiprocess function stops working and I suspect it has something to do with the way data is returned from multiprocess. Does anyone know what might be causing this problem and how to fix it?
from multiprocessing import Process, Queue
from itertools import chain
import string
import random
class Test:
def __init__(self, array_list):
self.array_list = array_list
def func(self, names):
return 'Test-' + names
def iterate(self, upper, lower, counter):
output = [self.func(self.array_list[i]) for i in range(lower, upper)]
return output, counter
class MultiProc:
def __init__(self, num_procs, data_array, func):
self.num_procs = num_procs
self.data_array = data_array
self.func = func
if self.num_procs > len(self.data_array):
self.num_procs = len(self.data_array)
self.length = int((len(self.data_array) / self.num_procs) // 1)
def run_processes(self):
upper = self.__determine_upper_indices()
lower = self.__determine_lower_indices(upper)
p, q = self.__initiate_proc(self.func, upper, lower)
self.__start_thread(p)
self.__join_threads(p)
results = self.__extract_data(q)
new = self.__reorder_data(results)
return new
def __determine_upper_indices(self):
upper = [i * self.length for i in range(1, self.num_procs)]
upper.append(len(self.data_array))
return upper
def __determine_lower_indices(self, upper):
lower = [upper[i] for i in range(len(upper) - 1)]
lower = [0] + lower
return lower
def __initiate_proc(self, func, upper, lower):
q = Queue()
p = [Process(target=self.run_and_send_back_output,
args=(q, func, upper[i], lower[i], i))
for i in range(self.num_procs)]
return p, q
def __start_thread(self, p):
[p[i].start() for i in range(self.num_procs)]
def __join_threads(self, p):
[p[i].join() for i in range(self.num_procs)]
def __extract_data(self, q):
results = []
while not q.empty():
results.extend(q.get())
return results
def __reorder_data(self, results):
new = [results[i - 1] for j in range(self.num_procs)
for i in range(len(results)) if results[i] == j]
new = list(chain.from_iterable(new))
return new
def run_and_send_back_output(self, queue, func, *args):
result = func(*args) # run the func
queue.put(result) # send the result back
def id_generator(size=6, chars=string.ascii_uppercase + string.digits):
return ''.join(random.choice(chars) for _ in range(size))
if __name__ == "__main__":
random.seed(1234)
data_size = 9
num_proc = 2
test_list = [id_generator() for i in range(data_size)]
obj1 = Test(test_list)
result1 = obj1.iterate(data_size, 0, 1)
print(result1)
multi = MultiProc(num_proc, test_list, obj1.iterate)
result2 = multi.run_processes()
print(result2)
# >> ['Test-2HAFCF', 'Test-GWPBBB', 'Test-W43JFL', 'Test-HA65PE',
# 'Test-83EF6C', 'Test-R9ET4W', 'Test-RPM37B', 'Test-6EAVJ4',
# 'Test-YKDE5K']

Your main problem is this:
self.__start_thread(p)
self.__join_threads(p)
results = self.__extract_data(q)
You start your workers that try to put something in a queue, then join the workers and only after that you start retreiving data from the queue. The workers however can only exit after all data has been flushed to the underlying pipe, and will block on exit otherwise. Joining processes blocked like this before starting to retrieve elements from the pipe can result in a deadlock.
Maybe you should look into multiprocessing.Pool, as what you're trying to implement is some kind of a map() operation. Your example could rewritten more elegantly something like this:
from multiprocessing import Pool
import string
import random
def func(name):
return 'Test-' + name
def id_generator(size=6, chars=string.ascii_uppercase + string.digits):
return ''.join(random.choice(chars) for _ in range(size))
if __name__ == "__main__":
random.seed(1234)
data_size = 5000
num_proc = 2
test_list = [id_generator() for i in range(data_size)]
with Pool(num_proc) as pool:
result = pool.map(func, test_list)
print(result)

Changing arguments in Function after Importing

I have a script which has a function that is used in various classes and other functions throughout the script.
For example:
from scipy.stats import beta
import matplotlib.pyplot as plt
def function(i):
x = beta.pdf(i,a=10,b=2, scale = 100, loc = -50)
return x
def plotme():
graphme = []
for i in range(500):
graphme.append(function(i))
plt.plot(graphme)
def average():
averageme = []
for i in range(500):
averageme.append(function(i))
average = sum(averageme)/float(len(averageme))
return print(average)
Now if I wanted to import the module and call plotme() or average() it would use the values that are in the function(i). But is there a way for me to change the values of a, b, scale, and loc in function(i) when importing it? I know I could change each function to allow for it to change but I am hoping I could just adjust the initial function.
Ideally, I would like to be able to do something like this:
import mymodule
mymodule.function(i, a = 500, b = 200, scale = 50, loc = 0)
mymodule.plotme()
And the plotme() would be based on the new values not what is coded in the script.

Without touching your module, you could monkey-patch it after importing it. One way, using your example:
import mymodule
def function(i):
x = beta.pdf(i, a=500, b=200, scale=50, loc=0)
return x
mymodule.function = function
mymodule.plotme()
I agree with commenter jasonharper that mymodule's functions could have a better API.

Ideally, if you can change existing implementation, it should have a class which makes it possible to modify these parameters:
class WhateverYouCallIt:
def __init__(a=10, b=2, scale=100, loc=-50):
self.a = a
self.b = 2
self.scale = scale
self.loc = loc
def function(self, i):
return beta.pdf(i, a=self.a, b=self.b, scale=self.scale, loc=self.loc)
def plotme(self):
graphme = []
for i in range(500):
graphme.append(self.function(i))
plt.plot(graphme)
def average(self):
averageme = []
for i in range(500):
averageme.append(self.function(i))
average = sum(averageme)/float(len(averageme))
return print(average)
Then you can have several differently parameterized instances:
default_one = WhateverYouCallIt() # the default
default_one.plotme()
default_one.average()
a_different_one = WhateverYouCallIt(a=500, b=200, scale=50, loc=0)
a_different_one.plotme()
a_different_one.average()

You can use functools.partial:
from functools import partial
import mymodule
# and any time you need you assign a partial func to your function
mymodule.function = partial(mymodule.function(a=500, b=200, scale=50, loc=0))
...
mymodule.function(i)
...

Proper handling of spark broadcast variables in a Python class

I've been implementing a model with spark via a python class. I had some headaches calling class methods on a RDD defined in the class (see this question for details), but finally have made some progress. Here is an example of a class method I'm working with:
#staticmethod
def alpha_sampler(model):
# all the variables in this block are numpy arrays or floats
var_alpha = model.params.var_alpha
var_rating = model.params.var_rating
b = model.params.b
beta = model.params.beta
S = model.params.S
Z = model.params.Z
x_user_g0_inner_over_var = model.x_user_g0_inner_over_var
def _alpha_sampler(row):
feature_arr = row[2]
var_alpha_given_rest = 1/((1/var_alpha) + feature_arr.shape[0]*(1/var_rating))
i = row[0]
items = row[1]
O = row[3] - np.inner(feature_arr,b) - beta[items] - np.inner(S[i],Z[items])
E_alpha_given_rest = var_alpha_given_rest * (x_user_g0_inner_over_var[i] + O.sum()/var_rating)
return np.random.normal(E_alpha_given_rest,np.sqrt(var_alpha_given_rest))
return _alpha_sampler
As you can see, to avoid serialization errors, I define a static method that returns a function that is in turn applied to each row of an RDD (model is the parent class here, and this is called from within another method of model):
# self.grp_user is the RDD
self.params.alpha = np.array(self.grp_user.map(model.alpha_sampler(self)).collect())
Now, this all works fine, but is not leveraging Spark's broadcast variables at all. Ideally, all the variables I'm passing in this function (var_alpha, beta, S, etc.) could first be broadcast to the workers, so that I wasn't redundantly passing them as part of the map. But I'm not sure how to do this.
My question, then, is the following: How/where should I make these into broadcast variables such that they are available to the alpha_sampler function that I map to grp_user? One thing I believe will work would be to make them globals, e.g.
global var_alpha
var_alpha = sc.broadcast(model.params.var_alpha)
# and similarly for the other variables...
Then the alpha_sampler could be much simplified:
#staticmethod
def _alpha_sampler(row):
feature_arr = row[2]
var_alpha_given_rest = 1/((1/var_alpha.value) + feature_arr.shape[0]*(1/var_rating.value))
i = row[0]
items = row[1]
O = row[3] - np.inner(feature_arr,b.value) - beta.value[items] - np.inner(S.value[i],Z.value[items])
E_alpha_given_rest = var_alpha_given_rest * (x_user_g0_inner_over_var.value[i] + O.sum()/var_rating.value)
return np.random.normal(E_alpha_given_rest,np.sqrt(var_alpha_given_rest))
But of course this is really dangerous use of globals that I would like to avoid. Is there a better way that lets me leverage broadcast variables?

Assuming that variables you use here are simply scalars there is probably nothing to gain here from a performance perspective and using broadcast variables will make you code less readable but you can either pass a broadcast variable as an argument to the static method:
class model(object):
#staticmethod
def foobar(a_model, mu):
y = a_model.y
def _foobar(x):
return x - mu.value + y
return _foobar
def __init__(self, sc):
self.sc = sc
self.y = -1
self.rdd = self.sc.parallelize([1, 2, 3])
def get_mean(self):
return self.rdd.mean()
def run_foobar(self):
mu = self.sc.broadcast(self.get_mean())
self.data = self.rdd.map(model.foobar(self, mu))
or initialize it there:
class model(object):
#staticmethod
def foobar(a_model):
mu = a_model.sc.broadcast(a_model.get_mean())
y = a_model.y
def _foobar(x):
return x - mu.value + y
return _foobar
def __init__(self, sc):
self.sc = sc
self.y = -1
self.rdd = self.sc.parallelize([1, 2, 3])
def get_mean(self):
return self.rdd.mean()
def run_foobar(self):
self.data = self.rdd.map(model.foobar(self))

loop fails when increasing string from elements in list

I have some problem with sequence generator. I have a file where each line contain one fragment (8 letters). I load it from file in to list, where each element is one fragment. It is DNA so it should go that way:
1. Takes first 8-letter element
2. Check for element in which first 7 letters is the same as last 7 letters in first.
3. Add 8th letter from second element in to sequence.
It should look like this:
ATTGCCAT
TTGCCATA
TGCAATAC
So sequence: ATTGCCATAC
Unfortunately it only add one element. :( First element is given (we knew it). I do it that way its first in file (first line).
Here is the code:
from os import sys
import random
def frag_get(seqfile):
frags = []
f_in = open(seqfile, "r")
for i in f_in.readlines():
frags.append(i.strip())
f_in.close()
return frags
def frag_list_shuffle(frags):
random.shuffle(frags)
return frags
def seq_build(first, frags):
seq = first
for f in frags:
if seq[-7:] == f[:7]:
seq += f[-1:]
return seq
def errors():
pass
if __name__ == "__main__":
frags = frag_get(sys.argv[1])
first = frags[0]
frags.remove(first)
frags = frag_list_shuffle(frags)
seq = seq_build(first, frags)
check(sys.argv[2], seq)
spectrum(sys.argv[2], sys.argv[3])
I have deleted check and spectrum functions because it's simple calculations e.g. length comparison, so it is not what cause a problem as I think.
I will be very thankfully for help!
Regards,
Mateusz

Because your fragments are shuffled, your algorithm needs to take that into account; currently, you're just looping through the fragments once, which is unlikely to include more than a few fragments if they're not in the right order. For example, say you have 5 fragments, which I'm going to refer to by their order in your sequence. Now the fragments are slightly out of order:
1 - 3 - 2 - 4 - 5
Your algorithm will start with 1, skip 3, then match on 2, adding a base at the end. Then it'll check against 4 and 5, and then finish, never reaching fragment 3.
You could easily fix this by starting your loop again each time you add a base, however, this will scale very badly for a large number of bases. Instead, I'd recommend loading your fragments into a trie, and then searching the trie for the next fragment each time you add a base, until you've added one base for each fragment or you can no longer find a matching fragment.

works for me:
>>> seq = "ATTGCCAT"
>>> frags = ["TTGCCATA", "TGCCATAC"]
>>> for f in frags:
... if seq[-7:] == f[:7]:
... seq += f[-1:]
...
>>> seq
'ATTGCCATAC'
You have a spelling error in your example, TGCAATAC should be TGCCATAC. But fixing that it works.

For fun and interest, I've rewritten the problem using OO. See what you think:
import collections
import sys
import random
usage = """
Usage:
sequence fname expected
Where
fname: name of file containing fragments
expected: result-string which should be obtained by chaining from first fragment.
"""
class Frag(str):
MATCHLEN = 7
def __new__(cls, s=''):
return str.__new__(cls, s.strip())
def head(self):
return Frag(self[:Frag.MATCHLEN])
def tail(self):
return Frag(self[Frag.MATCHLEN:])
def nexthead(self):
return Frag(self[-Frag.MATCHLEN:])
def check(self, s):
return self.__eq__(s)
def __add__(self, s):
return Frag(str(self).__add__(s))
class Fraglist(list):
#classmethod
def fromFile(cls, fname):
with open(fname, "r") as inf:
lst = [Frag(ln) for ln in inf]
return cls(lst)
def shuffle(self):
random.shuffle(self)
class Sequencer(object):
def __init__(self, seq=None):
super(Sequencer, self).__init__()
self.sequences = collections.defaultdict(list)
if seq is not None:
for frag in seq:
self.sequences[frag.head()].append(frag.tail())
def build(self, frag):
res = [frag]
match = frag.nexthead()
while match in self.sequences:
next = random.choice(self.sequences[match])
res.append(next)
match = (match + next).nexthead()
return Frag(''.join(res))
def main():
if len(sys.argv) != 3:
print usage
sys.exit(-1)
else:
fname = sys.argv[1]
expected = sys.argv[2]
frags = Fraglist.fromFile(fname)
frag1 = frags.pop(0)
frags.shuffle()
seq = Sequencer(frags)
result = seq.build(frag1)
if result.check(expected):
print "Match!"
else:
print "No match"
if __name__=="__main__":
main()

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Mongoengine Document resets value of attribute after multiprocessing map - python

Related

python multiprocessing write data to the same list

Python multiprocessing example never terminates when the dataset is too large

Changing arguments in Function after Importing

Proper handling of spark broadcast variables in a Python class

loop fails when increasing string from elements in list

Categories

Resources