I have a large list containing binary encoded strings that I used to process in a single function before, like so:
""" just included this to demonstrate the 'data' structure """
data=np.zeros(250,dtype='float32, (250000,2)float32')
def func numpy_array(data, peaks):
rt_counter=0
for x in peaks:
if rt_counter %(len(peaks)/20) == 0:
update_progress()
peak_counter=0
data_buff=base64.b64decode(x)
buff_size=len(data_buff)/4
unpack_format=">%dL" % buff_size
index=0
for y in struct.unpack(unpack_format,data_buff):
buff1=struct.pack("I",y)
buff2=struct.unpack("f",buff1)[0]
if (index % 2 == 0):
data[rt_counter][1][peak_counter][0]=float(buff2)
else:
data[rt_counter][1][peak_counter][1]=float(buff2)
peak_counter+=1
index+=1
rt_counter+=1
I have been reading up on multiprocessing and figured that I wanted to try that to see if I could get a big increase in performance, I rewrote my function into 2 (helper and 'caller') like so:
def numpy_array(data, peaks):
processors=mp.cpu_count #Might as well throw this directly in the mp.Pool (just for clarity for now)
pool = mp.Pool(processes=processors)
chunk_size=len(peaks)/processors
for i in range(processors):
counter = i*chunk_size
chunk=peaks[i*chunk_size:(i+1)*chunk_size-1]
pool.map(decode(data,chunk,counter))
def decode(data,chunk,counter):
for x in chunk:
peak_counter=0
data_buff=base64.b64decode(x)
buff_size=len(data_buff)/4
unpack_format=">%dL" % buff_size
index=0
for y in struct.unpack(unpack_format,data_buff):
buff1=struct.pack("I",y)
buff2=struct.unpack("f",buff1)[0]
if (index % 2 == 0):
data[counter][1][peak_counter][0]=float(buff2)
else:
data[counter][1][peak_counter][1]=float(buff2)
peak_counter+=1
index+=1
print data[counter][1][10][0]
counter+=1
The program runs but only uses 100-110% of CPU (according to top) and once it should be finished it throws TypeError: map() takes at least 3 arguments (2 given) at me, could anyone with some more experience with multiprocess give me a hint as to what things to look out for (that could cause the TypeError)? What might be causing my low cpu usage?
-- Code after incorporating answers --
def decode((data,chunk,counter)):
print len(chunk), counter
for x in chunk:
peak_counter=0
data_buff=base64.b64decode(x)
buff_size=len(data_buff)/4
unpack_format=">%dL" % buff_size
index=0
for y in struct.unpack(unpack_format,data_buff):
buff1=struct.pack("I",y)
buff2=struct.unpack("f",buff1)[0]
if (index % 2 == 0):
data[counter][1][peak_counter][0]=float(buff2)
else:
data[counter][1][peak_counter][1]=float(buff2)
peak_counter+=1
index+=1
counter+=1
def numpy_array(data, peaks):
"""Fills the NumPy array 'data' with m/z-intensity values acquired
from b64 decoding and unpacking the binary string read from the
mzXML file, which is stored in the list 'peaks'.
The m/z values are assumed to be ordered without validating this
assumption.
Note: This function uses multi-processing
"""
processors=mp.cpu_count()
pool = mp.Pool(processes=processors)
chunk_size=int(len(peaks)/processors)
map_parameters=[]
for i in range(processors):
counter = i*chunk_size
chunk=peaks[i*chunk_size:(i+1)*chunk_size-1]
map_parameters.append((data,chunk,counter))
pool.map(decode,map_parameters)
This latest version 'works' so far that it fills the array in the processes (where the array contains values) but once all processes are done accessing the array yields zero values only because each process gets a local copy of the array.
Something like this should work
Note that pool.map takes a function and a list of parameters for that function for each call. In your original example you are just calling it in the numpy_array function.
The function must only have one argument, hence the packing of the arguments into a tuple and the rather odd looking double brackets in decode (which is called tuple unpacking).
def numpy_array(data, peaks):
processors=4
pool = mp.Pool(processes=processors)
chunk_size=len(data)/processors
print range(processors)
map_parameters = [] # new
for i in range(processors):
counter = i*chunk_size
chunk=peaks[i*chunk_size:(i+1)*chunk_size-1]
map_parameters.append((data,chunk,counter)) # new
pool.map(decode, map_parameters) # new
def decode((data,chunk,counter)): # changed
for x in chunk:
peak_counter=0
data_buff=base64.b64decode(x)
buff_size=len(data_buff)/4
unpack_format=">%dL" % buff_size
index=0
for y in struct.unpack(unpack_format,data_buff):
buff1=struct.pack("I",y)
buff2=struct.unpack("f",buff1)[0]
if (index % 2 == 0):
data[counter][1][peak_counter][0]=float(buff2)
else:
data[counter][1][peak_counter][1]=float(buff2)
peak_counter+=1
index+=1
print data[counter][1][10][0]
counter+=1
The bug is in your numpy_array function:
for i in range(processors):
counter = i*chunk_size
chunk=peaks[i*chunk_size:(i+1)*chunk_size-1]
pool.map(decode(data,chunk,counter))
The problem is that you're calling map sequentially so you're only running one process at a time. Also, I don't think you're calling map correctly as you're doing pool.map(f(*args)) when the signature is map(f, ['list', 'of', 'data']).
I would use a partial so that you don't create copies of data as I assume that array is quite large or could be larger in the future.
This should be:
import functools
decode_with_data = functools.partial(decode, data)
args = []
for i in range(processors):
counter = i * chunk_size
chunk = peaks[1*chunk_size:(i+1)*chunk_size-1]
args.append(chunk, counter)
pool.map(decode_with_data, args)
Related
I have a problem where I need to read a large text file and conditionally send data from it to two different database tables using Python. I would like to use a generator pipeline to avoid loading all the data into memory.
The logic of what I am trying to do is equivalent to turning a generator that yields integers into one for odd numbers and another for even numbers then simultaneously them writing them to separate files.
I can split into two generators with itertools.tee as follows:
def generate_numbers(limit):
# My real-life function has this form.
for i in range(limit):
yield i
numbers1, numbers2 = itertools.tee(generate_numbers(10), n=2)
evens = (num for num in numbers1 if num % 2 == 0)
odds = (num for num in numbers2 if num %2 != 0)
This produces the expected result:
>>> list(evens)
[0, 2, 4, 6, 8]
>>> list(odds)
[1, 3, 5, 7, 9]
The tee documentation warns the lots of temporary data will be stored if one iterator uses most or all of the data before the other starts. This is what happens when I write the data (from fresh iterators) to files as below.
def write_to_file(filename, numbers):
# My real-life function takes an iterable as an argument.
with open(filename, 'wt') as outfile:
for i in numbers:
outfile.write(f"{i}\n")
write_to_file('evens.txt', evens)
write_to_file('odds.txt', odds)
Is it possible to consume the generators simultaneously? The tee documentation also warns that it isn't thread safe. Can this be done with asyncio?
Alternatively, is there another approach? Would chunking the data help? My main constraints are that I don't want to hold all the data for a single table in memory and that my consuming functions expect an iterable of items, rather than an individual item.
Similar questions
This question: Separate odd and even lines in a generator with python is very similar to mine. The accepted answer suggests passing through the input file twice. I may end up doing this, but it would be nice to do a single pass.
There is another answer the opens both output files at once for writing and then processes each item one-by-one. This isn't suitable for me as my real-life consuming function expects to read all the items in an iterator.
Edit: After posting, I found this answer from 2015: https://stackoverflow.com/a/28030261/3508733, which suggests that you can't do it with itertools.tee, because of the memory issue. Is there an alternative way?
After reading more about itertools, I've found a way that works for me. I still create two generators with tee as before, but then I split the data into chunks with itertools.islice. That way I can alternate between generators without letting one get too far ahead of the other.
# Create two generators using tee
numbers1, numbers2 = itertools.tee(generate_numbers(10), 2)
evens = (num for num in numbers1 if num % 2 == 0)
odds = (num for num in numbers2 if num %2 != 0)
def append_to_file(filename, numbers):
# Append to file, instead of writing the whole file at once
with open(filename, 'at') as f:
for num in numbers:
f.write(f"{num}\n")
# Use islice to move through both generators in chunks
chunksize = 2
while True:
odds_chunk = list(itertools.islice(odds, chunksize))
append_to_file('/tmp/odds.txt', odds_chunk)
evens_chunk = list(itertools.islice(evens, chunksize))
append_to_file('/tmp/evens.txt', evens_chunk)
if odds_chunk == evens_chunk == []:
break
In my real-life case, I expect that a chunk size of a few thousand will be a good balance between memory use and reducing round-trips to the database.
Based on the chunking method in my earlier answer and comments from #KellyBundy about unbalanced inputs, I have modified the code. The following meets my requirements, even if it isn't technically generators all the way through.
✅ Doesn't hold more than chunksize items in memory at once
✅ Downstream function receives a generator
✅ Single pass through the data
❌ Data are temporarily materialized (but only in chunks)
import itertools
def generate_numbers(limit):
# My real-life function has this form.
for i in range(limit):
yield i
def append_to_file(filename, numbers):
# My real-life function takes an iterable as an argument.
with open(filename, 'at') as f:
for num in numbers:
f.write(f"{num}\n")
numbers = generate_numbers(10)
chunksize = 2
while True:
evens_chunk = []
odds_chunk = []
for num in itertools.islice(numbers, chunksize):
# Could use `match-case` in Python >= 3.10
if num %2 == 0:
evens_chunk.append(num)
else:
odds_chunk.append(num)
if evens_chunk:
evens_gen = (num for num in evens_chunk)
append_to_file('evens.txt', evens_gen)
if odds_chunk:
odds_gen = (num for num in odds_chunk)
append_to_file('odds.txt', odds_gen)
if odds_chunk == evens_chunk == []:
break
If the downstream functions don't require a generator as input, the loop can be simplified further:
while True:
evens_chunk = []
odds_chunk = []
for num in itertools.islice(numbers, chunksize):
if num %2 == 0:
evens_chunk.append(num)
else:
odds_chunk.append(num)
append_to_file('evens.txt', evens_chunk)
append_to_file('odds.txt', odds_chunk)
if odds_chunk == evens_chunk == []:
break
I want to process some array/list/timeseries data, and want to use many different filters for this.
This led to two problems: I don't want to copy-paste the function every time, especially if I change something. Also with different dependencies (there might be a dependency on the previous, or n-th previous element, or n-th following element), the array that is looped over can go out of bounds, if I don't adjust the ranges.
The conditions for the filters could be arbitrarily complex, but always involve relative position in the data.
Here is a minimal example:
import random as r
data = [r.random() for _ in range(100)]
def example_filter(data):
counter = 0
for i in range(1, len(data)):
if((data[i-1]>0.8) and (data[i]<0.5)):
counter +=1
#might want to change something here
#right now I would need to do this in all filters separately
return counter
def example_filter_2(data):
counter = 0
for i in range(2, len(data)):
if((data[i-2]>0.8) or ((data[i-1]>0.9) and (data[i]<0.2))):
counter +=1
return counter
My idea was to somehow compress the conditions (they are more complicated in the real example), use a converter function to make the real condition out of them, pass it as a string to a template function, and then use the condition, like this:
def filter_template(condition):
def instance_of_filter(data):
counter = 0
#problem: the range isn't adjusted to account for out of bounds here
for i in range(len(data)):
#problem: condition will be passed as a string, so how can I evaluate it
#also, I can't evaluate condition before I know what 'data' is, so I need to keep the dependency
if condition:
counter += 1
return counter
return instance_of_filter
Any ideas?
You can use your last code idea, just change the condition from variable to a predicate function based on data and index.
Example:
def filter_template(condition_func, start_at=0):
def instance_of_filter(data):
counter = 0
for i in range(start_at, len(data)):
if condition_func(data, i):
counter += 1
return counter
return instance_of_filter
def condition1(data, i):
return (data[i-1]>0.8) and (data[i]<0.5)
def condition2(data, i):
return ((data[i-2]>0.8) or ((data[i-1]>0.9) and (data[i]<0.2)))
# usage
filter_template(condition1, 1)
filter_template(condition2, 2)
I am trying to loop over a directory and load all files. I've tried using one generator to load files and another one to generate batches and call the first generator when it runs of out memory.
def file_gen(b):
# iterate over my directory and load two audio file at a time
for n in range(len(b)):
path_ = os.path.join(os.path.join(path,'Mixtures'), 'Dev')
os.chdir(os.path.join(path_,b[n]))
y, _ = librosa.load('mixture.wav', sr=rate)
path_vox = os.path.join(os.path.join(path,'Sources'), 'Dev')
os.chdir(os.path.join(path_vox,b[n]))
x, _ = librosa.load('vocals.wav', sr=rate)
yield y, x
list_titles = os.listdir(os.path.join(os.path.join(path,'Mixtures'),'Dev'))
gen_file = file_gen(list_titles)
# second generator
def memory_test():
memory = 0
if memory == 0:
a, b = next(gen_file)
a, _ = mag_phase(spectrogram(a))
b, _ = mag_phase(spectrogram(b))
# calculate how many batches I can generate from the file
memory = a.shape[1]/(n_frames*(time_len-overlap) + time_len)
for n in range(memory):
yield memory
memory = memory -1
test = memory_test()
The second generator is where the problem is. Ideally, I would like both generator to iterate indefinitely though (the first one should go back to the beginning of the list).
Thank you!
itertools.cycle()
One way you could do this is to use itertools.cycle() which will essentially store the results of the generator and then continuously loops them over and over. docs
If you chose to do that, you would consume a lot of additional memory storing those results.
except StopIteration
As an alternative method, you could try: and except StopIteration for your generator yield in order to reset it back to the beginning. Generators always raise StopIteration if you call __next__ on an exhausted generator.
Edit: I originally linked to a wrapper function here but the code in that example actually doesn't work. Below is code that I have tested to work which is hopefully helpful. My answer here is based on the same concept.
def Primes(max): # primary generator
number = 1
while number < max:
number += 1
if check_prime(number):
yield number
primes = Primes(100)
def primer(): # this acts as a loop and resets your generator
global primes
try:
ok = next(primes)
return ok
except StopIteration:
primes = Primes(100)
ok = next(primes)
return ok
while True: # this is the actual loop continuing forever
primer()
You'll notice we couldn't implicitly refer to our own function in order to reset itself, and we also couldn't use a standard for loop because it will always catch StopIteration before you can, by design [more info].
Is it possible to create a python for-loop with a modulo operation? I have a ringbuffer in Python and I want to iterate the elements between the startPos and endPos indexes, where startPos can have a bigger value than endPos. In other programming languages, I would intuitively implement this with a modulo operator:
int startPos = 6;
int endPos = 2;
int ringBufferSize = 8;
for(int i = startPos, i != endPos, i = (i+1) % ringBufferSize) {
print buffer.getElementAt(i);
}
Is there a way to do this easily in Python? I only found the
for i in list:
print buffer[i]
Syntax but nothing which provides an equivalent solution to my problem.
My next approach would be to create the list in advance before iterate the indexes which are stored in the list. But is there a way to do this as a one-liner like in other programming languages by using the modulo operation directly in the for loop?
You have some ways of doing that:
As you do in "other programing languages" (i.e. C derived syntaxes), just that you basically have to write their for loop in a while form - and then you realize that C's for is just a while nonetheless:
start_pos = 6
end_pos = 2
ring_buffer_size = 8
i = start_pos
while True:
i = (i + 1) % ring_buffer_size
if i <= end_pos:
break
# your code here
Now, for the for statement, Python only has what is called "for each" - which always walks an iterable or sequence. So you can create an iterable that will yield your values -
def ring(start, end, buffer_size, increment=1):
i = start
while i != end:
yield i
i += 1
i %= buffer_size
for slot in ring(6, 2, 8):
# your code here
Note that while this second form is "bigger", it does abstract away your circular buffer logic, avoiding that hard code values get mixed with their meaning where you don't need to look at them - that is, inside the for body itself.
Note that the actual idea of for in Python is to iterate over the buffer contents itself, not and index that will lead to its contents.
So, the Python standard library includes a ready made circular buffer object already that always have its indexes normalized to 0 and (len - 1) -
just import deque from the collections module.
If you want a circular buffer with changing start and end indexes taht will wrap around and work automatically in forstatements, that is also relatively easy to do - if you don need the full functionality, just subclass list, add the start and end indexes, and make a custom implementation of its __iter__ method:
class Circular(list):
def __init__(self, content, start, end):
super(Circular, self).__init__( content)
self.start = start
self.end = end
def __iter__(self):
for i in range(self.start, self.start + len(self)):
if i % len(self) == self.end: break
yield self[i % len(self)]
And now you can use this custom container in your code:
In [22]: mylist = Circular(range(8), 6 , 2)
In [23]: for i in mylist:
...: print(i)
...:
6
7
0
1
For loops can take any iterable. As such you can create your own to do the work and drop it into the for loop. For example:
for i in [i % ring_buffer for i in range(start_pos, end_pos)]:
# Do stuff...
Or, to create an iterable directly:
for i in (i % ring_buffer for i in range(start_pos, end_pos)):
# Do stuff...
See the docs for more information about when you might want to create an iterator directly for this purpose.
use range
for i in range(0,len(a)):
#code
i=i%x
I believe you should be able to use while instead of a for loop. This should work as long as you just want to increment i by 1 each time, then calculate the mod.
Try:
i = startPos
while (i <= endPos and i == (i+1) % ringBufferSize) :
print buffer.getElementAt(x)
i = i+1
for x in records:
data = {}
for y in sObjectName.describe()['fields']
data[y['name']] = x[y['name']]
ls.append(adapter.insert_posts(collection, data))
I want to execute the code ls.append(adapter.insert_post(collection, x)) in the batch size of 500, where x should contain 500 data dicts. I could create a list a of 500 data dicts using a double for loop and a list and then insert it. I could do that in the following way, , is there a better way to do it? :
for x in records:
for i in xrange(0,len(records)/500):
for j in xrange(0,500):
l=[]
data = {}
for y in sObjectName.describe()['fields']:
data[y['name']] = x[y['name']]
#print data
#print data
l.append(data)
ls.append(adapter.insert_posts(collection, data))
for i in xrange(0,len(records)%500):
l=[]
data = {}
for y in sObjectName.describe()['fields']:
data[y['name']] = x[y['name']]
#print data
#print data
l.append(data)
ls.append(adapter.insert_posts(collection, data))
The general structure I use looks like this:
worklist = [...]
batchsize = 500
for i in range(0, len(worklist), batchsize):
batch = worklist[i:i+batchsize] # the result might be shorter than batchsize at the end
# do stuff with batch
Note that we're using the step argument of range to simplify the batch processing considerably.
If you're working with sequences, the solution by #nneonneo is about as performant as you can get. If you want a solution which works with arbitrary iterables, you can look into some of the itertools recipes. e.g. grouper:
def grouper(iterable, n, fillvalue=None):
"Collect data into fixed-length chunks or blocks"
# grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx
args = [iter(iterable)] * n
return itertools.izip_longest(fillvalue=fillvalue, *args)
I tend to not use this one because it "fills" the last group with None so that it is the same length as the others. I usually define my own variant which doesn't have this behavior:
def grouper2(iterable, n):
iterable = iter(iterable)
while True:
tup = tuple(itertools.islice(iterable, 0, n))
if tup:
yield tup
else:
break
This yields tuples of the requested size. This is generally good enough, but, for a little fun we can write a generator which returns lazy iterables of the correct size if we really want to...
The "best" solution here I think depends a bit on the problem at hand -- particularly the size of the groups and objects in the original iterable and the type of the original iterable. Generally, these last 2 recipes will find less use because they're more complex and rarely needed. However, If you're feeling adventurous and in the mood for a little fun, read on!
The only real modification that we need to get a lazy iterable instead of a tuple is the ability to "peek" at the next value in the islice to see if there is anything there. here I just peek at the value -- If it's missing, StopIteration will be raised which will stop the generator just as if it had ended normally. If it's there, I put it back using itertools.chain:
def grouper3(iterable, n):
iterable = iter(iterable)
while True:
group = itertools.islice(iterable, n)
item = next(group) # raises StopIteration if the group doesn't yield anything
yield itertools.chain((item,), group)
Careful though, this last function only "works" if you completely exhaust each iterable yielded before moving on to the next one. In the extreme case where you don't exhaust any of the iterables, e.g. list(grouper3(..., n)), you'll get "m" iterables which yield only 1 item, not n (where "m" is the "length" of the input iterable). This behavior could actually be useful sometimes, but not typically. We can fix that too if we use the itertools "consume" recipe (which also requires importing collections in addition to itertools):
def grouper4(iterable, n):
iterable = iter(iterable)
group = []
while True:
collections.deque(group, maxlen=0) # consume all of the last group
group = itertools.islice(iterable, n)
item = next(group) # raises StopIteration if the group doesn't yield anything
group = itertools.chain((item,), group)
yield group
Of course, list(grouper4(..., n)) will return empty iterables -- Any value not pulled from the "group" before the next invocation of next (e.g. when the for loop cycles back to the start) will never get yielded.
I like #nneonneo and #mgilson's answers but doing this over and over again is tedious. The bottom of the itertools page in python3 mentions the library more-itertools (I know this question was about python2 and this is python3 library, but some might find this useful). The following seems to do what you ask:
from more_itertools import chunked # Note: you might also want to look at ichuncked
for batch in chunked(records, 500):
# Do the work--`batch` is a list of 500 records (or less for the last batch).
Maybe something like this?
l = []
for ii, x in enumerate(records):
data = {}
for y in sObjectName.describe()['fields']
data[y['name']] = x[y['name']]
l.append(data)
if not ii % 500:
ls.append(adapter.insert_posts(collection, l))
l = []
I think one particular case scenario is not covered here. Let`s say the batch size is 100 and your list size is 103, the above answer might miss the last 3 element.
list = [.....] 103 elements
total_size = len(list)
batch_size_count=100
for start_index in range(0, total_size, batch_size_count):
list[start_index : start_index+batch_size_count] #Slicing operation
Above sliced list can be sent to each method call to complete the execution for all the elements.