I have the following inheritance:
class Processor(object):
def get_listings(self):
"""
returns a list of data
"""
raise NotImplemented()
def run(self):
for listing in get_listings():
do_stuff(listing)
class DBProcessor(Processor):
def get_listings(self):
"""
return a large set of paginated data
"""
...
for page in pages:
for data in db.fetch_from_query(...):
yield data
Although this works, this fails on len(self.get_listings()) or any other list operations.
My question is how to refactor my code that DBProcessor.get_listings can handle list operations, but also when it's iterator called it will return a generator?
I think I got an idea:
class DBListings(object):
def __iter__(self):
for page in pages:
for data in db.fetch_from_query(...):
yield data
def __len__(self):
return db.get_total_from_query(...)
"""
Or the following
counter = 0
for x in self:
counter += 1
return counter
"""
class DBProcessor(Processor):
def get_listings(self):
"""
return a large set of paginated data
"""
return DBListings()
UPDATE: Just tested the above code, works.
It depends on which list-operations you want to support. Some of them will only consume the generator when defaulting to iter.
If you know the result of the operation (for example len) beforehand you can just bypass it by creating a GeneratorContainer:
class GeneratorContainer():
def __init__(self, generator, length):
self.generator = generator
self.length = length
def __iter__(self):
return self.generator
def __len__(self):
return self.length
result = GeneratorContainer(DBProcessor().get_listings(), length)
# But you need to know the length-value.
Calling len will then not try to iterate over the generator. But you can always just create a list so that the data will not be exhausted:
result = list(DBProcessor().get_listings())
and use it as a list without the generator advantages and disadvantages.
If you wish to convert the generator (iterator in non-Python speak) produced by get_listings to a list, simply use
listings = list(get_listings())
Related
I'm trying to improve a performance issue by preloading data used by a batching iterator, and I'm stuck at the point of fitting this into the python idiomatic style.
Currently I'm using a sequence of iterators composed one on top of the other:
iterator = list(dirin.iterdir())
iterator = TranformIterator(iterator, lambda path: load_img(path)) # This takes most of the time
iterator = PreloadingIterator(iterator) # I want to use this iterator to preload some of the data
iterator = BatchingIterator(iterator, batchsize=100)
iterator = BatchingIteratorWithMeta(iterator)
for batch in iterator:
All of these are implemented with the exception of PreloadingIterator:
class PreloadingIterator:
inner: typing.Iterator
def __init__(self,
inner: [typing.Union[typing.Sized, typing.Iterable]]
):
self.inner = inner
self.total = len(inner)
self.index = 0
self.__cache__ = []
def __len__(self):
return len(self.inner)
def __iter__(self):
mem = psutil.virtual_memory()
for item in self.inner:
memconsumed = psutil.virtual_memory()
if self.should_preload(mem, memconsumed):
pass
#threading.Thread(target=
#peeker = more_itertools.peekable(self.inner)
#preload_item = peeker.peek()
yield item
self.index += 1
def should_preload(self, oldmem, newmem):
return True # TODO
What I'm trying to do is peek ahead at the next item in the iterator(preload_item = peeker.peek()) and use that to start a thread to start loading the next result from the iterator. However, I'm struggling to think how I can change the item in for item in self.inner: so it refers to the next item, not the one from the underlying iterator.
How can I iterate over an iterator in a way which allows me to source the item from a precached result if it is available?
I'm trying to use the iterparse funciton of ElementTree to parse a large OpenStreetMap (OSM) XML file. I'm trying to define my OSM file and its elements as class objects to help with my later analysis.
Now I've successfully defined an OSM class:
parse() uses a generator to iterparse the OSM file. It yields elem (elements in OpenStreetMap).
reset() reset the generator to avoid exhaustion.
slice() uses itertools to make views of the OSM file by passing the index of start, stop and step. It returns a list of elem.
iloc() called the slice function to locate a specific elem.
getchild() returns the secondary element of an element with a given index.
My problem is: I want to make another class elem to create functions on a single element. However, I must only generate an instance of elem through calling a function of OSM.
i.e. I have class1. And when I call a function of an instance of class1, it should create and return an instance of class2.
How can I achive this?
Here is my current code:
import xml.etree.cElementTree as ET
from collections import defaultdict
import itertools
class OSM:
def __init__(self, data):
self.data = data
self.parser = ET.iterparse(self.data, events=('start', ))
def parse(self):
_, root = next(self.parser)
for event, elem in self.parser:
yield elem
elem.clear()
root.clear()
def reset(self):
self.parser = ET.iterparse(self.data, events=('start', ))
def slice(self, start=0, stop=1, step=1):
self.reset()
view = []
for i in itertools.islice(self.parse(), start, stop, step):
view.append(i)
return view
def iloc(self, index):
self.reset()
return self.slice(index, index + 1)[0]
def getchild(self, index):
self.reset()
elem = self.iloc(index)
childdict = defaultdict(list)
for i, child in enumerate(list(elem)):
childdict[i] = [child.tag, child.attrib]
return dict(childdict)
I've found a way:
class Mother(object):
def __init__(self):
pass
def reproduce(self):
return Child()
class Child(object):
def __init__(self):
pass
Is there a way to make a thread go to sleep if the list is empty and wake it up again when there are items? I don't want to use Queues since I want to be able to index into the datastructure.
Yes, the solution will probably involve a threading.Condition variable as you note in comments.
Without more information or a code snippet, it's difficult to know what API suits your needs. How are you producing new elements? How are you consuming them? At base, you could do something like this:
cv = threading.Condition()
elements = [] # elements is protected by, and signaled by, cv
def produce(...):
with cv:
... add elements somehow ...
cv.notify_all()
def consume(...):
with cv:
while len(elements) == 0:
cv.wait()
... remove elements somehow ...
I would go with this:
import threading
class MyList (list):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self._cond = threading.Condition()
def append(self, item):
with self._cond:
super().append(item)
self._cond.notify_all()
def pop_or_sleep(self):
with self._cond:
while not len(self):
self._cond.wait()
return self.pop()
Im trying to create a iterator class that will give me a path throw a tree graph, which every iteration it will return the next step according to certain conditions.
So i looked up how to do this here : Build a Basic Python Iterator
and this is what i wrote so far :
def travel_path_iterator(self, article_name):
return Path_Iter(article_name)
class Path_Iter:
def __init__(self,article):
self.article=article
def __iter__(self):
return next(self)
def __next__(self):
answer= self.article.get_max_out_nb()
if answer != self.article.get_name():
return answer
else:
raise StopIteration
But I have a problem to call this class.
my output is always :
<__main__.Path_Iter object at 0x7fe94049fc50>
any guesses what im doing wrong ?
While Path_Iter is already an iterator, the __iter__-method should return self:
def __iter__(self):
return self
Next, to iterate an iterator, you need some kind of loop. E.g. to print the contents, you could convert the iterator to a list:
print list(xyz.travel_path_iterator(article_name))
Using a generator function:
def travel_path_generator(article):
while True:
answer = article.get_max_out_nb()
if answer == article.get_name()
break
else:
yield answer
What I mean by "forkable iterator" - it is a regular iterator with method fork() which creates a new iterator which iterates from the current point of iteration of original iterator. And even if the original iterator was iterated further, fork will stay at the point where it was forked, until it itself will not be iterated over.
My practical use case:
I have a socket connection, and some "packets" that sent through it. Connection can be shared between "receivers" and each "packet" can be addressed to some "receiver". "Packets" can come in unordered way, so each "receiver" can potentially receive packet for different "recevier". And more than that - if one "receiver" received "packet" for different "recevier", this "different receiver" must still be able to read that packet.
So for that I want to implement such forkable iterator, which will represent the connection, and each receiver will make own fork, read it and search for "packets" addressed for it.
Does somebody know any implementations of what I'm talking about?
You are looking for the itertools.tee() function:
Return n independent iterators from a single iterable.
Do take into account that the implementation will buffer data to service all child iterators:
This itertool may require significant auxiliary storage (depending on how much temporary data needs to be stored).
Also, you should only use the returned child iterators; iterating over the source iterator will not propagate the data to the tee() iterables.
Thats my current implementation of forkable iterator:
#!/usr/bin/env python
# coding=utf-8
from collections import Iterator, deque
import threading
class ForkableIterator(Iterator):
def __init__(self, iterator, buffer=None, *args, **kwargs):
self.iterator = iter(iterator)
if buffer is None:
self.buffer = deque()
else:
self.buffer = buffer
args = iter(args)
self.refs = kwargs.get('refs', next(args, {}))
self.refs.setdefault('base', 0)
self.pointer = kwargs.get('pointer', next(args, 0))
self.lock = kwargs.get('lock', next(args, threading.Lock()))
#property
def pointer(self):
return self.refs[self] + self.refs['base']
#pointer.setter
def pointer(self, value):
self.refs[self] = value
def __del__(self):
del self.refs[self]
def __iter__(self):
return self
def next(self):
with self.lock:
if len(self.buffer) - self.pointer == 0:
elem = next(self.iterator)
self.buffer.append(elem)
else:
if self.pointer == min(self.refs.itervalues()):
elem = self.buffer.popleft()
self.refs['base'] -= 1
else:
elem = self.buffer[self.pointer]
self.pointer += 1
return elem
def fork(self):
return self.__class__(self.iterator, self.buffer,
refs=self.refs, pointer=self.pointer,
lock=self.lock)