I'm trying to subclass str - not for anything important, just an experiment to learn more about Python built-in types. I've subclassed str this way (using __new__ because str is immutable):
class MyString(str):
def __new__(cls, value=''):
return str.__new__(cls, value)
def __radd__(self, value): # what method should I use??
return MyString(self + value) # what goes here??
def write(self, data):
self.__radd__(data)
It initializes right, as far as I can tell. but I cant get it to modify itself in-place using the += operator. I've tried overriding __add__, __radd__, __iadd__ and a variety of other configurations. Using a return statement, ive managed to get it to return a new instance of the correct appended MyString, but not modify in place. Success would look like:
b = MyString('g')
b.write('h') # b should now be 'gh'
Any thoughts?
UPDATE
To possibly add a reason why someone might want to do this, I followed the suggestion of creating the following mutable class that uses a plain string internally:
class StringInside(object):
def __init__(self, data=''):
self.data = data
def write(self, data):
self.data += data
def read(self):
return self.data
and tested with timeit:
timeit.timeit("arr+='1234567890'", setup="arr = ''", number=10000)
0.004415035247802734
timeit.timeit("arr.write('1234567890')", setup="from hard import StringInside; arr = StringInside()", number=10000)
0.0331270694732666
The difference increases rapidly at the number goes up - at 1 million interactions, StringInside took longer than I was willing to wait to return, while the pure str version returned in ~100ms.
UPDATE 2
For posterity, I decided to write a cython class wrapping a C++ string to see if performance could be improved compared to one loosely based on Mike Müller's updated version below, and I managed to succeed. I realize cython is "cheating" but I provide this just for fun.
python version:
class Mike(object):
def __init__(self, data=''):
self._data = []
self._data.extend(data)
def write(self, data):
self._data.extend(data)
def read(self, stop=None):
return ''.join(self._data[0:stop])
def pop(self, stop=None):
if not stop:
stop = len(self._data)
try:
return ''.join(self._data[0:stop])
finally:
self._data = self._data[stop:]
def __getitem__(self, key):
return ''.join(self._data[key])
cython version:
from libcpp.string cimport string
cdef class CyString:
cdef string buff
cdef public int length
def __cinit__(self, string data=''):
self.length = len(data)
self.buff = data
def write(self, string new_data):
self.length += len(new_data)
self.buff += new_data
def read(self, int length=0):
if not length:
length = self.length
return self.buff.substr(0, length)
def pop(self, int length=0):
if not length:
length = self.length
ans = self.buff.substr(0, length)
self.buff.erase(0, length)
return ans
performance:
writing
>>> timeit.timeit("arr.write('1234567890')", setup="from pyversion import Mike; arr = Mike()", number=1000000)
0.5992741584777832
>>> timeit.timeit("arr.write('1234567890')", setup="from cyversion import CyBuff; arr = CyBuff()", number=1000000)
0.17381906509399414
reading
>>> timeit.timeit("arr.write('1234567890'); arr.read(5)", setup="from pyversion import Mike; arr = Mike()", number=1000000)
1.1499049663543701
>>> timeit.timeit("arr.write('1234567890'); arr.read(5)", setup="from cyversion import CyBuff; arr = CyBuff()", number=1000000)
0.2894480228424072
popping
>>> # note I'm using 10e3 iterations - the python version wouldn't return otherwise
>>> timeit.timeit("arr.write('1234567890'); arr.pop(5)", setup="from pyversion import Mike; arr = Mike()", number=10000)
0.7390561103820801
>>> timeit.timeit("arr.write('1234567890'); arr.pop(5)", setup="from cyversion import CyBuff; arr = CyBuff()", number=10000)
0.01501607894897461
Solution
This is an answer to the updated question.
You can use a list to hold data and only construct the string when reading it:
class StringInside(object):
def __init__(self, data=''):
self._data = []
self._data.append(data)
def write(self, data):
self._data.append(data)
def read(self):
return ''.join(self._data)
Performance
The performance of this class:
%%timeit arr = StringInside()
arr.write('1234567890')
1000000 loops, best of 3: 352 ns per loop
is much closer to that of the native str:
%%timeit str_arr = ''
str_arr+='1234567890'
1000000 loops, best of 3: 222 ns per loop
Compare with your version:
%%timeit arr = StringInsidePlusEqual()
arr.write('1234567890')
100000 loops, best of 3: 87 µs per loop
Reason
The my_string += another_string way of building a string has been an anti-pattern performance wise for a long time. CPython has some optimizations for this case. Seems like CPython cannot detect that this pattern is used here. This likely because it a bit hidden inside a class.
Not all implementations have this optimization for various reasons. For example. PyPy, which in general is much faster than CPython, is considerably slower for this use case:
PyPy 2.6.0 (Python 2.7.9)
>>>> import timeit
>>>> timeit.timeit("arr+='1234567890'", setup="arr = ''", number=10000)
0.08312582969665527
CPython 2.7.11
>>> import timeit
>>> timeit.timeit("arr+='1234567890'", setup="arr = ''", number=10000)
0.002151966094970703
Slice-able version
This version supports slicing:
class StringInside(object):
def __init__(self, data=''):
self._data = []
self._data.extend(data)
def write(self, data):
self._data.extend(data)
def read(self, start=None, stop=None):
return ''.join(self._data[start:stop])
def __getitem__(self, key):
return ''.join(self._data[key])
You can slice the normal way:
>>> arr = StringInside('abcdefg')
>>> arr[2]
'c'
>>> arr[1:3]
'bc'
Now, read() also supports optional start and stop indices:
>>> arr.read()
'abcdefg'
>>> arr.read(1, 3)
'bc'
>>> arr.read(1)
'bcdefg'
Related
One of the exercises (namely, #6) asks us to compare performance of queue implementations (with head in the beinning / at the end of a list). That sounds like there could be some difference, so I tried to figure it out. Here's my code
import timeit
class QueueStart(object):
'''Queue implementation with head in the beginning of a list'''
def __init__(self):
self.items = []
def enqueue(self, i):
self.items.append(i)
def dequeue(self):
return self.items.pop(0)
def isEmpty(self):
return len(self.items) == 0
def size(self):
return len(self.items)
class QueueEnd(object):
'''Queue implementation with head at the end of a list'''
def __init__(self):
self.items = []
def enqueue(self, item):
self.items.insert(0, item)
def dequeue(self):
return self.items.pop()
def isEmpty(self):
return len(self.items) == 0
def size(self):
return len(self.items)
# store results for further plotting
start_add_list = [] # QueueStart.enqueue(item) runtimes for inputs of different sizes
start_pop_list = [] # the same for QueueStart.dequeue(item)
end_add_list = [] # the same for QueueEnd.enqueue(item)
end_pop_list = [] # the same for QueueEnd.dequeue(item)
for i in range(100000, 500000, 10000):
qs = QueueStart()
qs.items = list(range(i))
qe = QueueEnd()
qe.items = list(range(i))
start_add = timeit.Timer('qs.enqueue(1)', 'from __main__ import qs')
start_pop = timeit.Timer('qs.dequeue()', 'from __main__ import qs')
end_add = timeit.Timer('qe.enqueue(1)', 'from __main__ import qe')
end_pop = timeit.Timer('qe.dequeue()', 'from __main__ import qe')
start_add_list.append(start_add.timeit(number=1000))
start_pop_list.append(start_pop.timeit(number=1000))
end_add_list.append(end_add.timeit(number=1000))
end_pop_list.append(end_pop.timeit(number=1000))
And here are plots that reflect results of my experiment
It's known that insert and pop(index) are O(n). The interesting thing, though, is that from the graphs we see that insert(0, item) takes twice as long as pop(0). That observation made me wonder, why this is the case. On the surface, two methods look very similar, but, apparently, under the hood there's something interesting going on. So, the question is: could you help me figure it out?
Some reading on CPython's list implementation: http://www.laurentluce.com/posts/python-list-implementation/
Basically, lists are designed to have about twice as much memory as they need at any given time, so they can change length slightly without needing to reallocate memory. When lists need more memory, they sometimes need to move the whole list to another location in memory that has enough space. When they shrink, they can just free memory at the end of the list.
I have two iterators in python and both should follow the same "random" distribution (both should run in parallel). For instance:
class Iter1(object):
def __iter__(self):
for i in random_generator():
yield i
class Iter2(object):
def __iter__(self):
for i in random_generator():
yield i
for el1, el2 in zip(Iter1(), Iter2()):
print '{} {}'.format(el1, el2)
output should be somethig like:
0.53534 0.53534
0.12312 0.12312
0.19238 0.19238
How can I define random_generator() in a way that it creates the same random distributions in parallel for both iterators.
Note:
They should run in parallel
I can't generate the sequence in advance (it is a streaming, so I don't know the size of the sequence)
Thanks.
Specify the same seed to each call of random_generator:
import random
def random_generator(l, seed=None):
r = random.Random(seed)
for i in range(l):
yield r.random()
class Iter1(object):
def __init__(self, seed):
self.seed = seed
def __iter__(self):
for i in random_generator(10, self.seed):
yield i
class Iter2(object):
def __init__(self, seed):
self.seed = seed
def __iter__(self):
for i in random_generator(10, self.seed):
yield i
# The seed can be any hashable object, but don't use None; that
# tells random.seed() to use the current time. But make sure that
# Python itself isn't using hash randomization.
common_seed = object()
for el1, el2 in zip(Iter1(common_seed), Iter2(common_seed)):
print '{} {}'.format(el1, el2)
There is no way to control the random generation number in this way. If you want to do that you should create your own random function. But as another pythonic and simpler way you can just create one object and use itertools.tee in order to copy your iterator object to having the same result for your random sequences:
In [28]: class Iter1(object):
def __init__(self, number):
self.number = number
def __iter__(self):
for _ in range(self.number):
yield random.random()
....:
In [29]:
In [29]: num = Iter1(5)
In [30]: from itertools import tee
In [31]: num, num2 = tee(num)
In [32]: list(zip(num, num2))
Out[32]:
[(0.485400998727448, 0.485400998727448),
(0.8801649381536764, 0.8801649381536764),
(0.9684025615967844, 0.9684025615967844),
(0.9980073706742334, 0.9980073706742334),
(0.1963579685642387, 0.1963579685642387)]
I am trying to generate the following sequence:
011212201220200112 ... constructed as follows: first is 0,
then repeated the following action:
already written part is attributed to the right with replacement
0 to 1, 1 to 2, 2 to 0.
E.g.
0 -> 01 -> 0112 -> 01121220 -> ...
I am trying to find the 3 billion-th element of this sequence.
I realized that the sequence grows exponentially and hence derived that:
log(base2) (3 billion) ~ 32
So I just need to generate this sequence 32 times.
Here is what I tried in python:
import os
import sys
s=['0']
num_dict = {'0':'1' , '1':'2' , '2':'0'}
def mapper(b):
return num_dict[b]
def gen(s):
while True:
yield s
s.extend( map(mapper,s) )
a = gen(s)
for i in xrange(32):
a.next()
print a.next()[3000000000 - 1]
The problem is my RAM gets filled up before hitting the 3 billion mark.
Is there a better way to do this problem ?
EDIT: This program could crash your machine.Please try for xrange(25) for testing purposes
There are enough hints in the comments that you should be able to find the one-line solution. I think that it's more interesting to try to derive it with a more general tool, namely, implicit data structures. Here's a class for singleton lists.
class Singleton:
def __init__(self, x):
self.x = x
def __getitem__(self, i):
if not isinstance(i, int): raise TypeError(i)
elif not (0 <= i < len(self)): raise IndexError(i)
else: return self.x
def __len__(self): return 1
We can use this class like so.
>>> lst = Singleton(42)
>>> lst[0]
42
>>> len(lst)
1
Now we define a concatenation class and a mapper class, where the latter takes a function and implicitly applies it to each list element.
class Concatenation:
def __init__(self, lst1, lst2):
self.lst1 = lst1
self.lst2 = lst2
self.cachedlen = len(lst1) + len(lst2)
def __getitem__(self, i):
if not isinstance(i, int): raise TypeError(i)
elif not (0 <= i < len(self)): raise IndexError(i)
elif i < len(self.lst1): return self.lst1[i]
else: return self.lst2[i - len(self.lst1)]
def __len__(self): return self.cachedlen
class Mapper:
def __init__(self, f, lst):
self.f = f
self.lst = lst
def __getitem__(self, i): return self.f(self.lst[i])
def __len__(self): return len(self.lst)
Now let's rewrite your code to use these classes.
a = Singleton(0)
for i in range(32):
a = Concatenation(a, Mapper({0: 1, 1: 2, 2: 0}.get, a))
print(a[3000000000 - 1])
As an exercise: why do we need cachedlen?
I was trying to port a function from C to Python and to make it easy to debug, I'd prefer it performed the same CPU word-size limited operations so I could compare the intermediate results. In other words, I'd like something like:
a = UnsignedBoundedInt(32, 399999)
b = UnsignedBoundedInt(32, 399999)
print(a*b) # prints 1085410049 (159999200001 % 2**32)
What's the best way to achieve this so that all operations (including bitwise shifts) would work as in C?
You can try using ctypes.uint_32 to bound the results for you:
>>> import ctypes
>>> print ctypes.c_uint32(399999 * 399999).value
1085410049
Alternatively you can use numpy's data types:
>>> import numpy as np
>>> a = np.uint32(399999)
>>> b = np.uint32(399999)
>>> a * b
__main__:1: RuntimeWarning: overflow encountered in uint_scalars
1085410049
Here's an interesting solution, though it only works under Python 2:
class U32:
"""Emulates 32-bit unsigned int known from C programming language."""
def __init__(self, num=0, base=None):
"""Creates the U32 object.
Args:
num: the integer/string to use as the initial state
base: the base of the integer use if the num given was a string
"""
if base is None:
self.int_ = int(num) % 2**32
else:
self.int_ = int(num, base) % 2**32
def __coerce__(self, ignored):
return None
def __str__(self):
return "<U32 instance at 0x%x, int=%d>" % (id(self), self.int_)
def __getattr__(self, attribute_name):
print("getattr called, attribute_name=%s" % attribute_name)
# you might want to take a look here:
# https://stackoverflow.com/q/19611001/1091116
r = getattr(self.int_, attribute_name)
if callable(r): # return a wrapper if integer's function was requested
def f(*args, **kwargs):
if args and isinstance(args[0], U32):
args = (args[0].int_, ) + args[1:]
ret = r(*args, **kwargs)
if ret is NotImplemented:
return ret
if attribute_name in ['__str__', '__repr__', '__index__']:
return ret
ret %= 2**32
return U32(ret)
return f
return r
print(U32(4) / 2)
print(4 / U32(2))
print(U32(4) / U32(2))
For Python 3 compatibility, have a look here.
Thanks to some great folks on SO, I discovered the possibilities offered by collections.defaultdict, notably in readability and speed. I have put them to use with success.
Now I would like to implement three levels of dictionaries, the two top ones being defaultdict and the lowest one being int. I don't find the appropriate way to do this. Here is my attempt:
from collections import defaultdict
d = defaultdict(defaultdict)
a = [("key1", {"a1":22, "a2":33}),
("key2", {"a1":32, "a2":55}),
("key3", {"a1":43, "a2":44})]
for i in a:
d[i[0]] = i[1]
Now this works, but the following, which is the desired behavior, doesn't:
d["key4"]["a1"] + 1
I suspect that I should have declared somewhere that the second level defaultdict is of type int, but I didn't find where or how to do so.
The reason I am using defaultdict in the first place is to avoid having to initialize the dictionary for each new key.
Any more elegant suggestion?
Thanks pythoneers!
Use:
from collections import defaultdict
d = defaultdict(lambda: defaultdict(int))
This will create a new defaultdict(int) whenever a new key is accessed in d.
Another way to make a pickleable, nested defaultdict is to use a partial object instead of a lambda:
from functools import partial
...
d = defaultdict(partial(defaultdict, int))
This will work because the defaultdict class is globally accessible at the module level:
"You can't pickle a partial object unless the function [or in this
case, class] it wraps is globally accessible ... under its __name__
(within its __module__)"
-- Pickling wrapped partial functions
Look at nosklo's answer here for a more general solution.
class AutoVivification(dict):
"""Implementation of perl's autovivification feature."""
def __getitem__(self, item):
try:
return dict.__getitem__(self, item)
except KeyError:
value = self[item] = type(self)()
return value
Testing:
a = AutoVivification()
a[1][2][3] = 4
a[1][3][3] = 5
a[1][2]['test'] = 6
print a
Output:
{1: {2: {'test': 6, 3: 4}, 3: {3: 5}}}
As per #rschwieb's request for D['key'] += 1, we can expand on previous by overriding addition by defining __add__ method, to make this behave more like a collections.Counter()
First __missing__ will be called to create a new empty value, which will be passed into __add__. We test the value, counting on empty values to be False.
See emulating numeric types for more information on overriding.
from numbers import Number
class autovivify(dict):
def __missing__(self, key):
value = self[key] = type(self)()
return value
def __add__(self, x):
""" override addition for numeric types when self is empty """
if not self and isinstance(x, Number):
return x
raise ValueError
def __sub__(self, x):
if not self and isinstance(x, Number):
return -1 * x
raise ValueError
Examples:
>>> import autovivify
>>> a = autovivify.autovivify()
>>> a
{}
>>> a[2]
{}
>>> a
{2: {}}
>>> a[4] += 1
>>> a[5][3][2] -= 1
>>> a
{2: {}, 4: 1, 5: {3: {2: -1}}}
Rather than checking argument is a Number (very non-python, amirite!) we could just provide a default 0 value and then attempt the operation:
class av2(dict):
def __missing__(self, key):
value = self[key] = type(self)()
return value
def __add__(self, x):
""" override addition when self is empty """
if not self:
return 0 + x
raise ValueError
def __sub__(self, x):
""" override subtraction when self is empty """
if not self:
return 0 - x
raise ValueError
Late to the party, but for arbitrary depth I just found myself doing something like this:
from collections import defaultdict
class DeepDict(defaultdict):
def __call__(self):
return DeepDict(self.default_factory)
The trick here is basically to make the DeepDict instance itself a valid factory for constructing missing values. Now we can do things like
dd = DeepDict(DeepDict(list))
dd[1][2].extend([3,4])
sum(dd[1][2]) # 7
ddd = DeepDict(DeepDict(DeepDict(list)))
ddd[1][2][3].extend([4,5])
sum(ddd[1][2][3]) # 9
def _sub_getitem(self, k):
try:
# sub.__class__.__bases__[0]
real_val = self.__class__.mro()[-2].__getitem__(self, k)
val = '' if real_val is None else real_val
except Exception:
val = ''
real_val = None
# isinstance(Avoid,dict)也是true,会一直递归死
if type(val) in (dict, list, str, tuple):
val = type('Avoid', (type(val),), {'__getitem__': _sub_getitem, 'pop': _sub_pop})(val)
# 重新赋值当前字典键为返回值,当对其赋值时可回溯
if all([real_val is not None, isinstance(self, (dict, list)), type(k) is not slice]):
self[k] = val
return val
def _sub_pop(self, k=-1):
try:
val = self.__class__.mro()[-2].pop(self, k)
val = '' if val is None else val
except Exception:
val = ''
if type(val) in (dict, list, str, tuple):
val = type('Avoid', (type(val),), {'__getitem__': _sub_getitem, 'pop': _sub_pop})(val)
return val
class DefaultDict(dict):
def __getitem__(self, k):
return _sub_getitem(self, k)
def pop(self, k):
return _sub_pop(self, k)
In[8]: d=DefaultDict()
In[9]: d['a']['b']['c']['d']
Out[9]: ''
In[10]: d['a']="ggggggg"
In[11]: d['a']
Out[11]: 'ggggggg'
In[12]: d['a']['pp']
Out[12]: ''
No errors again.
No matter how many levels nested.
pop no error also
dd=DefaultDict({"1":333333})