Custom key function for python defaultdict - python

What is a good way to define a custom key function analogous to the key argument to list.sort, for use in a collections.defaultdict?
Here's an example use case:
import collections
class Path(object):
def __init__(self, start, end, *other_features):
self._first = start
self._last = end
self._rest = other_features
def startpoint(self):
return self._first
def endpoint(self):
return self._last
# Maybe it has __eq__ and __hash__, maybe not
paths = [... a list of Path objects ...]
by_endpoint = collections.defaultdict(list)
for p in paths:
by_last_name[p.endpoint()].append(p)
# do stuff that depends on lumping paths with the same endpoint together
What I desire is a way to tell by_endpoint to use Path.endpoint as the key function, similar to the key argument to list.sort, and not have to put this key definition into the Path class itself (via __eq__ and __hash__), since it is just as sensible to also support "lumping by start point" as well.

Something like this maybe:
from collections import defaultdict
class defaultkeydict(defaultdict):
def __init__(self, default_factory, key=lambda x: x, *args, **kwargs):
defaultdict.__init__(self, default_factory, *args, **kwargs)
self.key_func = key
def __getitem__(self, key):
return defaultdict.__getitem__(self, self.get_key(key))
def __setitem__(self, key, value):
defaultdict.__setitem__(self, self.get_key(key), value)
def get_key(self, key):
try:
return self.key_func(key)
except Exception:
return key
Note the logic that falls back to the passed-in key if the key function can't be executed. That way you can still access the items using strings or whatever keys.
Now:
p = Path("Seattle", "Boston")
d = defaultkeydict(list, key=lambda x: x.endpoint())
d[p].append(p)
print(d) # defaultdict(<type 'list'>, {'Boston': [<__main__.Path object at ...>]})

Related

How to evaluate values in a dictionary?

I have a dictionary (d).
I want to use this dictionary with different values for a and b.
How can I update the dictionary's output without calling the whole dictionary again?
a=1
b=2
d = {'value_1': a+b , 'value_2':a-b}
Based on your feedback to my first answer, it sounds like what you want is something that behaves more-or-less like a spreadsheet. Below is how to implement one that is very dictionary-like. It's based on Raymond Hettinger's ActiveState recipe by that name with some modifications and extensions.
Note that except for the special case of keyword arguments passed when an instance of the class is created, the values in it should all be strings, not numerical values.
Also note that since it uses eval(), for security purposes it should only be used with input from trusted sources.
I think it's important to understand that although the Spreadsheet class presented below isn't technically a dictionary, it behaves a lot like (a subclass of) one and if used instead of a regular dictionary will give you the capabilities you want. See the description of mapping in the online documentation's glossary.
from collections.abc import MutableMapping
class SpreadSheet(MutableMapping):
def __init__(self, tools=None, **kwargs):
self._cells = {}
for key, value in kwargs.items():
self._cells[key] = value if isinstance(value, str) else str(value)
self._tools = {'__builtins__': None} # Prevent eval() from supplying.
if tools is not None:
self._tools.update(tools) # Add any caller-supplied functions.
def clear(self):
return self._cells.clear()
def copy(self):
return self._cells.copy()
def __contains__(self, key):
return key in self._cells
def __setitem__(self, key, formula):
self._cells[key] = formula
def __getitem__(self, key):
return eval(self._cells[key], self._tools, self)
def __len__(self):
return len(self._cells)
def __iter__(self):
return iter(self._cells)
def __delitem__(self, key):
del self._cells[key]
def getformula(self, key):
""" Return raw un-evaluated contents of cell. """
return self._cells[key]
def update(self, *args, **kwargs):
for k, v in dict(*args, **kwargs).items():
self[k] = v
Sample usage:
d = SpreadSheet(a=1, b=2)
d.update({'x': 'x1',
'x1': 'a+2',
'x2': 'b+1',
'x3': 'a+b'})
xx = d['x']
print(xx) # -> 3
You could do it by storing functions in a separate dictionary and then create new ones by evaluating it.
Here's a simple example illustrating what I'm suggesting:
funcs = {'value_1': lambda: a+b,
'value_2': lambda: a-b}
a=1
b=2
d = {k: v() for k, v in funcs.items()}
print(d) # -> {'value_1': 3, 'value_2': -1}
a=3
b=5
d = {k: v() for k, v in funcs.items()}
print(d) # -> {'value_1': 8, 'value_2': -2}

Custom OrderedDict that returns itself

I have the following custom class:
class MyArray (OrderedDict):
def __init__ (self,*args):
OrderedDict.__init__(self,*args)
def __getitem__ (self, key):
if not hasattr (key, '__iter__'):
return OrderedDict.__getitem__ (self,key)
return MyArray((k,self[k]) for k in key)
This class does exactly what i want for when i have multiple keys, but doesn't do what i want for single keys.
Let me demonstrate what my code outputs:
x = MyArray()
x[0] = 3
x[1] = 4
x[2] = 5
print x[1,0,2]
MyArray([(1,4),(0,3),(2,5)])
But then:
print x[1]
4
I want it to be:
MyArray([(1,4)])
Here was my attempt to fix it to act the way i want (which led to infinite recursion):
class MyArray (OrderedDict):
def __getitem__ (self, key):
if not hasattr (key, '__iter__'):
return MyArray({key:OrderedDict.__getitem__ (self,key)})
return MyArray((k,OrderedDict.__getitem__ (self,k)) for k in key)
The key here is to realize that self[k] is the same as self.__getitem__(k) so you don't want to use self[k] inside __getitem__, unless you are in fact trying to do some recursion. Instead always use OrderedDict.__getitem__ (self,key).
On an unrelated note, you generally don't want to create a method that just calls the same method of the parent class, ie:
class MyArray (OrderedDict):
def __init__ (self,*args):
OrderedDict.__init__(self,*args)
Just delete the method and python will call the parent class method for you, inheritance is awesome :).
update:
After some digging I found that you get infinite recursion when you try to print a MyArray because OrderedDict.__repr__ calls OrderDict.items which then calls OrderDict.__getitem__ (in the form of self[key]), then it calls __repr__ on each of the items ... The issue here is that you're modifying __getitem__ to do something very different than what it does in the Parent class. If you want this class to have the full functionality of a python class, you'll need to override every method that uses self[key] anywhere in the method. You can start with items, ie something like:
def items(self):
'od.items() -> list of (key, value) pairs in od'
return [(key, OrderedDict.__getitem__(self, key)) for key in self]
When you hit this kind of thing it's often better to drop the subclassing and just have the OrderedDict be an attribute of the new class, something like:
class MyArray(object):
def __init__(self, *args):
self.data = OrderedDict(*args)
def __getitem__(self, key):
if not hasattr (key, '__iter__'):
return MyArray([(key, self.data[key])])
return MyArray([(k, self.data[k]) for k in key])
The infinite recursion was happening in self.items() as Bi Rico pointed out.
Here is a code that works (essentially overrides self.items())
class MyArray (OrderedDict):
def __getitem__ (self, key):
if not hasattr (key, '__iter__'):
return MyArray({key:OrderedDict.__getitem__ (self,key)})
return MyArray((k,OrderedDict.__getitem__ (self,k)) for k in key)
def items(self):
'''
called when the dictionary is printed
'''
return [(k, OrderedDict.__getitem__(self, k)) for k in self]
The code above would have worked without the items definition if I had inherited from dict instead of OrderedDict.

Python OrderedSet with .index() method

Does anyone know about a fast OrderedSet implementation for python that:
remembers insertion order
has an index() method (like the one lists offer)
All implementations I found are missing the .index() method.
You can always add it in a subclass. Here is a basic implementation for the OrderedSet you linked in a comment:
class IndexOrderedSet(OrderedSet):
def index(self, elem):
if key in self.map:
return next(i for i, e in enumerate(self) if e == elem)
else:
raise KeyError("That element isn't in the set")
You mentioned you only need add, index, and in-order iteration. You can get this by using an OrderedDict as storage. As a bonus, you can subclass the collections.Set abstract class to get the other set operations frozensets support:
from itertools import count, izip
from collections import OrderedDict, Set
class IndexOrderedSet(Set):
"""An OrderedFrozenSet-like object
Allows constant time 'index'ing
But doesn't allow you to remove elements"""
def __init__(self, iterable = ()):
self.num = count()
self.dict = OrderedDict(izip(iterable, self.num))
def add(self, elem):
if elem not in self:
self.dict[elem] = next(self.num)
def index(self, elem):
return self.dict[elem]
def __contains__(self, elem):
return elem in self.dict
def __len__(self):
return len(self.dict)
def __iter__(self):
return iter(self.dict)
def __repr__(self):
return 'IndexOrderedSet({})'.format(self.dict.keys())
You can't subclass collections.MutableSet because you can't support removing elements from the set and keep the indexes correct.

How to implement an ordered, default dict?

I would like to combine OrderedDict() and defaultdict() from collections in one object, which shall be an ordered, default dict.
Is this possible?
The following (using a modified version of this recipe) works for me:
from collections import OrderedDict, Callable
class DefaultOrderedDict(OrderedDict):
# Source: http://stackoverflow.com/a/6190500/562769
def __init__(self, default_factory=None, *a, **kw):
if (default_factory is not None and
not isinstance(default_factory, Callable)):
raise TypeError('first argument must be callable')
OrderedDict.__init__(self, *a, **kw)
self.default_factory = default_factory
def __getitem__(self, key):
try:
return OrderedDict.__getitem__(self, key)
except KeyError:
return self.__missing__(key)
def __missing__(self, key):
if self.default_factory is None:
raise KeyError(key)
self[key] = value = self.default_factory()
return value
def __reduce__(self):
if self.default_factory is None:
args = tuple()
else:
args = self.default_factory,
return type(self), args, None, None, self.items()
def copy(self):
return self.__copy__()
def __copy__(self):
return type(self)(self.default_factory, self)
def __deepcopy__(self, memo):
import copy
return type(self)(self.default_factory,
copy.deepcopy(self.items()))
def __repr__(self):
return 'OrderedDefaultDict(%s, %s)' % (self.default_factory,
OrderedDict.__repr__(self))
Here is another possibility, inspired by Raymond Hettinger's super() Considered Super, tested on Python 2.7.X and 3.4.X:
from collections import OrderedDict, defaultdict
class OrderedDefaultDict(OrderedDict, defaultdict):
def __init__(self, default_factory=None, *args, **kwargs):
#in python3 you can omit the args to super
super(OrderedDefaultDict, self).__init__(*args, **kwargs)
self.default_factory = default_factory
If you check out the class's MRO (aka, help(OrderedDefaultDict)), you'll see this:
class OrderedDefaultDict(collections.OrderedDict, collections.defaultdict)
| Method resolution order:
| OrderedDefaultDict
| collections.OrderedDict
| collections.defaultdict
| __builtin__.dict
| __builtin__.object
meaning that when an instance of OrderedDefaultDict is initialized, it defers to the OrderedDict's init, but this one in turn will call the defaultdict's methods before calling __builtin__.dict, which is precisely what we want.
If you want a simple solution that doesn't require a class, you can just use OrderedDict.setdefault(key, default=None) or OrderedDict.get(key, default=None). If you only get / set from a few places, say in a loop, you can easily just setdefault.
totals = collections.OrderedDict()
for i, x in some_generator():
totals[i] = totals.get(i, 0) + x
It is even easier for lists with setdefault:
agglomerate = collections.OrderedDict()
for i, x in some_generator():
agglomerate.setdefault(i, []).append(x)
But if you use it more than a few times, it is probably better to set up a class, like in the other answers.
Here's another solution to think about if your use case is simple like mine and you don't necessarily want to add the complexity of a DefaultOrderedDict class implementation to your code.
from collections import OrderedDict
keys = ['a', 'b', 'c']
items = [(key, None) for key in keys]
od = OrderedDict(items)
(None is my desired default value.)
Note that this solution won't work if one of your requirements is to dynamically insert new keys with the default value. A tradeoff of simplicity.
Update 3/13/17 - I learned of a convenience function for this use case. Same as above but you can omit the line items = ... and just:
od = OrderedDict.fromkeys(keys)
Output:
OrderedDict([('a', None), ('b', None), ('c', None)])
And if your keys are single characters, you can just pass one string:
OrderedDict.fromkeys('abc')
This has the same output as the two examples above.
You can also pass a default value as the second arg to OrderedDict.fromkeys(...).
Another simple approach would be to use dictionary get method
>>> from collections import OrderedDict
>>> d = OrderedDict()
>>> d['key'] = d.get('key', 0) + 1
>>> d['key'] = d.get('key', 0) + 1
>>> d
OrderedDict([('key', 2)])
>>>
A simpler version of #zeekay 's answer is:
from collections import OrderedDict
class OrderedDefaultListDict(OrderedDict): #name according to default
def __missing__(self, key):
self[key] = value = [] #change to whatever default you want
return value
A simple and elegant solution building on #NickBread.
Has a slightly different API to set the factory, but good defaults are always nice to have.
class OrderedDefaultDict(OrderedDict):
factory = list
def __missing__(self, key):
self[key] = value = self.factory()
return value
I created slightly fixed and more simplified version of the accepted answer, actual for python 3.7.
from collections import OrderedDict
from copy import copy, deepcopy
import pickle
from typing import Any, Callable
class DefaultOrderedDict(OrderedDict):
def __init__(
self,
default_factory: Callable[[], Any],
*args,
**kwargs,
):
super().__init__(*args, **kwargs)
self.default_factory = default_factory
def __getitem__(self, key):
try:
return super().__getitem__(key)
except KeyError:
return self.__missing__(key)
def __missing__(self, key):
self[key] = value = self.default_factory()
return value
def __reduce__(self):
return type(self), (self.default_factory, ), None, None, iter(self.items())
def copy(self):
return self.__copy__()
def __copy__(self):
return type(self)(self.default_factory, self)
def __deepcopy__(self, memo):
return type(self)(self.default_factory, deepcopy(tuple(self.items()), memo))
def __repr__(self):
return f'{self.__class__.__name__}({self.default_factory}, {OrderedDict(self).__repr__()})'
And, that may be even more important, provided some tests.
a = DefaultOrderedDict(list)
# testing default
assert a['key'] == []
a['key'].append(1)
assert a['key'] == [1, ]
# testing repr
assert repr(a) == "DefaultOrderedDict(<class 'list'>, OrderedDict([('key', [1])]))"
# testing copy
b = a.copy()
assert b['key'] is a['key']
c = copy(a)
assert c['key'] is a['key']
d = deepcopy(a)
assert d['key'] is not a['key']
assert d['key'] == a['key']
# testing pickle
saved = pickle.dumps(a)
restored = pickle.loads(saved)
assert restored is not a
assert restored == a
# testing order
a['second_key'] = [2, ]
a['key'] = [3, ]
assert list(a.items()) == [('key', [3, ]), ('second_key', [2, ])]
Inspired by other answers on this thread, you can use something like,
from collections import OrderedDict
class OrderedDefaultDict(OrderedDict):
def __missing__(self, key):
value = OrderedDefaultDict()
self[key] = value
return value
I would like to know if there're any downsides of initializing another object of the same class in the missing method.
i tested the default dict and discovered it's also sorted!
maybe it was just a coincidence but anyway you can use the sorted function:
sorted(s.items())
i think it's simpler

How to properly subclass dict and override __getitem__ & __setitem__

I am debugging some code and I want to find out when a particular dictionary is accessed. Well, it's actually a class that subclass dict and implements a couple extra features. Anyway, what I would like to do is subclass dict myself and add override __getitem__ and __setitem__ to produce some debugging output. Right now, I have
class DictWatch(dict):
def __init__(self, *args):
dict.__init__(self, args)
def __getitem__(self, key):
val = dict.__getitem__(self, key)
log.info("GET %s['%s'] = %s" % str(dict.get(self, 'name_label')), str(key), str(val)))
return val
def __setitem__(self, key, val):
log.info("SET %s['%s'] = %s" % str(dict.get(self, 'name_label')), str(key), str(val)))
dict.__setitem__(self, key, val)
'name_label' is a key which will eventually be set that I want to use to identify the output. I have then changed the class I am instrumenting to subclass DictWatch instead of dict and changed the call to the superconstructor. Still, nothing seems to be happening. I thought I was being clever, but I wonder if I should be going a different direction.
Thanks for the help!
Another issue when subclassing dict is that the built-in __init__ doesn't call update, and the built-in update doesn't call __setitem__. So, if you want all setitem operations to go through your __setitem__ function, you should make sure that it gets called yourself:
class DictWatch(dict):
def __init__(self, *args, **kwargs):
self.update(*args, **kwargs)
def __getitem__(self, key):
val = dict.__getitem__(self, key)
print('GET', key)
return val
def __setitem__(self, key, val):
print('SET', key, val)
dict.__setitem__(self, key, val)
def __repr__(self):
dictrepr = dict.__repr__(self)
return '%s(%s)' % (type(self).__name__, dictrepr)
def update(self, *args, **kwargs):
print('update', args, kwargs)
for k, v in dict(*args, **kwargs).items():
self[k] = v
What you're doing should absolutely work. I tested out your class, and aside from a missing opening parenthesis in your log statements, it works just fine. There are only two things I can think of. First, is the output of your log statement set correctly? You might need to put a logging.basicConfig(level=logging.DEBUG) at the top of your script.
Second, __getitem__ and __setitem__ are only called during [] accesses. So make sure you only access DictWatch via d[key], rather than d.get() and d.set()
Consider subclassing UserDict or UserList. These classes are intended to be subclassed whereas the normal dict and list are not, and contain optimisations.
That should not really change the result (which should work, for good logging threshold values) :
your init should be :
def __init__(self,*args,**kwargs) : dict.__init__(self,*args,**kwargs)
instead, because if you call your method with DictWatch([(1,2),(2,3)]) or DictWatch(a=1,b=2) this will fail.
(or,better, don't define a constructor for this)
As Andrew Pate's answer proposed, subclassing collections.UserDict instead of dict is much less error prone.
Here is an example showing an issue when inheriting dict naively:
class MyDict(dict):
def __setitem__(self, key, value):
super().__setitem__(key, value * 10)
d = MyDict(a=1, b=2) # Bad! MyDict.__setitem__ not called
d.update(c=3) # Bad! MyDict.__setitem__ not called
d['d'] = 4 # Good!
print(d) # {'a': 1, 'b': 2, 'c': 3, 'd': 40}
UserDict inherits from collections.abc.MutableMapping, so this works as expected:
class MyDict(collections.UserDict):
def __setitem__(self, key, value):
super().__setitem__(key, value * 10)
d = MyDict(a=1, b=2) # Good: MyDict.__setitem__ correctly called
d.update(c=3) # Good: MyDict.__setitem__ correctly called
d['d'] = 4 # Good
print(d) # {'a': 10, 'b': 20, 'c': 30, 'd': 40}
Similarly, you only have to implement __getitem__ to automatically be compatible with key in my_dict, my_dict.get, …
Note: UserDict is not a subclass of dict, so isinstance(UserDict(), dict) will fail (but isinstance(UserDict(), collections.abc.MutableMapping) will work).
All you will have to do is
class BatchCollection(dict):
def __init__(self, inpt={}):
super(BatchCollection, self).__init__(inpt)
A sample usage for my personal use
### EXAMPLE
class BatchCollection(dict):
def __init__(self, inpt={}):
super(BatchCollection, self).__init__(inpt)
def __setitem__(self, key, item):
if (isinstance(key, tuple) and len(key) == 2
and isinstance(item, collections.Iterable)):
# self.__dict__[key] = item
super(BatchCollection, self).__setitem__(key, item)
else:
raise Exception(
"Valid key should be a tuple (database_name, table_name) "
"and value should be iterable")
Note: tested only in python3

Categories

Resources