I'm trying to find the best way to get a list of all LDAP user accounts that belong to groups which are members of a groupOfNames using python-ldap. This is on an OpenLDAP server, not AD. I wrote the function below, which does the job but takes forever to run. I'm hoping either python-ldap has some builtin function that I'm not aware of, or there's something I can modify to make this run more quickly. If not, hopefully someone else will find this code useful. Thanks in advance for any help!
def get_nested_members(con, dn):
con : LDAPObject
An authenticated python-ldap connection object
dn : string
The dn of the groupOfNames to be checked
members : list
A list of all accounts that are members of the given dn
members = []
searched = []
to_search = [dn]
while len(to_search) > 0:
current_dn = to_search.pop()
cn = current_dn.split(',')[0]
r = con.search_s(base_dn, ldap.SCOPE_SUBTREE, cn, [])[0][1]
if 'groupOfNames' in r['objectClass']:
if 'member' in r:
for i in r['member']:
if((i != current_dn) and (i not in searched)):
elif 'posixGroup' in r['objectClass']:
if 'memberUid' in r:
for i in r['memberUid']:
elif 'posixAccount' in r['objectClass']:
if 'uid' in r:
print('ERROR: encountered record of unknown type:')
pprint(str([current_dn, r]))
return list(set(members))
I realized that running ldapsearch repeatedly was the limiting factor, so I made a new version which builds a dictionary of ALL group and groupOfNames records first. It takes up a bit more memory than the old solution, but is less taxing on the LDAP server and runs significantly faster (down from ~15 minutes to <1 second for my application). I'll leave the original code below the new version for a reference of what not to do. Credit for the merge_dicts() function goes to Aaron Hall.
import ldap
def merge_dicts(*dict_args):
"""Given any number of dicts, shallow copy and merge into a new dict,
precedence goes to key value pairs in latter dicts.
result = {}
for dictionary in dict_args:
return result
def get_nested_members(con, dn, base_dn='dc=example'):
"""Search a groupOfNames and return all posixAccount members from all its subgroups
con: LDAPObject
An authenticated LDAP connection object
dn: string
The dn of the groupOfNames to be searched for members
(optional) base_dn: string
The base dn to search on. Make sure to change the default value to fit your LDAP server
members: list
A list of all nested members from the provided groupOfNames
logging.info('Getting nested members of ' + str(dn))
print('Getting nested members of ' + str(dn))
if type(dn) is list:
to_search = [] + dn
elif type(dn) is str:
to_search = [dn]
print('ERROR: Invalid dn value. Please supply either a sting or list of strings.')
return []
members = []
searched = []
groupOfNames_list = con.search_s(base_dn, ldap.SCOPE_SUBTREE, 'objectClass=groupOfNames', ['dn', 'member', 'cn'])
groupOfNames_dict = {}
for g in range(len(groupOfNames_list)):
groupOfNames_dict[groupOfNames_list[g][0]] = groupOfNames_list[g][1]
groupOfNames_list = None #To free up memory
group_list = con.search_s(base_dn, ldap.SCOPE_SUBTREE, 'objectClass=posixGroup', ['dn', 'memberUid', 'cn'])
group_dict = {}
for g in range(len(group_list)):
group_dict[group_list[g][0]] = group_list[g][1]
group_list = None #To free up memory
all_groups = merge_dicts(groupOfNames_dict, group_dict)
group_dict = None #To free up memory
groupOfNamesdict = None #To free up memory
while len(to_search) > 0:
search_dn = to_search.pop()
g = all_groups[search_dn]
if 'memberUid' in g:
members += g['memberUid']
elif 'member' in g:
m = g['member']
for i in m:
if i.startswith('uid='):
elif i.startswith('cn='):
if i not in searched:
return list(set(members))
I have a code which add extra memcache instance at run time, but this makes my keys lost. I know there are several libraries available like consistent_hash, hash_ring but I am unable to use them in my code. I know there is ketama available but couldn't find python code sample for it.
import random
import string
import memcache
class MemcacheClient(memcache.Client):
""" A memcache subclass. It currently allows you to add a new host at run
Sadly, this truely messes with the our keys. I.E. Adding a host at runtime
effectively wipes our cache all together...Wonder why?
def _get_server(self, key):
""" Current implementation of Memcache client
return super(MemcacheClient, self)._get_server(key)
def add_server(self, server):
""" Adds a host at runtime to client
# Create a new host entry
server = memcache._Host(
server, self.debug, dead_retry=self.dead_retry,
# Add this to our server choices
# Update our buckets
def random_key(size):
""" Generates a random key
return ''.join(random.choice(string.letters) for _ in range(size))
if __name__ == '__main__':
# We have 7 running memcached servers
servers = ['' % i for i in range(1,8)]
# We have 100 keys to split across our servers
keys = [random_key(10) for i in range(100)]
# Init our subclass
client = MemcacheClient(servers=servers)
# Distribute the keys on our servers
for key in keys:
client.set(key, 1)
# Check how many keys come back
valid_keys = client.get_multi(keys)
print '%s percent of keys matched' % ((len(valid_keys)/float(len(keys))) * 100)
# We add another server...and pow!
print 'Added new server'
valid_keys = client.get_multi(keys)
print '%s percent of keys stil matched' % ((len(valid_keys)/float(len(keys))) * 100)
Well, basically you have to override the _get _server() method to change the server distribution algorithm.
I've done some searching over the internet and found this article on google, amix.dk/blog/post/19367, which is a very good material written by Amir Salihefendic, that helps a lot to understand how the ketama consistent hash algorithm works, and also has a ketama implementation on a Python class called HashRing made by him.
So I basically used his class and changed it a little to fit Memcached client needs. The modifications were the change of the md5 module that was depprecated, and the change of the string used to generate the keys for the servers from:
key = self.gen_key('%s:%s' % (node, i))
key = self.gen_key(
'%s:%s:%s:%s' % (node.address[0],
node.address[1], i, node.weight)
I also fixed a bug that caused an infinite loop on get_nodes() method when the algorithm didn't find a server at the first loop.
The old get_nodes() method (will enter infinite loop if no server is yielded).
def get_nodes(self, string_key):
"""Given a string key it returns the nodes as a generator that can hold the key.
The generator is never ending and iterates through the ring
starting at the correct position.
if not self.ring:
yield None, None
node, pos = self.get_node_pos(string_key)
for key in self._sorted_keys[pos:]:
yield self.ring[key]
while True:
for key in self._sorted_keys:
yield self.ring[key]
The new get_nodes() method:
def get_nodes(self, string_key):
if not self.ring:
yield None, None
node, pos = self.get_node_pos(string_key)
for key in self._sorted_keys[pos:]:
if key in self.ring:
yield self.ring[key]
for key in self._sorted_keys[:pos]:
if key in self.ring:
yield self.ring[key]
I've added a new forloop scope on add_node() as well as on remove_node() method to consider the weight of the server for adding more replicas.
Old way:
for i in xrange(0, self.replicas):
key = self.gen_key('%s:%s' % (node, i))
self.ring[key] = node
New way:
for i in xrange(0, self.replicas):
for x in range(0, node.weight):
key = self.gen_key(
'%s:%s:%s:%s' % (node.address[0],
node.address[1], i, node.weight)
if key not in self.ring:
self.ring[key] = node
The above code regards to the add_node() method, but the some idea applies to remove_node().
Well, maybe there are some other changes I've made, I just don't recall any other for now. This is the suited HashRing class:
from hashlib import md5
class HashRing(object):
def __init__(self, nodes=None, replicas=3):
"""Manages a hash ring.
`nodes` is a list of objects that have a proper __str__ representation.
`replicas` indicates how many virtual points should be used pr. node,
replicas are required to improve the distribution.
self.replicas = replicas
self.ring = dict()
self._sorted_keys = []
if nodes:
for node in nodes:
def add_node(self, node):
"""Adds a `node` to the hash ring (including a number of replicas).
for i in xrange(0, self.replicas):
"""This will ensure that a server with a bigger weight will have
more copies into the ring increasing it's probability to be retrieved.
for x in range(0, node.weight):
key = self.gen_key(
'%s:%s:%s:%s' % (node.address[0],
node.address[1], i, node.weight)
if key not in self.ring:
self.ring[key] = node
def remove_node(self, node):
"""Removes `node` from the hash ring and its replicas.
for i in xrange(0, self.replicas):
for x in range(node.weight):
key = self.gen_key(
'%s:%s:%s:%s' % (node.address[0],
node.address[1], i, node.weight)
if key in self.ring:
del self.ring[key]
def get_node(self, string_key):
Given a string key a corresponding node in the hash ring is returned.
If the hash ring is empty, `None` is returned.
return self.get_node_pos(string_key)[0]
def get_node_pos(self, string_key):
"""Given a string key a corresponding node in the hash ring is returned
along with it's position in the ring.
If the hash ring is empty, (`None`, `None`) is returned.
if not self.ring:
return None, None
key = self.gen_key(string_key)
nodes = self._sorted_keys
for i in xrange(0, len(nodes)):
node = nodes[i]
if key <= node:
return self.ring[node], i
return self.ring[nodes[0]], 0
def get_nodes(self, string_key):
"""Given a string key it returns the nodes as a generator that can hold
the key.
The generator is never ending and iterates through the ring
starting at the correct position.
if not self.ring:
yield None, None
node, pos = self.get_node_pos(string_key)
for key in self._sorted_keys[pos:]:
if key in self.ring:
yield self.ring[key]
for key in self._sorted_keys[:pos]:
if key in self.ring:
yield self.ring[key]
def gen_key(key):
"""Given a string key it returns a long value,
this long value represents a place on the hash ring.
md5 is currently used because it mixes well.
m = md5()
return long(m.hexdigest(), 16)
I changed your class a little in order to make it more flexible for deciding when to use ketama algorithm, or the default - modulo.
I noticed that when writing your add_server() method you forgot to consider the weight of the server when appending it to the buckets list.
So this is how the new MemcacheClient would look like:
from consistent_hash import HashRing
class MemcacheClient(memcache.Client):
""" A memcache subclass. It currently allows you to add a new host at run
available_algorithms = ['ketama', 'modulo']
hash_algorithm_index = 0
def __init__(self, hash_algorithm='ketama', *args, **kwargs):
super(MemcacheClient, self).__init__(*args, **kwargs)
if hash_algorithm in self.available_algorithms:
self.hash_algorithm_index = self.available_algorithms.index(
if hash_algorithm == 'ketama':
self.consistent_hash_manager = HashRing(nodes=self.servers)
self.consistent_hash_manager = None
raise Exception(
"The algorithm \"%s\" is not implemented for this client. The "
"options are \"%s\""
"" % (hash_algorithm, " or ".join(self.available_algorithms))
def _get_server(self, key):
""" Returns the most likely server to hold the key
if self.hash_algorithm == 'ketama':
""" Basic concept of the Implementation of ketama algorithm
e.g. ring = {100:server1, 110:server2, 120:server3, 140:server4}
If the hash of the current key is 105, it server will be the next
bigger integer in the ring which is 110 (server2)
If a server is added on position 108 the key will be now allocated
to it and not to server 110. Otherwise if the server on position
110 is removed the key will now belong to de server 120.
If there's no bigger integer position in the ring then the hash of
the key, it will take the first server from the ring.
# The variable "servers" is the list of the servers in the ring
# starting from the next bigger integer to the hash of the key,
# till it finds the one that holds the key
servers_generator = self.consistent_hash_manager.get_nodes(key)
for server in servers_generator:
if server.connect():
#print server.address[1]
return server, key
return None, None
return super(MemcacheClient, self)._get_server(key)
def add_server(self, server):
""" Adds a host at runtime to client
# Uncomment this to protect the Client from adding a server in case
# there's no reliable consistent hash algorithm such as MODULO
if not self.consistent_hash_manager:
raise Exception("The current consistent hash algorithm (\"%s\") is"
" not reliable for adding a new server"
"" % self.hash_algorithm)
# Create a new host entry
server = memcache._Host(
server, self.debug, dead_retry=self.dead_retry,
# Add this to our server choices
"""This for statement will ensure that a server with a bigger weight
will have more copies into the buckets increasing it's probability to
be retrieved.
for i in range(server.weight):
# Adds this node to the circle
if self.consistent_hash_manager:
def random_key(size):
""" Generates a random key
return ''.join(random.choice(string.letters) for _ in range(size))
def run_consistent_hash_test(client_obj):
# We have 500 keys to split across our servers
keys = [random_key(100) for i in range(500)]
"\n/////////// CONSISTENT HASH ALGORITHM \"%s\" //////////////"
"" % client_obj.hash_algorithm.upper()
print("\n->These are the %s servers:" % len(client_obj.servers))
str_servers = ""
for server in client_obj.servers:
str_servers += "%s:%s, " % (server.address[0], server.address[1])
# Clear all previous keys from memcache
# Distribute the keys over the servers
for key in keys:
client_obj.set(key, 1)
"\n%d keys distributed for %d server(s)\n"
"" % (len(keys), len(client_obj.servers))
# Check how many keys come back
valid_keys = client_obj.get_multi(keys)
"%s percent of keys matched, before adding extra servers.\n" \
"" %((len(valid_keys) / float(len(keys))) * 100)
# Add 5 new extra servers
interval_extra_servers = range(19, 24)
extra_servers = ['' % i for i in interval_extra_servers]
for server in extra_servers:
# Check how many keys come back after adding the extra servers
valid_keys = client_obj.get_multi(keys)
print (
"Added %d new server(s).\n%s percent of keys still matched" \
"" % (len(interval_extra_servers),
(len(valid_keys) / float(len(keys))) * 100)
if __name__ == '__main__':
# We have 8 running memcached servers
interval_servers = range(11, 19)
servers = ['' % i for i in interval_servers]
Init our subclass. The hash_algorithm paramether can be "modulo"<-
(default) or "ketama" (the new one).
client = MemcacheClient(servers=servers, hash_algorithm='ketama')
If you run this class directly on terminal it will show a proper output
This worked for me...before creating a new host entry, add a condition. if server is None, then execute server=memcahce. line
I know it's too late to answer this question, but I'm hoping it will be helpful for some. I have the working class that you can use directly. This will be a drop in replacement for the original memcache.Client.
class KetamaMemcacheClient(memcache.Client):
This memcache client implements consistent hashing algorithm "ketama".
This will make sure that the cache miss happening while adding or removing
a node from the client to very minimal.
# Server weight means, numer of slots given for one server. For better
# performence it whould be between 100-200 - Adjust the weight to see how
# cache miss changing.
# Total number of slots on the ring.
# If addition or deltion of a new node only causes 1 to 5 percentage cache
# miss on the current configuraiton. ie; K / RING_SIZE - where K means total
# keys.
RING_SIZE = 2 ** 16
def __init__(self, *args, **kwargs):
Add some special parameters to handle the servers allocation.
# Mapping between ring slot -> server.
self._ketama_server_ring = {}
# Sorted server slots on top of the virtual ring.
self._ketama_server_slots = []
super(KetamaMemcacheClient, self).__init__(*args, **kwargs)
def _get_server(self, key):
Get the memcache server corresponding to the given key.
:param key: The input query.
:return: A tuple with (server_obj, key).
# map the key on to the ring slot space.
h_key = self._generate_ring_slot(key)
for slot in self._ketama_server_slots:
if h_key <= slot:
server = self._ketama_server_ring[slot]
if server.connect():
return (server, key)
# Even after allocating the server, if the h_key won't fit
# on any server, then pick the first server on the ring.
server = self._ketama_server_ring[self._ketama_server_slots[0]] if \
self._ketama_server_slots else None
server and server.connect()
return server, key
def add_server(self, server):
Add new server to the client.
:param servers: server host in <IP>:<PORT> format.
or in tuple of (<IP>:<PORT>, weight)
server_obj = memcache._Host(
server if isinstance(server, tuple) else (
self.debug, dead_retry=self.dead_retry,
def set_servers(self, servers):
Add a pool of servers into the client.
:param servers: List of server hosts in <IP>:<PORT> format.
List of tuples with each tuple of the format
(<IP>:<PORT>, weight)
# Set the default weight if weight isn't passed.
self.servers = [memcache._Host(
s if isinstance(s, tuple) else (s, self.DEFAULT_SERVER_WEIGHT),
self.debug, dead_retry=self.dead_retry,
flush_on_reconnect=self.flush_on_reconnect) for s in servers]
# Place all the servers on rings based on the slot allocation
# specifications.
[self._place_server_on_ring(s) for s in self.servers]
def _place_server_on_ring(self, server):
Place given server on the ring.
:param server: An instance of :class:~`memcache._Host`.
server_slots = self._get_server_slots_on_ring(server)
for slot in server_slots:
if slot not in self._ketama_server_ring:
self._ketama_server_ring[slot] = server
# There is a key collection(<<<1% chance).
# Discarding this scenario now.
# TODO: Handle it.
# Append the sorted server slot list
def _get_server_slots_on_ring(self, server):
Returns list of slot on the ring for given server.
This make sure that the slots won't collid with others server.
:param server: An object of :class:~`memcache._Host`.
:return: list of slots on the ring.
server_slots = []
for i in range(0, server.weight):
server_key = "{}_{}".format("{}:{}".format(server.ip,
server.port), i)
return server_slots
def _generate_ring_slot(self, key):
Hash function which give random slots on the ring. Hash functon make
sure that the key distribution is even as much as possible.
:param key: Key which need to be mapped to the hash space.
:type key: str
:return: hash key corresponding to `key`
# Simple hash method using python's internal hash algorithm.
#h_key = hash(key) & 0xffff
# crc32 based hashing
#h_key = ((crc32(key) & 0xffffffff) >> 16) & 0xffff
# For better randomness
h_key = ((crc32(key) & 0xffffffff)) & 0xffff
return h_key
client = KetamaMemcacheClient(servers)
# This change in number of servers only affect very few key misses.
I haven't added remove_server method to remove some dead server from the configured server list. That is pretty easy by keeping a inverted server mapping and remove the slots allocated to that server.
Enjoy !
Python noob here so please bear with me! I have a list that looks like this:
bookList = [("Wuthering Heights", "fred"), ("Everville", "fred"), ("Wuthering Heights", "dan")]
What I’m trying to do is write a function that looks at each nested list and sees who shares books in common with who, depending who is logged in. For example, if dan was logged in, the system would say “fred also has plums”.
I have a dictionary set up the holds usernames as keys and passwords as their value.
I’m kind of struggling with list comprehension when they involve anything nested, and help would be greatly appreciated!
I don't think your existing data structure is really ideal for this. What I would do would be to pre-process it into a dictionary whose keys are the usernames and the values are sets of books. Then you can do a loop or list comprehension to compare the logged-in user with all the other users and see if there is anything in common. So:
from collections import defaultdict
bookdict = defaultdict(set)
for book, name in bookList:
logged_in_user = 'fred'
for person, books in bookdict.items():
if person == logged_in_user:
common = books.intersection(bookdict[logged_in_user])
if common:
print '%s also has %s' % (person, ', '.join(common))
def common_books(user):
user_books = {b for b, u in bookList if u == user}
for b, u in bookList:
if b in user_books and u != user:
print '{0} also has {1}'.format(u,b)
If you're trying to get the books that fred has in the list
filter(lambda x: x[1] == "fred", bookList)
Another version as per Bakuriu's comment.
class Session:
def __init__(self):
self.books = ["Wuthering Heights", "Everville"]
self.username = "fred"
bookList = [("Wuthering Heights", "fred"), ("Everville", "fred"), ("Wuthering Heights", "dan")]
if __name__ == "__main__":
session = Session()
for book in bookList:
if book[1] != session.username and book[0] in session.books:
print "{} also has {}".format(book[1], book[0])
I'm trying to write a program that creates an address book with contact names, emails, phone numbers, etc. I store each contact as a dictionary and then place each person (dictionary) into a global list. I then convert the list to a string using repr() and write it to a file. When I try to reload the list and write what it contains, I get a list of empty dictionaries. Please help me figure out what is wrong.
Here is my code:
list = []
listfile = 'phonebook.txt'
class bookEntry(dict):
total = 0
def __init__(self):
bookEntry.total += 1
self.d = {}
def __del__(self):
bookEntry.total -= 1
class Person(bookEntry):
def __init__(self, n):
self.n = n
print '%s has been created' % (self.n)
def addnewperson(self, n, e = '', ph = '', note = ''):
f = file(listfile, 'w')
self.d['name'] = n
self.d['email'] = e
self.d['phone'] = ph
self.d['note'] = note
listStr = repr(list)
I start the program with a startup() function:
def startup():
aor = raw_input('Hello! Would you like to add an entry or retrieve one?')
if aor == 'add':
info = raw_input('Would you like to add a person or a company?')
if info == 'person':
n = raw_input('Please enter this persons name:')
e = raw_input('Please enter this persons email address:')
ph = raw_input('Please enter this persons phone number:')
note = raw_input('Please add any notes if applicable:')
X = Person(n)
X.addnewperson(n, e, ph, note)
I add these answers to the prompts:
Hello! Would you like to add an entry or retrieve one?add
Would you like to add a person or a company?person
Please enter this persons name:Pig
Please enter this persons email address:pig#brickhouse.com
Please enter this persons phone number:333-333-3333
Please add any notes if applicable:one of three
Pig has been created
When I open phonebook.txt, this is what I see:
Why are empty dictionaries being returned?
You're deriving from dict, but storing all the elements in a member d. Hence, repr gives you a string representing an empty dict. If you want to use a bookEntry as a dict, insert the info with
self['name'] = n
instead of
self.d['name'] = n
(But really, you shouldn't be inheriting from dict here. Also, please don't use list as an identifier, it's the name of a builtin.)
you should save self.d instead of self:
listStr = repr(alist)
btw don't use list as the name of a variable, you are overwritting the keyword list
Your problem is that the X.d dictionary is not the same as the dictionary "bookEntry" is inheriting from. Therefore repr(X) is not showing X.d
A solution might be to override repr in BookEntry:
def __repr___(self):
return repr(self.d)
I am having problems understanding how to work with query results. I asked about half a dozen questions about this but I still do not understand. I copy from previous code and I make it work somehow but since I don't understand the underlying concept the code breaks down if I make a minor change. I would really appreciate if you could tell me how you visualize what is happenning here and explain it to me. Thank you.
class ReceiveEmail(InboundMailHandler):
def receive(self, message):
logging.info("Received email from %s" % message.sender)
plaintext = message.bodies(content_type='text/plain')
for text in plaintext:
txtmsg = ""
txtmsg = text[1].decode()
logging.info("Body is %s" % txtmsg)
logging.info("CC email is %s" % ((message.cc).split(",")[1]))
query = User.all()
query.filter("userEmail =", ((message.cc).split(",")[1]))
results = query.fetch(1)
for result in results:
result.userScore += 1
um = results[0]
um.userScore = result.userScore
In this code, as I understand it, the query takes the second email address from the cc list and fetches the result.
Then I increment the userScore by 1.
Next, I want to update this item in Datastore so I say
um = results[0]
um.userScore = result.userScore
But this gives an index out of range error:
um = results[0]
IndexError: list index out of range
Why? I am imagining that results[0] is the zeroeth item of the results. Why is it out of range? Only thing I can think of is that, the list may be None. But I don't understand why. It must have the 1 item that was fetched.
Also, if I try to test for the first email address by changing the index from [1] to [0]
query.filter("userEmail =", ((message.cc).split(",")[0]))
then I don't get the IndexError.
What am I doing wrong here?
See comments:
left a space in front of the emails (starting with the second email), so the query was not matching them;
>>> cc.split(",")
['cc12#example.com', ' cc13#example.com', ' cc13#example.com']
adding a space after comma fixed the problem:
>>> listcc = cc.split(", ")
>>> listcc
['cc12#example.com', 'cc13#example.com', 'cc13#example.com']
To understand the code break it down and look at it piece by piece:
class ReceiveEmail(InboundMailHandler):
def receive(self, message):
logging.info("Received email from %s" % message.sender)
# Get a list of CC addresses. This is basically a for loop.
cc_addresses = [address.strip() for address in message.cc.split(",")]
# The CC list goes with the message, not the bodies.
logging.info("CC email is %s" % (cc_addresses))
# Get and iterate over all of the *plain-text* bodies in the email.
plaintext = message.bodies(content_type='text/plain')
for text in plaintext:
txtmsg = ""
txtmsg = text[1].decode()
logging.info("Body is %s" % txtmsg)
# Setup a query object.
query = User.all()
# Filter the user objects to get only the emails in the CC list.
query.filter("userEmail IN", cc_addresses)
# But, only get at most 10 users.
users = query.fetch(10)
logging.info('Got %d user entities from the datastore.' % len(users))
# Iterate over each of the users increasing their score by one.
for user in users:
user.userScore += 1
# Now, write the users back to the datastore.
logging.info('Wrote %d user entities.' % len(users))
I would make an adjustment to your model structure. When you create the User entity, I would set the key_name to the email address. You will be able to make your queries much more efficient.
Some references:
List Comprehension.
Query Object.
Say I have a simple table that contains username, firstname, lastname.
How do I express this in berkeley Db?
I'm currently using bsddb as the interface.
You have to pick one "column" as the key (must be unique; I imagine that would be "username" in your case) -- the only way searches will ever possibly happen. The other columns can be made to be the single string value of that key by any way you like, from pickling to simple joining with a character that's guaranteed to never occur in any of the columns, such as `\0' for many kind of "readable text strings".
If you need to be able to search by different keys you'll need other, supplementary and separate bsddb databases set up as "indices" into your main table -- it's lots of work, and there's lots of literature on the subject. (Alternatively, you move to a higher-abstraction technology, such as sqlite, which handles the indexing neatly on your behalf;-).
tl,dr: To express multiple columns in an ordered key value store like berkley db you need to learn about key composition. Look up my other answers about bsddb to learn more.
There is several ways to do that using ordered key/value store.
The simplest solution is to store documents as json values with a correct key.
Now you probably want to build index over those columns to retrieve documents without having to iterate over all the hashmap to find the correct object. For that you can use a secondaryDB that will build automatically the index for you. Or you can build the index yourself.
If you don't want to deal with key packing (and it's a good idea for starting up), you can take advantage of DB.set_bt_compare which will allow you to use cpickle, json or msgpack for both keys and values while still having an order that makes sens to create indices and doing queries. This is slower method but introduce the pattern of key composition.
To fully take advantage what ordered key is, you can make use of Cursor.set_range(key) to set the position of the db at the beginning of a query.
Another pattern, is called the EAV pattern stores tuples that follow the scheme (entity, attribute, value) and then you build various index by using permutation of that tuple. I learned this pattern studing datomic.
For less ressource hungry database, you will go the "static typed" way and store as much as possible of common information in the "metadata" table and split documents (which are really RDBMS tables) into their own hashmap.
To get you started here is an example database using bsddb (but you could build it using another ordered key/value store like wiredtiger or leveldb) that implements the EAV pattern. In this implementation I swap EAV for IKV which translates to Unique identifier, Key, Value. The overal result is that you have a fully indexed schema less document database. I think it's a good compromise between efficiency and ease-of-use.
import struct
from json import dumps
from json import loads
from bsddb3.db import DB
from bsddb3.db import DBEnv
from bsddb3.db import DB_BTREE
from bsddb3.db import DB_CREATE
from bsddb3.db import DB_INIT_MPOOL
from bsddb3.db import DB_LOG_AUTO_REMOVE
def pack(*values):
def __pack(value):
if type(value) is int:
return '1' + struct.pack('>q', value)
elif type(value) is str:
return '2' + struct.pack('>q', len(value)) + value
data = dumps(value, encoding='utf-8')
return '3' + struct.pack('>q', len(data)) + data
return ''.join(map(__pack, values))
def unpack(packed):
kind = packed[0]
if kind == '1':
value = struct.unpack('>q', packed[1:9])[0]
packed = packed[9:]
elif kind == '2':
size = struct.unpack('>q', packed[1:9])[0]
value = packed[9:9+size]
packed = packed[size+9:]
size = struct.unpack('>q', packed[1:9])[0]
value = loads(packed[9:9+size])
packed = packed[size+9:]
if packed:
values = unpack(packed)
values.insert(0, value)
values = [value]
return values
class TupleSpace(object):
"""Generic database"""
def __init__(self, path):
self.env = DBEnv()
self.env.set_cache_max(10, 0)
self.env.set_cachesize(5, 0)
flags = (
self.env.log_set_config(DB_LOG_AUTO_REMOVE, True)
self.env.set_lg_max(1024 ** 3)
# create vertices and edges k/v stores
def new_store(name):
flags = DB_CREATE
elements = DB(self.env)
return elements
self.tuples = new_store('tuples')
self.index = new_store('index')
self.txn = None
def get(self, uid):
cursor = self.tuples.cursor()
def __get():
record = cursor.set_range(pack(uid, ''))
if not record:
key, value = record
while True:
other, key = unpack(key)
if other == uid:
value = unpack(value)[0]
yield key, value
record = cursor.next()
if record:
key, value = record
tuples = dict(__get())
return tuples
def add(self, uid, **properties):
for key, value in properties.items():
self.tuples.put(pack(uid, key), pack(value))
self.index.put(pack(key, value, uid), '')
def delete(self, uid):
# delete item from main table and index
cursor = self.tuples.cursor()
index = self.index.cursor()
record = cursor.set_range(pack(uid, ''))
if record:
key, value = record
raise Exception('not found')
while True:
other, key = unpack(key)
if other == uid:
# remove tuple from main index
# remove it from index
value = unpack(value)[0]
index.set(pack(key, value, uid))
# continue
record = cursor.next()
if record:
key, value = record
def update(self, uid, **properties):
self.add(uid, **properties)
def close(self):
def debug(self):
for key, value in self.tuples.items():
uid, key = unpack(key)
value = unpack(value)[0]
print(uid, key, value)
def query(self, key, value=''):
"""return `(key, value, uid)` tuples that where
`key` and `value` are expressed in the arguments"""
cursor = self.index.cursor()
match = (key, value) if value else (key,)
record = cursor.set_range(pack(key, value))
if not record:
while True:
key, _ = record
other = unpack(key)
ok = reduce(
lambda previous, x: (cmp(*x) == 0) and previous,
zip(match, other),
if ok:
yield other
record = cursor.next()
if not record:
db = TupleSpace('tmp')
# you can use a tuple to store a counter
db.add(0, counter=0)
# And then have a procedure doing the required work
# to alaways have a fresh uid
def make_uid():
counter = db.get(0)
counter['counter'] += 1
return counter['counter']
amirouche = make_uid()
db.add(amirouche, username="amirouche", age=30)