Python memcache consistent hashing using ketama - python

I have a code which add extra memcache instance at run time, but this makes my keys lost. I know there are several libraries available like consistent_hash, hash_ring but I am unable to use them in my code. I know there is ketama available but couldn't find python code sample for it.
import random
import string
import memcache
class MemcacheClient(memcache.Client):
""" A memcache subclass. It currently allows you to add a new host at run
time.
Sadly, this truely messes with the our keys. I.E. Adding a host at runtime
effectively wipes our cache all together...Wonder why?
"""
def _get_server(self, key):
""" Current implementation of Memcache client
"""
return super(MemcacheClient, self)._get_server(key)
def add_server(self, server):
""" Adds a host at runtime to client
"""
# Create a new host entry
server = memcache._Host(
server, self.debug, dead_retry=self.dead_retry,
socket_timeout=self.socket_timeout,
flush_on_reconnect=self.flush_on_reconnect
)
# Add this to our server choices
self.servers.append(server)
# Update our buckets
self.buckets.append(server)
def random_key(size):
""" Generates a random key
"""
return ''.join(random.choice(string.letters) for _ in range(size))
if __name__ == '__main__':
# We have 7 running memcached servers
servers = ['127.0.0.1:1121%d' % i for i in range(1,8)]
# We have 100 keys to split across our servers
keys = [random_key(10) for i in range(100)]
# Init our subclass
client = MemcacheClient(servers=servers)
# Distribute the keys on our servers
for key in keys:
client.set(key, 1)
# Check how many keys come back
valid_keys = client.get_multi(keys)
print '%s percent of keys matched' % ((len(valid_keys)/float(len(keys))) * 100)
# We add another server...and pow!
client.add_server('127.0.0.1:11219')
print 'Added new server'
valid_keys = client.get_multi(keys)
print '%s percent of keys stil matched' % ((len(valid_keys)/float(len(keys))) * 100)

Well, basically you have to override the _get _server() method to change the server distribution algorithm.
I've done some searching over the internet and found this article on google, amix.dk/blog/post/19367, which is a very good material written by Amir Salihefendic, that helps a lot to understand how the ketama consistent hash algorithm works, and also has a ketama implementation on a Python class called HashRing made by him.
So I basically used his class and changed it a little to fit Memcached client needs. The modifications were the change of the md5 module that was depprecated, and the change of the string used to generate the keys for the servers from:
key = self.gen_key('%s:%s' % (node, i))
to:
key = self.gen_key(
'%s:%s:%s:%s' % (node.address[0],
node.address[1], i, node.weight)
)
I also fixed a bug that caused an infinite loop on get_nodes() method when the algorithm didn't find a server at the first loop.
The old get_nodes() method (will enter infinite loop if no server is yielded).
def get_nodes(self, string_key):
"""Given a string key it returns the nodes as a generator that can hold the key.
The generator is never ending and iterates through the ring
starting at the correct position.
"""
if not self.ring:
yield None, None
node, pos = self.get_node_pos(string_key)
for key in self._sorted_keys[pos:]:
yield self.ring[key]
while True:
for key in self._sorted_keys:
yield self.ring[key]
The new get_nodes() method:
def get_nodes(self, string_key):
if not self.ring:
yield None, None
node, pos = self.get_node_pos(string_key)
for key in self._sorted_keys[pos:]:
if key in self.ring:
yield self.ring[key]
for key in self._sorted_keys[:pos]:
if key in self.ring:
yield self.ring[key]
I've added a new forloop scope on add_node() as well as on remove_node() method to consider the weight of the server for adding more replicas.
Old way:
for i in xrange(0, self.replicas):
key = self.gen_key('%s:%s' % (node, i))
self.ring[key] = node
self._sorted_keys.append(key)
New way:
for i in xrange(0, self.replicas):
for x in range(0, node.weight):
key = self.gen_key(
'%s:%s:%s:%s' % (node.address[0],
node.address[1], i, node.weight)
)
if key not in self.ring:
self.ring[key] = node
self._sorted_keys.append(key)
The above code regards to the add_node() method, but the some idea applies to remove_node().
Well, maybe there are some other changes I've made, I just don't recall any other for now. This is the suited HashRing class:
from hashlib import md5
class HashRing(object):
def __init__(self, nodes=None, replicas=3):
"""Manages a hash ring.
`nodes` is a list of objects that have a proper __str__ representation.
`replicas` indicates how many virtual points should be used pr. node,
replicas are required to improve the distribution.
"""
self.replicas = replicas
self.ring = dict()
self._sorted_keys = []
if nodes:
for node in nodes:
self.add_node(node)
def add_node(self, node):
"""Adds a `node` to the hash ring (including a number of replicas).
"""
for i in xrange(0, self.replicas):
"""This will ensure that a server with a bigger weight will have
more copies into the ring increasing it's probability to be retrieved.
"""
for x in range(0, node.weight):
key = self.gen_key(
'%s:%s:%s:%s' % (node.address[0],
node.address[1], i, node.weight)
)
if key not in self.ring:
self.ring[key] = node
self._sorted_keys.append(key)
self._sorted_keys.sort()
def remove_node(self, node):
"""Removes `node` from the hash ring and its replicas.
"""
for i in xrange(0, self.replicas):
for x in range(node.weight):
key = self.gen_key(
'%s:%s:%s:%s' % (node.address[0],
node.address[1], i, node.weight)
)
if key in self.ring:
del self.ring[key]
self._sorted_keys.remove(key)
def get_node(self, string_key):
"""
Given a string key a corresponding node in the hash ring is returned.
If the hash ring is empty, `None` is returned.
"""
return self.get_node_pos(string_key)[0]
def get_node_pos(self, string_key):
"""Given a string key a corresponding node in the hash ring is returned
along with it's position in the ring.
If the hash ring is empty, (`None`, `None`) is returned.
"""
if not self.ring:
return None, None
key = self.gen_key(string_key)
nodes = self._sorted_keys
for i in xrange(0, len(nodes)):
node = nodes[i]
if key <= node:
return self.ring[node], i
return self.ring[nodes[0]], 0
def get_nodes(self, string_key):
"""Given a string key it returns the nodes as a generator that can hold
the key.
The generator is never ending and iterates through the ring
starting at the correct position.
"""
if not self.ring:
yield None, None
node, pos = self.get_node_pos(string_key)
for key in self._sorted_keys[pos:]:
if key in self.ring:
yield self.ring[key]
for key in self._sorted_keys[:pos]:
if key in self.ring:
yield self.ring[key]
#staticmethod
def gen_key(key):
"""Given a string key it returns a long value,
this long value represents a place on the hash ring.
md5 is currently used because it mixes well.
"""
m = md5()
m.update(key)
return long(m.hexdigest(), 16)
I changed your class a little in order to make it more flexible for deciding when to use ketama algorithm, or the default - modulo.
I noticed that when writing your add_server() method you forgot to consider the weight of the server when appending it to the buckets list.
So this is how the new MemcacheClient would look like:
from consistent_hash import HashRing
class MemcacheClient(memcache.Client):
""" A memcache subclass. It currently allows you to add a new host at run
time.
"""
available_algorithms = ['ketama', 'modulo']
hash_algorithm_index = 0
def __init__(self, hash_algorithm='ketama', *args, **kwargs):
super(MemcacheClient, self).__init__(*args, **kwargs)
if hash_algorithm in self.available_algorithms:
self.hash_algorithm_index = self.available_algorithms.index(
hash_algorithm)
if hash_algorithm == 'ketama':
self.consistent_hash_manager = HashRing(nodes=self.servers)
else:
self.consistent_hash_manager = None
else:
raise Exception(
"The algorithm \"%s\" is not implemented for this client. The "
"options are \"%s\""
"" % (hash_algorithm, " or ".join(self.available_algorithms))
)
def _get_server(self, key):
""" Returns the most likely server to hold the key
"""
if self.hash_algorithm == 'ketama':
""" Basic concept of the Implementation of ketama algorithm
e.g. ring = {100:server1, 110:server2, 120:server3, 140:server4}
If the hash of the current key is 105, it server will be the next
bigger integer in the ring which is 110 (server2)
If a server is added on position 108 the key will be now allocated
to it and not to server 110. Otherwise if the server on position
110 is removed the key will now belong to de server 120.
If there's no bigger integer position in the ring then the hash of
the key, it will take the first server from the ring.
"""
# The variable "servers" is the list of the servers in the ring
# starting from the next bigger integer to the hash of the key,
# till it finds the one that holds the key
servers_generator = self.consistent_hash_manager.get_nodes(key)
for server in servers_generator:
if server.connect():
#print server.address[1]
return server, key
return None, None
else:
return super(MemcacheClient, self)._get_server(key)
def add_server(self, server):
""" Adds a host at runtime to client
"""
# Uncomment this to protect the Client from adding a server in case
# there's no reliable consistent hash algorithm such as MODULO
"""
if not self.consistent_hash_manager:
raise Exception("The current consistent hash algorithm (\"%s\") is"
" not reliable for adding a new server"
"" % self.hash_algorithm)
"""
# Create a new host entry
server = memcache._Host(
server, self.debug, dead_retry=self.dead_retry,
socket_timeout=self.socket_timeout,
flush_on_reconnect=self.flush_on_reconnect
)
# Add this to our server choices
self.servers.append(server)
"""This for statement will ensure that a server with a bigger weight
will have more copies into the buckets increasing it's probability to
be retrieved.
"""
for i in range(server.weight):
self.buckets.append(server)
# Adds this node to the circle
if self.consistent_hash_manager:
self.consistent_hash_manager.add_node(server)
def random_key(size):
""" Generates a random key
"""
return ''.join(random.choice(string.letters) for _ in range(size))
def run_consistent_hash_test(client_obj):
# We have 500 keys to split across our servers
keys = [random_key(100) for i in range(500)]
print(
"\n/////////// CONSISTENT HASH ALGORITHM \"%s\" //////////////"
"" % client_obj.hash_algorithm.upper()
)
print("\n->These are the %s servers:" % len(client_obj.servers))
str_servers = ""
for server in client_obj.servers:
str_servers += "%s:%s, " % (server.address[0], server.address[1])
print("******************************************************************")
print(str_servers)
print("******************************************************************")
# Clear all previous keys from memcache
client_obj.flush_all()
# Distribute the keys over the servers
for key in keys:
client_obj.set(key, 1)
print(
"\n%d keys distributed for %d server(s)\n"
"" % (len(keys), len(client_obj.servers))
)
# Check how many keys come back
valid_keys = client_obj.get_multi(keys)
print(
"%s percent of keys matched, before adding extra servers.\n" \
"" %((len(valid_keys) / float(len(keys))) * 100)
)
# Add 5 new extra servers
interval_extra_servers = range(19, 24)
extra_servers = ['127.0.0.1:112%d' % i for i in interval_extra_servers]
for server in extra_servers:
client_obj.add_server(server)
# Check how many keys come back after adding the extra servers
valid_keys = client_obj.get_multi(keys)
print (
"Added %d new server(s).\n%s percent of keys still matched" \
"" % (len(interval_extra_servers),
(len(valid_keys) / float(len(keys))) * 100)
)
print("\n***************************************************************"
"****\n")
if __name__ == '__main__':
# We have 8 running memcached servers
interval_servers = range(11, 19)
servers = ['127.0.0.1:112%d' % i for i in interval_servers]
"""
Init our subclass. The hash_algorithm paramether can be "modulo"<-
(default) or "ketama" (the new one).
"""
client = MemcacheClient(servers=servers, hash_algorithm='ketama')
run_consistent_hash_test(client)
If you run this class directly on terminal it will show a proper output

This worked for me...before creating a new host entry, add a condition. if server is None, then execute server=memcahce. line

I know it's too late to answer this question, but I'm hoping it will be helpful for some. I have the working class that you can use directly. This will be a drop in replacement for the original memcache.Client.
class KetamaMemcacheClient(memcache.Client):
"""
This memcache client implements consistent hashing algorithm "ketama".
This will make sure that the cache miss happening while adding or removing
a node from the client to very minimal.
"""
#
# Server weight means, numer of slots given for one server. For better
# performence it whould be between 100-200 - Adjust the weight to see how
# cache miss changing.
#
DEFAULT_SERVER_WEIGHT = 200
# Total number of slots on the ring.
# If addition or deltion of a new node only causes 1 to 5 percentage cache
# miss on the current configuraiton. ie; K / RING_SIZE - where K means total
# keys.
RING_SIZE = 2 ** 16
def __init__(self, *args, **kwargs):
"""
Add some special parameters to handle the servers allocation.
"""
# Mapping between ring slot -> server.
self._ketama_server_ring = {}
# Sorted server slots on top of the virtual ring.
self._ketama_server_slots = []
super(KetamaMemcacheClient, self).__init__(*args, **kwargs)
def _get_server(self, key):
"""
Get the memcache server corresponding to the given key.
:param key: The input query.
:return: A tuple with (server_obj, key).
"""
# map the key on to the ring slot space.
h_key = self._generate_ring_slot(key)
for slot in self._ketama_server_slots:
if h_key <= slot:
server = self._ketama_server_ring[slot]
if server.connect():
return (server, key)
# Even after allocating the server, if the h_key won't fit
# on any server, then pick the first server on the ring.
server = self._ketama_server_ring[self._ketama_server_slots[0]] if \
self._ketama_server_slots else None
server and server.connect()
return server, key
def add_server(self, server):
"""
Add new server to the client.
:param servers: server host in <IP>:<PORT> format.
or in tuple of (<IP>:<PORT>, weight)
"""
server_obj = memcache._Host(
server if isinstance(server, tuple) else (
server, self.DEFAULT_SERVER_WEIGHT),
self.debug, dead_retry=self.dead_retry,
socket_timeout=self.socket_timeout,
flush_on_reconnect=self.flush_on_reconnect)
self._place_server_on_ring(server_obj)
def set_servers(self, servers):
"""
Add a pool of servers into the client.
:param servers: List of server hosts in <IP>:<PORT> format.
or
List of tuples with each tuple of the format
(<IP>:<PORT>, weight)
"""
# Set the default weight if weight isn't passed.
self.servers = [memcache._Host(
s if isinstance(s, tuple) else (s, self.DEFAULT_SERVER_WEIGHT),
self.debug, dead_retry=self.dead_retry,
socket_timeout=self.socket_timeout,
flush_on_reconnect=self.flush_on_reconnect) for s in servers]
# Place all the servers on rings based on the slot allocation
# specifications.
[self._place_server_on_ring(s) for s in self.servers]
def _place_server_on_ring(self, server):
"""
Place given server on the ring.
:param server: An instance of :class:~`memcache._Host`.
"""
server_slots = self._get_server_slots_on_ring(server)
for slot in server_slots:
if slot not in self._ketama_server_ring:
self._ketama_server_ring[slot] = server
self._ketama_server_slots.append(slot)
else:
# There is a key collection(<<<1% chance).
# Discarding this scenario now.
# TODO: Handle it.
pass
# Append the sorted server slot list
self._ketama_server_slots.sort()
def _get_server_slots_on_ring(self, server):
"""
Returns list of slot on the ring for given server.
This make sure that the slots won't collid with others server.
:param server: An object of :class:~`memcache._Host`.
:return: list of slots on the ring.
"""
server_slots = []
for i in range(0, server.weight):
server_key = "{}_{}".format("{}:{}".format(server.ip,
server.port), i)
server_slots.append(self._generate_ring_slot(server_key))
return server_slots
def _generate_ring_slot(self, key):
"""
Hash function which give random slots on the ring. Hash functon make
sure that the key distribution is even as much as possible.
:param key: Key which need to be mapped to the hash space.
:type key: str
:return: hash key corresponding to `key`
"""
# Simple hash method using python's internal hash algorithm.
#h_key = hash(key) & 0xffff
# crc32 based hashing
#h_key = ((crc32(key) & 0xffffffff) >> 16) & 0xffff
# For better randomness
h_key = ((crc32(key) & 0xffffffff)) & 0xffff
return h_key
client = KetamaMemcacheClient(servers)
# This change in number of servers only affect very few key misses.
client.add_server('127.0.0.1:11218')
I haven't added remove_server method to remove some dead server from the configured server list. That is pretty easy by keeping a inverted server mapping and remove the slots allocated to that server.
Enjoy !

Related

How to send the encrypted data to the client?

Here is my client code.
import socket, pickle,time
from encryption import *
def Main():
host = '127.0.0.1'
port = 5006
s = socket.socket()
s.connect((host, port))
m= encryption()
pri_key,pub_key,n=m.generating_keys(1)
filename = input("Filename? -> ")
if filename != 'q':
data=[filename,pub_key,n]
msg=pickle.dumps(data)
s.send(msg)
data = s.recv(1024)
data=data.decode('utf-8')
if data == '1':
size = s.recv(1024)
size = int(size.decode('utf-8'))
filesize = size
message = input("File exists, " + str(filesize) +"Bytes, download? (Y/N)? -> ")
if message == 'Y':
s.send(b'1')
count=0
f = open('new_'+filename, 'wb')
data = s.recv(1024)
data=int.from_bytes(data,byteorder="little")
msg=m.decrypt(data,pri_key,n)
totalRecv = len(msg)
f.write(msg)
#count=0
while totalRecv<filesize:
#time.sleep(.300)
decipher = s.recv(1024)
decipher=int.from_bytes(decipher,byteorder="little")
print(decipher)
if(decipher==0):
break
msg=m.decrypt(decipher,pri_key,n)
totalRecv += len(msg)
f.write(msg)
print ("{0:.2f}".format((totalRecv/float(filesize))*100)+ "% Done")
print ("Download Complete!")
f.close()
else:
print ("File Does Not Exist!")
s.close()
if __name__ == '__main__':
Main()
Here is my server code.
import socket,threading,os,pickle
from encryption import *
def RetrFile(name, sock):
m=encryption()
filename = sock.recv(1024)
dat=pickle.loads(filename)
if os.path.isfile(dat[0]):
s='1'
s=s.encode('utf-8')
sock.send(s)
k=str(os.path.getsize(dat[0]))
k=k.encode('utf-8')
sock.send(k)
count=8
userResponse = sock.recv(1024)
if userResponse[:2] == (b'1'):
with open(dat[0],'rb') as f:
bytesToSend = f.read(1024)
#print(type(bytesToSend))
#print('1')
#print(bytesToSend)
msg= m.encrypt(bytesToSend,dat[1],dat[2])
#print(msg)
#print(1)
k=msg.bit_length()
if(k%8>=1):
k=k+1
msg=msg.to_bytes(k,byteorder="little")
#print (msg)
#msg=msg.encode('utf-8')
#print(msg)
sock.send(msg)
s=''
s=s.encode('utf-8')
while bytesToSend != s:
bytesToSend = f.read(1024)
msg= m.encrypt(bytesToSend,dat[1],dat[2])
k=msg.bit_length()
if(k%8>=1):
k=k//8+1
msg=msg.to_bytes(k,byteorder="little")
sock.send(msg)
#count=count.to_bytes(1,byteorder="little")
#sock.send(count)
else:
sock.send(b'ERR')
sock.close()
def Main():
host = '127.0.0.1'
port = 5006
s = socket.socket()
s.bind((host,port))
s.listen(5)
print ("Server Started.")
while True:
c, addr = s.accept()
print ("client connedted ip:<" + str(addr) + ">")
t = threading.Thread(target=RetrFile, args=("RetrThread", c))
t.start()
s.close()
if __name__ == '__main__':
Main()
Now my problem is that decipher.recv(1024) in client side is not receiving the message. what should i do.
On the server side, change the code to:
while bytesToSend != s:
bytesToSend = f.read(1024)
length = len(bytesTosend)
leng = length.to_bytes(4, 'little')
sock.sendall(leng)
msg = m.encrypt(bytesToSend, dat[1], dat[2])
k = msg.bit_length()
if k % 8 >= 1 :
k = k // 8 + 1
else:
k = k // 8
msg = msg.to_bytes(k, byteorder='little')
sock.sendall(msg)
And on the client side:
while True:
length = s.recv(4)
length = int.from_bytes(length, byteorder='little')
decipher = s.recv(leng)
decipher = int.from_bytes(decipher, byteorder='little')
if not decipher:
break
msg = m.decrypt(decipher, pri_key, n)
f.write(msg)
f.close()
It is rather difficult to check your code without seeing the encryption module referenced in your code. With such functionality absent, testing to find out where the problem is becomes impossible. As such, the following programs are provided along with the implementation of another encryption module.
The server should be run from the command line and requires a port number and password to be supplied upon execution. The only form of authentication or authorization used is proper understanding of the client. The client must use the same password to be understood by the server.
Server
#! /usr/bin/env python3
import argparse
import pathlib
import pickle
import pickletools
import random
import socket
import socketserver
import zlib
import encryption
BYTES_USED = bytes(range(1 << 8))
CHAIN_SIZE = 1 << 8
def main():
"""Start a file server and serve clients forever."""
parser = argparse.ArgumentParser(description='Execute a file server demo.')
parser.add_argument('port', type=int, help='location where server listens')
parser.add_argument('password', type=str, help='key to use on secure line')
arguments = parser.parse_args()
server_address = socket.gethostbyname(socket.gethostname()), arguments.port
server = CustomServer(server_address, CustomHandler, arguments.password)
server.serve_forever()
class CustomServer(socketserver.ThreadingTCPServer):
"""Provide server support for the management of encrypted data."""
def __init__(self, server_address, request_handler_class, password):
"""Initialize the server and keep a set of security credentials."""
super().__init__(server_address, request_handler_class, True)
self.key = encryption.Key.new_client_random(
BYTES_USED,
CHAIN_SIZE,
random.Random(password)
)
self.primer = encryption.Primer.new_client_random(
self.key,
random.Random(password)
)
class CustomHandler(socketserver.StreamRequestHandler):
"""Allow forwarding of data to all connected clients."""
def __init__(self, request, client_address, server):
"""Initialize the handler with security translators."""
self.decoder = encryption.Decrypter(server.key, server.primer)
self.encoder = encryption.Encrypter(server.key, server.primer)
super().__init__(request, client_address, server)
def handle(self):
"""Run the code to handle clients while dealing with errors."""
try:
self.process_file_request()
except (ConnectionResetError, EOFError):
pass
def process_file_request(self):
"""Deal with clients that wish to download a file."""
segment = self.load()
path = pathlib.Path(segment)
if path.is_file():
size = path.stat().st_size
self.dump(size)
accepted = self.load()
if accepted:
with path.open('rb') as file:
while True:
buffer = file.read(1 << 15)
self.dump(buffer)
if not buffer:
break
else:
error = 'The given path does not specify a file.'
self.dump(error)
def load(self):
"""Read the client's connection with blocking."""
data = self.decoder.load_16bit_frame(self.rfile)
bytes_object = zlib.decompress(data)
return pickle.loads(bytes_object)
def dump(self, obj):
"""Send an object securely over to the client if possible."""
pickle_string = pickle.dumps(obj, pickle.HIGHEST_PROTOCOL)
bytes_object = pickletools.optimize(pickle_string)
data = zlib.compress(bytes_object, zlib.Z_BEST_COMPRESSION)
self.encoder.dump_16bit_frame(data, self.wfile)
if __name__ == '__main__':
main()
The client should also be run from the command line and requires the host name, port number, and password for the server. Communications are encrypted with the password and cannot be decrypted properly if it is different. Please note that very little checking for errors is present in the two programs.
Client
#! /usr/bin/env python3
import argparse
import pathlib
import pickle
import pickletools
import random
import socket
import zlib
import encryption
BYTES_USED = bytes(range(1 << 8))
CHAIN_SIZE = 1 << 8
# These are possible answers accepted for yes/no style questions.
POSITIVE = tuple(map(str.casefold, ('yes', 'true', '1')))
NEGATIVE = tuple(map(str.casefold, ('no', 'false', '0')))
def main():
"""Connect a file client to a server and process incoming commands."""
parser = argparse.ArgumentParser(description='Execute a file client demo.')
parser.add_argument('host', type=str, help='name of server on the network')
parser.add_argument('port', type=int, help='location where server listens')
parser.add_argument('password', type=str, help='key to use on secure line')
arguments = parser.parse_args()
connection = socket.create_connection((arguments.host, arguments.port))
try:
talk_to_server(*make_dump_and_load(connection, arguments.password))
finally:
connection.shutdown(socket.SHUT_RDWR)
connection.close()
def make_dump_and_load(connection, password):
"""Create objects to help with the encrypted communications."""
reader = connection.makefile('rb', -1)
writer = connection.makefile('wb', 0)
chaos = random.Random(password)
key = encryption.Key.new_client_random(BYTES_USED, CHAIN_SIZE, chaos)
chaos = random.Random(password)
primer = encryption.Primer.new_client_random(key, chaos)
decoder = encryption.Decrypter(key, primer)
encoder = encryption.Encrypter(key, primer)
def dump(obj):
"""Write an object to the writer file in an encoded form."""
pickle_string = pickle.dumps(obj, pickle.HIGHEST_PROTOCOL)
bytes_object = pickletools.optimize(pickle_string)
data = zlib.compress(bytes_object, zlib.Z_BEST_COMPRESSION)
encoder.dump_16bit_frame(data, writer)
def load():
"""Read an object from the reader file and decode the results."""
data = decoder.load_16bit_frame(reader)
bytes_object = zlib.decompress(data)
return pickle.loads(bytes_object)
return dump, load
def talk_to_server(dump, load):
"""Converse with the serve while trying to get a file."""
segment = input('Filename: ')
dump(segment)
size = load()
if isinstance(size, int):
print('File exists and takes', size, 'bytes to download.')
response = get_response('Continue? ')
dump(response)
if response:
location = input('Where should the new file be created? ')
with pathlib.Path(location).open('wb') as file:
written = 0
while True:
buffer = load()
if not buffer:
break
written += file.write(buffer)
print('Progress: {:.1%}'.format(written / size))
print('Download complete!')
else:
print(size)
def get_response(query):
"""Ask the user yes/no style questions and return the results."""
while True:
answer = input(query).casefold()
if answer:
if any(option.startswith(answer) for option in POSITIVE):
return True
if any(option.startswith(answer) for option in NEGATIVE):
return False
print('Please provide a positive or negative answer.')
if __name__ == '__main__':
main()
Since access to the encryption module was not provided, an alternative implementation has been included below. No guarantee is made for its suitability in any capacity or for any purpose. It may be somewhat slow as the software is currently configured but works well if obfuscation is desired.
encryption
"""Provide an implementation of Markov Encryption for simplified use.
This module exposes primitives useful for executing Markov Encryption
processes. ME was inspired by a combination of Markov chains with the
puzzles of Sudoku. This implementation has undergone numerous changes
and optimizations since its original design. Please see documentation."""
###############################################################################
# Import several functions needed later in the code.
from collections import deque
from math import ceil
from random import Random, SystemRandom
from struct import calcsize, pack, unpack
from inspect import currentframe
__author__ = 'Stephen "Zero" Chappell <Noctis.Skytower#gmail.com>'
__date__ = '18 August 2016'
__version__ = 2, 0, 8
###############################################################################
# Create some tools to use in the classes down below.
_CHAOS = SystemRandom()
def slots(names=''):
"""Set the __slots__ variable in the calling context with private names.
This function allows a convenient syntax when specifying the slots
used in a class. Simply call it in a class definition context with
the needed names. Locals are modified with private slot names."""
currentframe().f_back.f_locals['__slots__'] = \
tuple('__' + name for name in names.replace(',', ' ').split())
###############################################################################
# Implement a Key primitive data type for Markov Encryption.
class Key:
"""Key(data) -> Key instance
This class represents a Markov Encryption Key primitive. It allows for
easy key creation, checks for proper data construction, and helps with
encoding and decoding indexes based on cached internal tables."""
slots('data dimensions base size encoder axes order decoder')
#classmethod
def new(cls, bytes_used, chain_size):
"""Return a Key instance created from bytes_used and chain_size.
Creating a new key is easy with this method. Call this class method
with the bytes you want the key to recognize along with the size of
the chains you want the encryption/decryption processes to use."""
selection, blocks = list(set(bytes_used)), []
for _ in range(chain_size):
_CHAOS.shuffle(selection)
blocks.append(bytes(selection))
return cls(tuple(blocks))
#classmethod
def new_deterministic(cls, bytes_used, chain_size):
"""Automatically create a key with the information provided."""
selection, blocks, chaos = list(set(bytes_used)), [], Random()
chaos.seed(chain_size.to_bytes(ceil(
chain_size.bit_length() / 8), 'big') + bytes(range(256)))
for _ in range(chain_size):
chaos.shuffle(selection)
blocks.append(bytes(selection))
return cls(tuple(blocks))
#classmethod
def new_client_random(cls, bytes_used, chain_size, chaos):
"""Create a key using chaos as the key's source of randomness."""
selection, blocks = list(set(bytes_used)), []
for _ in range(chain_size):
chaos.shuffle(selection)
blocks.append(bytes(selection))
return cls(tuple(blocks))
def __init__(self, data):
"""Initialize the Key instance's variables after testing the data.
Keys are created with tuples of carefully constructed bytes arrays.
This method tests the given data before going on to build internal
tables for efficient encoding and decoding methods later on."""
self.__test_data(data)
self.__make_vars(data)
#staticmethod
def __test_data(data):
"""Test the data for correctness in its construction.
The data must be a tuple of at least two byte arrays. Each byte
array must have at least two bytes, all of which must be unique.
Furthermore, all arrays should share the exact same byte set."""
if not isinstance(data, tuple):
raise TypeError('Data must be a tuple object!')
if len(data) < 2:
raise ValueError('Data must contain at least two items!')
item = data[0]
if not isinstance(item, bytes):
raise TypeError('Data items must be bytes objects!')
length = len(item)
if length < 2:
raise ValueError('Data items must contain at least two bytes!')
unique = set(item)
if len(unique) != length:
raise ValueError('Data items must contain unique byte sets!')
for item in data[1:]:
if not isinstance(item, bytes):
raise TypeError('Data items must be bytes objects!')
next_length = len(item)
if next_length != length:
raise ValueError('All data items must have the same size!')
next_unique = set(item)
if len(next_unique) != next_length:
raise ValueError('Data items must contain unique byte sets!')
if next_unique ^ unique:
raise ValueError('All data items must use the same byte set!')
def __make_vars(self, data):
"""Build various internal tables for optimized calculations.
Encoding and decoding rely on complex relationships with the given
data. This method caches several of these key relationships for use
when the encryption and decryption processes are being executed."""
self.__data = data
self.__dimensions = len(data)
base, *mutations = data
self.__base = base = tuple(base)
self.__size = size = len(base)
offset = -sum(base.index(block[0]) for block in mutations[:-1]) % size
self.__encoder = base[offset:] + base[:offset]
self.__axes = tuple(reversed([tuple(base.index(byte) for byte in block)
for block in mutations]))
self.__order = key = tuple(sorted(base))
grid = []
for rotation in range(size):
block, row = base[rotation:] + base[:rotation], [None] * size
for byte, value in zip(block, key):
row[key.index(byte)] = value
grid.append(tuple(row))
self.__decoder = tuple(grid[offset:] + grid[:offset])
def test_primer(self, primer):
"""Raise an error if the primer is not compatible with this key.
Key and primers have a certain relationship that must be maintained
in order for them to work together. Since the primer understands
the requirements, it is asked to check this key for compatibility."""
primer.test_key(self)
def encode(self, index):
"""Encode index based on internal tables and return byte code.
An index probes into the various axes of the multidimensional,
virtual grid that a key represents. The index is evaluated, and
the value at its coordinates is returned by running this method."""
assert len(index) == self.__dimensions, \
'Index size is not compatible with key dimensions!'
*probes, current = index
return self.__encoder[(sum(
table[probe] for table, probe in zip(self.__axes, probes)
) + current) % self.__size]
def decode(self, index):
"""Decode index based on internal tables and return byte code.
Decoding does the exact same thing as encoding, but it indexes
into a virtual grid that represents the inverse of the encoding
grid. Tables are used to make the process fast and efficient."""
assert len(index) == self.__dimensions, \
'Index size is not compatible with key dimensions!'
*probes, current = index
return self.__decoder[sum(
table[probe] for table, probe in zip(self.__axes, probes)
) % self.__size][current]
#property
def data(self):
"""Data that the instance was initialized with.
This is the tuple of byte arrays used to create this key and can
be used to create an exact copy of this key at some later time."""
return self.__data
#property
def dimensions(self):
"""Dimensions that the internal, virtual grid contains.
The virtual grid has a number of axes that can be referenced when
indexing into it, and this number is the count of its dimensions."""
return self.__dimensions
#property
def base(self):
"""Base value that the internal grid is built from.
The Sudoku nature of the grid comes from rotating this value by
offsets, keeping values unique along any axis while traveling."""
return self.__base
#property
def order(self):
"""Order of base after its values have been sorted.
A sorted base is important when constructing inverse rows and when
encoding raw bytes for use in updating an encode/decode index."""
return self.__order
###############################################################################
# Implement a Primer primitive data type for Markov Encryption.
class Primer:
"""Primer(data) -> Primer instance
This class represents a Markov Encryption Primer primitive. It is very
important for starting both the encryption and decryption processes. A
method is provided for their easy creation with a related key."""
slots('data')
#classmethod
def new(cls, key):
"""Return a Primer instance from a parent Key.
Primers must be compatible with the keys they are used with. This
method takes a key and constructs a cryptographically sound primer
that is ready to use in the beginning stages of encryption."""
base = key.base
return cls(bytes(_CHAOS.choice(base)
for _ in range(key.dimensions - 1)))
#classmethod
def new_deterministic(cls, key):
"""Automatically create a primer with the information provided."""
base, chain_size, chaos = key.base, key.dimensions, Random()
chaos.seed(chain_size.to_bytes(ceil(
chain_size.bit_length() / 8), 'big') + bytes(range(256)))
return cls(bytes(chaos.choice(base) for _ in range(chain_size - 1)))
#classmethod
def new_client_random(cls, key, chaos):
"""Create a primer using chaos as the primer's source of randomness."""
base = key.base
return cls(
bytes(chaos.choice(base) for _ in range(key.dimensions - 1))
)
def __init__(self, data):
"""Initialize the Primer instance after testing validity of data.
Though not as complicated in its requirements as keys, primers do
need some simple structure in the data they are given. A checking
method is run before saving the data to the instance's attribute."""
self.__test_data(data)
self.__data = data
#staticmethod
def __test_data(data):
"""Test the data for correctness and test the data.
In order for the primer to be compatible with the nature of the
Markov Encryption processes, the data must be an array of bytes;
and to act as a primer, it must contain at least some information."""
if not isinstance(data, bytes):
raise TypeError('Data must be a bytes object!')
if not data:
raise ValueError('Data must contain at least one byte!')
def test_key(self, key):
"""Raise an error if the key is not compatible with this primer.
Primers provide needed data to start encryption and decryption. For
it be compatible with a key, it must contain one byte less than the
key's dimensions and must be a subset of the base in the key."""
if len(self.__data) != key.dimensions - 1:
raise ValueError('Key size must be one more than the primer size!')
if not set(self.__data).issubset(key.base):
raise ValueError('Key data must be a superset of primer data!')
#property
def data(self):
"""Data that the instance was initialized with.
This is the byte array used to create this primer and can be used
if desired to create an copy of this primer at some later time."""
return self.__data
###############################################################################
# Create an abstract processing class for use in encryption and decryption.
class _Processor:
"""_Processor(key, primer) -> NotImplementedError exception
This class acts as a base for the encryption and decryption processes.
The given key is saved, and several tables are created along with an
index. Since it is abstract, calling the class will raise an exception."""
slots('key into index from')
def __init__(self, key, primer):
"""Initialize the _Processor instance if it is from a child class.
After passing several tests for creating a valid processing object,
the key is saved, and the primer is used to start an index. Tables
are also formed for converting byte values between systems."""
if type(self) is _Processor:
raise NotImplementedError('This is an abstract class!')
key.test_primer(primer)
self.__key = key
self.__into = table = dict(map(reversed, enumerate(key.order)))
self.__index = deque(map(table.__getitem__, primer.data),
key.dimensions)
self.__from = dict(map(reversed, table.items()))
def process(self, data):
"""Process the data and return its transformed state.
A cache for the data transformation is created and an internal
method is run to quickly encode or decode the given bytes. The
cache is finally converted to immutable bytes when returned."""
cache = bytearray()
self._run(data, cache.append, self.__key, self.__into, self.__index)
return bytes(cache)
#staticmethod
def _run(data, cache_append, key, table, index):
"""Run the processing algorithm in an overloaded method.
Since this is only an abstract base class for encoding/decoding,
this method will raise an exception when run. Inheriting classes
should implement whatever is appropriate for the intended function."""
raise NotImplementedError('This is an abstract method!')
#property
def primer(self):
"""Primer representing the state of the internal index.
The index can be retrieved as a primer, useful for initializing
another processor in the same starting state as the current one."""
index = self.__index
index.append(None)
index.pop()
return Primer(bytes(map(self.__from.__getitem__, index)))
###############################################################################
# Inherit from _Processor and implement the ME encoding algorithm.
class Encrypter(_Processor):
"""Encrypter(key, primer) -> Encrypter instance
This class represents a state-aware encryption engine that can be fed
data and will return a stream of coherent cipher-text. An index is
maintained, and a state-continuation primer can be retrieved at will."""
slots()
#staticmethod
def _run(data, cache_append, key, table, index):
"""Encrypt the data with the given arguments.
To run the encryption process as fast as possible, methods are
cached as names. As the algorithm operates, only recognized bytes
are encoded while running through the selective processing loop."""
encode, index_append = key.encode, index.append
for byte in data:
if byte in table:
index_append(table[byte])
cache_append(encode(index))
else:
cache_append(byte)
def dump_16bit_frame(self, data, file):
"""Write the data to the file using a guaranteed frame size."""
size = len(data)
if not 1 <= size <= 1 << 16:
raise ValueError('data has an unsupported length')
packed = self.process(pack('<H{}s'.format(size), size - 1, data))
if file.write(packed) != len(packed):
raise IOError('frame was not properly written to file')
###############################################################################
# Inherit from _Processor and implement the ME decoding algorithm.
class Decrypter(_Processor):
"""Decrypter(key, primer) -> Decrypter instance
This class represents a state-aware decryption engine that can be fed
data and will return a stream of coherent plain-text. An index is
maintained, and a state-continuation primer can be retrieved at will."""
slots()
SIZE = '<H'
DATA = '{}s'
#staticmethod
def _run(data, cache_append, key, table, index):
"""Decrypt the data with the given arguments.
To run the decryption process as fast as possible, methods are
cached as names. As the algorithm operates, only recognized bytes
are decoded while running through the selective processing loop."""
decode, index_append = key.decode, index.append
for byte in data:
if byte in table:
index_append(table[byte])
value = decode(index)
cache_append(value)
index[-1] = table[value]
else:
cache_append(byte)
def load_16bit_frame(self, file):
"""Read some data from the file using a guaranteed frame size."""
size = unpack(self.SIZE, self.process(self.read_all(
file,
calcsize(self.SIZE)
)))[0] + 1
return unpack(self.DATA.format(size), self.process(self.read_all(
file,
size
)))[0]
#staticmethod
def read_all(file, size):
"""Get all the data that has been requested from the file."""
if not 1 <= size <= 1 << 16:
raise ValueError('size has an unsupported value')
buffer = bytearray()
while size > 0:
data = file.read(size)
if not data:
raise EOFError('file has unexpectedly reached the end')
buffer.extend(data)
size -= len(data)
if size < 0:
raise IOError('more data was read than was required')
return bytes(buffer)

Get nested LDAP group members with python-ldap

I'm trying to find the best way to get a list of all LDAP user accounts that belong to groups which are members of a groupOfNames using python-ldap. This is on an OpenLDAP server, not AD. I wrote the function below, which does the job but takes forever to run. I'm hoping either python-ldap has some builtin function that I'm not aware of, or there's something I can modify to make this run more quickly. If not, hopefully someone else will find this code useful. Thanks in advance for any help!
def get_nested_members(con, dn):
"""
Parameters
----------
con : LDAPObject
An authenticated python-ldap connection object
dn : string
The dn of the groupOfNames to be checked
Returns
-------
members : list
A list of all accounts that are members of the given dn
"""
members = []
searched = []
to_search = [dn]
while len(to_search) > 0:
current_dn = to_search.pop()
cn = current_dn.split(',')[0]
r = con.search_s(base_dn, ldap.SCOPE_SUBTREE, cn, [])[0][1]
if 'groupOfNames' in r['objectClass']:
if 'member' in r:
for i in r['member']:
if((i != current_dn) and (i not in searched)):
to_search.append(i)
searched.append(current_dn)
elif 'posixGroup' in r['objectClass']:
if 'memberUid' in r:
for i in r['memberUid']:
members.append(i)
searched.append(current_dn)
elif 'posixAccount' in r['objectClass']:
if 'uid' in r:
members.append(r['uid'][0])
else:
print('ERROR: encountered record of unknown type:')
pprint(str([current_dn, r]))
return list(set(members))
I realized that running ldapsearch repeatedly was the limiting factor, so I made a new version which builds a dictionary of ALL group and groupOfNames records first. It takes up a bit more memory than the old solution, but is less taxing on the LDAP server and runs significantly faster (down from ~15 minutes to <1 second for my application). I'll leave the original code below the new version for a reference of what not to do. Credit for the merge_dicts() function goes to Aaron Hall.
import ldap
def merge_dicts(*dict_args):
"""Given any number of dicts, shallow copy and merge into a new dict,
precedence goes to key value pairs in latter dicts.
"""
result = {}
for dictionary in dict_args:
result.update(dictionary)
return result
def get_nested_members(con, dn, base_dn='dc=example'):
"""Search a groupOfNames and return all posixAccount members from all its subgroups
Parameters
----------
con: LDAPObject
An authenticated LDAP connection object
dn: string
The dn of the groupOfNames to be searched for members
(optional) base_dn: string
The base dn to search on. Make sure to change the default value to fit your LDAP server
Returns
-------
members: list
A list of all nested members from the provided groupOfNames
"""
logging.info('Getting nested members of ' + str(dn))
print('Getting nested members of ' + str(dn))
if type(dn) is list:
to_search = [] + dn
elif type(dn) is str:
to_search = [dn]
else:
print('ERROR: Invalid dn value. Please supply either a sting or list of strings.')
return []
members = []
searched = []
groupOfNames_list = con.search_s(base_dn, ldap.SCOPE_SUBTREE, 'objectClass=groupOfNames', ['dn', 'member', 'cn'])
groupOfNames_dict = {}
for g in range(len(groupOfNames_list)):
groupOfNames_dict[groupOfNames_list[g][0]] = groupOfNames_list[g][1]
groupOfNames_list = None #To free up memory
group_list = con.search_s(base_dn, ldap.SCOPE_SUBTREE, 'objectClass=posixGroup', ['dn', 'memberUid', 'cn'])
group_dict = {}
for g in range(len(group_list)):
group_dict[group_list[g][0]] = group_list[g][1]
group_list = None #To free up memory
all_groups = merge_dicts(groupOfNames_dict, group_dict)
group_dict = None #To free up memory
groupOfNamesdict = None #To free up memory
while len(to_search) > 0:
search_dn = to_search.pop()
try:
g = all_groups[search_dn]
if 'memberUid' in g:
members += g['memberUid']
searched.append(search_dn)
elif 'member' in g:
m = g['member']
for i in m:
if i.startswith('uid='):
members.append((i.split(',')[0]).split('=')[1])
elif i.startswith('cn='):
if i not in searched:
to_search.append(i)
searched.append(search_dn)
else:
searched.append(search_dn)
except:
searched.append(search_dn)
return list(set(members))

How to use the DICT protocol client in Python?

I use the command line DICT client named dict like this : dict <some word>, which will show me the meaning from the dictionary servers which I have configured.
I'd like to interact with the dict servers from Python, for example reimplement that command line client in Python.
I found a Python module for the dict protocol in the Ubuntu repositories that I've installed (apt-get install python-dictclient), but unfortunately I couldn't find any documentation for this module. I tried to understand the modules's builtin help help('dictclient') but didn't succeed. I could only make a connection to a dict server and was able to see apartial definition, here's my attempt :
import dictclient
c = dictclient.Connection('localhost', 2628)
If anyone has experience with this module please explain to me how to use it.
dict-like definition fetching using dictclient in Python:
from dictclient import Connection, Database
from sys import argv
con = Connection("dict.org") #or whatever your server is
db = Database(con, "*") #replace * with ! to get only the first result
def_list = db.define(argv[1]) #list containing Definition objects
for x in def_list:
print x.getdefstr() + '\n'
Here is a Python3 version of dictlient:
# -*- coding: UTF-8 -*-
# Client for the DICT protocol (RFC2229)
#
# Copyright (C) 2002 John Goerzen
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
# A few small hacks to make it work on Python3 - KrisvanderMerwe 25 Aoril 2015
import socket, re
version = '1.0.2.1'
def dequote(teks):
"""Will remove single or double quotes from the start and end of a string
and return the result."""
quotechars = "'\""
while len(teks) and teks[0] in quotechars:
teks = teks[1:]
while len(teks) and teks[-1] in quotechars:
teks = teks[0:-1]
return teks
def enquote(teks):
"""This function will put a string in double quotes, properly
escaping any existing double quotes with a backslash. It will
return the result."""
return '"' + teks.replace('"', "\\\"") + '"'
class Connection:
"""This class is used to establish a connection to a database server.
You will usually use this as the first call into the dictclient library.
Instantiating it takes two optional arguments: a hostname (a string)
and a port (an int). The hostname defaults to localhost
and the port to 2628, the port specified in RFC."""
def __init__(self, hostname = 'localhost', port = 2628):
self.sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
self.sock.connect((hostname, port))
self.rfile = self.sock.makefile()
self.wfile = self.sock.makefile("wb", 0)
self.saveconnectioninfo()
def getresultcode(self):
"""Generic function to get a result code. It will return a list
consisting of two items: the integer result code and the text
following. You will not usually use this function directly."""
line = self.rfile.readline().strip()
code, text = line.split(' ', 1)
return [int(code), text]
def get200result(self):
"""Used when expecting a single line of text -- a 200-class
result. Returns [intcode, remaindertext]"""
code, text = self.getresultcode()
if code < 200 or code >= 300:
raise Exception ("Got '%s' when 200-class response expected %s " % (code,text) )
return [code, text]
def get100block(self):
"""Used when expecting multiple lines of text -- gets the block
part only. Does not get any codes or anything! Returns a string."""
data = []
while 1:
line = self.rfile.readline().strip()
if line == '.':
break
data.append(line)
return "\n".join(data)
def get100result(self):
"""Used when expecting multiple lines of text, terminated by a period
and a 200 code. Returns: [initialcode, [bodytext_1lineperentry],
finalcode]"""
code, text = self.getresultcode()
if code < 100 or code >= 200:
raise Exception ("Got '%s' when 100-class response expected" % code )
bodylines = self.get100block().split("\n")
code2 = self.get200result()[0]
return [code, bodylines, code2]
def get100dict(self):
"""Used when expecting a dictionary of results. Will read from
the initial 100 code, to a period and the 200 code."""
dicl = {}
for line in self.get100result()[1]:
key, val = line.split(' ', 1)
dicl[key] = dequote(val)
return dicl
def saveconnectioninfo(self):
"""Called by __init__ to handle the initial connection. Will
save off the capabilities and messageid."""
code, string = self.get200result()
assert code == 220
capstr, msgid = re.search('<(.*)> (<.*>)$', string).groups()
self.capabilities = capstr.split('.')
self.messageid = msgid
def getcapabilities(self):
"""Returns a list of the capabilities advertised by the server."""
return self.capabilities
def getmessageid(self):
"""Returns the message id, including angle brackets."""
return self.messageid
def getdbdescs(self):
"""Gets a dict of available databases. The key is the db name
and the value is the db description. This command may generate
network traffic!"""
if hasattr(self, 'dbdescs'):
return self.dbdescs
self.sendcommand("SHOW DB")
self.dbdescs = self.get100dict()
return self.dbdescs
def getstratdescs(self):
"""Gets a dict of available strategies. The key is the strat
name and the value is the strat description. This call may
generate network traffic!"""
if hasattr(self, 'stratdescs'):
return self.stratdescs
self.sendcommand("SHOW STRAT")
self.stratdescs = self.get100dict()
return self.stratdescs
def getdbobj(self, dbname):
"""Gets a Database object corresponding to the database name passed
in. This function explicitly will *not* generate network traffic.
If you have not yet run getdbdescs(), it will fail."""
if not hasattr(self, 'dbobjs'):
self.dbobjs = {}
if dbname in self.dbobjs:
return self.dbobjs[dbname]
# We use self.dbdescs explicitly since we don't want to
# generate net traffic with this request!
if dbname != '*' and dbname != '!' and \
not dbname in self.dbdescs.keys():
raise Exception( "Invalid database name '%s'" % dbname )
self.dbobjs[dbname] = Database(self, dbname)
return self.dbobjs[dbname]
def sendcommand(self, command):
"""Takes a command, without a newline character, and sends it to
the server."""
self.wfile.write(command.encode() + b"\n")
def define(self, database, word):
"""Returns a list of Definition objects for each matching
definition. Parameters are the database name and the word
to look up. This is one of the main functions you will use
to interact with the server. Returns a list of Definition
objects. If there are no matches, an empty list is returned.
Note: database may be '*' which means to search all databases,
or '!' which means to return matches from the first database that
has a match."""
self.getdbdescs() # Prime the cache
if database != '*' and database != '!' and \
not database in self.getdbdescs():
raise Exception ( "Invalid database '%s' specified" % database )
self.sendcommand("DEFINE " + enquote(database) + " " + enquote(word))
code = self.getresultcode()[0]
retval = []
if code == 552:
# No definitions.
return []
if code != 150:
raise Exception ("Unknown code %d" % code )
while 1:
code, text = self.getresultcode()
if code != 151:
break
resultword, resultdb = re.search('^"(.+)" (\S+)', text).groups()
defstr = self.get100block()
retval.append(Definition(self, self.getdbobj(resultdb),
resultword, defstr))
return retval
def match(self, database, strategy, word):
"""Gets matches for a query. Arguments are database name,
the strategy (see available ones in getstratdescs()), and the
pattern/word to look for. Returns a list of Definition objects.
If there is no match, an empty list is returned.
Note: database may be '*' which means to search all databases,
or '!' which means to return matches from the first database that
has a match."""
self.getstratdescs() # Prime the cache
self.getdbdescs() # Prime the cache
if not strategy in self.getstratdescs().keys():
raise Exception ( "Invalid strategy '%s'" % strategy )
if database != '*' and database != '!' and not database in self.getdbdescs().keys():
raise Exception ( "Invalid database name '%s'" % database )
self.sendcommand("MATCH %s %s %s" % (enquote(database),
enquote(strategy),
enquote(word)))
code = self.getresultcode()[0]
if code == 552:
# No Matches
return []
if code != 152:
raise Exception ( "Unexpected code %d" % code )
retval = []
for matchline in self.get100block().split("\n"):
matchdict, matchword = matchline.split(" ", 1)
retval.append(Definition(self, self.getdbobj(matchdict),
dequote(matchword)))
if self.getresultcode()[0] != 250:
raise Exception ( "Unexpected end-of-list code %d" % code )
return retval
class Database:
"""An object corresponding to a particular database in a server."""
def __init__(self, dictconn, dbname):
"""Initialize the object -- requires a Connection object and
a database name."""
self.conn = dictconn
self.name = dbname
def getname(self):
"""Returns the short name for this database."""
return self.name
def getdescription(self):
if hasattr(self, 'description'):
return self.description
if self.getname() == '*':
self.description = 'All Databases'
elif self.getname() == '!':
self.description = 'First matching database'
else:
self.description = self.conn.getdbdescs()[self.getname()]
return self.description
def getinfo(self):
"""Returns a string of info describing this database."""
if hasattr(self, 'info'):
return self.info
if self.getname() == '*':
self.info = "This special database will search all databases on the system."
elif self.getname() == '!':
self.info = "This special database will return matches from the first matching database."
else:
self.conn.sendcommand("SHOW INFO " + self.name)
self.info = "\n".join(self.conn.get100result()[1])
return self.info
def define(self, word):
"""Get a definition from within this database.
The argument, word, is the word to look up. The return value is the
same as from Connection.define()."""
return self.conn.define(self.getname(), word)
def match(self, strategy, word):
"""Get a match from within this database.
The argument, word, is the word to look up. The return value is
the same as from Connection.define()."""
return self.conn.match(self.getname(), strategy, word)
class Definition:
"""An object corresponding to a single definition."""
def __init__(self, dictconn, db, word, defstr = None):
"""Instantiate the object. Requires: a Connection object,
a Database object (NOT corresponding to '*' or '!' databases),
a word. Optional: a definition string. If not supplied,
it will be fetched if/when it is requested."""
self.conn = dictconn
self.db = db
self.word = word
self.defstr = defstr
def getdb(self):
"""Get the Database object corresponding to this definition."""
return self.db
def getdefstr(self):
"""Get the definition string (the actual content) of this
definition."""
if not self.defstr:
self.defstr = self.conn.define(self.getdb().getname(), self.word)[0].getdefstr()
return self.defstr
def getword(self):
"""Get the word this object describes."""
return self.word

Dictionary+Queue Data Structure with Active Removal of Old Messages

I would like to create a data structure which represents a set of queues (ideally a hash, map, or dict like lookup) where messages in the queues are being actively removed after they've reached a certain age. The ttl value would be global; messages would not need nor have individual ttl's. The resolution for the ttl doesn't need to be terribly accurate - only within a second or so.
I'm not even sure what to search for here. I could create a separate global queue that a background thread is monitoring, peeking and pulling pointers to messages off the global queue that tell it to remove items from the individual queues, but the behavior needs to go both ways. If an item gets removed from an invidual queue, it needs to remove from the global queue.
I would like for this data structure to be implemented in Python, ideally, and as always, speed is of the utmost importance (more so than memory usage). Any suggestions for where to start?
I'd start by just modeling the behavior you're looking for in a single class, expressed as simply as possible. Performance can come later on through iterative optimization, but only if necessary (you may not need it).
The class below does something roughly like what you're describing. Queues are simply lists that are named and stored in dictionary. Each message is timestamped and inserted at the front of the list (FIFO). Messages are reaped by checking the timestamp of the message at the end of the list, and popping it until it hits a message that is below the age threshold.
If you plan to access this from several threads you'll need to add some fine-grained locking to squeeze the most performance out of it. For example, the reap() method should only lock 1 queue at a time, rather than locking all queues (method-level synchronization), so you'd also need to keep a lock for each named queue.
Updated -- Now uses a global set of buckets (by timestamp, 1 second resolution) to keep track of which queues have messages from that time. This reduces the number of queues to be checked on each pass.
import time
from collections import defaultdict
class QueueMap(object):
def __init__(self):
self._expire = defaultdict(lambda *n: defaultdict(int))
self._store = defaultdict(list)
self._oldest_key = int(time.time())
def get_queue(self, name):
return self._store.get(name, [])
def pop(self, name):
queue = self.get_queue(name)
if queue:
key, msg = queue.pop()
self._expire[key][name] -= 1
return msg
return None
def set(self, name, message):
key = int(time.time())
# increment count of messages in this bucket/queue
self._expire[key][name] += 1
self._store[name].insert(0, (key, message))
def reap(self, age):
now = time.time()
threshold = int(now - age)
oldest = self._oldest_key
# iterate over buckets we need to check
for key in range(oldest, threshold + 1):
# for each queue with items, expire the oldest ones
for name, count in self._expire[key].iteritems():
if count <= 0:
continue
queue = self.get_queue(name)
while queue:
if queue[-1][0] > threshold:
break
queue.pop()
del self._expire[key]
# set oldest_key for next pass
self._oldest_key = threshold
Usage:
qm = QueueMap()
qm.set('one', 'message 1')
qm.set('one', 'message 2')
qm.set('two', 'message 3')
print qm.pop('one')
print qm.get_queue('one')
print qm.get_queue('two')
# call this on a background thread which sleeps
time.sleep(2)
# reap messages older than 1 second
qm.reap(1)
# queues should be empty now
print qm.get_queue('one')
print qm.get_queue('two')
Consider checking the TTLs whenever you access the queues instead of using a thread to be constantly checking. I'm not sure what you mean about the hash/map/dict (what is the key?), but how about something like this:
import time
class EmptyException(Exception): pass
class TTLQueue(object):
TTL = 60 # seconds
def __init__(self):
self._queue = []
def push(self, msg):
self._queue.append((time.time()+self.TTL, msg))
def pop(self):
self._queue = [(t, msg) for (t, msg) in self._queue if t > time.time()]
if len(self._queue) == 0:
raise EmptyException()
return self._queue.pop(0)[1]
queues = [TTLQueue(), TTLQueue(), TTLQueue()] # this could be a dict or set or
# whatever if I knew what keys
# you expected

Expressing multiple columns in berkeley db in python?

Say I have a simple table that contains username, firstname, lastname.
How do I express this in berkeley Db?
I'm currently using bsddb as the interface.
Cheers.
You have to pick one "column" as the key (must be unique; I imagine that would be "username" in your case) -- the only way searches will ever possibly happen. The other columns can be made to be the single string value of that key by any way you like, from pickling to simple joining with a character that's guaranteed to never occur in any of the columns, such as `\0' for many kind of "readable text strings".
If you need to be able to search by different keys you'll need other, supplementary and separate bsddb databases set up as "indices" into your main table -- it's lots of work, and there's lots of literature on the subject. (Alternatively, you move to a higher-abstraction technology, such as sqlite, which handles the indexing neatly on your behalf;-).
tl,dr: To express multiple columns in an ordered key value store like berkley db you need to learn about key composition. Look up my other answers about bsddb to learn more.
There is several ways to do that using ordered key/value store.
The simplest solution is to store documents as json values with a correct key.
Now you probably want to build index over those columns to retrieve documents without having to iterate over all the hashmap to find the correct object. For that you can use a secondaryDB that will build automatically the index for you. Or you can build the index yourself.
If you don't want to deal with key packing (and it's a good idea for starting up), you can take advantage of DB.set_bt_compare which will allow you to use cpickle, json or msgpack for both keys and values while still having an order that makes sens to create indices and doing queries. This is slower method but introduce the pattern of key composition.
To fully take advantage what ordered key is, you can make use of Cursor.set_range(key) to set the position of the db at the beginning of a query.
Another pattern, is called the EAV pattern stores tuples that follow the scheme (entity, attribute, value) and then you build various index by using permutation of that tuple. I learned this pattern studing datomic.
For less ressource hungry database, you will go the "static typed" way and store as much as possible of common information in the "metadata" table and split documents (which are really RDBMS tables) into their own hashmap.
To get you started here is an example database using bsddb (but you could build it using another ordered key/value store like wiredtiger or leveldb) that implements the EAV pattern. In this implementation I swap EAV for IKV which translates to Unique identifier, Key, Value. The overal result is that you have a fully indexed schema less document database. I think it's a good compromise between efficiency and ease-of-use.
import struct
from json import dumps
from json import loads
from bsddb3.db import DB
from bsddb3.db import DBEnv
from bsddb3.db import DB_BTREE
from bsddb3.db import DB_CREATE
from bsddb3.db import DB_INIT_MPOOL
from bsddb3.db import DB_LOG_AUTO_REMOVE
def pack(*values):
def __pack(value):
if type(value) is int:
return '1' + struct.pack('>q', value)
elif type(value) is str:
return '2' + struct.pack('>q', len(value)) + value
else:
data = dumps(value, encoding='utf-8')
return '3' + struct.pack('>q', len(data)) + data
return ''.join(map(__pack, values))
def unpack(packed):
kind = packed[0]
if kind == '1':
value = struct.unpack('>q', packed[1:9])[0]
packed = packed[9:]
elif kind == '2':
size = struct.unpack('>q', packed[1:9])[0]
value = packed[9:9+size]
packed = packed[size+9:]
else:
size = struct.unpack('>q', packed[1:9])[0]
value = loads(packed[9:9+size])
packed = packed[size+9:]
if packed:
values = unpack(packed)
values.insert(0, value)
else:
values = [value]
return values
class TupleSpace(object):
"""Generic database"""
def __init__(self, path):
self.env = DBEnv()
self.env.set_cache_max(10, 0)
self.env.set_cachesize(5, 0)
flags = (
DB_CREATE |
DB_INIT_MPOOL
)
self.env.log_set_config(DB_LOG_AUTO_REMOVE, True)
self.env.set_lg_max(1024 ** 3)
self.env.open(
path,
flags,
0
)
# create vertices and edges k/v stores
def new_store(name):
flags = DB_CREATE
elements = DB(self.env)
elements.open(
name,
None,
DB_BTREE,
flags,
0,
)
return elements
self.tuples = new_store('tuples')
self.index = new_store('index')
self.txn = None
def get(self, uid):
cursor = self.tuples.cursor()
def __get():
record = cursor.set_range(pack(uid, ''))
if not record:
return
key, value = record
while True:
other, key = unpack(key)
if other == uid:
value = unpack(value)[0]
yield key, value
record = cursor.next()
if record:
key, value = record
continue
else:
break
else:
break
tuples = dict(__get())
cursor.close()
return tuples
def add(self, uid, **properties):
for key, value in properties.items():
self.tuples.put(pack(uid, key), pack(value))
self.index.put(pack(key, value, uid), '')
def delete(self, uid):
# delete item from main table and index
cursor = self.tuples.cursor()
index = self.index.cursor()
record = cursor.set_range(pack(uid, ''))
if record:
key, value = record
else:
cursor.close()
raise Exception('not found')
while True:
other, key = unpack(key)
if other == uid:
# remove tuple from main index
cursor.delete()
# remove it from index
value = unpack(value)[0]
index.set(pack(key, value, uid))
index.delete()
# continue
record = cursor.next()
if record:
key, value = record
continue
else:
break
else:
break
cursor.close()
def update(self, uid, **properties):
self.delete(uid)
self.add(uid, **properties)
def close(self):
self.index.close()
self.tuples.close()
self.env.close()
def debug(self):
for key, value in self.tuples.items():
uid, key = unpack(key)
value = unpack(value)[0]
print(uid, key, value)
def query(self, key, value=''):
"""return `(key, value, uid)` tuples that where
`key` and `value` are expressed in the arguments"""
cursor = self.index.cursor()
match = (key, value) if value else (key,)
record = cursor.set_range(pack(key, value))
if not record:
cursor.close()
return
while True:
key, _ = record
other = unpack(key)
ok = reduce(
lambda previous, x: (cmp(*x) == 0) and previous,
zip(match, other),
True
)
if ok:
yield other
record = cursor.next()
if not record:
break
else:
break
cursor.close()
db = TupleSpace('tmp')
# you can use a tuple to store a counter
db.add(0, counter=0)
# And then have a procedure doing the required work
# to alaways have a fresh uid
def make_uid():
counter = db.get(0)
counter['counter'] += 1
return counter['counter']
amirouche = make_uid()
db.add(amirouche, username="amirouche", age=30)
print(db.get(amirouche))

Categories

Resources