Pickling memmaped np.ndarray copies array buffer into the pickled object - python

I'm trying to subclass an np.ndarray with the following constraints:
the subclass has to be memory mapped
is has to be pickleable while keeping the underlying buffer memory mapped.
I was able to implement the subclass, however, when pickling the object, the buffer is copied into the pickled object, so loading the pickled object means loading the underlying memmap to memory.
Here's the snippet:
import numpy as np
from pathlib import Path
import pickle
import mmap
class PersistedArray(np.ndarray):
suffix: str = 'mmap'
__array_priority__ = 1.0
def __new__(cls, input_array, name: str, folder: Path):
folder.mkdir(parents=True, exist_ok=True)
path = folder / f'{name}.{cls.suffix}'
mode = 'r+' if path.is_file() else 'w+'
obj = np.memmap(path, dtype=input_array.dtype, shape=input_array.shape, mode=mode).view(cls)
obj[:] = input_array[:]
obj.folder = folder
obj.name = name
obj.path = path
obj.meta_path = obj.path.with_suffix('.p')
meta_json = {'dtype': obj.dtype, 'shape': obj.shape}
with open(obj.meta_path, 'wb') as f:
pickle.dump(meta_json, f)
return obj
def __array_finalize__(self, obj):
if obj is None: return
self.name = getattr(obj, 'name', None)
self.path = getattr(obj, 'path', None)
self.folder = getattr(obj, 'folder', None)
self.meta_path = getattr(obj, 'meta_path', None)
def __reduce__(self):
object_state = list(super().__reduce__())
object_state[2] = (tuple(object_state[2]), self.__dict__)
return tuple(object_state)
def __setstate__(self, state):
nd_state, own_state = state
self.__dict__.update(own_state)
super().__setstate__(nd_state)
The problem lies in super().__reduce__() - it expects a byte string buffer in the last item, and I couldn't get around that. I tried updating the buffer to a single byte length, and the shape to 1, loading a memory map in __setstate__ and replacing the original buffer back with the memory mapped one, but np.ndarray expects a byte string.

Related

convert string representation of a object using python

I am having string PyObject [methodCall=1001, attributeName=javax.servlet.jsp.jstl.fmt.request.charset, attributeValue=UTF-8]
and in python I am having class :
class PyObject:
def __init__(self, methodCall, attributeName, attributeValue):
self.methodCall = methodCall
self.attributeName = attributeName
self.attributeValue = attributeValue
I am receiving above string from socket I want to convert that received string to object of PyObject so that I can access attributes like pyobj.methodCall.
I have tried with json.loads() but its giving me dict.
Since you added the requirement for this to be dynamic in the comments (you should add it to the question as well) consider this solution.
It is not the most beautiful or elegant, but it gets the job done.
This solution abuses the ability to dynamically create classes and instances with type.
string = 'PyObject [methodCall=1001, attributeName=javax.servlet.jsp.jstl.fmt.request.charset, attributeValue=UTF-8]'
class_name, string = string.split(' ', maxsplit=1)
data = [substring.strip() for substring in string.replace('[', '') \
.replace(']', '').split(',') if substring]
print(data)
# ['methodCall=1001', 'attributeName=javax.servlet.jsp.jstl.fmt.request.charset',
# 'attributeValue=UTF-8']
data_dict = {}
for d in data:
key, value = d.split('=')
data_dict[key] = value
print(data_dict)
# {'attributeValue': 'UTF-8',
# 'attributeName': 'javax.servlet.jsp.jstl.fmt.request.charset', 'methodCall': '1001'}
def init(self, **kwargs):
for k, v in kwargs.items():
setattr(self, k, v)
obj = type(class_name, (), {'__init__': init})(**data_dict)
print(obj)
# <__main__.PyObject object at 0x00520C70>
print(obj.methodCall)
# 1001
print(obj.attributeName)
# javax.servlet.jsp.jstl.fmt.request.charset

How to instantiate a Python class in Julia?

Here is my Python class:
class RoadSegInr(object):
def __init__(self, tmc, road_num, shape_length):
self.tmc = tmc
self.road_num = road_num
self.shape_length = shape_length
I have instantiated it to get
road_seg_inr = RoadSegInr(0, 0, 0)
I use the following code to save road_seg_inr:
try:
import cPickle as pickle
except ImportError:
import pickle
import gzip
proto = pickle.HIGHEST_PROTOCOL
def zdump(obj, f_name):
f = gzip.open(f_name,'wb', proto)
pickle.dump(obj,f)
f.close()
def zload(f_name):
f = gzip.open(f_name,'rb', proto)
obj = pickle.load(f)
f.close()
return obj
zdump(road_seg_inr, "road_seg_inr.pkz")
How can I use Julia to load road_seg_inr so that I can access its attributes?

MessagePack and datetime

I need a fast way to send 300 short messages a second over zeromq between the python multiprocessing processes. Each message needs to contain an ID and time.time()
msgpack seems like the best way to serialize the dict before sending it via zeromq, and conveniently, msgpack has an example of exactly what I need, except it has a datetime.datetime.now().
import datetime
import msgpack
useful_dict = {
"id": 1,
"created": datetime.datetime.now(),
}
def decode_datetime(obj):
if b'__datetime__' in obj:
obj = datetime.datetime.strptime(obj["as_str"], "%Y%m%dT%H:%M:%S.%f")
return obj
def encode_datetime(obj):
if isinstance(obj, datetime.datetime):
return {'__datetime__': True, 'as_str': obj.strftime("%Y%m%dT%H:%M:%S.%f")}
return obj
packed_dict = msgpack.packb(useful_dict, default=encode_datetime)
this_dict_again = msgpack.unpackb(packed_dict, object_hook=decode_datetime)
The problem is that their example doesn't work, I get this error:
obj = datetime.datetime.strptime(obj["as_str"], "%Y%m%dT%H:%M:%S.%f")
KeyError: 'as_str'
Maybe because I'm on python 3.4, but I don't know what the issue with strptime. Would appreciate your help.
Given that messagepack ( import msgpack ) is good at serializing integers, I created a solution which only uses integers:
_datetime_ExtType = 42
def _unpacker_hook(code, data):
if code == _datetime_ExtType:
values = unpack(data)
if len(values) == 8: # we have timezone
return datetime.datetime(*values[:-1], dateutil.tz.tzoffset(None, values[-1]))
else:
return datetime.datetime(*values)
return msgpack.ExtType(code, data)
# This will only get called for unknown types
def _packer_unknown_handler(obj):
if isinstance(obj, datetime.datetime):
if obj.tzinfo:
components = (obj.year, obj.month, obj.day, obj.hour, obj.minute, obj.second, obj.microsecond, int(obj.utcoffset().total_seconds()))
else:
components = (obj.year, obj.month, obj.day, obj.hour, obj.minute, obj.second, obj.microsecond)
# we effectively double pack the values to "compress" them
data = msgpack.ExtType(_datetime_ExtType, pack(components))
return data
raise TypeError("Unknown type: {}".format(obj))
def pack(obj, **kwargs):
# we don't use a global packer because it wouldn't be re-entrant safe
return msgpack.packb(obj, use_bin_type=True, default=_packer_unknown_handler, **kwargs)
def unpack(payload):
try:
# we temporarily disable gc during unpack to bump up perf: https://pypi.python.org/pypi/msgpack-python
gc.disable()
# This must match the above _packer parameters above. NOTE: use_list is faster
return msgpack.unpackb(payload, use_list=False, encoding='utf-8', ext_hook=_unpacker_hook)
finally:
gc.enable()
Python3 and Python2 manage differently strings encoding : encoding-and-decoding-strings-in-python-3-x
Then it is needed to :
use b'as_str' (instead of 'as_str') as dictionary key
use encode and decode for the stored value
Modifying the code like this works with python2 and python3 :
import datetime
import msgpack
useful_dict = {
"id": 1,
"created": datetime.datetime.now(),
}
def decode_datetime(obj):
if b'__datetime__' in obj:
obj = datetime.datetime.strptime(obj[b'as_str'].decode(), "%Y%m%dT%H:%M:%S.%f")
return obj
def encode_datetime(obj):
if isinstance(obj, datetime.datetime):
obj = {'__datetime__': True, 'as_str': obj.strftime("%Y%m%dT%H:%M:%S.%f").encode()}
return obj
packed_dict = msgpack.packb(useful_dict, default=encode_datetime)
this_dict_again = msgpack.unpackb(packed_dict, object_hook=decode_datetime)
In the documentation it says that the default encoding of unicode strings of packb (and pack) is utf-8. You could simply search for the unicode string '__datetime__' in the decode_datetime function, instead of the byte object b'__datetime__', and add the encoding='utf-8' parameter to unpack. Like so:
def decode_datetime(obj):
if '__datetime__' in obj:
obj = datetime.datetime.strptime(obj["as_str"], "%Y%m%dT%H:%M:%S.%f")
return obj
packed_dict = msgpack.packb(useful_dict, default=encode_datetime)
this_dict_again = msgpack.unpackb(packed_dict, object_hook=decode_datetime, encoding='utf-8')

memoize to disk - python - persistent memoization

Is there a way to memoize the output of a function to disk?
I have a function
def getHtmlOfUrl(url):
... # expensive computation
and would like to do something like:
def getHtmlMemoized(url) = memoizeToFile(getHtmlOfUrl, "file.dat")
and then call getHtmlMemoized(url), so as to do the expensive computation only once for each url.
Python offers a very elegant way to do this - decorators. Basically, a decorator is a function that wraps another function to provide additional functionality without changing the function source code. Your decorator can be written like this:
import json
def persist_to_file(file_name):
def decorator(original_func):
try:
cache = json.load(open(file_name, 'r'))
except (IOError, ValueError):
cache = {}
def new_func(param):
if param not in cache:
cache[param] = original_func(param)
json.dump(cache, open(file_name, 'w'))
return cache[param]
return new_func
return decorator
Once you've got that, 'decorate' the function using #-syntax and you're ready.
#persist_to_file('cache.dat')
def html_of_url(url):
your function code...
Note that this decorator is intentionally simplified and may not work for every situation, for example, when the source function accepts or returns data that cannot be json-serialized.
More on decorators: How to make a chain of function decorators?
And here's how to make the decorator save the cache just once, at exit time:
import json, atexit
def persist_to_file(file_name):
try:
cache = json.load(open(file_name, 'r'))
except (IOError, ValueError):
cache = {}
atexit.register(lambda: json.dump(cache, open(file_name, 'w')))
def decorator(func):
def new_func(param):
if param not in cache:
cache[param] = func(param)
return cache[param]
return new_func
return decorator
Check out joblib.Memory. It's a library for doing exactly that.
from joblib import Memory
memory = Memory("cachedir")
#memory.cache
def f(x):
print('Running f(%s)' % x)
return x
A cleaner solution powered by Python's Shelve module. The advantage is the cache gets updated in real time via well-known dict syntax, also it's exception proof(no need to handle annoying KeyError).
import shelve
def shelve_it(file_name):
d = shelve.open(file_name)
def decorator(func):
def new_func(param):
if param not in d:
d[param] = func(param)
return d[param]
return new_func
return decorator
#shelve_it('cache.shelve')
def expensive_funcion(param):
pass
This will facilitate the function to be computed just once. Next subsequent calls will return the stored result.
There is also diskcache.
from diskcache import Cache
cache = Cache("cachedir")
#cache.memoize()
def f(x, y):
print('Running f({}, {})'.format(x, y))
return x, y
The Artemis library has a module for this. (you'll need to pip install artemis-ml)
You decorate your function:
from artemis.fileman.disk_memoize import memoize_to_disk
#memoize_to_disk
def fcn(a, b, c = None):
results = ...
return results
Internally, it makes a hash out of input arguments and saves memo-files by this hash.
Check out Cachier. It supports additional cache configuration parameters like TTL etc.
Simple example:
from cachier import cachier
import datetime
#cachier(stale_after=datetime.timedelta(days=3))
def foo(arg1, arg2):
"""foo now has a persistent cache, trigerring recalculation for values stored more than 3 days."""
return {'arg1': arg1, 'arg2': arg2}
Something like this should do:
import json
class Memoize(object):
def __init__(self, func):
self.func = func
self.memo = {}
def load_memo(filename):
with open(filename) as f:
self.memo.update(json.load(f))
def save_memo(filename):
with open(filename, 'w') as f:
json.dump(self.memo, f)
def __call__(self, *args):
if not args in self.memo:
self.memo[args] = self.func(*args)
return self.memo[args]
Basic usage:
your_mem_func = Memoize(your_func)
your_mem_func.load_memo('yourdata.json')
# do your stuff with your_mem_func
If you want to write your "cache" to a file after using it -- to be loaded again in the future:
your_mem_func.save_memo('yournewdata.json')
Assuming that you data is json serializable, this code should work
import os, json
def json_file(fname):
def decorator(function):
def wrapper(*args, **kwargs):
if os.path.isfile(fname):
with open(fname, 'r') as f:
ret = json.load(f)
else:
with open(fname, 'w') as f:
ret = function(*args, **kwargs)
json.dump(ret, f)
return ret
return wrapper
return decorator
decorate getHtmlOfUrl and then simply call it, if it had been run previously, you will get your cached data.
Checked with python 2.x and python 3.x
You can use the cache_to_disk package:
from cache_to_disk import cache_to_disk
#cache_to_disk(3)
def my_func(a, b, c, d=None):
results = ...
return results
This will cache the results for 3 days, specific to the arguments a, b, c and d. The results are stored in a pickle file on your machine, and unpickled and returned next time the function is called. After 3 days, the pickle file is deleted until the function is re-run. The function will be re-run whenever the function is called with new arguments. More info here: https://github.com/sarenehan/cache_to_disk
Most answers are in a decorator fashion. But maybe I don't want to cache the result every time when calling the function.
I made one solution using context manager, so the function can be called as
with DiskCacher('cache_id', myfunc) as myfunc2:
res=myfunc2(...)
when you need the caching functionality.
The 'cache_id' string is used to distinguish data files, which are named [calling_script]_[cache_id].dat. So if you are doing this in a loop, will need to incorporate the looping variable into this cache_id, otherwise data will be overwritten.
Alternatively:
myfunc2=DiskCacher('cache_id')(myfunc)
res=myfunc2(...)
Alternatively (this is probably not quite useful as the same id is used all time time):
#DiskCacher('cache_id')
def myfunc(*args):
...
The complete code with examples (I'm using pickle to save/load, but can be changed to whatever save/read methods. NOTE that this is also assuming the function in question returns only 1 return value):
from __future__ import print_function
import sys, os
import functools
def formFilename(folder, varid):
'''Compose abspath for cache file
Args:
folder (str): cache folder path.
varid (str): variable id to form file name and used as variable id.
Returns:
abpath (str): abspath for cache file, which is using the <folder>
as folder. The file name is the format:
[script_file]_[varid].dat
'''
script_file=os.path.splitext(sys.argv[0])[0]
name='[%s]_[%s].nc' %(script_file, varid)
abpath=os.path.join(folder, name)
return abpath
def readCache(folder, varid, verbose=True):
'''Read cached data
Args:
folder (str): cache folder path.
varid (str): variable id.
Keyword Args:
verbose (bool): whether to print some text info.
Returns:
results (tuple): a tuple containing data read in from cached file(s).
'''
import pickle
abpath_in=formFilename(folder, varid)
if os.path.exists(abpath_in):
if verbose:
print('\n# <readCache>: Read in variable', varid,
'from disk cache:\n', abpath_in)
with open(abpath_in, 'rb') as fin:
results=pickle.load(fin)
return results
def writeCache(results, folder, varid, verbose=True):
'''Write data to disk cache
Args:
results (tuple): a tuple containing data read to cache.
folder (str): cache folder path.
varid (str): variable id.
Keyword Args:
verbose (bool): whether to print some text info.
'''
import pickle
abpath_out=formFilename(folder, varid)
if verbose:
print('\n# <writeCache>: Saving output to:\n',abpath_out)
with open(abpath_out, 'wb') as fout:
pickle.dump(results, fout)
return
class DiskCacher(object):
def __init__(self, varid, func=None, folder=None, overwrite=False,
verbose=True):
'''Disk cache context manager
Args:
varid (str): string id used to save cache.
function <func> is assumed to return only 1 return value.
Keyword Args:
func (callable): function object whose return values are to be
cached.
folder (str or None): cache folder path. If None, use a default.
overwrite (bool): whether to force a new computation or not.
verbose (bool): whether to print some text info.
'''
if folder is None:
self.folder='/tmp/cache/'
else:
self.folder=folder
self.func=func
self.varid=varid
self.overwrite=overwrite
self.verbose=verbose
def __enter__(self):
if self.func is None:
raise Exception("Need to provide a callable function to __init__() when used as context manager.")
return _Cache2Disk(self.func, self.varid, self.folder,
self.overwrite, self.verbose)
def __exit__(self, type, value, traceback):
return
def __call__(self, func=None):
_func=func or self.func
return _Cache2Disk(_func, self.varid, self.folder, self.overwrite,
self.verbose)
def _Cache2Disk(func, varid, folder, overwrite, verbose):
'''Inner decorator function
Args:
func (callable): function object whose return values are to be
cached.
varid (str): variable id.
folder (str): cache folder path.
overwrite (bool): whether to force a new computation or not.
verbose (bool): whether to print some text info.
Returns:
decorated function: if cache exists, the function is <readCache>
which will read cached data from disk. If needs to recompute,
the function is wrapped that the return values are saved to disk
before returning.
'''
def decorator_func(func):
abpath_in=formFilename(folder, varid)
#functools.wraps(func)
def wrapper(*args, **kwargs):
if os.path.exists(abpath_in) and not overwrite:
results=readCache(folder, varid, verbose)
else:
results=func(*args, **kwargs)
if not os.path.exists(folder):
os.makedirs(folder)
writeCache(results, folder, varid, verbose)
return results
return wrapper
return decorator_func(func)
if __name__=='__main__':
data=range(10) # dummy data
#--------------Use as context manager--------------
def func1(data, n):
'''dummy function'''
results=[i*n for i in data]
return results
print('\n### Context manager, 1st time call')
with DiskCacher('context_mananger', func1) as func1b:
res=func1b(data, 10)
print('res =', res)
print('\n### Context manager, 2nd time call')
with DiskCacher('context_mananger', func1) as func1b:
res=func1b(data, 10)
print('res =', res)
print('\n### Context manager, 3rd time call with overwrite=True')
with DiskCacher('context_mananger', func1, overwrite=True) as func1b:
res=func1b(data, 10)
print('res =', res)
#--------------Return a new function--------------
def func2(data, n):
results=[i*n for i in data]
return results
print('\n### Wrap a new function, 1st time call')
func2b=DiskCacher('new_func')(func2)
res=func2b(data, 10)
print('res =', res)
print('\n### Wrap a new function, 2nd time call')
res=func2b(data, 10)
print('res =', res)
#----Decorate a function using the syntax sugar----
#DiskCacher('pie_dec')
def func3(data, n):
results=[i*n for i in data]
return results
print('\n### pie decorator, 1st time call')
res=func3(data, 10)
print('res =', res)
print('\n### pie decorator, 2nd time call.')
res=func3(data, 10)
print('res =', res)
The outputs:
### Context manager, 1st time call
# <writeCache>: Saving output to:
/tmp/cache/[diskcache]_[context_mananger].nc
res = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90]
### Context manager, 2nd time call
# <readCache>: Read in variable context_mananger from disk cache:
/tmp/cache/[diskcache]_[context_mananger].nc
res = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90]
### Context manager, 3rd time call with overwrite=True
# <writeCache>: Saving output to:
/tmp/cache/[diskcache]_[context_mananger].nc
res = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90]
### Wrap a new function, 1st time call
# <writeCache>: Saving output to:
/tmp/cache/[diskcache]_[new_func].nc
res = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90]
### Wrap a new function, 2nd time call
# <readCache>: Read in variable new_func from disk cache:
/tmp/cache/[diskcache]_[new_func].nc
res = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90]
### pie decorator, 1st time call
# <writeCache>: Saving output to:
/tmp/cache/[diskcache]_[pie_dec].nc
res = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90]
### pie decorator, 2nd time call.
# <readCache>: Read in variable pie_dec from disk cache:
/tmp/cache/[diskcache]_[pie_dec].nc
res = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90]
Here's a solution I came up with which can:
memoize mutable objects (memoized functions should have no side effects that change mutable parameters or it won't work as expected)
writes to a separate cache file for each wrapped function (easy to delete the file to purge that particular cache)
compresses the data to make it much smaller on disk (a LOT smaller)
It will create cache files like:
cache.__main__.function.getApiCall.db
cache.myModule.function.fixDateFormat.db
cache.myOtherModule.function.getOtherApiCall.db
Here's the code. You can choose a compression library of your choosing, but I've found LZMA works best for the pickle storage we are using.
import dbm
import hashlib
import pickle
# import bz2
import lzma
# COMPRESSION = bz2
COMPRESSION = lzma # better with pickle compression
# Create a #memoize_to_disk decorator to cache a memoize to disk cache
def memoize_to_disk(function, cache_filename=None):
uniqueFunctionSignature = f'cache.{function.__module__}.{function.__class__.__name__}.{function.__name__}'
if cache_filename is None:
cache_filename = uniqueFunctionSignature
# print(f'Caching to {cache_file}')
def wrapper(*args, **kwargs):
# Convert the dictionary into a JSON object (can't memoize mutable fields, this gives us an immutable, hashable function signature)
if cache_filename == uniqueFunctionSignature:
# Cache file is function-specific, so don't include function name in params
params = {'args': args, 'kwargs': kwargs}
else:
# add module.class.function name to params so no collisions occur if user overrides cache_file with the same cache for multiple functions
params = {'function': uniqueFunctionSignature, 'args': args, 'kwargs': kwargs}
# key hash of the json representation of the function signature (to avoid immutable dictionary errors)
params_json = json.dumps(params)
key = hashlib.sha256(params_json.encode("utf-8")).hexdigest() # store hash of key
# Get cache entry or create it if not found
with dbm.open(cache_filename, 'c') as db:
# Try to retrieve the result from the cache
try:
result = pickle.loads(COMPRESSION.decompress(db[key]))
# print(f'CACHE HIT: Found {key[1:100]=} in {cache_file=} with value {str(result)[0:100]=}')
return result
except KeyError:
# If the result is not in the cache, call the function and store the result
result = function(*args, **kwargs)
db[key] = COMPRESSION.compress(pickle.dumps(result))
# print(f'CACHE MISS: Stored {key[1:100]=} in {cache_file=} with value {str(result)[0:100]=}')
return result
return wrapper
To use the code, use the #memoize_to_disk decorator (with an optional filename parameter if you don't like "cache." as a prefix)
#memoize_to_disk
def expensive_example(n):
// expensive operation goes here
return value

Persisting hashlib state

I'd like to create a hashlib instance, update() it, then persist its state in some way. Later, I'd like to recreate the object using this state data, and continue to update() it. Finally, I'd like to get the hexdigest() of the total cumulative run of data. State persistence has to survive across multiple runs.
Example:
import hashlib
m = hashlib.sha1()
m.update('one')
m.update('two')
# somehow, persist the state of m here
#later, possibly in another process
# recreate m from the persisted state
m.update('three')
m.update('four')
print m.hexdigest()
# at this point, m.hexdigest() should be equal to hashlib.sha1().update('onetwothreefour').hextdigest()
EDIT:
I did not find a good way to do this with python in 2010 and ended up writing a small helper app in C to accomplish this. However, there are some great answers below that were not available or known to me at the time.
You can do it this way using ctypes, no helper app in C is needed:-
rehash.py
#! /usr/bin/env python
''' A resumable implementation of SHA-256 using ctypes with the OpenSSL crypto library
Written by PM 2Ring 2014.11.13
'''
from ctypes import *
SHA_LBLOCK = 16
SHA256_DIGEST_LENGTH = 32
class SHA256_CTX(Structure):
_fields_ = [
("h", c_long * 8),
("Nl", c_long),
("Nh", c_long),
("data", c_long * SHA_LBLOCK),
("num", c_uint),
("md_len", c_uint)
]
HashBuffType = c_ubyte * SHA256_DIGEST_LENGTH
#crypto = cdll.LoadLibrary("libcrypto.so")
crypto = cdll.LoadLibrary("libeay32.dll" if os.name == "nt" else "libssl.so")
class sha256(object):
digest_size = SHA256_DIGEST_LENGTH
def __init__(self, datastr=None):
self.ctx = SHA256_CTX()
crypto.SHA256_Init(byref(self.ctx))
if datastr:
self.update(datastr)
def update(self, datastr):
crypto.SHA256_Update(byref(self.ctx), datastr, c_int(len(datastr)))
#Clone the current context
def _copy_ctx(self):
ctx = SHA256_CTX()
pointer(ctx)[0] = self.ctx
return ctx
def copy(self):
other = sha256()
other.ctx = self._copy_ctx()
return other
def digest(self):
#Preserve context in case we get called before hashing is
# really finished, since SHA256_Final() clears the SHA256_CTX
ctx = self._copy_ctx()
hashbuff = HashBuffType()
crypto.SHA256_Final(hashbuff, byref(self.ctx))
self.ctx = ctx
return str(bytearray(hashbuff))
def hexdigest(self):
return self.digest().encode('hex')
#Tests
def main():
import cPickle
import hashlib
data = ("Nobody expects ", "the spammish ", "imposition!")
print "rehash\n"
shaA = sha256(''.join(data))
print shaA.hexdigest()
print repr(shaA.digest())
print "digest size =", shaA.digest_size
print
shaB = sha256()
shaB.update(data[0])
print shaB.hexdigest()
#Test pickling
sha_pickle = cPickle.dumps(shaB, -1)
print "Pickle length:", len(sha_pickle)
shaC = cPickle.loads(sha_pickle)
shaC.update(data[1])
print shaC.hexdigest()
#Test copying. Note that copy can be pickled
shaD = shaC.copy()
shaC.update(data[2])
print shaC.hexdigest()
#Verify against hashlib.sha256()
print "\nhashlib\n"
shaD = hashlib.sha256(''.join(data))
print shaD.hexdigest()
print repr(shaD.digest())
print "digest size =", shaD.digest_size
print
shaE = hashlib.sha256(data[0])
print shaE.hexdigest()
shaE.update(data[1])
print shaE.hexdigest()
#Test copying. Note that hashlib copy can NOT be pickled
shaF = shaE.copy()
shaF.update(data[2])
print shaF.hexdigest()
if __name__ == '__main__':
main()
resumable_SHA-256.py
#! /usr/bin/env python
''' Resumable SHA-256 hash for large files using the OpenSSL crypto library
The hashing process may be interrupted by Control-C (SIGINT) or SIGTERM.
When a signal is received, hashing continues until the end of the
current chunk, then the current file position, total file size, and
the sha object is saved to a file. The name of this file is formed by
appending '.hash' to the name of the file being hashed.
Just re-run the program to resume hashing. The '.hash' file will be deleted
once hashing is completed.
Written by PM 2Ring 2014.11.14
'''
import cPickle as pickle
import os
import signal
import sys
import rehash
quit = False
blocksize = 1<<16 # 64kB
blocksperchunk = 1<<8
chunksize = blocksize * blocksperchunk
def handler(signum, frame):
global quit
print "\nGot signal %d, cleaning up." % signum
quit = True
def do_hash(fname, filesize):
hashname = fname + '.hash'
if os.path.exists(hashname):
with open(hashname, 'rb') as f:
pos, fsize, sha = pickle.load(f)
if fsize != filesize:
print "Error: file size of '%s' doesn't match size recorded in '%s'" % (fname, hashname)
print "%d != %d. Aborting" % (fsize, filesize)
exit(1)
else:
pos, fsize, sha = 0, filesize, rehash.sha256()
finished = False
with open(fname, 'rb') as f:
f.seek(pos)
while not (quit or finished):
for _ in xrange(blocksperchunk):
block = f.read(blocksize)
if block == '':
finished = True
break
sha.update(block)
pos += chunksize
sys.stderr.write(" %6.2f%% of %d\r" % (100.0 * pos / fsize, fsize))
if finished or quit:
break
if quit:
with open(hashname, 'wb') as f:
pickle.dump((pos, fsize, sha), f, -1)
elif os.path.exists(hashname):
os.remove(hashname)
return (not quit), pos, sha.hexdigest()
def main():
if len(sys.argv) != 2:
print "Resumable SHA-256 hash of a file."
print "Usage:\npython %s filename\n" % sys.argv[0]
exit(1)
fname = sys.argv[1]
filesize = os.path.getsize(fname)
signal.signal(signal.SIGINT, handler)
signal.signal(signal.SIGTERM, handler)
finished, pos, hexdigest = do_hash(fname, filesize)
if finished:
print "%s %s" % (hexdigest, fname)
else:
print "sha-256 hash of '%s' incomplete" % fname
print "%s" % hexdigest
print "%d / %d bytes processed." % (pos, filesize)
if __name__ == '__main__':
main()
demo
import rehash
import pickle
sha=rehash.sha256("Hello ")
s=pickle.dumps(sha.ctx)
sha=rehash.sha256()
sha.ctx=pickle.loads(s)
sha.update("World")
print sha.hexdigest()
output
a591a6d40bf420404a011733cfb7b190d62c65bf0bcda32b57b277d9ad9f146e
Note: I would like to thank PM2Ring for his wonderful code.
hashlib.sha1 is a wrapper around a C library so you won't be able to pickle it.
It would need to implement the __getstate__ and __setstate__ methods for Python to access its internal state
You could use a pure Python implementation of sha1 if it is fast enough for your requirements
I was facing this problem too, and found no existing solution, so I ended up writing a library that does something very similar to what Devesh Saini described: https://github.com/kislyuk/rehash. Example:
import pickle, rehash
hasher = rehash.sha256(b"foo")
state = pickle.dumps(hasher)
hasher2 = pickle.loads(state)
hasher2.update(b"bar")
assert hasher2.hexdigest() == rehash.sha256(b"foobar").hexdigest()
Hash algorithm for dynamic growing/streaming data?
You can easily build a wrapper object around the hash object which can transparently persist the data.
The obvious drawback is that it needs to retain the hashed data in full in order to restore the state - so depending on the data size you are dealing with, this may not suit your needs. But it should work fine up to some tens of MB.
Unfortunattely the hashlib does not expose the hash algorithms as proper classes, it rathers gives factory functions that construct the hash objects - so we can't properly subclass those without loading reserved symbols - a situation I'd rather avoid. That only means you have to built your wrapper class from the start, which is not such that an overhead from Python anyway.
here is a sample code that might even fill your needs:
import hashlib
from cStringIO import StringIO
class PersistentSha1(object):
def __init__(self, salt=""):
self.__setstate__(salt)
def update(self, data):
self.__data.write(data)
self.hash.update(data)
def __getattr__(self, attr):
return getattr(self.hash, attr)
def __setstate__(self, salt=""):
self.__data = StringIO()
self.__data.write(salt)
self.hash = hashlib.sha1(salt)
def __getstate__(self):
return self.data
def _get_data(self):
self.__data.seek(0)
return self.__data.read()
data = property(_get_data, __setstate__)
You can access the "data" member itself to get and set the state straight, or you can use python pickling functions:
>>> a = PersistentSha1()
>>> a
<__main__.PersistentSha1 object at 0xb7d10f0c>
>>> a.update("lixo")
>>> a.data
'lixo'
>>> a.hexdigest()
'6d6332a54574aeb35dcde5cf6a8774f938a65bec'
>>> import pickle
>>> b = pickle.dumps(a)
>>>
>>> c = pickle.loads(b)
>>> c.hexdigest()
'6d6332a54574aeb35dcde5cf6a8774f938a65bec'
>>> c.data
'lixo'

Categories

Resources