I'm trying to make application using PyQt5, and I'm using QWebEngine for the user interface.
I have successfully made the application but now I want to make a plugin mechanism for this application so that later it will be easier to add features. I have tried doing it using Yapsy, but then I realized that I could not use the relative import from the plugin i made
So i decide to create it on my own by using importlib module, then i found this question
Using importlib to dynamically import module(s) containing relative imports
while the answer itself is not wrong, but it doesn't work in my case where i want to load a module outside of my package directory
my source code can be found here
As you can see in my source code there is 2 plugins directory, plugins inside myapp package and outside myapp package. The plugins directory inside myapp package is there so that it won't gave me any ImportError when it try to import myapp.plugins
Now the problem is when i run it using python3 -m myapp it will give me ImportError $No module named 'myapp.plugins.extras'. This is where i'm stuck.
After googling for a while i found this article
Dependency Injection with Import Hooks in Python 3
while the concept itself is exactly what i need, but the way he provide the module is not what i'm expecting cuz he use a class to provide the module instead of file.
So i change it to fulfill what i need
DependencyInjector.py
import os
import io
import sys
import _imp
import marshal
import types
import importlib
import importlib._bootstrap as _bootstrap
from importlib.abc import Loader
_PYCACHE = '__pycache__'
_OPT = 'opt-'
SOURCE_SUFFIXES = ['.py'] # _setup() adds .pyw as needed.
BYTECODE_SUFFIXES = ['.pyc']
MAGIC_NUMBER = (3394).to_bytes(2, 'little') + b'\r\n'
_RAW_MAGIC_NUMBER = int.from_bytes(MAGIC_NUMBER, 'little') # For import.c
def _r_long(int_bytes):
"""Convert 4 bytes in little-endian to an integer."""
return int.from_bytes(int_bytes, 'little')
def _w_long(x):
"""Convert a 32-bit integer to little-endian."""
return (int(x) & 0xFFFFFFFF).to_bytes(4, 'little')
def _calc_mode(path):
"""Calculate the mode permissions for a bytecode file."""
try:
mode = os.stat(path).st_mode
except OSError:
mode = 0o666
# We always ensure write access so we can update cached files
# later even when the source files are read-only on Windows (#6074)
mode |= 0o200
return mode
def _write_atomic(path, data, mode=0o666):
# id() is used to generate a pseudo-random filename.
path_tmp = '{}.{}'.format(path, id(path))
fd = os.open(path_tmp,
os.O_EXCL | os.O_CREAT | os.O_WRONLY, mode & 0o666)
try:
with io.FileIO(fd, 'wb') as file:
file.write(data)
os.replace(path_tmp, path)
except OSError:
try:
os.unlink(path_tmp)
except OSError:
pass
raise
_code_type = type(_write_atomic.__code__)
def cache_from_source(path, debug_override=None, *, optimization=None):
if debug_override is not None:
print('the debug_override parameter is deprecated; use '
"'optimization' instead", DeprecationWarning)
if optimization is not None:
message = 'debug_override or optimization must be set to None'
raise TypeError(message)
optimization = '' if debug_override else 1
path = os.fspath(path)
head, tail = os.path.split(path)
base, sep, rest = tail.rpartition('.')
tag = sys.implementation.cache_tag
if tag is None:
raise NotImplementedError('sys.implementation.cache_tag is None')
almost_filename = ''.join([(base if base else rest), sep, tag])
if optimization is None:
if sys.flags.optimize == 0:
optimization = ''
else:
optimization = sys.flags.optimize
optimization = str(optimization)
if optimization != '':
if not optimization.isalnum():
raise ValueError('{!r} is not alphanumeric'.format(optimization))
almost_filename = '{}.{}{}'.format(almost_filename, _OPT, optimization)
return os.path.join(head, _PYCACHE, almost_filename + BYTECODE_SUFFIXES[0])
def _classify_pyc(data, name, exc_details):
magic = data[:4]
if magic != MAGIC_NUMBER:
message = f'bad magic number in {name!r}: {magic!r}'
_bootstrap._verbose_message('{}', message)
raise ImportError(message, **exc_details)
if len(data) < 16:
message = f'reached EOF while reading pyc header of {name!r}'
_bootstrap._verbose_message('{}', message)
raise EOFError(message)
flags = _r_long(data[4:8])
# Only the first two flags are defined.
if flags & ~0b11:
message = f'invalid flags {flags!r} in {name!r}'
raise ImportError(message, **exc_details)
return flags
def _validate_timestamp_pyc(data, source_mtime, source_size, name,
exc_details):
if _r_long(data[8:12]) != (source_mtime & 0xFFFFFFFF):
message = f'bytecode is stale for {name!r}'
_bootstrap._verbose_message('{}', message)
raise ImportError(message, **exc_details)
if (source_size is not None and
_r_long(data[12:16]) != (source_size & 0xFFFFFFFF)):
raise ImportError(f'bytecode is stale for {name!r}', **exc_details)
def _validate_hash_pyc(data, source_hash, name, exc_details):
if data[8:16] != source_hash:
raise ImportError(
f'hash in bytecode doesn\'t match hash of source {name!r}',
**exc_details,
)
def _compile_bytecode(data, name=None, bytecode_path=None, source_path=None):
code = marshal.loads(data)
if isinstance(code, _code_type):
_bootstrap._verbose_message('code object from {!r}', bytecode_path)
if source_path is not None:
_imp._fix_co_filename(code, source_path)
return code
else:
raise ImportError('Non-code object in {!r}'.format(bytecode_path),
name=name, path=bytecode_path)
def _code_to_timestamp_pyc(code, mtime=0, source_size=0):
data = bytearray(MAGIC_NUMBER)
data.extend(_w_long(0))
data.extend(_w_long(mtime))
data.extend(_w_long(source_size))
data.extend(marshal.dumps(code))
return data
def _code_to_hash_pyc(code, source_hash, checked=True):
data = bytearray(MAGIC_NUMBER)
flags = 0b1 | checked << 1
data.extend(_w_long(flags))
assert len(source_hash) == 8
data.extend(source_hash)
data.extend(marshal.dumps(code))
return data
class DependencyInjectorFinder(importlib.abc.MetaPathFinder):
def __init__(self, loader):
# we'll write the loader in a minute, hang tight
self._loader = loader
def find_spec(self, fullname, path, target=None):
if self._loader.provides(fullname):
return self._gen_spec(fullname)
def _gen_spec(self, fullname):
spec = importlib.machinery.ModuleSpec(fullname, self._loader)
return spec
class DependencyInjectorLoader(Loader):
_COMMON_PREFIX = "myapp.plugins."
path = None
def __init__(self):
self._services = {}
self._dummy_module = types.ModuleType(self._COMMON_PREFIX[:-1])
self._dummy_module.__path__ = []
def path_stats(self, path):
st = os.stat(path)
return {'mtime': st.st_mtime, 'size': st.st_size}
def _cache_bytecode(self, source_path, bytecode_path, data):
mode = _calc_mode(source_path)
return self.set_data(bytecode_path, data, _mode=mode)
def set_data(self, path, data, *, _mode=0o666):
parent, filename = os.path.split(path)
path_parts = []
# Figure out what directories are missing.
while parent and not os.path.isdir(parent):
parent, part = os.path.split(parent)
path_parts.append(part)
# Create needed directories.
for part in reversed(path_parts):
parent = os.path.join(parent, part)
try:
os.mkdir(parent)
except FileExistsError:
# Probably another Python process already created the dir.
continue
except OSError as exc:
# Could be a permission error, read-only filesystem: just forget
# about writing the data.
_bootstrap._verbose_message('could not create {!r}: {!r}',
parent, exc)
return
try:
_write_atomic(path, data, _mode)
_bootstrap._verbose_message('created {!r}', path)
except OSError as exc:
# Same as above: just don't write the bytecode.
_bootstrap._verbose_message('could not create {!r}: {!r}', path,
exc)
def get_filename(self, fullname):
"""Return the path to the source file as found by the finder."""
if fullname in self._services:
module = self._services[fullname]
return module.__path__
return None
def get_code(self, fullname):
"""Concrete implementation of InspectLoader.get_code.
Reading of bytecode requires path_stats to be implemented. To write
bytecode, set_data must also be implemented.
"""
source_path = self.get_filename(fullname)
source_mtime = None
source_bytes = None
source_hash = None
hash_based = False
check_source = True
try:
bytecode_path = cache_from_source(source_path)
except NotImplementedError:
bytecode_path = None
else:
try:
st = self.path_stats(source_path)
except OSError:
pass
else:
source_mtime = int(st['mtime'])
try:
data = self.get_data(bytecode_path)
except OSError:
pass
else:
exc_details = {
'name': fullname,
'path': bytecode_path,
}
try:
flags = _classify_pyc(data, fullname, exc_details)
bytes_data = memoryview(data)[16:]
hash_based = flags & 0b1 != 0
if hash_based:
check_source = flags & 0b10 != 0
if (_imp.check_hash_based_pycs != 'never' and
(check_source or
_imp.check_hash_based_pycs == 'always')):
source_bytes = self.get_data(source_path)
source_hash = _imp.source_hash(
_RAW_MAGIC_NUMBER,
source_bytes,
)
_validate_hash_pyc(data, source_hash, fullname,
exc_details)
else:
_validate_timestamp_pyc(
data,
source_mtime,
st['size'],
fullname,
exc_details,
)
except (ImportError, EOFError):
pass
else:
_bootstrap._verbose_message('{} matches {}', bytecode_path,
source_path)
return _compile_bytecode(bytes_data, name=fullname,
bytecode_path=bytecode_path,
source_path=source_path)
if source_bytes is None:
source_bytes = self.get_data(source_path)
code_object = self.source_to_code(source_bytes, source_path)
_bootstrap._verbose_message('code object from {}', source_path)
if (not sys.dont_write_bytecode and bytecode_path is not None and
source_mtime is not None):
if hash_based:
if source_hash is None:
source_hash = _imp.source_hash(source_bytes)
data = _code_to_hash_pyc(
code_object, source_hash, check_source)
else:
data = _code_to_timestamp_pyc(code_object, source_mtime,
len(source_bytes))
try:
self._cache_bytecode(source_path, bytecode_path, data)
_bootstrap._verbose_message('wrote {!r}', bytecode_path)
except NotImplementedError:
pass
return code_object
def source_to_code(self, data, path, *, _optimize=-1):
"""Return the code object compiled from source.
The 'data' argument can be any object type that compile() supports.
"""
return _bootstrap._call_with_frames_removed(compile, data, path, 'exec',
dont_inherit=True, optimize=_optimize)
def get_data(self, path):
"""Return the data from path as raw bytes."""
# TODO: raise error if the file is not found
# if it's a directory try to get the __init__.py file inside this folder
if os.path.isdir(path):
init_path = os.path.join(path, '__init__.py')
if os.path.exists(init_path):
with io.FileIO(init_path, 'r') as file:
return file.read()
with io.FileIO(path, 'r') as file:
return file.read()
def provide(self, service_name, module):
"""Register a service as provided via the given module
A service is any Python object in this context - an imported module,
a class, etc."""
self._services[service_name] = module
def provides(self, fullname):
if self._truncate_name(fullname) in self._services:
return True
else:
# this checks if we should return the dummy module,
# since this evaluates to True when importing myapp and
# myapp.virtual
return self._COMMON_PREFIX.startswith(fullname)
def create_module(self, spec):
"""Create the given module from the supplied module spec
Under the hood, this module returns a service or a dummy module,
depending on whether Python is still importing one of the names listed
in _COMMON_PREFIX.
"""
service_name = self._truncate_name(spec.name)
if service_name not in self._services:
# return our dummy module since at this point we're loading
# *something* along the lines of "myapp.virtual" that's not
# a service
return self._dummy_module
module = self._services[service_name]
return module
def exec_module(self, module):
"""Execute the given module in its own namespace
This method is required to be present by importlib.abc.Loader,
but since we know our module object is already fully-formed,
this method merely no-ops.
"""
if hasattr(module, "__path__"):
self.path = module.__path__
code = self.get_code(module.__name__)
importlib._bootstrap._call_with_frames_removed(
exec, code, module.__dict__)
def _truncate_name(self, fullname):
"""Strip off _COMMON_PREFIX from the given module name
Convenience method when checking if a service is provided.
"""
truncated_name = fullname
if truncated_name.startswith(self._COMMON_PREFIX):
truncated_name = fullname[len(self._COMMON_PREFIX):]
return truncated_name
def is_package(self, fullname):
return self.provides(fullname)
most of the source code above was taken from importlib.machinery.SourceFileLoader
PluginManager.py
import os
import re
import ast
import sys
import types
import typing
import importlib
import configparser
from PyQt5.QtCore import QObject
from PyQt5.QtWidgets import QApplication
from PyQt5.QtWebEngineWidgets import QWebEngineScript
from .utils import Signal, findFiles
from .config import change_filter, getInstance
from .DependencyInjector import DependencyInjectorFinder, DependencyInjectorLoader
class PluginInjector:
"""
Convenience wrapper for DependencyInjectorLoader and DependencyInjectorFinder.
"""
def __init__(self):
self._loader = DependencyInjectorLoader()
self._finder = DependencyInjectorFinder(self._loader)
self.installed = False
self.install()
def install(self):
if not self.installed:
self.installed = True
sys.meta_path.append(self._finder)
def provide(self, service_name, module):
self._loader.provide(service_name, module)
class PluginInfo(configparser.ConfigParser):
def __init__(self, filepath):
super().__init__()
self._filepath = filepath
self.read(self._filepath, encoding='utf-8')
def isValid(self) -> bool:
""""""
return self.has_section("plugin") and self.has_option("plugin", "Module")
class PluginManager(QObject):
pluginAdded = Signal()
pluginRemoved = Signal()
pluginActivated = Signal()
pluginDeactivated = Signal()
loadStarted = Signal()
loadFinished = Signal()
beforeLoad = Signal()
bridgeInitialize = Signal()
def __init__(self, pluginDirs: typing.List[str] = [], parent=None):
super().__init__(parent)
app = QApplication.instance()
from .MyApplication import MyApplication
assert isinstance(app, MyApplication)
self._injector = PluginInjector()
self._plugins = {}
self._loadedPlugins = {}
self._pluginsResources = {}
self._pluginDirs = pluginDirs
self.loadStarted.connect(self._loadStarted)
self.beforeLoad.connect(self._beforeLoad)
self.loadFinished.connect(self._loadFinished)
self.bridgeInitialize.connect(self._bridgeInitialize)
self._loadPlugins()
def _bridgeInitialize(self, page):
for name, resources in self._pluginsResources.items():
for resource in resources:
scriptName = name + "_" + os.path.basename(resource)
if resource.endswith(".js"):
injectionPoint = QWebEngineScript.DocumentReady
page.injectScript(resource, scriptName, injectionPoint)
elif resource.endswith(".css"):
injectionPoint = QWebEngineScript.DocumentReady
page.injectStylesheet(
resource, scriptName, injectionPoint)
def _beforeLoad(self, channel, page):
for name, plugin in self._plugins.items():
if 'beforeLoad' in dir(plugin):
plugin.beforeLoad(channel, page)
elif 'before_load' in dir(plugin):
plugin.before_load(channel, page)
def _loadStarted(self, page):
for name, plugin in self._plugins.items():
if 'loadStarted' in dir(plugin):
plugin.loadStarted(page)
elif 'load_started' in dir(plugin):
plugin.load_started(page)
def _loadFinished(self, page):
for name, plugin in self._plugins.items():
if 'loadFinished' in dir(plugin):
plugin.loadStarted(page)
elif 'load_finished' in dir(plugin):
plugin.load_started(page)
def addPluginPath(self, path: str):
assert os.path.isabs(path)
if not path in self._pluginDirs:
self._pluginDirs.append(path)
self._loadPlugins()
def _loadPlugin(self, pluginName):
if pluginName in self._loadedPlugins.keys():
return self._loadedPlugins[pluginName]
identities_paths = []
for directory in self._pluginDirs:
identities_paths += findFiles("*.plugin", directory)
module = None
for f in identities_paths:
info = PluginInfo(f)
name = f
if info.has_section("plugin") and info.has_option("plugin", "Name"):
name = info.get("plugin", "Name")
else:
continue
if name == pluginName:
if not info.isValid():
print(f"Plugin identity {name} is not valid, please read documentation "
"about how to write plugin.")
else:
parentdir = os.path.dirname(f)
module_path = os.path.join(
parentdir, info.get("plugin", "Module"))
if(not module_path.endswith(".py")):
module_path += ".py"
if os.path.exists(module_path):
try:
module_name = info.get(
"plugin", "Module").replace(".py", "")
parentdir = os.path.dirname(module_path)
print(f"creating namespace for plugin {name}")
# create a fake module for this plugin namespace
package = f"{name}"
module = types.ModuleType(package)
module.__path__ = parentdir
self._injector.provide(package, module)
# try to load all python file except for the main file and __init__.py
for f in findFiles("*.py", parentdir):
basename = os.path.splitext(
os.path.basename(f))[0]
if basename == module_name or basename == '__init__':
continue
tail = f[len(
parentdir + '/'):].replace(os.path.sep, '.').replace('.py', '')
package = f"{name}.{tail}"
m_path = f
print(
f"load external module for plugin {name} with name {module.__name__}")
spec = importlib.util.spec_from_file_location(
package, m_path)
module = importlib.util.module_from_spec(spec)
module.__path__ = m_path
self._injector.provide(package, module)
package = f"{name}.{module_name}"
spec = importlib.util.spec_from_file_location(
package, module_path)
module = importlib.util.module_from_spec(spec)
module.__path__ = module_path
self._injector.provide(package, module)
spec.loader.exec_module(module)
self._loadedPlugins[name] = module
except ImportError:
print(
f"Unable to load plugin module {name}")
break
else:
print(
f"module specified in {name} doesn't exists, it will be ignored.")
return module
def _loadPlugins(self):
""""""
identities_paths = []
for directory in self._pluginDirs:
identities_paths += findFiles("*.plugin", directory)
plugins: typing.List[PluginInfo] = []
for f in identities_paths:
info = PluginInfo(f)
name = f
if info.has_section("plugin") and info.has_option("plugin", "Name"):
name = info.get("plugin", "Name")
# if it's already exists it means that user just add a new plugins directory
if name in self._loadedPlugins.keys():
continue
if not info.isValid():
print(f"Plugin identity {name} is not valid, please read documentation "
"about how to write plugin.")
else:
parentdir = os.path.dirname(f)
module_path = os.path.join(
parentdir, info.get("plugin", "Module"))
if(not module_path.endswith(".py")):
module_path += ".py"
if os.path.exists(module_path):
info.set("plugin", "Path", module_path)
plugins.append(info)
else:
print(
f"module specified in {f} doesn't exists, it will be ignored.")
print(f"{len(plugins)} plugins found.")
for plugin in plugins:
try:
name = plugin.get("plugin", "Name")
module_name = plugin.get("plugin", "Module").replace(".py", "")
module_path = plugin.get("plugin", "Path")
parentdir = os.path.dirname(module_path)
print(f"creating namespace for plugin {name}")
# create a fake module for this plugin namespace
# create a fake module for this plugin namespace
package = f"{name}"
module = types.ModuleType(package)
module.__path__ = parentdir
self._injector.provide(package, module)
# try to load all python file except for the main file and __init__.py
for f in findFiles("*.py", parentdir):
basename = os.path.splitext(os.path.basename(f))[0]
if basename == module_name or basename == '__init__':
continue
tail = f[len(parentdir + '/'):].replace(os.path.sep, '.').replace('.py', '')
package = f"{name}.{tail}"
m_path = f
print(
f"load external module for plugin {name} with name {module.__name__}")
spec = importlib.util.spec_from_file_location(
package, m_path)
module = importlib.util.module_from_spec(spec)
module.__path__ = m_path
self._injector.provide(package, module)
print(f"importing main plugin module for plugin {name}")
package = f"{name}.{module_name}"
spec = importlib.util.spec_from_file_location(
package, module_path)
module = importlib.util.module_from_spec(spec)
module.__path__ = module_path
self._injector.provide(package, module)
spec.loader.exec_module(module)
self._loadedPlugins[name] = module
"""
By default plugin will be enabled if there was no plugin configuration.
"""
cfg = getInstance().get(f"plugins.{name}")
shouldLoad = True
if cfg is None:
shouldLoad = False
cfg = dict()
cfg['enabled'] = True
getInstance().set(f"plugins.{name}.enabled", True)
# if this is the first time the plugin is registered code above will trigger _pluginStateChange
# and activate it, so we don't need to activate it again here
if cfg['enabled'] and shouldLoad:
if 'activate' in dir(module):
module.activate()
self._plugins[name] = module
if plugin.has_option("plugin", "Resources"):
resources = ast.literal_eval(
plugin.get("plugin", "Resources"))
base_path = os.path.dirname(module_path)
def to_abspath(path: str):
if not os.path.isabs(path):
return os.path.join(base_path, path)
return path
resources = list(map(to_abspath, resources))
self._pluginsResources[name] = resources
except ImportError as e:
name = plugin.get("plugin", "Name")
print(
f"Unable to load plugin module {name} : ${e.msg}")
#change_filter("plugins")
def _pluginsStateChanged(self, key: str, value):
"""We only interested with the name and the value"""
res = re.findall("plugins\\.(.*)\\.enabled", key)
if key.endswith("enabled") and len(res) > 0:
name = res[0]
if not value:
self.disablePlugin(name)
elif value:
self.enablePlugin(name)
def enablePlugin(self, name: str):
print(f"enabling plugin {name}")
if not name in self._plugins.keys():
module = self._loadPlugin(name)
if module is not None:
if "activate" in dir(module):
module.activate()
self.pluginActivated.emit(name)
self._plugins[name] = module
self.pluginAdded.emit(name)
else:
print(f"Unable activate plugin {name}")
def disablePlugin(self, name: str):
""""""
print(f"disabling plugin {name}")
if name in self._plugins.keys():
module = self._plugins[name]
if "deactivate" in dir(module):
module.deactivate()
self.pluginDeactivated.emit(name)
self._plugins.pop(name, None)
self.pluginRemoved.emit(name)
while the code above is not yet perfect, but at least it's already fulfill what i need.
code is given below
import time
import unittest
import logging as log
from loggenerator import set_log_params,move_latest_log_to_persistent_file
from parse_test_json import get_test_params
from scrapping import SC
class ScrapeTest(unittest.TestCase):
def setup(self):
self.start_time=time.time()
def teardown(self):
self.end_time=time.time()
def test_SC_scrape(self):
try:
test_name="test scrape"
set_log_params(log_file=test_name,level=log.INFO)
log.info("step1:set all test params")
test_params = get_test_params(self.__class__.__name__, test_name=test_name)
log.info("step 2:set")
ik=SC()
log.info("step3:calling scrapper for")
log.debug(ik.parseURL(party_name=test_params["party_name"],start_date=test_params["start_date"],end_date=test_params["end_date"]))
except Exception as e:
raise e
move_latest_log_to_persistent_file(log_file=test_name)
####
import json, os
from builtins import *
def get_test_params(test_class_name = None, test_name = None):
dir_path = os.path.dirname(os.path.realpath(__file__))
file_path = "/test_param_jsons/" + test_class_name + "params.json"
json_file = dir_path + file_path
with open(json_file) as test_data:
test_json = json.load(test_data)
return test_json[test_class_name][test_name]
this function is raising error key error.
this should work as long as you have a json file available at: <SCRIPT_PATH>/test_param_jsons/MyClass_params.json
Also, in order to avoid KeyError you'll need to ensure that your input json file contains key: value, test_class_name : test_name
import json, os
from builtins import *
class MyClass:
def get_test_params (self, test_class_name = None, test_name = None):
with open(os.path.join(os.path.dirname(__file__),"test_param_jsons\\params.json"), 'r') as test_data:
test_json = json.load (test_data)
try:
return test_json[test_class_name][test_name]
except KeyError as e:
print ('KeyError: {}'.format (e))
def mymain(self, test_name):
''' mymain is defined to accomodate your requirement to use __class__ as a parameter '''
test_params = self.get_test_params (self.__class__.__name__, test_name = test_name)
return test_params
if __name__ == '__main__':
test_name = 'sth'
myclass = MyClass ()
result = myclass.mymain (test_name)
print (result)
I'd like to create a hashlib instance, update() it, then persist its state in some way. Later, I'd like to recreate the object using this state data, and continue to update() it. Finally, I'd like to get the hexdigest() of the total cumulative run of data. State persistence has to survive across multiple runs.
Example:
import hashlib
m = hashlib.sha1()
m.update('one')
m.update('two')
# somehow, persist the state of m here
#later, possibly in another process
# recreate m from the persisted state
m.update('three')
m.update('four')
print m.hexdigest()
# at this point, m.hexdigest() should be equal to hashlib.sha1().update('onetwothreefour').hextdigest()
EDIT:
I did not find a good way to do this with python in 2010 and ended up writing a small helper app in C to accomplish this. However, there are some great answers below that were not available or known to me at the time.
You can do it this way using ctypes, no helper app in C is needed:-
rehash.py
#! /usr/bin/env python
''' A resumable implementation of SHA-256 using ctypes with the OpenSSL crypto library
Written by PM 2Ring 2014.11.13
'''
from ctypes import *
SHA_LBLOCK = 16
SHA256_DIGEST_LENGTH = 32
class SHA256_CTX(Structure):
_fields_ = [
("h", c_long * 8),
("Nl", c_long),
("Nh", c_long),
("data", c_long * SHA_LBLOCK),
("num", c_uint),
("md_len", c_uint)
]
HashBuffType = c_ubyte * SHA256_DIGEST_LENGTH
#crypto = cdll.LoadLibrary("libcrypto.so")
crypto = cdll.LoadLibrary("libeay32.dll" if os.name == "nt" else "libssl.so")
class sha256(object):
digest_size = SHA256_DIGEST_LENGTH
def __init__(self, datastr=None):
self.ctx = SHA256_CTX()
crypto.SHA256_Init(byref(self.ctx))
if datastr:
self.update(datastr)
def update(self, datastr):
crypto.SHA256_Update(byref(self.ctx), datastr, c_int(len(datastr)))
#Clone the current context
def _copy_ctx(self):
ctx = SHA256_CTX()
pointer(ctx)[0] = self.ctx
return ctx
def copy(self):
other = sha256()
other.ctx = self._copy_ctx()
return other
def digest(self):
#Preserve context in case we get called before hashing is
# really finished, since SHA256_Final() clears the SHA256_CTX
ctx = self._copy_ctx()
hashbuff = HashBuffType()
crypto.SHA256_Final(hashbuff, byref(self.ctx))
self.ctx = ctx
return str(bytearray(hashbuff))
def hexdigest(self):
return self.digest().encode('hex')
#Tests
def main():
import cPickle
import hashlib
data = ("Nobody expects ", "the spammish ", "imposition!")
print "rehash\n"
shaA = sha256(''.join(data))
print shaA.hexdigest()
print repr(shaA.digest())
print "digest size =", shaA.digest_size
print
shaB = sha256()
shaB.update(data[0])
print shaB.hexdigest()
#Test pickling
sha_pickle = cPickle.dumps(shaB, -1)
print "Pickle length:", len(sha_pickle)
shaC = cPickle.loads(sha_pickle)
shaC.update(data[1])
print shaC.hexdigest()
#Test copying. Note that copy can be pickled
shaD = shaC.copy()
shaC.update(data[2])
print shaC.hexdigest()
#Verify against hashlib.sha256()
print "\nhashlib\n"
shaD = hashlib.sha256(''.join(data))
print shaD.hexdigest()
print repr(shaD.digest())
print "digest size =", shaD.digest_size
print
shaE = hashlib.sha256(data[0])
print shaE.hexdigest()
shaE.update(data[1])
print shaE.hexdigest()
#Test copying. Note that hashlib copy can NOT be pickled
shaF = shaE.copy()
shaF.update(data[2])
print shaF.hexdigest()
if __name__ == '__main__':
main()
resumable_SHA-256.py
#! /usr/bin/env python
''' Resumable SHA-256 hash for large files using the OpenSSL crypto library
The hashing process may be interrupted by Control-C (SIGINT) or SIGTERM.
When a signal is received, hashing continues until the end of the
current chunk, then the current file position, total file size, and
the sha object is saved to a file. The name of this file is formed by
appending '.hash' to the name of the file being hashed.
Just re-run the program to resume hashing. The '.hash' file will be deleted
once hashing is completed.
Written by PM 2Ring 2014.11.14
'''
import cPickle as pickle
import os
import signal
import sys
import rehash
quit = False
blocksize = 1<<16 # 64kB
blocksperchunk = 1<<8
chunksize = blocksize * blocksperchunk
def handler(signum, frame):
global quit
print "\nGot signal %d, cleaning up." % signum
quit = True
def do_hash(fname, filesize):
hashname = fname + '.hash'
if os.path.exists(hashname):
with open(hashname, 'rb') as f:
pos, fsize, sha = pickle.load(f)
if fsize != filesize:
print "Error: file size of '%s' doesn't match size recorded in '%s'" % (fname, hashname)
print "%d != %d. Aborting" % (fsize, filesize)
exit(1)
else:
pos, fsize, sha = 0, filesize, rehash.sha256()
finished = False
with open(fname, 'rb') as f:
f.seek(pos)
while not (quit or finished):
for _ in xrange(blocksperchunk):
block = f.read(blocksize)
if block == '':
finished = True
break
sha.update(block)
pos += chunksize
sys.stderr.write(" %6.2f%% of %d\r" % (100.0 * pos / fsize, fsize))
if finished or quit:
break
if quit:
with open(hashname, 'wb') as f:
pickle.dump((pos, fsize, sha), f, -1)
elif os.path.exists(hashname):
os.remove(hashname)
return (not quit), pos, sha.hexdigest()
def main():
if len(sys.argv) != 2:
print "Resumable SHA-256 hash of a file."
print "Usage:\npython %s filename\n" % sys.argv[0]
exit(1)
fname = sys.argv[1]
filesize = os.path.getsize(fname)
signal.signal(signal.SIGINT, handler)
signal.signal(signal.SIGTERM, handler)
finished, pos, hexdigest = do_hash(fname, filesize)
if finished:
print "%s %s" % (hexdigest, fname)
else:
print "sha-256 hash of '%s' incomplete" % fname
print "%s" % hexdigest
print "%d / %d bytes processed." % (pos, filesize)
if __name__ == '__main__':
main()
demo
import rehash
import pickle
sha=rehash.sha256("Hello ")
s=pickle.dumps(sha.ctx)
sha=rehash.sha256()
sha.ctx=pickle.loads(s)
sha.update("World")
print sha.hexdigest()
output
a591a6d40bf420404a011733cfb7b190d62c65bf0bcda32b57b277d9ad9f146e
Note: I would like to thank PM2Ring for his wonderful code.
hashlib.sha1 is a wrapper around a C library so you won't be able to pickle it.
It would need to implement the __getstate__ and __setstate__ methods for Python to access its internal state
You could use a pure Python implementation of sha1 if it is fast enough for your requirements
I was facing this problem too, and found no existing solution, so I ended up writing a library that does something very similar to what Devesh Saini described: https://github.com/kislyuk/rehash. Example:
import pickle, rehash
hasher = rehash.sha256(b"foo")
state = pickle.dumps(hasher)
hasher2 = pickle.loads(state)
hasher2.update(b"bar")
assert hasher2.hexdigest() == rehash.sha256(b"foobar").hexdigest()
Hash algorithm for dynamic growing/streaming data?
You can easily build a wrapper object around the hash object which can transparently persist the data.
The obvious drawback is that it needs to retain the hashed data in full in order to restore the state - so depending on the data size you are dealing with, this may not suit your needs. But it should work fine up to some tens of MB.
Unfortunattely the hashlib does not expose the hash algorithms as proper classes, it rathers gives factory functions that construct the hash objects - so we can't properly subclass those without loading reserved symbols - a situation I'd rather avoid. That only means you have to built your wrapper class from the start, which is not such that an overhead from Python anyway.
here is a sample code that might even fill your needs:
import hashlib
from cStringIO import StringIO
class PersistentSha1(object):
def __init__(self, salt=""):
self.__setstate__(salt)
def update(self, data):
self.__data.write(data)
self.hash.update(data)
def __getattr__(self, attr):
return getattr(self.hash, attr)
def __setstate__(self, salt=""):
self.__data = StringIO()
self.__data.write(salt)
self.hash = hashlib.sha1(salt)
def __getstate__(self):
return self.data
def _get_data(self):
self.__data.seek(0)
return self.__data.read()
data = property(_get_data, __setstate__)
You can access the "data" member itself to get and set the state straight, or you can use python pickling functions:
>>> a = PersistentSha1()
>>> a
<__main__.PersistentSha1 object at 0xb7d10f0c>
>>> a.update("lixo")
>>> a.data
'lixo'
>>> a.hexdigest()
'6d6332a54574aeb35dcde5cf6a8774f938a65bec'
>>> import pickle
>>> b = pickle.dumps(a)
>>>
>>> c = pickle.loads(b)
>>> c.hexdigest()
'6d6332a54574aeb35dcde5cf6a8774f938a65bec'
>>> c.data
'lixo'