Persisting hashlib state - python

I'd like to create a hashlib instance, update() it, then persist its state in some way. Later, I'd like to recreate the object using this state data, and continue to update() it. Finally, I'd like to get the hexdigest() of the total cumulative run of data. State persistence has to survive across multiple runs.
Example:
import hashlib
m = hashlib.sha1()
m.update('one')
m.update('two')
# somehow, persist the state of m here
#later, possibly in another process
# recreate m from the persisted state
m.update('three')
m.update('four')
print m.hexdigest()
# at this point, m.hexdigest() should be equal to hashlib.sha1().update('onetwothreefour').hextdigest()
EDIT:
I did not find a good way to do this with python in 2010 and ended up writing a small helper app in C to accomplish this. However, there are some great answers below that were not available or known to me at the time.

You can do it this way using ctypes, no helper app in C is needed:-
rehash.py
#! /usr/bin/env python
''' A resumable implementation of SHA-256 using ctypes with the OpenSSL crypto library
Written by PM 2Ring 2014.11.13
'''
from ctypes import *
SHA_LBLOCK = 16
SHA256_DIGEST_LENGTH = 32
class SHA256_CTX(Structure):
_fields_ = [
("h", c_long * 8),
("Nl", c_long),
("Nh", c_long),
("data", c_long * SHA_LBLOCK),
("num", c_uint),
("md_len", c_uint)
]
HashBuffType = c_ubyte * SHA256_DIGEST_LENGTH
#crypto = cdll.LoadLibrary("libcrypto.so")
crypto = cdll.LoadLibrary("libeay32.dll" if os.name == "nt" else "libssl.so")
class sha256(object):
digest_size = SHA256_DIGEST_LENGTH
def __init__(self, datastr=None):
self.ctx = SHA256_CTX()
crypto.SHA256_Init(byref(self.ctx))
if datastr:
self.update(datastr)
def update(self, datastr):
crypto.SHA256_Update(byref(self.ctx), datastr, c_int(len(datastr)))
#Clone the current context
def _copy_ctx(self):
ctx = SHA256_CTX()
pointer(ctx)[0] = self.ctx
return ctx
def copy(self):
other = sha256()
other.ctx = self._copy_ctx()
return other
def digest(self):
#Preserve context in case we get called before hashing is
# really finished, since SHA256_Final() clears the SHA256_CTX
ctx = self._copy_ctx()
hashbuff = HashBuffType()
crypto.SHA256_Final(hashbuff, byref(self.ctx))
self.ctx = ctx
return str(bytearray(hashbuff))
def hexdigest(self):
return self.digest().encode('hex')
#Tests
def main():
import cPickle
import hashlib
data = ("Nobody expects ", "the spammish ", "imposition!")
print "rehash\n"
shaA = sha256(''.join(data))
print shaA.hexdigest()
print repr(shaA.digest())
print "digest size =", shaA.digest_size
print
shaB = sha256()
shaB.update(data[0])
print shaB.hexdigest()
#Test pickling
sha_pickle = cPickle.dumps(shaB, -1)
print "Pickle length:", len(sha_pickle)
shaC = cPickle.loads(sha_pickle)
shaC.update(data[1])
print shaC.hexdigest()
#Test copying. Note that copy can be pickled
shaD = shaC.copy()
shaC.update(data[2])
print shaC.hexdigest()
#Verify against hashlib.sha256()
print "\nhashlib\n"
shaD = hashlib.sha256(''.join(data))
print shaD.hexdigest()
print repr(shaD.digest())
print "digest size =", shaD.digest_size
print
shaE = hashlib.sha256(data[0])
print shaE.hexdigest()
shaE.update(data[1])
print shaE.hexdigest()
#Test copying. Note that hashlib copy can NOT be pickled
shaF = shaE.copy()
shaF.update(data[2])
print shaF.hexdigest()
if __name__ == '__main__':
main()
resumable_SHA-256.py
#! /usr/bin/env python
''' Resumable SHA-256 hash for large files using the OpenSSL crypto library
The hashing process may be interrupted by Control-C (SIGINT) or SIGTERM.
When a signal is received, hashing continues until the end of the
current chunk, then the current file position, total file size, and
the sha object is saved to a file. The name of this file is formed by
appending '.hash' to the name of the file being hashed.
Just re-run the program to resume hashing. The '.hash' file will be deleted
once hashing is completed.
Written by PM 2Ring 2014.11.14
'''
import cPickle as pickle
import os
import signal
import sys
import rehash
quit = False
blocksize = 1<<16 # 64kB
blocksperchunk = 1<<8
chunksize = blocksize * blocksperchunk
def handler(signum, frame):
global quit
print "\nGot signal %d, cleaning up." % signum
quit = True
def do_hash(fname, filesize):
hashname = fname + '.hash'
if os.path.exists(hashname):
with open(hashname, 'rb') as f:
pos, fsize, sha = pickle.load(f)
if fsize != filesize:
print "Error: file size of '%s' doesn't match size recorded in '%s'" % (fname, hashname)
print "%d != %d. Aborting" % (fsize, filesize)
exit(1)
else:
pos, fsize, sha = 0, filesize, rehash.sha256()
finished = False
with open(fname, 'rb') as f:
f.seek(pos)
while not (quit or finished):
for _ in xrange(blocksperchunk):
block = f.read(blocksize)
if block == '':
finished = True
break
sha.update(block)
pos += chunksize
sys.stderr.write(" %6.2f%% of %d\r" % (100.0 * pos / fsize, fsize))
if finished or quit:
break
if quit:
with open(hashname, 'wb') as f:
pickle.dump((pos, fsize, sha), f, -1)
elif os.path.exists(hashname):
os.remove(hashname)
return (not quit), pos, sha.hexdigest()
def main():
if len(sys.argv) != 2:
print "Resumable SHA-256 hash of a file."
print "Usage:\npython %s filename\n" % sys.argv[0]
exit(1)
fname = sys.argv[1]
filesize = os.path.getsize(fname)
signal.signal(signal.SIGINT, handler)
signal.signal(signal.SIGTERM, handler)
finished, pos, hexdigest = do_hash(fname, filesize)
if finished:
print "%s %s" % (hexdigest, fname)
else:
print "sha-256 hash of '%s' incomplete" % fname
print "%s" % hexdigest
print "%d / %d bytes processed." % (pos, filesize)
if __name__ == '__main__':
main()
demo
import rehash
import pickle
sha=rehash.sha256("Hello ")
s=pickle.dumps(sha.ctx)
sha=rehash.sha256()
sha.ctx=pickle.loads(s)
sha.update("World")
print sha.hexdigest()
output
a591a6d40bf420404a011733cfb7b190d62c65bf0bcda32b57b277d9ad9f146e
Note: I would like to thank PM2Ring for his wonderful code.

hashlib.sha1 is a wrapper around a C library so you won't be able to pickle it.
It would need to implement the __getstate__ and __setstate__ methods for Python to access its internal state
You could use a pure Python implementation of sha1 if it is fast enough for your requirements

I was facing this problem too, and found no existing solution, so I ended up writing a library that does something very similar to what Devesh Saini described: https://github.com/kislyuk/rehash. Example:
import pickle, rehash
hasher = rehash.sha256(b"foo")
state = pickle.dumps(hasher)
hasher2 = pickle.loads(state)
hasher2.update(b"bar")
assert hasher2.hexdigest() == rehash.sha256(b"foobar").hexdigest()

Hash algorithm for dynamic growing/streaming data?

You can easily build a wrapper object around the hash object which can transparently persist the data.
The obvious drawback is that it needs to retain the hashed data in full in order to restore the state - so depending on the data size you are dealing with, this may not suit your needs. But it should work fine up to some tens of MB.
Unfortunattely the hashlib does not expose the hash algorithms as proper classes, it rathers gives factory functions that construct the hash objects - so we can't properly subclass those without loading reserved symbols - a situation I'd rather avoid. That only means you have to built your wrapper class from the start, which is not such that an overhead from Python anyway.
here is a sample code that might even fill your needs:
import hashlib
from cStringIO import StringIO
class PersistentSha1(object):
def __init__(self, salt=""):
self.__setstate__(salt)
def update(self, data):
self.__data.write(data)
self.hash.update(data)
def __getattr__(self, attr):
return getattr(self.hash, attr)
def __setstate__(self, salt=""):
self.__data = StringIO()
self.__data.write(salt)
self.hash = hashlib.sha1(salt)
def __getstate__(self):
return self.data
def _get_data(self):
self.__data.seek(0)
return self.__data.read()
data = property(_get_data, __setstate__)
You can access the "data" member itself to get and set the state straight, or you can use python pickling functions:
>>> a = PersistentSha1()
>>> a
<__main__.PersistentSha1 object at 0xb7d10f0c>
>>> a.update("lixo")
>>> a.data
'lixo'
>>> a.hexdigest()
'6d6332a54574aeb35dcde5cf6a8774f938a65bec'
>>> import pickle
>>> b = pickle.dumps(a)
>>>
>>> c = pickle.loads(b)
>>> c.hexdigest()
'6d6332a54574aeb35dcde5cf6a8774f938a65bec'
>>> c.data
'lixo'

Related

Python multicore CSV short program, advice/help needed

I'm a hobby coder started with AHK, then some java and now I try to learn Python. I have searched and found some tips but I have yet not been able to implement it into my own code.
Hopefully someone here can help me, it's a very short program.
I'm using .txt csv database with ";" as a separator.
DATABASE EXAMPLE:
Which color is normally a cat?;Black
How tall was the longest man on earth?;272 cm
Is the earth round?;Yes
The database now consists of 20.000 lines which makes the program "to slow", only using 25% CPU (1 core).
If I can make it use all 4 cores (100%) I guess it would perform the task alot faster. The task is basically to compare the CLIPBOARD with the database and if there is a match, it should give me an answer as a return. Perhaps also I can separate the database into 4 pieces?
The code right now looks like this! Not more then 65 lines and its doing its job (but to slow). Advice on how I can make this process into multi core needed.
import time
import pyperclip as pp
import pandas as pd
import pymsgbox as pmb
from fuzzywuzzy import fuzz
import numpy
ratio_threshold = 90
fall_back_time = 1
db_file_path = 'database.txt'
db_separator = ';'
db_encoding = 'latin-1'
def load_db():
while True:
try:
# Read and create database
db = pd.read_csv(db_file_path, sep=db_separator, encoding=db_encoding)
db = db.drop_duplicates()
return db
except:
print("Error in load_db(). Will sleep for %i seconds..." % fall_back_time)
time.sleep(fall_back_time)
def top_answers(db, question):
db['ratio'] = db['question'].apply(lambda q: fuzz.ratio(q, question))
db_sorted = db.sort_values(by='ratio', ascending=False)
db_sorted = db_sorted[db_sorted['ratio'] >= ratio_threshold]
return db_sorted
def write_txt(top):
result = top.apply(lambda row: "%s" % (row['answer']), axis=1).tolist()
result = '\n'.join(result)
fileHandle = open("svar.txt", "w")
fileHandle.write(result)
fileHandle.close()
pp.copy("")
def main():
try:
db = load_db()
last_db_reload = time.time()
while True:
# Get contents of clipboard
question = pp.paste()
# Rank answer
top = top_answers(db, question)
# If answer was found, show results
if len(top) > 0:
write_txt(top)
time.sleep(fall_back_time)
except:
print("Error in main(). Will sleep for %i seconds..." % fall_back_time)
time.sleep(fall_back_time)
if name == 'main':
main()'
If you could divide the db into four equally large you could process them in parallel like this:
import time
import pyperclip as pp
import pandas as pd
import pymsgbox as pmb
from fuzzywuzzy import fuzz
import numpy
import threading
ratio_threshold = 90
fall_back_time = 1
db_file_path = 'database.txt'
db_separator = ';'
db_encoding = 'latin-1'
def worker(thread_id, question):
thread_id = str(thread_id)
db = pd.read_csv(db_file_path + thread_id, sep=db_separator, encoding=db_encoding)
db = db.drop_duplicates()
db['ratio'] = db['question'].apply(lambda q: fuzz.ratio(q, question))
db_sorted = db.sort_values(by='ratio', ascending=False)
db_sorted = db_sorted[db_sorted['ratio'] >= ratio_threshold]
top = db_sorted
result = top.apply(lambda row: "%s" % (row['answer']), axis=1).tolist()
result = '\n'.join(result)
fileHandle = open("svar" + thread_id + ".txt", "w")
fileHandle.write(result)
fileHandle.close()
pp.copy("")
return
def main():
question = pp.paste()
for i in range(1, 4):
t = threading.Thread(target=worker, args=(i, question))
t.start()
t.join()
if name == 'main':
main()
The solution with multiprocessing:
import time
import pyperclip as pp
import pandas as pd
#import pymsgbox as pmb
from fuzzywuzzy import fuzz
import numpy as np
# pathos uses better pickle to tranfer more complicated objects
from pathos.multiprocessing import Pool
from functools import reduce
import sys
import os
from contextlib import closing
ratio_threshold = 70
fall_back_time = 1
db_file_path = 'database.txt'
db_separator = ';'
db_encoding = 'latin-1'
chunked_db = []
NUM_PROCESSES = os.cpu_count()
def load_db():
while True:
try:
# Read and create database
db = pd.read_csv(db_file_path, sep=db_separator, encoding=db_encoding)
db.columns = ['question', 'answer']
#db = db.drop_duplicates() # i drop it for experiment
break
except:
print("Error in load_db(). Will sleep for %i seconds..." % fall_back_time)
time.sleep(fall_back_time)
# split database into equal chunks:
# (if you have a lot of RAM, otherwise you
# need to compute ranges in db, something like
# chunk_size = len(db)//NUM_PROCESSES
# ranges[i] = (i*chunk_size, (i+1)*cjunk_size)
# and pass ranges in original db to processes
chunked_db = np.split(db, [NUM_PROCESSES], axis=0)
return chunked_db
def top_answers_multiprocessed(question, chunked_db):
# on unix, python uses 'fork' mode by default
# so the process has 'copy-on-change' access to all global variables
# i.e. if process will change something in db, it will be copied to it
# with a lot of overhead
# Unfortunately, I'fe heard that on Windows only 'spawn' mode with full
# copy of everything is used
# Process pipeline uses pickle, it's quite slow.
# so on small database you may not have benefit from multiprocessing
# If you are going to transfer big objects in or out, look
# in the direction of multiprocessing.Array
# this solution is not fully efficient,
# as pool is recreated each time
# You can create daemon processes which will monitor
# Queue for incoming questions, but it's harder to implement
def top_answers(idx):
# question is in the scope of parent function,
chunked_db[idx]['ratio'] = chunked_db[idx]['question'].apply(lambda q: fuzz.ratio(q, question))
db_sorted = chunked_db[idx].sort_values(by='ratio', ascending=False)
db_sorted = db_sorted[db_sorted['ratio'] >= ratio_threshold]
return db_sorted
with closing(Pool(processes=NUM_PROCESSES)) as pool:
# chunked_db is a list of databases
# they are in global scope, we send only index beacause
# all the data set is pickled
num_chunks = len(chunked_db)
# apply function top_answers across generator range(num_chunks)
res = pool.imap_unordered(top_answers, range(num_chunks))
res = list(res)
# now res is list of dataframes, let's join it
res_final = reduce(lambda left,right: pd.merge(left,right,on='ratio'), res)
return res_final
def write_txt(top):
result = top.apply(lambda row: "%s" % (row['answer']), axis=1).tolist()
result = '\n'.join(result)
fileHandle = open("svar.txt", "w")
fileHandle.write(result)
fileHandle.close()
pp.copy("")
def mainfunc():
global chunked_db
chunked_db = load_db()
last_db_reload = time.time()
print('db loaded')
last_clip = ""
while True:
# Get contents of clipboard
try:
new_clip = pp.paste()
except:
continue
if (new_clip != last_clip) and (len(new_clip)> 0):
print(new_clip)
last_clip = new_clip
question = new_clip.strip()
else:
continue
# Rank answer
top = top_answers_multiprocessed(question, chunked_db)
# If answer was found, show results
if len(top) > 0:
#write_txt(top)
print(top)
if __name__ == '__main__':
mainfunc()

Return Events from DispatchWithEvents using third party COM

My code prints out what I need from the print statement in Events. But, I have no idea how to return the data because of the way the class is instantiated. Further, the print statement only works if pythoncom.PumpWaitingMessages() is included, but it doesn't return the data that's printed, or anything.
I'd like to be able to use what's printed as a return value to be accessed by other functions.
(If worse comes to worse, I could capture stdout (which is a last resort).)
Code:
# Standard Lib
import time
# Third Party
from win32com.client import DispatchWithEvents
import pythoncom
# Local Lib
import scan_var
class Events(object):
def OnBarcodeEvent(self, eventType=pythoncom.Empty, pscanData=pythoncom.Empty):
print pscanData
return pscanData
zebra = DispatchWithEvents("CoreScanner.CoreScanner", Events)
# open api
open_status = zebra.Open(0, [1], 1)
print "Open status: {}".format(open_status)
# get scanners
get_scanners = zebra.GetScanners(0, [1])
print "get_scanners: {}".format(get_scanners)
# Register for events
register = zebra.ExecCommand(1001,scan_var.register_for_events)
print "register: {}".format(register)
# PEWPEWPEW (pull trigger)
fire_result = zebra.ExecCommand(2011, scan_var.pull_trigger)
print "PEWPEWPEW {}".format(fire_result)
time.sleep(5)
while True:
time.sleep(1)
pythoncom.PumpWaitingMessages()
Output:
Open status: 0
get_scanners: (1, (1,),504</VID> <PID>6400</PID> ...
register: (u'', 0)
PEWPEWPEW (u'', 0)
<?xml version="1.0" encoding="UTF-8"?>
<outArgs>
<scannerID>1</scannerID>
<arg-xml>
<scandata>
<modelnumber>new_hotness </modelnumber>
<serialnumber>1522501a0501156 </serialnumber>
<GUID>2A4BE99CFCEFD047837ADF0082aD51AD5</GUID>
<datatype>27</datatype>
<datalabel>0x39 0x32 0x304 ...</datalabel>
<rawdata>0x22 0x03 0x00 ... </rawdata>
</scandata>
</arg-xml>
</outArgs>
My solution is as follows. This may be ugly and wrong but it got me what I need. If anyone has a better way I'm happy to edit.
class Events(object):
def get_barcode(self):
return self.pscanData
def OnBarcodeEvent(self, eventType=1, pscanData=pythoncom.Empty):
self.pscanData = pscanData
print self.pscanData
def save_serial_to_cache():
zebra = DispatchWithEvents("CoreScanner.CoreScanner", Events)
# open api
open_status = zebra.Open(0, [1], 1)
print "Open status: {}".format(open_status)
#get scanners
get_scanners = zebra.GetScanners(0, [1])
print "get_scanners: {}".format(get_scanners)
# Register for events
register = zebra.ExecCommand(1001,scan_var.register_for_events)
print "register: {}".format(register)
# PEWPEWPEW (pull trigger)
fire_result = zebra.ExecCommand(2011, scan_var.pull_trigger)
print "PEWPEWPEW {}".format(fire_result)
for counter in xrange(0, 5):
time.sleep(1)
pythoncom.PumpWaitingMessages()
return zebra._obj_.get_barcode.__self__.pscanData

Why is the python zipfile module faster than C?

I am writing a module that needs to be able to deal with a large number of zip file pretty fast. As such, I was going to use something implemented in C rather than Python (from which I'll be calling the extractor). To try and test which method would be fastest, I wrote a test script comparing linux's 'unzip' command vs the czipfile python module (wrapper around a c zip extractor). As a control, I used the native python zipfile module.
The script creates a zipfile that's around 100MB out of 100 ~1MB files. It looks at 3 scenarios. A) The files are all just random bytestrings. B)The files are just random hex characters C)The files are uniform random sentences with line breaks.
In all cases, the performance of zipfile (implemented in python) was on par with or significantly better than the two extractors implemented in c.
Any ideas why this could be happening? The script is attached. Requires czipfile and the 'unzip' command available in the shell.
from datetime import datetime
import zipfile
import czipfile
import os, binascii, random
class ZipTestError(Exception):
pass
class ZipTest:
procs = ['zipfile', 'czipfile', 'os']
type_map = {'r':'Random', 'h':'Random Hex', 's':'Sentences'}
# three types. t=='r' is random noise files directly out of urandom. t=='h' is urandom noise converted to ascii characters. t=='s' are randomly constructed sentences with line breaks.
def __init__(self):
print """Testing Random Byte Files:
"""
self.test('r')
self.test('h')
self.test('s')
#staticmethod
def rand_name():
return binascii.b2a_hex(os.urandom(10))
def make_file(self, t):
f_name = self.rand_name()
f = open(f_name, 'w')
if t == 'r':
f.write(os.urandom(1048576))
elif t == 'h':
f.write(binascii.b2a_hex(os.urandom(1048576)))
elif t == 's':
for i in range(76260):
ops = ['dog', 'cat', 'rat']
ops2 = ['meat', 'wood', 'fish']
n1 = int(random.random()*10) % 3
n2 = int(random.random()*10) % 3
sentence = """The {0} eats {1}
""".format(ops[n1], ops2[n2])
f.write(sentence)
else:
raise ZipTestError('Invalid Type')
f.close()
return f_name
#create a ~100MB zip file to test extraction on.
def create_zip_test(self, t):
self.file_names = []
self.output_names = []
for i in range(100):
self.file_names.append(self.make_file(t))
self.zip_name = self.rand_name()
output = zipfile.ZipFile(self.zip_name, 'w', zipfile.ZIP_DEFLATED)
for f in self.file_names:
output.write(f)
output.close()
def clean_up(self, rem_zip = False):
for f in self.file_names:
os.remove(f)
self.file_names = []
for f in self.output_names:
os.remove(f)
self.output_names = []
if rem_zip:
if getattr(self, 'zip_name', False):
os.remove(self.zip_name)
self.zip_name = False
def display_res(self, res, t):
print """
{0} results:
""".format(self.type_map[t])
for p in self.procs:
print"""
{0} = {1} milliseconds""".format(p, str(res[p]))
def test(self, t):
self.create_zip_test(t)
res = self.unzip()
self.display_res(res, t)
self.clean_up(rem_zip = True)
def unzip(self):
res = dict()
for p in self.procs:
self.clean_up()
res[p] = getattr(self, "unzip_with_{0}".format(p))()
return res
def unzip_with_zipfile(self):
return self.unzip_with_python(zipfile)
def unzip_with_czipfile(self):
return self.unzip_with_python(czipfile)
def unzip_with_python(self, mod):
f = open(self.zip_name)
zf = mod.ZipFile(f)
start = datetime.now()
op = './'
for name in zf.namelist():
zf.extract(name,op)
self.output_names.append(name)
end = datetime.now()
total = end-start
ms = total.microseconds
ms += (total.seconds) * 1000000
return ms /1000
def unzip_with_os(self):
f = open(self.zip_name)
start = datetime.now()
zf = zipfile.ZipFile(f)
for name in zf.namelist():
self.output_names.append(name)
os.system("unzip -qq {0}".format(f.name))
end = datetime.now()
total = end-start
ms = total.microseconds
ms += (total.seconds) * 1000000
return ms /1000
if __name__ == '__main__':
ZipTest()
As was pointed out above, the decryption is done in python, not the decompression. So zipfile is just using a c implementation like the other two.
Even if C is generally faster than interpreted languages, assuming the algorithm is the same, different buffering strategies can make a difference. Here's some evidence:
I made a couple of changes to your script. The diff is below.
I made the stopwatch start just before os.system. This change is not noticeable, since reading off entries from the Central Directory is quick. So I saved the zip files and measured unzip time with the time shell builtin, outside Python. The result shows that the overhead of firing up new processes doesn't matter so much.
A more interesting change is the addition of libarchive. The results I obtained are like so (milliseconds):
Random Hex Sentences
zipfile 368 1909 604
czipfile 241 1600 2313
os 707 2225 784
shell-measured 797 2272 737
libarchive 248 1513 451
EXTRACTION METHOD
Note that results vary by some milliseconds every time. The shell measures real, user, and sys time (see What do 'real', 'user' and 'sys' mean in the output of time(1)?). The figures above reflect real time, for consistency with other measurements.
A better analysis of what system calls unzip issues can be achieved by strace -c -w. It shows a spike of reads for Hex:
Random Hex Sentences
read 805 14597 12816
write 2600 3200 1600
SYSTEM CALLS ISSUED BY unzip
Now for the diff (it assumes the original script is named ziptest.py in the same directory where you run patch < _diff_, see patch, diff)
--- ziptest.py.orig 2017-05-25 10:36:03.106994889 +0200
+++ ziptest.py 2017-05-25 11:30:42.032598259 +0200
## -2,6 +2,7 ##
import zipfile
import czipfile
import os, binascii, random
+import libarchive.public
class ZipTestError(Exception):
pass
## -10,7 +11,7 ##
class ZipTest:
- procs = ['zipfile', 'czipfile', 'os']
+ procs = ['zipfile', 'czipfile', 'os', 'libarchive']
type_map = {'r':'Random', 'h':'Random Hex', 's':'Sentences'}
# three types. t=='r' is random noise files directly out of urandom. t=='h' is urandom noise converted to ascii characters. t=='s' are randomly constructed sentences with line breaks.
## -119,10 +120,10 ##
def unzip_with_os(self):
f = open(self.zip_name)
- start = datetime.now()
zf = zipfile.ZipFile(f)
for name in zf.namelist():
self.output_names.append(name)
+ start = datetime.now()
os.system("unzip -qq {0}".format(f.name))
end = datetime.now()
total = end-start
## -130,7 +131,15 ##
ms += (total.seconds) * 1000000
return ms /1000
-
+ def unzip_with_libarchive(self):
+ start = datetime.now()
+ for entry in libarchive.public.file_pour(self.zip_name):
+ self.output_names.append(str(entry))
+ end = datetime.now()
+ total = end-start
+ ms = total.microseconds
+ ms += (total.seconds) * 1000000
+ return ms /1000

Python/Multiprocessing : Processes does not seem to start

I have a function which reads a binary file and converts each byte into a corresponding sequence of characters. For example, 0x05 becomes 'AACC', 0x2A becomes 'AGGG' etc...The function which reads the file and converts the bytes is currently a linear one and since the files to convert are anywhere between 25kb and 2Mb, this can take quite a while.
Therefore, I'm trying to use multiprocessing to divide the task and hopefully improve speed. However, I just can't get it to work. Below is the linear function, which works, albeit slowly;
def fileToRNAString(_file):
if (_file and os.path.isfile(_file)):
rnaSequences = []
blockCount = 0
blockSize = 2048
printAndLog("!", "Converting %s into RNA string (%d bytes/block)" % (_file, blockSize))
with open(_file, "rb") as hFile:
buf = hFile.read(blockSize)
while buf:
decSequenceToRNA(blockCount, buf, rnaSequences)
blockCount = blockCount + 1
buf = hFile.read(blockSize)
else:
printAndLog("-", "Could not find the specified file. Please verify that the file exists:" + _file)
return rnaSequences
Note: The function 'decSequenceToRNA' takes the buffer read and converts each byte to the required string. Upon execution, the function returns a tuple which contain the block number and the string, e.g. (1, 'ACCGTAGATTA...') and at the end, I have an array of these tuples available.
I've tried to convert the function to use the multiprocessing of Python;
def fileToRNAString(_file):
rnaSequences = []
if (_file and os.path.isfile(_file)):
blockCount = 0
blockSize = 2048
printAndLog("!", "Converting %s into RNA string (%d bytes/block)" % (_file, blockSize))
workers = []
with open(_file, "rb") as hFile:
buf = hFile.read(blockSize)
while buf:
p = Process(target=decSequenceToRNA, args=(blockCount, buf, rnaSequences))
p.start()
workers.append(p)
blockCount = blockCount + 1
buf = hFile.read(blockSize)
for p in workers:
p.join()
else:
printAndLog("-", "Could not find the specified file. Please verify that the file exists:" + _file)
return rnaSequences
However, no processes seems to even start, as when this function is ran, an empty array is returned. Any message printed to the console in 'decSequenceToRNA' is not displayed;
>>>fileToRNAString(testfile)
[!] Converting /root/src/amino56/M1H2.bin into RNA string (2048 bytes/block).
Unlike this question here, I'm running Linux shiva 3.14-kali1-amd64 #1 SMP Debian 3.14.5-1kali1 (2014-06-07) x86_64 GNU/Linux and using PyCrust to test the functions on Python Version: 2.7.3. I'm using the following packages:
import os
import re
import sys
import urllib2
import requests
import logging
import hashlib
import argparse
import tempfile
import shutil
import feedparser
from multiprocessing import Process
I'd like help to figure out why my code does not work, of if I'm missing something elsewhere to make the Process works. Also open to suggestions for improving the code. Below is 'decSequenceToRNA' for reference:
def decSequenceToRNA(_idxSeq, _byteSequence, _rnaSequences):
rnaSequence = ''
printAndLog("!", "Processing block %d (%d bytes)" % (_idxSeq, len(_byteSequence)))
for b in _byteSequence:
rnaSequence = rnaSequence + base10ToRNA(ord(b))
printAndLog("+", "Block %d completed. RNA of %d nucleotides generated." % (_idxSeq, len(rnaSequence)))
_rnaSequences.append((_idxSeq, rnaSequence))
decSequenceToRNA is running in its own process, which means it gets its own, separate copy of every data structure in the main process. That means that when you append to _rnaSequences in decSequenceToRNA, it's has no effect on rnaSequences in the parent process. That would explain why an empty list is being returned.
You have two options to address this. First, is to create a list that can be shared between processes using multiprocessing.Manager. For example:
import multiprocessing
def f(shared_list):
shared_list.append(1)
if __name__ == "__main__":
normal_list = []
p = multiprocessing.Process(target=f, args=(normal_list,))
p.start()
p.join()
print(normal_list)
m = multiprocessing.Manager()
shared_list = m.list()
p = multiprocessing.Process(target=f, args=(shared_list,))
p.start()
p.join()
print(shared_list)
Output:
[] # Normal list didn't work, the appended '1' didn't make it to the main process
[1] # multiprocessing.Manager() list works fine
Applying this to your code would just require replacing
rnaSequences = []
With
m = multiprocessing.Manager()
rnaSequences = m.list()
Alternatively, you could (and probably should) use a multiprocessing.Pool instead of creating individual Process for each chunk. I'm not sure how large hFile is or how big the chunks you're reading are, but if there are more than multiprocessing.cpu_count() chunks, you're going to hurt performance by spawning processes for every chunk. Using a Pool, you can keep your process count constant, and easily create your rnaSequence list:
def decSequenceToRNA(_idxSeq, _byteSequence):
rnaSequence = ''
printAndLog("!", "Processing block %d (%d bytes)" % (_idxSeq, len(_byteSequence)))
for b in _byteSequence:
rnaSequence = rnaSequence + base10ToRNA(ord(b))
printAndLog("+", "Block %d completed. RNA of %d nucleotides generated." % (_idxSeq, len(rnaSequence)))
return _idxSeq, rnaSequence
def fileToRNAString(_file):
rnaSequences = []
if (_file and os.path.isfile(_file)):
blockCount = 0
blockSize = 2048
printAndLog("!", "Converting %s into RNA string (%d bytes/block)" % (_file, blockSize))
results = []
p = multiprocessing.Pool() # Creates a pool of cpu_count() processes
with open(_file, "rb") as hFile:
buf = hFile.read(blockSize)
while buf:
result = pool.apply_async(decSequenceToRNA, blockCount, buf)
results.append(result)
blockCount = blockCount + 1
buf = hFile.read(blockSize)
rnaSequences = [r.get() for r in results]
pool.close()
pool.join()
else:
printAndLog("-", "Could not find the specified file. Please verify that the file exists:" + _file)
return rnaSequences
Note that we no longer pass the rnaSequences list to the child. Instead, we just return the result we would have appened back to the parent (which we can't do with Process), and build the list there.
Try writing this (comma at the end of the parameter list)
p = Process(target=decSequenceToRNA, args=(blockCount, buf, rnaSequences,))

Unpickling mid-stream (python)

I am writing scripts to process (very large) files by repeatedly unpickling objects until EOF. I would like to partition the file and have separate processes (in the cloud) unpickle and process separate parts.
However my partitioner is not intelligent, it does not know about the boundaries between pickled objects in the file (since those boundaries depend on the object types being pickled, etc.).
Is there a way to scan a file for a "start pickled object" sentinel? The naive way would be to attempt unpickling at successive byte offsets until an object is successfully pickled, but that yields unexpected errors. It seems that for certain combinations of input, the unpickler falls out of sync and returns nothing for the rest of the file (see code below).
import cPickle
import os
def stream_unpickle(file_obj):
while True:
start_pos = file_obj.tell()
try:
yield cPickle.load(file_obj)
except (EOFError, KeyboardInterrupt):
break
except (cPickle.UnpicklingError, ValueError, KeyError, TypeError, ImportError):
file_obj.seek(start_pos+1, os.SEEK_SET)
if __name__ == '__main__':
import random
from StringIO import StringIO
# create some data
sio = StringIO()
[cPickle.dump(random.random(), sio, cPickle.HIGHEST_PROTOCOL) for _ in xrange(1000)]
sio.flush()
# read from subsequent offsets and find discontinuous jumps in object count
size = sio.tell()
last_count = None
for step in xrange(size):
sio.seek(step, os.SEEK_SET)
count = sum(1 for _ in stream_unpickle(file_obj))
if last_count is None or count == last_count - 1:
last_count = count
elif count != last_count:
# if successful, these should never print (but they do...)
print '%d elements read from byte %d' % (count, step)
print '(%d elements read from byte %d)' % (last_count, step-1)
last_count = count
The pickletools module has a dis function that shows the opcodes. It shows that there is a STOP opcode that you may be scan for:
>>> import pickle, pickletools, StringIO
>>> s = StringIO.StringIO()
>>> pickle.dump('abc', s)
>>> p = s.getvalue()
>>> pickletools.dis(p)
0: S STRING 'abc'
7: p PUT 0
10: . STOP
highest protocol among opcodes = 0
Note, using the STOP opcode is a bit tricky because the codes are of variable length, but it may serve as a useful hint about where the cutoffs are.
If you control the pickling step on the other end, then you can improve the situation by adding your own unambiguous alternative separator:
>>> sep = '\xDE\xAD\xBE\xEF'
>>> s = StringIO.StringIO()
>>> pickle.dump('abc', s)
>>> s.write(sep)
>>> pickle.dump([10, 20], s)
>>> s.write(sep)
>>> pickle.dump('def', s)
>>> s.write(sep)
>>> pickle.dump([30, 40], s)
>>> p = s.getvalue()
Before unpacking, split into separate pickles using the known separator:
>>> for pick in p.split(sep):
print pickle.loads(pick)
abc
[10, 20]
def
[30, 40]
In the pickled file, some opcodes have an argument -- a data value that follows the opcode. The data values vary in length, and can contain bytes identical to opcodes. Therefore, if you start reading the file from an arbitrary position, you have no way of knowing if you are looking at an opcode or in the middle of an argument. You must read the file from beginning and parse the opcodes.
I cooked up this function that skips one pickle from a file, i.e. reads it and parses opcodes, but does not construct the objects. It seems slightly faster than cPickle.loads on some files I have. You could rewrite this in C for more speed. (after testing this properly)
Then, you can make one pass over the whole file to get the seek position of each pickle.
from pickletools import code2op, UP_TO_NEWLINE, TAKEN_FROM_ARGUMENT1, TAKEN_FROM_ARGUMENT4
from marshal import loads as mloads
def skip_pickle(f):
"""Skip one pickle from file.
'f' is a file-like object containing the pickle.
"""
while True:
code = f.read(1)
if not code:
raise EOFError
opcode = code2op[code]
if opcode.arg is not None:
n = opcode.arg.n
if n > 0:
f.read(n)
elif n == UP_TO_NEWLINE:
f.readline()
elif n == TAKEN_FROM_ARGUMENT1:
n = ord(f.read(1))
f.read(n)
elif n == TAKEN_FROM_ARGUMENT4:
n = mloads('i' + f.read(4))
f.read(n)
if code == '.':
break
Sorry to answer my own question, and thanks to #RaymondHettinger for the idea of adding sentinels.
Here's what worked for me. I created readers and writers that use a sentinel '#S' followed by a data block length at the beginning of each 'record'. The writer has to take care to find any occurrences of '#' in the data being written and double them (into '##'). The reader then uses a look-behind regex to find sentinels, distinct from any matching values that might be in the original stream, and also verify the number of bytes between this sentinel and the subsequent one.
RecordWriter is a context manager (so multiple calls to write() can be encapsulated into a single record if needed). RecordReader is a generator.
Not sure how this is on performance. Any faster/elegant-er solutions are welcome.
import re
import cPickle
from functools import partial
from cStringIO import StringIO
SENTINEL = '#S'
# when scanning look for #S, but NOT ##S
sentinel_pattern = '(?<!#)#S' # uses negative look-behind
sentinel_re = re.compile(sentinel_pattern)
find_sentinel = sentinel_re.search
# when writing replace single # with double ##
write_pattern = '#'
write_re = re.compile(write_pattern)
fix_write = partial(write_re.sub, '##')
# when reading, replace double ## with single #
read_pattern = '##'
read_re = re.compile(read_pattern)
fix_read = partial(read_re.sub, '#')
class RecordWriter(object):
def __init__(self, stream):
self._stream = stream
self._write_buffer = None
def __enter__(self):
self._write_buffer = StringIO()
return self
def __exit__(self, et, ex, tb):
if self._write_buffer.tell():
self._stream.write(SENTINEL) # start
cPickle.dump(self._write_buffer.tell(), self._stream, cPickle.HIGHEST_PROTOCOL) # byte length of user's original data
self._stream.write(fix_write(self._write_buffer.getvalue()))
self._write_buffer = None
return False
def write(self, data):
if not self._write_buffer:
raise ValueError("Must use StreamWriter as a context manager")
self._write_buffer.write(data)
class BadBlock(Exception): pass
def verify_length(block):
fobj = StringIO(block)
try:
stated_length = cPickle.load(fobj)
except (ValueError, IndexError, cPickle.UnpicklingError):
raise BadBlock
data = fobj.read()
if len(data) != stated_length:
raise BadBlock
return data
def RecordReader(stream):
' Read one record '
accum = StringIO()
seen_sentinel = False
data = ''
while True:
m = find_sentinel(data)
if not m:
if seen_sentinel:
accum.write(data)
data = stream.read(80)
if not data:
if accum.tell():
try: yield verify_length(fix_read(accum.getvalue()))
except BadBlock: pass
return
else:
if seen_sentinel:
accum.write(data[:m.start()])
try: yield verify_length(fix_read(accum.getvalue()))
except BadBlock: pass
accum = StringIO()
else:
seen_sentinel = True
data = data[m.end():] # toss
if __name__ == '__main__':
import random
stream = StringIO()
data = [str(random.random()) for _ in xrange(3)]
# test with a string containing sentinel and almost-sentinel
data.append('abc12#jeoht38#SoSooihetS#')
count = len(data)
for i in data:
with RecordWriter(stream) as r:
r.write(i)
size = stream.tell()
start_pos = random.random() * size
stream.seek(start_pos, os.SEEK_SET)
read_data = [s for s in RecordReader(stream)]
print 'Original data: ', data
print 'After seeking to %d, RecordReader returned: %s' % (start_pos, read_data)

Categories

Resources