I've written a subclass of file that a) provides methods to conveniently lock it (using fcntl, so it only supports unix, which is however OK for me atm) and b) when reading or writing asserts that the file is appropriately locked.
Now I'm not an expert at such stuff (I've just read one paper [de] about it) and would appreciate some feedback: Is it secure, are there race conditions, are there other things that could be done better … Here is the code:
from fcntl import flock, LOCK_EX, LOCK_SH, LOCK_UN, LOCK_NB
class LockedFile(file):
"""
A wrapper around `file` providing locking. Requires a shared lock to read
and a exclusive lock to write.
Main differences:
* Additional methods: lock_ex, lock_sh, unlock
* Refuse to read when not locked, refuse to write when not locked
exclusivly.
* mode cannot be `w` since then the file would be truncated before
it could be locked.
You have to lock the file yourself, it won't be done for you implicitly.
Only you know what lock you need.
Example usage::
def get_config():
f = LockedFile(CONFIG_FILENAME, 'r')
f.lock_sh()
config = parse_ini(f.read())
f.close()
def set_config(key, value):
f = LockedFile(CONFIG_FILENAME, 'r+')
f.lock_ex()
config = parse_ini(f.read())
config[key] = value
f.truncate()
f.write(make_ini(config))
f.close()
"""
def __init__(self, name, mode='r', *args, **kwargs):
if 'w' in mode:
raise ValueError('Cannot open file in `w` mode')
super(LockedFile, self).__init__(name, mode, *args, **kwargs)
self.locked = None
def lock_sh(self, **kwargs):
"""
Acquire a shared lock on the file. If the file is already locked
exclusively, do nothing.
:returns: Lock status from before the call (one of 'sh', 'ex', None).
:param nonblocking: Don't wait for the lock to be available.
"""
if self.locked == 'ex':
return # would implicitly remove the exclusive lock
return self._lock(LOCK_SH, **kwargs)
def lock_ex(self, **kwargs):
"""
Acquire an exclusive lock on the file.
:returns: Lock status from before the call (one of 'sh', 'ex', None).
:param nonblocking: Don't wait for the lock to be available.
"""
return self._lock(LOCK_EX, **kwargs)
def unlock(self):
"""
Release all locks on the file.
Flushes if there was an exclusive lock.
:returns: Lock status from before the call (one of 'sh', 'ex', None).
"""
if self.locked == 'ex':
self.flush()
return self._lock(LOCK_UN)
def _lock(self, mode, nonblocking=False):
flock(self, mode | bool(nonblocking) * LOCK_NB)
before = self.locked
self.locked = {LOCK_SH: 'sh', LOCK_EX: 'ex', LOCK_UN: None}[mode]
return before
def _assert_read_lock(self):
assert self.locked, "File is not locked"
def _assert_write_lock(self):
assert self.locked == 'ex', "File is not locked exclusively"
def read(self, *args):
self._assert_read_lock()
return super(LockedFile, self).read(*args)
def readline(self, *args):
self._assert_read_lock()
return super(LockedFile, self).readline(*args)
def readlines(self, *args):
self._assert_read_lock()
return super(LockedFile, self).readlines(*args)
def xreadlines(self, *args):
self._assert_read_lock()
return super(LockedFile, self).xreadlines(*args)
def __iter__(self):
self._assert_read_lock()
return super(LockedFile, self).__iter__()
def next(self):
self._assert_read_lock()
return super(LockedFile, self).next()
def write(self, *args):
self._assert_write_lock()
return super(LockedFile, self).write(*args)
def writelines(self, *args):
self._assert_write_lock()
return super(LockedFile, self).writelines(*args)
def flush(self):
self._assert_write_lock()
return super(LockedFile, self).flush()
def truncate(self, *args):
self._assert_write_lock()
return super(LockedFile, self).truncate(*args)
def close(self):
self.unlock()
return super(LockedFile, self).close()
(the example in the docstring is also my current use case for this)
Thanks for having read until down here, and possibly even answering :)
I am also not an expert, but there is one thing you should change, and a couple others to consider:
First off, using assert this way is a bad idea: if python is run with -O or -OO asserts are turned off and your two assert_*_lock() methods would always return True.
Second -- you need some tests. :) I took the liberty of added a custom error class and writing a couple tests. The first four pass, the last fails; which brings up the question, what should happen if the file is opened normally (as some other non-LockedFile object) and data is written to it?
Oh, and lastly -- the name LockableFile makes more sense to me, since the file can be in an unlocked state.
Here are the changes I made:
class LockedFileError(OSError): # might want IOError instead
pass
if __name__ == '__main__':
import unittest
import tempfile
import shutil
import os
class TestLockedFile(unittest.TestCase):
def setUp(self):
self.dir = tempfile.mkdtemp()
self.testfile = testfile = os.path.join(self.dir, 'opened.txt')
temp = open(testfile, 'w')
temp.write('[global]\nsetting1=99\nsetting2=42\n')
temp.close()
def tearDown(self):
shutil.rmtree(self.dir, ignore_errors=True)
def test_01(self):
"writes fail if not locked exclusively"
testfile = self.testfile
temp = LockedFile(testfile, 'r+')
self.assertRaises(LockedFileError, temp.write, 'arbitrary data')
temp.lock_sh()
self.assertRaises(LockedFileError, temp.write, 'arbitrary data')
def test_02(self):
"reads fail if not locked"
testfile = self.testfile
temp = LockedFile(testfile, 'r')
self.assertRaises(LockedFileError, temp.read)
def test_03(self):
"writes succeed if locked exclusively"
testfile = self.testfile
temp = LockedFile(testfile, 'r+')
temp.lock_ex()
temp.write('arbitrary data\n')
def test_04(self):
"reads succeed if locked"
testfile = self.testfile
temp = LockedFile(testfile, 'r')
temp.lock_sh()
temp.readline()
temp.lock_ex()
temp.readline()
def test_05(self):
"other writes fail if locked exclusively"
testfile = self.testfile
temp = LockedFile(testfile, 'r')
temp.lock_ex()
testing = open(testfile, 'r+')
# not sure if this should be OSError, IOError, or something else...
self.assertRaises(OSError, testing.write, 'this should fail\n')
unittest.main()
Many more tests should be written to cover the various combinations of a LockedFile with reads, writes, and other non-LockedFile file objects trying to read/write to the same actual file.
Related
class PacketContext:
capture_tstamp = None
def __init__(self, capture_tstamp=None):
self.capture_tstamp = capture_tstamp
class SubParserMixin():
def __init__(self):
self.context = PacketContext()
def parser_er_data(packet):
x = packet.payload
def on_packet(self, packet):
self.context.capture_tstamp = packet.capture_timestamp
self.parse_er_data(packet.payload)
#this mock test lives in another python file
from exchanges.impl.tse.mixins import PacketContext
def test_receive_timestamp(self):
packet = MagicMock(capture_timestamp=123456)
self.on_packet(packet)
assert packet.context.capture_tstamp == 123456
I have the above test that runs as I want it to, but there is one small issue- when self.on_packet(packet) is called with the mock packet, it of course will cause errors, in particular, due to the line
self.parse_er_data(packet.payload)
Is there anyway to stop the code from running past the line
self.context.capture_tstamp = packet.capture_timestamp
which is what we want to assert.
You could alter your
def on_packet(self, packet):
self.context.capture_tstamp = packet.capture_timestamp
self.parse_er_data(packet.payload)
to
def on_packet(self, packet):
self.context.capture_tstamp = packet.capture_timestamp
return None # to be removed after debugging the line above
self.parse_er_data(packet.payload)
How do I overload __init__() in Python? I'm used to C/C++ where the compiler can see the difference in data type, but since there are no data types in Python, how can I make sure the third method gets called instead of the second when I give a string as parameter instead of an int?
class Handle:
def __init__(self):
self.pid = -1
def __init__(self, pid):
self.pid = pid
def __init__(self, procname):
print(procname)
self.pid = -1 # find pid by name
In C++ you can define multiple functions/methods with the same name and different signatures. In Python, everytime you define a new function with the same name of a previously defined function, you are replacing it.
To achieve what you want, you have to use optional arguments and/or explicit checks.
Here are a few possible solutions:
class Handle:
def __init__(self, pid=-1):
if isinstance(pid, str):
self.procname = pid
self.pid = -1
else:
self.procname = None
self.pid = pid
class Handle:
# Both pid and procname may be specified at the same time.
def __init__(self, pid=-1, procname=None):
self.procname = procname
self.pid = pid
class Handle:
# Either pid or procname, not both, may be specified.
def __init__(self, pid=-1, procname=None):
if pid >= 0 and procname is not None:
raise ValueError('you can specify either pid or procname, not both')
self.procname = procname
self.pid = pid
class Handle:
def __init__(self, pid=-1):
self.pid = pid
# A separate constructor, with a different name,
# because "explicit is better than implicit" and
# "sparse is better than dense" (cit. The Zen of
# Python).
# In my opinion, this is the most Pythonic solution.
#classmethod
def from_process_name(cls, procname):
pid = get_pid(procname)
return cls(pid)
By the way, I would not recommend using -1 for "not specified". I'd rather use None.
Couldn't the PEP 443 : Single-dispatch generic functions be useful in your case ?
With something like (very basic example) :
from functools import singledispatch
class Handle:
def __init__(self, arg=None):
self.pid = init_from_arg(arg)
#singledispatch
def init_from_arg(arg):
# If None or neither an integer nor a string, return None
return None
# If the argument is an integer (ie the pid)
#init_from_arg.register(int)
def _(arg):
return arg
# If the argument provided is a string (ie the procname)
#init_from_arg.register(str)
def _(arg):
return get_pid_from_procname(arg)
Edit : Following the advice in comments, these functions could probably be made staticmethods, although I had trouble with them so I reverted to the code of my original answer (see edit history if needed)
Use isinstance to check the type:
class Handle:
def __init__(self, arg):
if isinstance(arg, str):
self.pid = arg
else:
self.pid = -1
If you want to do one action or another you might be better setting the arg to None and using it based on whether it is set or not:
class Handle:
def __init__(self, by_name=None):
self.by_name = by_name
Then just check later if it is None:
if self.by_name is not None:
# find by name
else:
# by pid
Of if you want the user to have a choice to pass either a pid or name:
class Handle:
def __init__(self, by_name=None,by_pid=None):
self.by_name = by_name
self.by_pid = by_pid
The simplest solution would be to take a single arg and move the logic wherever you call it, just because something is not None does not make it an int or string so you should also check for that or your program will error:
class Handle:
def __init__(self, proc):
self.proc = proc
if not isinstance(self.proc, (int,str)):
raise ValueError("Must be process name or pid.")
def find_proc(self):
if isinstance(self.proc, str):
# find by name
else:
# by pid
EDIT: I'm stupid
Sorry everyone. The reason my loop wasn't working is a profoundly stupid one; I had the arguments in the wrong order for pickle.dump().
OP
First time pickling objects, sorry if this is a nub question; have looked at various other EOFError posts but none either work or seem to apply.
Additionally, though I close the .pkl file, when the script has run it always has a zero-byte file size. I don't understand why.
I'm on OS X (Mavericks); here's my code (slightly reduced for the purposes of readability; the to_file_inspections() def is a class method and a couple of var names have been made more useful to a general audience, otherwise is verbatim. And FileInspection class is verbatim!
def to_file_inspections(data_store)
jar = open("jar.pkl","wb")
for f in data_store:
f_ob = FileInspection(f) #custom class, not very elaborate
cPickle.dump(jar,f_ob, -1)
jar.close()
#now just to test it...
data = cPickle.load(open("jar.pkl", "rb"))
print repr(data)
Results in:
Traceback (most recent call last):
File "main.py", line 400, in <module>
to_file_inspections()
File "main.py", line 355, in to_file_inspections
data = cPickle.load(open("jar.pkl", "rb"))
EOFError
In case it matters, this is my FileInspection class:
class FileInspection:
def __init__(self, path):
self._path = path.rstrip('\n')
self._hash = self.hash_file()
self._inspectable = True
stats = os.stat(self._path)
self._size = stats.st_size
self._last_mod = stats.st_mtime
def hash_file(self):
read_size = 1024 # You can make this bigger
checksum = hashlib.md5()
with open(self._path) as f:
data = f.read(read_size)
while data:
checksum.update(data)
data = f.read(read_size)
checksum = checksum.hexdigest()
return checksum
def toString(self):
return '{0} = {1}"checked":false, "path":"{2}", "hash":{3} "inspectable":{4}, "filesize":{5}, "lastmod":"{6}"{7}'.format(
self._hash, "{", self._path, self._inspectable, self._hash, self._last_mod, "}\n")
def toJSON(self):
return '\t"{0}":{1}\n\t\t"checked":false,\n\t\t"path":"{2}",\n\t\t"hash":"{3}",\n\t\t"inspectable":"{4}",\n\t\t"filesize":"{5}",\n\t\t"last_mod":"{6}"\n\t{7}'.format(
self._hash, "{", self._path, self._hash, self._inspectable, self._size, self._last_mod, "}")
#property
def path(self):
return self._path
#property
def hash(self):
return self._hash
#property
def inspectable(self):
return self._inspectable
#property
def size(self):
return self._size
#property
def last_mod(self):
return self._last_mod
Sorry everyone. The reason my loop wasn't working is a profoundly stupid one; I have the arguments in the wrong order for pickle.dump().
I've been trying to optimize my application, and although I made the function run on average 10.06 seconds when I profiled it by itself, when it is put on a QThread, it takes around 17-22 seconds.
It's 2x slower in the QThread. How do I fix that?
The function is actually initializing a class called DocxDocument, which is a document that I read from a Word file and parsed it for my needs.
I have a QThread that creates this class and uses Qt signals to send progress information back to the GUI. Here is the code from that class:
class DocxImporterThread(QThread):
'''
This thread is used to import a .docx document, report its progress, and
then return the document that it parsed.
'''
reportProgress = pyqtSignal(int)
reportError = pyqtSignal(Exception, str)
reportText = pyqtSignal(str)
def __init__(self, filePath):
QThread.__init__(self)
self.setPriority(QThread.HighestPriority)
self._filePath = filePath
self._docx = None
self._html = ''
self._bookmarks = None
self._pages = None
self._stop = False
def run(self):
def myProgressHook(percentage):
self.reportProgress.emit(percentage)
def myCancelHook():
return self._stop
try:
self._docx = DocxDocument(self._filePath, myProgressHook, myCancelHook)
if not self._stop:
self._html = self._docx.getMainPage()
self._bookmarks = self._docx.getHeadings()
self._pages = self._docx.getPages()
except Exception as ex2:
print 'ERROR: The .docx document didn\'t import.'
self.reportError.emit(ex2, traceback.format_exc())
The getMainPage(), getHeadings(), and getPages() are instantaneous because they just return a reference to something that the constructor already created. Here is the code I used to profile my DocxDocument class:
testFile = 'some_file.docx'
statSave = 'profile.out'
def progress(percent):
print ' --', percent, '--'
cProfile.run('DocxDocument(testFile)', filename=statSave)
myStats = pstats.Stats(statSave)
myStats.sort_stats('cumulative', 'name')
myStats.print_stats()
Thanks for your time in looking at this!
I have written an iterator class that opens a file in it's __init__.
def __init__(self, path):
self.file = open(path, "r")
How do I close that file automatically when the iteration is finished?
Complete class:
class Parse(object):
"""A generator that iterates through a CC-CEDICT formatted file, returning
a tuple of parsed results (Traditional, Simplified, Pinyin, English)"""
def __init__(self, path):
self.file = open(path, "r")
def __iter__(self):
return self
def __is_comment(self, line):
return line.startswith("#")
def next(self):
#This block ignores comments.
line = self.file.readline()
while line and self.__is_comment(line):
line = self.file.readline()
if line:
working = line.rstrip().split(" ")
trad, simp = working[0], working[1]
working = " ".join(working[2:]).split("]")
pinyin = working[0][1:]
english = working[1][1:]
return trad, simp, pinyin, english
else:
raise StopIteration()
A better way to write the whole thing would be to keep the opening and the iteration in one place:
class Parse(object):
"""A generator that iterates through a CC-CEDICT formatted file, returning
a tuple of parsed results (Traditional, Simplified, Pinyin, English)"""
def __init__(self, path):
self.path = path
def __is_comment(self, line):
return line.startswith("#")
def __iter__(self):
with open(self.path) as f:
for line in f:
if self.__is_comment(line):
continue
working = line.rstrip().split(" ")
trad, simp = working[0], working[1]
working = " ".join(working[2:]).split("]")
pinyin = working[0][1:]
english = working[1][1:]
yield trad, simp, pinyin, english
This will wait to open the file until you really need it, and will automatically close it when done. It's also less code.
If you really want to get into the "generators are awesome!" mindset:
def skip_comments(f):
for line in f:
if not.startswith('#'):
yield line
...
def __iter__(self):
with open(self.path) as f:
for line in skip_comments(f):
working = ....
You need to explicitly close it as soon as StopIteration is raised. In this case, simply call .close() when you raise StopIteration yourself.
def next(self):
#This block ignores comments.
line = self.file.readline()
while line and self.__is_comment(line):
line = self.file.readline()
if line:
working = line.rstrip().split(" ")
trad, simp = working[0], working[1]
working = " ".join(working[2:]).split("]")
pinyin = working[0][1:]
english = working[1][1:]
return trad, simp, pinyin, english
else:
self.file.close()
raise StopIteration()
Since no other code in your .next() method could trigger a StopIteration this suffices.
If you did use next() on another iterator inside your own .next() you'd have to catch StopIteration with an except StopIteration: handler and reraise the exception.
This only handles the StopIteration case. If you want to handle other situations (not exhausting the iterator) you'll need to handle that situation separately. Making your class a Context Manager as well could help with that. Users of your iterator would then use the object in a with statement before iterating over it, and when the with suite is exited the file could be closed regardless. You may want to mark your iterator as 'done' as well in that case:
_closed = False
def next(self):
if self._closed:
raise StopIteration
line = self.file.readline()
while line and self.__is_comment(line):
line = self.file.readline()
if line:
working = line.rstrip().split(" ")
trad, simp = working[0], working[1]
working = " ".join(working[2:]).split("]")
pinyin = working[0][1:]
english = working[1][1:]
return trad, simp, pinyin, english
else:
self.file.close()
self._closed = True
raise StopIteration()
def __enter__(self):
return self
def __exit__(self, type_, value, tb):
self.file.close() # multiple calls to .close() are fine
self._closed = True