cPickle EOFError when loading pickled data - python

EDIT: I'm stupid
Sorry everyone. The reason my loop wasn't working is a profoundly stupid one; I had the arguments in the wrong order for pickle.dump().
OP
First time pickling objects, sorry if this is a nub question; have looked at various other EOFError posts but none either work or seem to apply.
Additionally, though I close the .pkl file, when the script has run it always has a zero-byte file size. I don't understand why.
I'm on OS X (Mavericks); here's my code (slightly reduced for the purposes of readability; the to_file_inspections() def is a class method and a couple of var names have been made more useful to a general audience, otherwise is verbatim. And FileInspection class is verbatim!
def to_file_inspections(data_store)
jar = open("jar.pkl","wb")
for f in data_store:
f_ob = FileInspection(f) #custom class, not very elaborate
cPickle.dump(jar,f_ob, -1)
jar.close()
#now just to test it...
data = cPickle.load(open("jar.pkl", "rb"))
print repr(data)
Results in:
Traceback (most recent call last):
File "main.py", line 400, in <module>
to_file_inspections()
File "main.py", line 355, in to_file_inspections
data = cPickle.load(open("jar.pkl", "rb"))
EOFError
In case it matters, this is my FileInspection class:
class FileInspection:
def __init__(self, path):
self._path = path.rstrip('\n')
self._hash = self.hash_file()
self._inspectable = True
stats = os.stat(self._path)
self._size = stats.st_size
self._last_mod = stats.st_mtime
def hash_file(self):
read_size = 1024 # You can make this bigger
checksum = hashlib.md5()
with open(self._path) as f:
data = f.read(read_size)
while data:
checksum.update(data)
data = f.read(read_size)
checksum = checksum.hexdigest()
return checksum
def toString(self):
return '{0} = {1}"checked":false, "path":"{2}", "hash":{3} "inspectable":{4}, "filesize":{5}, "lastmod":"{6}"{7}'.format(
self._hash, "{", self._path, self._inspectable, self._hash, self._last_mod, "}\n")
def toJSON(self):
return '\t"{0}":{1}\n\t\t"checked":false,\n\t\t"path":"{2}",\n\t\t"hash":"{3}",\n\t\t"inspectable":"{4}",\n\t\t"filesize":"{5}",\n\t\t"last_mod":"{6}"\n\t{7}'.format(
self._hash, "{", self._path, self._hash, self._inspectable, self._size, self._last_mod, "}")
#property
def path(self):
return self._path
#property
def hash(self):
return self._hash
#property
def inspectable(self):
return self._inspectable
#property
def size(self):
return self._size
#property
def last_mod(self):
return self._last_mod

Sorry everyone. The reason my loop wasn't working is a profoundly stupid one; I have the arguments in the wrong order for pickle.dump().

Related

How can I just test the one line of code in an argument after mocking an argument

class PacketContext:
capture_tstamp = None
def __init__(self, capture_tstamp=None):
self.capture_tstamp = capture_tstamp
class SubParserMixin():
def __init__(self):
self.context = PacketContext()
def parser_er_data(packet):
x = packet.payload
def on_packet(self, packet):
self.context.capture_tstamp = packet.capture_timestamp
self.parse_er_data(packet.payload)
#this mock test lives in another python file
from exchanges.impl.tse.mixins import PacketContext
def test_receive_timestamp(self):
packet = MagicMock(capture_timestamp=123456)
self.on_packet(packet)
assert packet.context.capture_tstamp == 123456
I have the above test that runs as I want it to, but there is one small issue- when self.on_packet(packet) is called with the mock packet, it of course will cause errors, in particular, due to the line
self.parse_er_data(packet.payload)
Is there anyway to stop the code from running past the line
self.context.capture_tstamp = packet.capture_timestamp
which is what we want to assert.
You could alter your
def on_packet(self, packet):
self.context.capture_tstamp = packet.capture_timestamp
self.parse_er_data(packet.payload)
to
def on_packet(self, packet):
self.context.capture_tstamp = packet.capture_timestamp
return None # to be removed after debugging the line above
self.parse_er_data(packet.payload)

"Sub-classes" and self in Python

Note: I see that I need to more clearly work out what it is that I want each property/descriptor/class/method to do before I ask how to do it! I don't think my question can be answered at this time. Thanks all for helping me out.
Thanks to icktoofay and BrenBarn, I'm starting to understand discriptors and properties, but now I have a slightly harder question to ask:
I see now how these work:
class Blub(object):
def __get__(self, instance, owner):
print('Blub gets ' + instance._blub)
return instance._blub
def __set__(self, instance, value):
print('Blub becomes ' + value)
instance._blub = value
class Quish(object):
blub = Blub()
def __init__(self, value):
self.blub = value
And how a = Quish('one') works (produces "Blub becomes one") but take a gander at this code:
import os
import glob
class Index(object):
def __init__(self, dir=os.getcwd()):
self.name = dir #index name is directory of indexes
# index is the list of indexes
self.index = glob.glob(os.path.join(self.name, 'BatchStarted*'))
# which is the pointer to the index (index[which] == BatchStarted_12312013_115959.txt)
self.which = 0
# self.file = self.File(self.index[self.which])
def get(self):
return self.index[self.which]
def next(self):
self.which += 1
if self.which < len(self.index):
return self.get()
else:
# loop back to the first
self.which = 0
return None
def back(self):
if self.which > 0:
self.which -= 1
return self.get()
class File(object):
def __init__(self, file):
# if the file exists, we'll use it.
if os.path.isfile(file):
self.name = file
# otherwise, our name is none and we return.
else:
self.name = None
return None
# 'file' attribute is the actual file object
self.file = open(self.name, 'r')
self.line = Lines(self.file)
class Lines(object):
# pass through the actual file object (not filename)
def __init__(self, file):
self.file = file
# line is the list if this file's lines
self.line = self.file.readlines()
self.which = 0
self.extension = Extension(self.line[self.which])
def __get__(self):
return self.line[self.which]
def __set__(self, value):
self.which = value
def next(self):
self.which += 1
return self.__get__()
def back(self):
self.which -= 1
return self.__get__()
class Extension(object):
def __init__(self, lineStr):
# check to make sure a string is passed
if lineStr:
self.lineStr = lineStr
self.line = self.lineStr.split('|')
self.pathStr = self.line[0]
self.path = self.pathStr.split('\\')
self.fileStr = self.path[-1]
self.file = self.fileStr.split('.')
else:
self.lineStr = None
def __get__(self):
self.line = self.lineStr.split('|')
self.pathStr = self.line[0]
self.path = self.pathStr.split('\\')
self.fileStr = self.path[-1]
self.file = self.fileStr.split('.')
return self.file[-1]
def __set__(self, ext):
self.file[-1] = ext
self.fileStr = '.'.join(self.file)
self.path[-1] = fileStr
self.pathStr = '\\'.join(self.path)
self.line[0] = self.pathStr
self.lineStr = '|'.join(self.line)
Firstly, there may be some typos in here because I've been working on it and leaving it half-arsed. That's not my point. My point is that in icktoofay's example, nothing gets passed to Blub(). Is there any way to do what I'm doing here, that is set some "self" attributes and after doing some processing, taking that and passing it to the next class? Would this be better suited for a property?
I would like to have it so that:
>>> i = Index() # i contains list of index files
>>> f = File(i.get()) # f is now one of those files
>>> f.line
'\\\\server\\share\\folder\\file0.txt|Name|Sean|Date|10-20-2000|Type|1'
>>> f.line.extension
'txt'
>>> f.line.extension = 'rtf'
>>> f.line
'\\\\server\\share\\folder\\file0.rtf|Name|Sean|Date|10-20-2000|Type|1'
You can do that, but the issue there is less about properties/descriptors and more about creating classes that give the behavior you want.
So, when you do f.line, that is some object. When you do f.line.extension, that is doing (f.line).extension --- that is, it first evalautes f.line and then gets the extension attribute of whatever f.line is.
The important thing here is that f.line cannot know whether you are later going to try to access its extension. So you can't have f.line do one thing for "plain" f.line and another thing for f.line.extension. The f.line part has to be the same in both, and the extension part can't change that.
The solution for what you seem to want to do is to make f.line return some kind of object that in some way looks or works like a string, but also allows setting attributes and updating itself accordingly. Exactly how you do this depends on how much you need f.lines to behave like a string and how much you need it to do other stuff. Basically you need f.line to be a "gatekeeper" object that handles some operations by acting like a string (e.g., you apparently want it to display as a string), and handles other objects in custom ways (e.g., you apparently want to be able to set an extension attribute on it and have that update its contents).
Here's a simplistic example:
class Line(object):
def __init__(self, txt):
self.base, self.extension = txt.split('.')
def __str__(self):
return self.base + "." + self.extension
Now you can do:
>>> line = Line('file.txt')
>>> print line
file.txt
>>> line.extension
'txt'
>>> line.extension = 'foo'
>>> print line
file.foo
However, notice that I did print line, not just line. By writing a __str__ method, I defined the behavior that happens when you do print line. But if you evaluate it "raw" without printing it, you'll see it's not really a string:
>>> line
<__main__.Line object at 0x000000000233D278>
You could override this behavior as well (by defining __repr__), but do you want to? That depends on how you want to use line. The point is that you need to decide what you want your line to do in what situations, and then craft a class that does that.

Python Custom Iterator: Close a file on StopIteration

I have written an iterator class that opens a file in it's __init__.
def __init__(self, path):
self.file = open(path, "r")
How do I close that file automatically when the iteration is finished?
Complete class:
class Parse(object):
"""A generator that iterates through a CC-CEDICT formatted file, returning
a tuple of parsed results (Traditional, Simplified, Pinyin, English)"""
def __init__(self, path):
self.file = open(path, "r")
def __iter__(self):
return self
def __is_comment(self, line):
return line.startswith("#")
def next(self):
#This block ignores comments.
line = self.file.readline()
while line and self.__is_comment(line):
line = self.file.readline()
if line:
working = line.rstrip().split(" ")
trad, simp = working[0], working[1]
working = " ".join(working[2:]).split("]")
pinyin = working[0][1:]
english = working[1][1:]
return trad, simp, pinyin, english
else:
raise StopIteration()
A better way to write the whole thing would be to keep the opening and the iteration in one place:
class Parse(object):
"""A generator that iterates through a CC-CEDICT formatted file, returning
a tuple of parsed results (Traditional, Simplified, Pinyin, English)"""
def __init__(self, path):
self.path = path
def __is_comment(self, line):
return line.startswith("#")
def __iter__(self):
with open(self.path) as f:
for line in f:
if self.__is_comment(line):
continue
working = line.rstrip().split(" ")
trad, simp = working[0], working[1]
working = " ".join(working[2:]).split("]")
pinyin = working[0][1:]
english = working[1][1:]
yield trad, simp, pinyin, english
This will wait to open the file until you really need it, and will automatically close it when done. It's also less code.
If you really want to get into the "generators are awesome!" mindset:
def skip_comments(f):
for line in f:
if not.startswith('#'):
yield line
...
def __iter__(self):
with open(self.path) as f:
for line in skip_comments(f):
working = ....
You need to explicitly close it as soon as StopIteration is raised. In this case, simply call .close() when you raise StopIteration yourself.
def next(self):
#This block ignores comments.
line = self.file.readline()
while line and self.__is_comment(line):
line = self.file.readline()
if line:
working = line.rstrip().split(" ")
trad, simp = working[0], working[1]
working = " ".join(working[2:]).split("]")
pinyin = working[0][1:]
english = working[1][1:]
return trad, simp, pinyin, english
else:
self.file.close()
raise StopIteration()
Since no other code in your .next() method could trigger a StopIteration this suffices.
If you did use next() on another iterator inside your own .next() you'd have to catch StopIteration with an except StopIteration: handler and reraise the exception.
This only handles the StopIteration case. If you want to handle other situations (not exhausting the iterator) you'll need to handle that situation separately. Making your class a Context Manager as well could help with that. Users of your iterator would then use the object in a with statement before iterating over it, and when the with suite is exited the file could be closed regardless. You may want to mark your iterator as 'done' as well in that case:
_closed = False
def next(self):
if self._closed:
raise StopIteration
line = self.file.readline()
while line and self.__is_comment(line):
line = self.file.readline()
if line:
working = line.rstrip().split(" ")
trad, simp = working[0], working[1]
working = " ".join(working[2:]).split("]")
pinyin = working[0][1:]
english = working[1][1:]
return trad, simp, pinyin, english
else:
self.file.close()
self._closed = True
raise StopIteration()
def __enter__(self):
return self
def __exit__(self, type_, value, tb):
self.file.close() # multiple calls to .close() are fine
self._closed = True

How to watch for a variable change in python without dunder setattr or pdb

There is large python project where one attribute of one class just have wrong value in some place.
It should be sqlalchemy.orm.attributes.InstrumentedAttribute, but when I run tests it is constant value, let's say string.
There is some way to run python program in debug mode, and run some check (if variable changed type) after each step throught line of code automatically?
P.S. I know how to log changes of attribute of class instance with help of inspect and property decorator. Possibly here I can use this method with metaclasses...
But sometimes I need more general and powerfull solution...
Thank you.
P.P.S. I need something like there: https://stackoverflow.com/a/7669165/816449, but may be with more explanation of what is going on in that code.
Well, here is a sort of slow approach. It can be modified for watching for local variable change (just by name). Here is how it works: we do sys.settrace and analyse the value of obj.attr each step. The tricky part is that we receive 'line' events (that some line was executed) before line is executed. So, when we notice that obj.attr has changed, we are already on the next line and we can't get the previous line frame (because frames aren't copied for each line, they are modified ). So on each line event I save traceback.format_stack to watcher.prev_st and if on the next call of trace_command value has changed, we print the saved stack trace to file. Saving traceback on each line is quite an expensive operation, so you'd have to set include keyword to a list of your projects directories (or just the root of your project) in order not to watch how other libraries are doing their stuff and waste cpu.
watcher.py
import traceback
class Watcher(object):
def __init__(self, obj=None, attr=None, log_file='log.txt', include=[], enabled=False):
"""
Debugger that watches for changes in object attributes
obj - object to be watched
attr - string, name of attribute
log_file - string, where to write output
include - list of strings, debug files only in these directories.
Set it to path of your project otherwise it will take long time
to run on big libraries import and usage.
"""
self.log_file=log_file
with open(self.log_file, 'wb'): pass
self.prev_st = None
self.include = [incl.replace('\\','/') for incl in include]
if obj:
self.value = getattr(obj, attr)
self.obj = obj
self.attr = attr
self.enabled = enabled # Important, must be last line on __init__.
def __call__(self, *args, **kwargs):
kwargs['enabled'] = True
self.__init__(*args, **kwargs)
def check_condition(self):
tmp = getattr(self.obj, self.attr)
result = tmp != self.value
self.value = tmp
return result
def trace_command(self, frame, event, arg):
if event!='line' or not self.enabled:
return self.trace_command
if self.check_condition():
if self.prev_st:
with open(self.log_file, 'ab') as f:
print >>f, "Value of",self.obj,".",self.attr,"changed!"
print >>f,"###### Line:"
print >>f,''.join(self.prev_st)
if self.include:
fname = frame.f_code.co_filename.replace('\\','/')
to_include = False
for incl in self.include:
if fname.startswith(incl):
to_include = True
break
if not to_include:
return self.trace_command
self.prev_st = traceback.format_stack(frame)
return self.trace_command
import sys
watcher = Watcher()
sys.settrace(watcher.trace_command)
testwatcher.py
from watcher import watcher
import numpy as np
import urllib2
class X(object):
def __init__(self, foo):
self.foo = foo
class Y(object):
def __init__(self, x):
self.xoo = x
def boom(self):
self.xoo.foo = "xoo foo!"
def main():
x = X(50)
watcher(x, 'foo', log_file='log.txt', include =['C:/Users/j/PycharmProjects/hello'])
x.foo = 500
x.goo = 300
y = Y(x)
y.boom()
arr = np.arange(0,100,0.1)
arr = arr**2
for i in xrange(3):
print 'a'
x.foo = i
for i in xrange(1):
i = i+1
main()
There's a very simple way to do this: use watchpoints.
Basically you only need to do
from watchpoints import watch
watch(your_object.attr)
That's it. Whenever the attribute is changed, it will print out the line that changed it and how it's changed. Super easy to use.
It also has more advanced features, for example, you can call pdb when the variable is changed, or use your own callback functions instead of print it to stdout.
A simpler way to watch for an object's attribute change (which can also be a module-level variable or anything accessible with getattr) would be to leverage hunter library, a flexible code tracing toolkit. To detect state changes we need a predicate which can look like the following:
import traceback
class MutationWatcher:
def __init__(self, target, attrs):
self.target = target
self.state = {k: getattr(target, k) for k in attrs}
def __call__(self, event):
result = False
for k, v in self.state.items():
current_value = getattr(self.target, k)
if v != current_value:
result = True
self.state[k] = current_value
print('Value of attribute {} has chaned from {!r} to {!r}'.format(
k, v, current_value))
if result:
traceback.print_stack(event.frame)
return result
Then given a sample code:
class TargetThatChangesWeirdly:
attr_name = 1
def some_nested_function_that_does_the_nasty_mutation(obj):
obj.attr_name = 2
def some_public_api(obj):
some_nested_function_that_does_the_nasty_mutation(obj)
We can instrument it with hunter like:
# or any other entry point that calls the public API of interest
if __name__ == '__main__':
obj = TargetThatChangesWeirdly()
import hunter
watcher = MutationWatcher(obj, ['attr_name'])
hunter.trace(watcher, stdlib=False, action=hunter.CodePrinter)
some_public_api(obj)
Running the module produces:
Value of attribute attr_name has chaned from 1 to 2
File "test.py", line 44, in <module>
some_public_api(obj)
File "test.py", line 10, in some_public_api
some_nested_function_that_does_the_nasty_mutation(obj)
File "test.py", line 6, in some_nested_function_that_does_the_nasty_mutation
obj.attr_name = 2
test.py:6 return obj.attr_name = 2
... return value: None
You can also use other actions that hunter supports. For instance, Debugger which breaks into pdb (debugger on an attribute change).
Try using __setattr__ to override the function that is called when an attribute assignment is attempted. Documentation for __setattr__
You can use the python debugger module (part of the standard library)
To use, just import pdb at the top of your source file:
import pdb
and then set a trace wherever you want to start inspecting the code:
pdb.set_trace()
You can then step through the code with n, and investigate the current state by running python commands.
def __setattr__(self, name, value):
if name=="xxx":
util.output_stack('xxxxx')
super(XXX, self).__setattr__(name, value)
This sample code helped me.

Utility that helps in file locking - expert tips wanted

I've written a subclass of file that a) provides methods to conveniently lock it (using fcntl, so it only supports unix, which is however OK for me atm) and b) when reading or writing asserts that the file is appropriately locked.
Now I'm not an expert at such stuff (I've just read one paper [de] about it) and would appreciate some feedback: Is it secure, are there race conditions, are there other things that could be done better … Here is the code:
from fcntl import flock, LOCK_EX, LOCK_SH, LOCK_UN, LOCK_NB
class LockedFile(file):
"""
A wrapper around `file` providing locking. Requires a shared lock to read
and a exclusive lock to write.
Main differences:
* Additional methods: lock_ex, lock_sh, unlock
* Refuse to read when not locked, refuse to write when not locked
exclusivly.
* mode cannot be `w` since then the file would be truncated before
it could be locked.
You have to lock the file yourself, it won't be done for you implicitly.
Only you know what lock you need.
Example usage::
def get_config():
f = LockedFile(CONFIG_FILENAME, 'r')
f.lock_sh()
config = parse_ini(f.read())
f.close()
def set_config(key, value):
f = LockedFile(CONFIG_FILENAME, 'r+')
f.lock_ex()
config = parse_ini(f.read())
config[key] = value
f.truncate()
f.write(make_ini(config))
f.close()
"""
def __init__(self, name, mode='r', *args, **kwargs):
if 'w' in mode:
raise ValueError('Cannot open file in `w` mode')
super(LockedFile, self).__init__(name, mode, *args, **kwargs)
self.locked = None
def lock_sh(self, **kwargs):
"""
Acquire a shared lock on the file. If the file is already locked
exclusively, do nothing.
:returns: Lock status from before the call (one of 'sh', 'ex', None).
:param nonblocking: Don't wait for the lock to be available.
"""
if self.locked == 'ex':
return # would implicitly remove the exclusive lock
return self._lock(LOCK_SH, **kwargs)
def lock_ex(self, **kwargs):
"""
Acquire an exclusive lock on the file.
:returns: Lock status from before the call (one of 'sh', 'ex', None).
:param nonblocking: Don't wait for the lock to be available.
"""
return self._lock(LOCK_EX, **kwargs)
def unlock(self):
"""
Release all locks on the file.
Flushes if there was an exclusive lock.
:returns: Lock status from before the call (one of 'sh', 'ex', None).
"""
if self.locked == 'ex':
self.flush()
return self._lock(LOCK_UN)
def _lock(self, mode, nonblocking=False):
flock(self, mode | bool(nonblocking) * LOCK_NB)
before = self.locked
self.locked = {LOCK_SH: 'sh', LOCK_EX: 'ex', LOCK_UN: None}[mode]
return before
def _assert_read_lock(self):
assert self.locked, "File is not locked"
def _assert_write_lock(self):
assert self.locked == 'ex', "File is not locked exclusively"
def read(self, *args):
self._assert_read_lock()
return super(LockedFile, self).read(*args)
def readline(self, *args):
self._assert_read_lock()
return super(LockedFile, self).readline(*args)
def readlines(self, *args):
self._assert_read_lock()
return super(LockedFile, self).readlines(*args)
def xreadlines(self, *args):
self._assert_read_lock()
return super(LockedFile, self).xreadlines(*args)
def __iter__(self):
self._assert_read_lock()
return super(LockedFile, self).__iter__()
def next(self):
self._assert_read_lock()
return super(LockedFile, self).next()
def write(self, *args):
self._assert_write_lock()
return super(LockedFile, self).write(*args)
def writelines(self, *args):
self._assert_write_lock()
return super(LockedFile, self).writelines(*args)
def flush(self):
self._assert_write_lock()
return super(LockedFile, self).flush()
def truncate(self, *args):
self._assert_write_lock()
return super(LockedFile, self).truncate(*args)
def close(self):
self.unlock()
return super(LockedFile, self).close()
(the example in the docstring is also my current use case for this)
Thanks for having read until down here, and possibly even answering :)
I am also not an expert, but there is one thing you should change, and a couple others to consider:
First off, using assert this way is a bad idea: if python is run with -O or -OO asserts are turned off and your two assert_*_lock() methods would always return True.
Second -- you need some tests. :) I took the liberty of added a custom error class and writing a couple tests. The first four pass, the last fails; which brings up the question, what should happen if the file is opened normally (as some other non-LockedFile object) and data is written to it?
Oh, and lastly -- the name LockableFile makes more sense to me, since the file can be in an unlocked state.
Here are the changes I made:
class LockedFileError(OSError): # might want IOError instead
pass
if __name__ == '__main__':
import unittest
import tempfile
import shutil
import os
class TestLockedFile(unittest.TestCase):
def setUp(self):
self.dir = tempfile.mkdtemp()
self.testfile = testfile = os.path.join(self.dir, 'opened.txt')
temp = open(testfile, 'w')
temp.write('[global]\nsetting1=99\nsetting2=42\n')
temp.close()
def tearDown(self):
shutil.rmtree(self.dir, ignore_errors=True)
def test_01(self):
"writes fail if not locked exclusively"
testfile = self.testfile
temp = LockedFile(testfile, 'r+')
self.assertRaises(LockedFileError, temp.write, 'arbitrary data')
temp.lock_sh()
self.assertRaises(LockedFileError, temp.write, 'arbitrary data')
def test_02(self):
"reads fail if not locked"
testfile = self.testfile
temp = LockedFile(testfile, 'r')
self.assertRaises(LockedFileError, temp.read)
def test_03(self):
"writes succeed if locked exclusively"
testfile = self.testfile
temp = LockedFile(testfile, 'r+')
temp.lock_ex()
temp.write('arbitrary data\n')
def test_04(self):
"reads succeed if locked"
testfile = self.testfile
temp = LockedFile(testfile, 'r')
temp.lock_sh()
temp.readline()
temp.lock_ex()
temp.readline()
def test_05(self):
"other writes fail if locked exclusively"
testfile = self.testfile
temp = LockedFile(testfile, 'r')
temp.lock_ex()
testing = open(testfile, 'r+')
# not sure if this should be OSError, IOError, or something else...
self.assertRaises(OSError, testing.write, 'this should fail\n')
unittest.main()
Many more tests should be written to cover the various combinations of a LockedFile with reads, writes, and other non-LockedFile file objects trying to read/write to the same actual file.

Categories

Resources