I am using JSON serializer helper function to easy access of dictionary(basically received as JSON) objects.
jsondict.py
"""Utilities for working with JSON and json-like structures - deeply nested Python dicts and lists
This lets us iterate over child nodes and access elements with a dot-notation.
"""
import sys
isPy3 = sys.version_info[0]==3
if isPy3:
def __alt_str__(v,enc='utf8'):
return v if isinstance(v,bytes) else v.encode(enc)
__strTypes__ = (str,bytes)
else:
__alt_str__ = unicode
__strTypes__ = (str,unicode)
class MyLocals(object):
pass
mylocals = MyLocals()
def setErrorCollect(collect):
mylocals.error_collect = collect
setErrorCollect(False)
def errorValue(x):
if isinstance(x,__strTypes__):
return repr(x) if ' ' in x else x
return 'None' if x is None else str(x)
def condJSON(v,__name__=''):
return JSONDict(v,__name__=__name__) if isinstance(v,dict) else JSONList(v,__name__=__name__) if isinstance(v,list) else v
def condJSONSafe(v,__name__=''):
return JSONDictSafe(v,__name__=__name__) if isinstance(v,dict) else JSONListSafe(v,__name__=__name__) if isinstance(v,list) else v
class JSONListIter(object):
def __init__(self, lst, conv):
self.lst = lst
self.i = -1
self.conv = conv
def __iter__(self):
return self
def next(self):
if self.i<len(self.lst)-1:
self.i += 1
return self.conv(self.lst[self.i])
else:
raise StopIteration
if isPy3:
__next__ = next
del next
class JSONList(list):
def __init__(self,v,__name__=''):
list.__init__(self,v)
self.__name__ = __name__
def __getitem__(self,x):
return condJSON(list.__getitem__(self,x),__name__='%s\t%s'%(self.__name__,errorValue(x)))
def __iter__(self):
return JSONListIter(self,condJSON)
class JSONListSafe(JSONList):
def __getitem__(self,x):
__name__='%s\t%s'%(self.__name__,errorValue(x))
try:
return condJSONSafe(list.__getitem__(self,x),__name__=__name__)
except:
if mylocals.error_collect:
mylocals.error_collect(__name__)
return JSONStrSafe('')
def __iter__(self):
return JSONListIter(self,condJSONSafe)
class JSONStrSafe(str):
def __getattr__(self, attr):
return self
__getitem__ = __getattr__
class JSONDict(dict):
"Allows dotted access"
def __new__(cls,*args,**kwds):
__name__ = kwds.pop('__name__')
self = dict.__new__(cls,*args,**kwds)
self.__name__ = __name__
return self
def __init__(self,*args,**kwds):
kwds.pop('__name__','')
dict.__init__(self,*args,**kwds)
def __getattr__(self, attr, default=None):
if attr in self:
return condJSON(self[attr],__name__='%s\t%s'%(self.__name__,errorValue(attr)))
elif __alt_str__(attr) in self:
return condJSON(self[__alt_str__(attr)],__name__='%s\t%s'%(self.__name__,errorValue(attr)))
elif attr=='__safe__':
return JSONDictSafe(self,__name__=self.__name__)
else:
raise AttributeError("No attribute or key named '%s'" % attr)
def sorted_items(self,accept=None, reject=lambda i: i[0]=='__name__'):
if accept or reject:
if not accept:
f = lambda i: not reject(i)
elif not reject:
f = accept
else: #both
f = lambda i: accept(i) and not reject(i)
return sorted(((k,condJSON(v,__name__==k)) for k,v in self.iteritems() if f((k,v))))
else:
return sorted(((k,condJSON(v,__name__==k)) for k,v in self.iteritems()))
def sorted_keys(self):
return sorted(self.keys())
class JSONDictSafe(JSONDict):
"Allows dotted access"
def __getattr__(self, attr, default=None):
if attr in self:
return condJSONSafe(self[attr],__name__='%s\t%s'%(self.__name__,errorValue(attr)))
elif __alt_str__(attr) in self:
return condJSONSafe(self[__alt_str__(attr)],__name__='%s\t%s'%(self.__name__,errorValue(attr)))
elif attr=='__safe__':
return self
else:
return JSONStrSafe('')
def __getitem__(self,x):
__name__='%s\t%s'%(self.__name__,errorValue(x))
try:
return condJSONSafe(dict.__getitem__(self,x),__name__=__name__)
except KeyError:
if mylocals.error_collect:
mylocals.error_collect(__name__)
return JSONStrSafe('')
def sorted_items(self,accept=None, reject=lambda i: i[0]=='__name__'):
if accept or reject:
if not accept:
f = lambda i: not reject(i)
elif not reject:
f = accept
else: #both
f = lambda i: accept(i) and not reject(i)
return sorted(((k,condJSONSafe(v,__name__==k)) for k,v in self.iteritems() if f((k,v))))
else:
return sorted(((k,condJSONSafe(v,__name__==k)) for k,v in self.iteritems()))
If JSON object passed like below.
data = {'name': 'john', 'age': 20, 'address': {'city':'xyz', 'country':'XZ', 'zip': 1223}}
json_obj = condJSONSafe(data)
I am able to access data with dot notation.
print(json_obj.name) --> john
print(json_obj.address.country) --> XZ
It was working well until I implementing multiprocessing in my code to improve the performance.
I have extracted a certain number of data from JSON (after made it as dot notation accessible data with the above helper function) and store it into separate lists, like list a,b,c.
And then, I passed into multiprocessing threads,
with mp.Pool(processes=mp.cpu_count()) as pool:
res = pool.starmap(self.process_records, zip(self.a, self.b, self.c))
pool.join()
end up with
TypeError: 'JSONStrSafe' object is not callable
I tried this answer, but it does not work for me. Appreciate your help. Thanks in advance.
EDIT:
reproduce example:
test.py
import jsondict
import multiprocessing as mp
import itertools
def process_records(data, metadata):
print(data.name)
print(metadata)
#code to requirment
if __name__ == '__main__':
data = {
"metadata": "test_data",
"cust_list": [
{
'name': 'john',
'age': 20,
'address': {
'city':'xyz',
'country':'XZ',
'zip': 1223
}
},
{
'name': 'michal',
'age': 25,
'address': {
'city':'abc',
'country':'CX',
'zip': 3435
}
},
{
'name': 'david',
'age': 30,
'address': {
'city':'mnl',
'country':'TD',
'zip': 6767
}
}
]
}
json_obj = jsondict.condJSONSafe(data)
print(json_obj.metadata) #will print 'test_data'
print(json_obj.cust_list[0].name) #will print 'john'
print(json_obj.cust_list[2].address.city) #will print 'mnl'
with mp.Pool(processes=mp.cpu_count()) as pool:
res = pool.starmap(process_records, zip(json_obj.cust_list, itertools.repeat(json_obj.metadata))) # --> not working
#res = pool.map(process_records, zip(json_obj.cust_list, itertools.repeat(json_obj.metadata))) --> not working
#res = [pool.apply_async(process_records, d, json_obj.metadata) for d in json_obj.cust_list] --> not working
#apply --> not working
pool.join()
Output:
test_data
john
mnl
Traceback (most recent call last):
File "c:/Users/mohanlal/Desktop/Mock/json_err/test_app.py", line 53, in <module>
res = pool.starmap(process_records, zip(json_obj.cust_list, itertools.repeat(json_obj.metadata))) # --> not working
File "C:\Users\mohanlal\AppData\Local\Programs\Python\Python36\lib\multiprocessing\pool.py", line 268, in starmap
return self._map_async(func, iterable, starmapstar, chunksize).get()
File "C:\Users\mohanlal\AppData\Local\Programs\Python\Python36\lib\multiprocessing\pool.py", line 608, in get
raise self._value
File "C:\Users\mohanlal\AppData\Local\Programs\Python\Python36\lib\multiprocessing\pool.py", line 385, in _handle_tasks
put(task)
File "C:\Users\mohanlal\AppData\Local\Programs\Python\Python36\lib\multiprocessing\connection.py", line 206, in send
self._send_bytes(_ForkingPickler.dumps(obj))
File "C:\Users\mohanlal\AppData\Local\Programs\Python\Python36\lib\multiprocessing\reduction.py", line 51, in dumps
cls(buf, protocol).dump(obj)
TypeError: 'JSONStrSafe' object is not callable
Tried with startmap, map, apply_async, apply, getting the same error for all.
I have tried with solution given in similar question attached link above. Modified as below where this error raised.
import re
dunder_pattern = re.compile("__.*__")
protected_pattern = re.compile("_.*")
classJSONStrSafe(str):
def__getattr__(self, attr):
if dunder_pattern.match(attr) or protected_pattern.match(attr):
return super().__getattr__(attr)
return self
def__getstate__(self): returnself.__dict__
def__setstate__(self, d): self.__dict__.update(d)
__getitem__ = __getattr__
But issue persists.
As suggested in the comments, I changed in all 3 places for getattr and tried. Getting different error as below
Process SpawnPoolWorker-1:
Traceback (most recent call last):
File "C:\Users\mohanlal\AppData\Local\Programs\Python\Python36\lib\multiprocessing\process.py", line 249, in _bootstrap
self.run()
File "C:\Users\mohanlal\AppData\Local\Programs\Python\Python36\lib\multiprocessing\process.py", line 93, in run
self._target(*self._args, **self._kwargs)
File "C:\Users\mohanlal\AppData\Local\Programs\Python\Python36\lib\multiprocessing\pool.py", line 108, in worker
task = get()
File "C:\Users\mohanlal\AppData\Local\Programs\Python\Python36\lib\multiprocessing\queues.py", line 345, in get
return _ForkingPickler.loads(res)
File "c:\Users\mohanlal\Desktop\Mock\json_err\jsondict.py", line 89, in __new__
__name__ = kwds.pop('__name__')
Process SpawnPoolWorker-2:
Process SpawnPoolWorker-4:
Traceback (most recent call last):
Traceback (most recent call last):
KeyError: '__name__'
File "C:\Users\mohanlal\AppData\Local\Programs\Python\Python36\lib\multiprocessing\process.py", line 249, in _bootstrap
self.run()
File "C:\Users\mohanlal\AppData\Local\Programs\Python\Python36\lib\multiprocessing\process.py", line 93, in run
self._target(*self._args, **self._kwargs)
File "C:\Users\mohanlal\AppData\Local\Programs\Python\Python36\lib\multiprocessing\pool.py", line 108, in worker
task = get()
File "C:\Users\mohanlal\AppData\Local\Programs\Python\Python36\lib\multiprocessing\queues.py", line 345, in get
return _ForkingPickler.loads(res)
File "C:\Users\mohanlal\AppData\Local\Programs\Python\Python36\lib\multiprocessing\process.py", line 249, in _bootstrap
self.run()
File "c:\Users\mohanlal\Desktop\Mock\json_err\jsondict.py", line 89, in __new__
__name__ = kwds.pop('__name__')
File "C:\Users\mohanlal\AppData\Local\Programs\Python\Python36\lib\multiprocessing\process.py", line 93, in run
self._target(*self._args, **self._kwargs)
KeyError: '__name__'
File "C:\Users\mohanlal\AppData\Local\Programs\Python\Python36\lib\multiprocessing\pool.py", line 108, in worker
task = get()
File "C:\Users\mohanlal\AppData\Local\Programs\Python\Python36\lib\multiprocessing\queues.py", line 345, in get
return _ForkingPickler.loads(res)
File "c:\Users\mohanlal\Desktop\Mock\json_err\jsondict.py", line 89, in __new__
__name__ = kwds.pop('__name__')
KeyError: '__name__'
The problem is you are in a "pickle". Forgive the pun -- you have a pickle problem. When you are doing multiprocessing, the arguments to your worker functions/methods are pickled. Usually, the defaults used to serialize and de-serialize states are OK, but not in your case. See Pickling Class Instances. The default save and load operations for serializing and de-serializing an object are:
def save(obj):
return (obj.__class__, obj.__dict__)
def load(cls, attributes):
obj = cls.__new__(cls)
obj.__dict__.update(attributes)
return obj
Note that when de-serializing the object the object's __init__ method is not called but its __new__ method is, and therein lies the problem. I had to modify your __new__ method of class JSONDict to try to recognize that it was being called by de-serialization and therefore '__name__' may not be present among the keyword arguments and then had to add to that class customized __getstate__ and __setstate__ methods to override the default way it saves and restores the object's attributes (method __init__ remains unmodified):
class JSONDict(dict):
"Allows dotted access"
def __new__(cls,*args,**kwds):
self = dict.__new__(cls,*args,**kwds)
if kwds and '__name__' in kwds:
__name__ = kwds.pop('__name__')
self.__name__ = __name__
return self
def __init__(self,*args,**kwds):
kwds.pop('__name__','')
dict.__init__(self,*args,**kwds)
def __getstate__(self):
return self.__dict__
def __setstate__(self, d):
self.__dict__ = d
""" The other methods remain unmodified """
Prints:
test_data
john
mnl
john
test_data
michal
david
test_data
test_data
Update
I was scratching my head figuring out why it should be necessary to provide the __getstate__ and __setstate__ pickle methods since what they are doing should be the default action anyway. If you modify the program just to test the pickling without even running the Pool methods by inserting the following line:
json_obj = condJSONSafe(data)
# insert this line:
import pickle; print(pickle.dumps(json_obj)); sys.exit(0)
It prints:
Traceback (most recent call last):
File "test.py", line 205, in <module>
import pickle; print('pickle'); print(pickle.dumps(json_obj)); sys.exit(0)
TypeError: 'JSONStrSafe' object is not callable
After adding a print statement in the right place, it became clear that the problem was in the __getattr__ method of class JSONDictSafe. When pickle checks to see if the class implements methods __getstate__ and __setstate__, when there are no implementations __getattr__ is ultimately called and returns as the default value for these attributes a JSONStrSafe instance. So instead of providing these attributes by defining these methods as I have done, one can alternatively add a simple check as follows:
class JSONDictSafe(JSONDict):
"Allows dotted access"
def __getattr__(self, attr, default=None):
if attr in ('__getstate__', '__setstate__'):
raise AttributeError(f'Missing attribute: {attr}')
""" rest of the method is unmodified """
I've managed to create an object that can exist in shared memory with BaseManager and NamespaceProxy, however all the examples I've seen require me to create the object with the proxy. For example:
class Foo:
def __init__(self):
self._a = 1000
def get_a(self):
return self._a
class SharedFoo(NamespaceProxy):
_exposed_ = ('__getattribute__', '__getattr__', '__setattr__', '__init__', 'get_a')
def get_a(self):
callmethod = object.__getattribute__(self, '_callmethod')
return callmethod('get_a', ())
class FooManager(BaseManager):
pass
def test():
FooManager.register('Foo', Foo, SharedFoo)
with FooManager() as manager:
ls = []
t = time.time()
for i in range(100):
ls.append(manager.Foo())
print(time.time() - t)
which prints out:
0.44 (and some other numbers that i ommitted)
Since I would likely create millions of Foo objects, this is too slow for the task. I tried to make it faster like this:
def do_stuff(obj):
obj.ls[4].set_a(300)
def test():
FooManager.register('Foo', Foo, SharedFoo)
with FooManager() as manager:
if manager._Server != None:
manager._Server.mutex = NoLock()
ls = []
t = time.time()
for i in range(100000):
ls.append(Foo())
foos = manager.Foo()
foos.ls = mp.Manager().list(ls)
print(time.time() - t)
processes = [Process(target=do_stuff, args = (foos,)) for _ in range(3)]
for process in processes:
process.start()
for process in processes:
process.join()
print(foos.ls[4].get_a())
which gave me this error:
Traceback (most recent call last):
File "/path/to/lib/python3.7/multiprocessing/managers.py", line 788, in _callmethod
conn = self._tls.connection
AttributeError: 'ForkAwareLocal' object has no attribute 'connection'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/path/to/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
self.run()
File "/path/to/lib/python3.7/multiprocessing/process.py", line 99, in run
self._target(*self._args, **self._kwargs)
File "/path/to/myproject/test.py", line 121, in do_stuff
obj.ls[4].set_a(300)
File "/path/to/lib/python3.7/multiprocessing/managers.py", line 1099, in __getattr__
return callmethod('__getattribute__', (key,))
File "/path/to/lib/python3.7/multiprocessing/managers.py", line 792, in _callmethod
self._connect()
File "/path/to/lib/python3.7/multiprocessing/managers.py", line 779, in _connect
conn = self._Client(self._token.address, authkey=self._authkey)
File "/path/to/lib/python3.7/multiprocessing/connection.py", line 492, in Client
c = SocketClient(address)
File "/path/to/lib/python3.7/multiprocessing/connection.py", line 619, in SocketClient
s.connect(address)
FileNotFoundError: [Errno 2] No such file or directory
Is what I'm trying to do possible? If so what should I use (not looking for a complete solution, just some ressources on how to do it)? I'm using Python 3.7 on Linux if that's relevant
Thanks
Edit: is this feasible with mmap, or am I going in a completely wrong direction? It looks promising but the documentations seems to say that it's more for files (again not looking for a complete solution, just if mmap would work with custom objects)
I am trying to mock data1 and data2 and trying to provide a return value.
I have the following code:
import pandas
def main():
data1= pandas.read_excel('path1')
data2= pandas.read_excel('path2')
if __name__ == '__main__':
main()
import test1
from unittest.mock import patch
import pandas
class Testdata(unittest.TestCase):
#patch('test1.main.data1')
#patch('test1.main.data2')
def test_main(self, mock_data1, mock_data2):
mock_data1.return_value = pandas.DataFrame([some dataframe])
mock_data2.return_value = pandas.DataFrame([some dataframe])
test.main()
data1.assert_called_once()
data2.assert_called_once()
if __name__ == '__main__':
unittest.main()
I am getting the following error:
Error
Traceback (most recent call last):
File "C:\apps\python\3.6.2\lib\unittest\case.py", line 59, in testPartExecutor
yield
File "C:\apps\python\3.6.2\lib\unittest\case.py", line 605, in run
testMethod()
File "C:\apps\python\3.6.2\lib\unittest\mock.py", line 1171, in patched
arg = patching.__enter__()
File "C:\apps\python\3.6.2\lib\unittest\mock.py", line 1227, in __enter__
self.target = self.getter()
File "C:\apps\python\3.6.2\lib\unittest\mock.py", line 1397, in <lambda>
getter = lambda: _importer(target)
File "C:\apps\python\3.6.2\lib\unittest\mock.py", line 1080, in _importer
thing = __import__(import_path)
ModuleNotFoundError: No module named 'main'
How do I resolve this issue and how to mock data1 and data2 and provide a return value to it?
Can't say much before looking at the full-code but I thing adding import unitest in the starting shall do the job.
I am writing a simple server program. They will be inevitable typos and other errors in fresh code, and usually the python interpreter will print a ValueError/AttributeError traceback and exit. The traceback can point to the exact position of the error. However under the twisted framework, these errors are not printed. Like in the following example:
from twisted.internet import reactor, protocol, task
#from twisted.internet.defer import setDebugging
#setDebugging(True)
class MyProtocol(protocol.Protocol):
def dataReceived(self, data):
try:
set_position(int(data))
except ValueError:
pass
def connectionMade(self):
self.factory.clientConnectionMade(self)
def connectionLost(self, reason):
self.factory.clientConnectionLost(self)
class MyFactory(protocol.Factory):
protocol = MyProtocol
def __init__(self):
self.clients = []
self.lc = task.LoopingCall(self.announce)
self.lc.start(1)
def announce(self):
pos = A_GREAT_TYPO_HERE()
for client in self.clients:
client.transport.write("Posiiton is {0}\n".format(pos).encode('utf-8'))
def clientConnectionMade(self, client):
self.clients.append(client)
def clientConnectionLost(self, client):
self.clients.remove(client)
def get_position():
return position[0]
def set_position(pos):
position[0] = pos
def main():
global position
position = [0]
myfactory = MyFactory()
reactor.listenTCP(5362, myfactory)
reactor.run()
if __name__ == "__main__":
main()
A_GREAT_TYPO_HERE() in MyFactory.announce is meant to be get_position(). But it is a typo.
And when the server is run, the terminal only outputs
Unhandled error in Deferred:
and nothing else. Even if I enable Defer debugging (uncomment the 2nd and 3rd line), the terminal outputs:
Unhandled error in Deferred:
(debug: C: Deferred was created:
C: File "nodes/test.py", line 48, in <module>
C: main()
C: File "nodes/test.py", line 43, in main
C: myfactory = MyFactory()
C: File "nodes/test.py", line 21, in __init__
C: self.lc.start(1)
C: File "/home/sgsdxzy/anaconda3/lib/python3.6/site-packages/twisted/internet/task.py", line 189, in start
C: deferred = self._deferred = defer.Deferred()
I: First Invoker was:
I: File "nodes/test.py", line 48, in <module>
I: main()
I: File "nodes/test.py", line 43, in main
I: myfactory = MyFactory()
I: File "nodes/test.py", line 21, in __init__
I: self.lc.start(1)
I: File "/home/sgsdxzy/anaconda3/lib/python3.6/site-packages/twisted/internet/task.py", line 194, in start
I: self()
I: File "/home/sgsdxzy/anaconda3/lib/python3.6/site-packages/twisted/internet/task.py", line 241, in __call__
I: d.addErrback(eb)
I: File "/home/sgsdxzy/anaconda3/lib/python3.6/site-packages/twisted/internet/defer.py", line 332, in addErrback
I: errbackKeywords=kw)
I: File "/home/sgsdxzy/anaconda3/lib/python3.6/site-packages/twisted/internet/defer.py", line 310, in addCallbacks
I: self._runCallbacks()
I: File "/home/sgsdxzy/anaconda3/lib/python3.6/site-packages/twisted/internet/defer.py", line 653, in _runCallbacks
I: current.result = callback(current.result, *args, **kw)
I: File "/home/sgsdxzy/anaconda3/lib/python3.6/site-packages/twisted/internet/task.py", line 236, in eb
I: d.errback(failure)
)
It points the error as close as to self.lc.start(1), but not A_GREAT_TYPO_HERE(). How can I debug my program so tracebacks can point to actual errors?
The "C" and "I" lines you're seeing are due to the fact that you've enabled Deferred debugging. The "C" lines give you the stack where the Deferred was created. The "I" lines give you the stack where the Deferred was "invoked" (its callback or errback method was called).
Neither of those is what you're looking for, it seems. If you want to see the stack associated with the Failure the Deferred has been fired with, the most straightforward solution is to make sure the Failure gets logged (and that you have a log observer so that you can actually see that log event).
You should add this to your main:
from sys import stdout
from twisted.logger import globalLogBeginner, textFileLogObserver
globalLogBeginner.beginLoggingTo([textFileLogObserver(stdout)])
This directs the log stream to stdout as text. It is most likely sufficient to get you the information you want. However, to be really safe, you also want to explicitly log failures instead of relying on the garbage collector to do it for you. So you also want to change:
self.lc.start(1)
To:
# Module scope
from twisted.logger import Logger
logger = Logger()
...
# in __init__
d = self.lc.start(1)
d.addErrback(lambda f: logger.failure("Loop thing problem", f))
(Also you may want to consider taking this code out of __init__ and putting it in startFactory instead; also consider not using a global reactor but instead pass it around as a parameter.)
This will give you output like:
2017-04-25T06:53:14-0400 [__main__.MyFactory#critical] Foo
Traceback (most recent call last):
File "debugging2.py", line 52, in main
myfactory = MyFactory()
File "debugging2.py", line 28, in __init__
d = self.lc.start(1)
File "/tmp/debugging/local/lib/python2.7/site-packages/twisted/internet/task.py", line 194, in start
self()
File "/tmp/debugging/local/lib/python2.7/site-packages/twisted/internet/task.py", line 239, in __call__
d = defer.maybeDeferred(self.f, *self.a, **self.kw)
--- <exception caught here> ---
File "/tmp/debugging/local/lib/python2.7/site-packages/twisted/internet/defer.py", line 150, in maybeDeferred
result = f(*args, **kw)
File "debugging2.py", line 32, in announce
pos = A_GREAT_TYPO_HERE()
exceptions.NameError: global name 'A_GREAT_TYPO_HERE' is not defined