I want to share a dict of thread-objects between 2 processes. I have also another dict of objects which seems to work at the moment.
The problem is that it raises an exception when I try to add key/value pairs to the dict (key is an integer and value is the thread-object):
Exception with manager.dict()
TypeError: can't pickle _thread.lock objects
I try to switch from manager.dict() to manager.list(), it does not work either:
Exception with manager.list()
TypeError: can't pickle _thread.lock objects
The readFiles() function is working correctly.
I use python 3.5.1 (Anaconda)
def startAlgorithm(fNameGraph, fNameEnergyDistribution, fNameRouteTables):
global _manager, _allTiesets, _allNodes, _stopDistribution
_manager = Manager()
_allTiesets = _manager.dict()
_allNodes = _manager.dict()
_stopDistribution = Value(c_bool, False)
readFiles(fNameGraph, fNameEnergyDistribution, fNameRouteTables)
initializeAlgorithm()
procTADiC = Process(target=TADiC, args=(_stopDistribution, _allNodes))
procTA = Process(target=TIESET_AGENT, args=(_stopDistribution, _allNodes, _allTiesets))
procTADiC.start()
procTA.start()
procTADiC.join()
procTA.join()
def initializeAlgorithm():
global _graphNX, _routingTable, _energyDistribution, _energyMeanValue
#Init all Nodes
allNodeIDs = _graphNX.nodes()
energySum = 0
for node in allNodeIDs:
nodeEnergyLoad = float(_energyDistribution.get(str(node)))
nodeObj = Node(node, nodeEnergyLoad)
_allNodes[node] = nodeObj
energySum = energySum + nodeEnergyLoad
#Calculate the mean value from the whole energy in the graph
_energyMeanValue = energySum / len(allNodeIDs)
#Init all Tieset-Threads
for tieset in _routingTable:
tiesetID = int(tieset['TiesetID'])
connNodes = list(tieset['Nodes'])
connEdges = list(tieset['Edges'])
adjTiesets = list(tieset['AdjTiesets'])
tiesetThread = Tieset(tiesetID, connNodes, connEdges, adjTiesets)
_allTiesets[tiesetID] = tiesetThread # Raise Exception!!!!!!!!!!
class Node:
'Node-Class that hold information about a node in a tieset'
def __init__(self, nodeID, energyLoad):
self.nodeID = nodeID
self.energyLoad = energyLoad
self.tiesetFlag = False
class Tieset(threading.Thread):
'Tieset-Class as Thread to distribute the load within the tieset'
def __init__(self, tiesetID, connectedNodes, connectedEdges, adjTiesets):
threading.Thread.__init__(self)
self.tiesetID = tiesetID
self.connectedNodes = connectedNodes
self.connectedEdges = connectedEdges
self.adjTiesets = adjTiesets
self.leaderNodeID = min(int(n) for n in connectedNodes)
self.measureCnt = 0
def run(self):
print('start Thread')
What I can say that you can't share threads between processes, you can share arguments for those threads if you want to start them in different processes, or you can share some results. The problem you are seeing caused by nature of that process creation, in python all the parameters will be serialized in your current process, then passed to a new one, and then python will deserialize them there to run the "target". Apparently, thread object is not serializable (you can check this interesting thread to understand serialization problem debugging pickle).
Related
I would like to parallelize a process in python which needs read access to several large, non-array data structures. What would be a recommended way to do this without copying all of the large data structures into every new process?
Thank you
The multiprocessing package provides two ways of sharing state: shared memory objects and server process managers. You should use server process managers as they support arbitrary object types.
The following program makes use of a server process manager:
#!/usr/bin/env python3
from multiprocessing import Process, Manager
# Simple data structure
class DataStruct:
data_id = None
data_str = None
def __init__(self, data_id, data_str):
self.data_id = data_id
self.data_str = data_str
def __str__(self):
return f"{self.data_str} has ID {self.data_id}"
def __repr__(self):
return f"({self.data_id}, {self.data_str})"
def set_data_id(self, data_id):
self.data_id = data_id
def set_data_str(self, data_str):
self.data_str = data_str
def get_data_id(self):
return self.data_id
def get_data_str(self):
return self.data_str
# Create function to manipulate data
def manipulate_data_structs(data_structs, find_str):
for ds in data_structs:
if ds.get_data_str() == find_str:
print(ds)
# Create manager context, modify the data
with Manager() as manager:
# List of DataStruct objects
l = manager.list([
DataStruct(32, "Andrea"),
DataStruct(45, "Bill"),
DataStruct(21, "Claire"),
])
# Processes that look for DataStructs with a given String
procs = [
Process(target = manipulate_data_structs, args = (l, "Andrea")),
Process(target = manipulate_data_structs, args = (l, "Claire")),
Process(target = manipulate_data_structs, args = (l, "David")),
]
for proc in procs:
proc.start()
for proc in procs:
proc.join()
For more information, see Sharing state between processes in the documentation.
I am attempting to dynamically open and parse through several text files (~10) to extract a particular value from key, for which I am utilizing multi-processing within Python to do this. My issue is that the function that I am calling writes particular data to a class list which I can see in the method, however outside the method that list is empty. Refer to the following:
class:
class MyClass(object):
__id_list = []
def __init__(self):
self.process_wrapper()
Caller Method:
def process_wrapper(self):
from multiprocessing import Pool
import multiprocessing
info_file = 'info*'
file_list = []
p = Pool(processes = multiprocessing.cpu_count() - 1)
for file_name in Path('c:/').glob('**/*/' + info_file):
file_list.append(str(os.path.join('c:/', file_name)))
p.map_async(self.get_ids, file_list)
p.close()
p.join()
print(self.__id_list) # this is showing as empty
Worker method:
def get_ids(self, file_name):
try:
with open(file_name) as data:
for line in data:
temp_split = line.split()
for item in temp_split:
value_split = str(item).split('=')
if 'id' == value_split[0].lower():
if int(value_split[1]) not in self._id_list:
self.__id_list.append(int(value_split[1]))
except:
raise FileReadError(f'There was an issue parsing "{file_name}".')
print(self.__id_list) # here the list prints fine
The map call returns a AysncResult class object. you should use that to wait for the processing to finish before checking self.__id_list. also you might consider returning a local list, collected those lists and aggregating them into the final list.
1. It looks like you have a typo in your get_ids method (self._id_list instead of self.__id_list). You can see it if you wait for the result:
result = p.map_async(self.get_ids, file_list)
result.get()
2. When a new child process is created, it gets a copy of the parent's address space however any subsequent changes (either by parent or child) are not reflected in the memory of the other process. They each have their own private address space.
Example:
$ cat fork.py
import os
l = []
l.append('global')
# Return 0 in the child and the child’s process id in the parent
pid = os.fork()
if pid == 0:
l.append('child')
print(f'Child PID: {os.getpid()}, {l}')
else:
l.append('parent')
print(f'Parent PID: {os.getpid()}, {l}')
print(l)
$ python3 fork.py
Parent PID: 9933, ['global', 'parent']
['global', 'parent']
Child PID: 9934, ['global', 'child']
['global', 'child']
Now back to your problem, you can use multiprocessing.Manager.list to create an object that is shared between processes:
from multiprocessing import Manager, Pool
m = Manager()
self.__id_list = m.list()
Docs: Sharing state between processes
or use threads as your workload seems to be I/O bound anyway:
from multiprocessing.dummy import Pool as ThreadPool
p = ThreadPool(processes = multiprocessing.cpu_count() - 1)
Alternatively check concurrent.futures
I'm having issues with using r2pipe, Radare2's API, with the multiprocessing Pool.map function in python. The problem I am facing is the application hangs on pool.join().
My hope was to use multithreading via the multiprocessing.dummy class in order to evaluate functions quickly through r2pipe. I have tried passing my r2pipe object as a namespace using the Manager class. I have attempted using events as well, but none of these seem to work.
class Test:
def __init__(self, filename=None):
if filename:
self.r2 = r2pipe.open(filename)
else:
self.r2 = r2pipe.open()
self.r2.cmd('aaa')
def t_func(self, args):
f = args[0]
r2_ns = args[1]
print('afbj # {}'.format(f['name']))
try:
bb = r2_ns.cmdj('afbj # {}'.format(f['name']))
if bb:
return bb[0]['addr']
else:
return None
except Exception as e:
print(e)
return None
def thread(self):
funcs = self.r2.cmdj('aflj')
mgr = ThreadMgr()
ns = mgr.Namespace()
ns.r2 = self.r2
pool = ThreadPool(2)
results = pool.map(self.t_func, product(funcs, [ns.r2]))
pool.close()
pool.join()
print(list(results))
This is the class I am using. I make a call to the Test.thread function in my main function.
I expect the application to print out the command it is about to run in r2pipe afbj # entry0, etc. Then to print out the list of results containing the first basic block address [40000, 50000, ...].
The application does print out the command about to run, but then hangs before printing out the results.
ENVIRONMENT
radare2: radare2 4.2.0-git 23712 # linux-x86-64 git.4.1.1-97-g5a48a4017
commit: 5a48a401787c0eab31ecfb48bebf7cdfccb66e9b build: 2020-01-09__21:44:51
r2pipe: 1.4.2
python: Python 3.6.9 (default, Nov 7 2019, 10:44:02)
system: Ubuntu 18.04.3 LTS
SOLUTION
This may be due to passing the same instance of r2pipe.open() to every call of t_func in the pool. One solution is to move the following lines of code into t_func:
r2 = r2pipe.open('filename')
r2.cmd('aaa')
This works, however its terribly slow to reanalyze for each thread/process.
Also, it is often faster to allow radare2 to do as much of the work as possible and limit the number of commands we need to send using r2pipe.
This problem is solved by using the command: afbj ##f
afbj # List basic blocks of given function and show results in json
##f # Execute the command for each function
EXAMPLE
Longer Example
import r2pipe
R2: r2pipe.open_sync = r2pipe.open('/bin/ls')
R2.cmd("aaaa")
FUNCS: list = R2.cmd('afbj ##f').split("\n")[:-1]
RESULTS: list = []
for func in FUNCS:
basic_block_info: list = eval(func)
first_block: dict = basic_block_info[0]
address_first_block: int = first_block['addr']
RESULTS.append(hex(address_first_block))
print(RESULTS)
'''
['0x4a56', '0x1636c', '0x3758', '0x15690', '0x15420', '0x154f0', '0x15420',
'0x154f0', '0x3780', '0x3790', '0x37a0', '0x37b0', '0x37c0', '0x37d0', '0x0',
...,
'0x3e90', '0x6210', '0x62f0', '0x8f60', '0x99e0', '0xa860', '0xc640', '0x3e70',
'0xd200', '0xd220', '0x133a0', '0x14480', '0x144e0', '0x145e0', '0x14840', '0x15cf0']
'''
Shorter Example
import r2pipe
R2 = r2pipe.open('/bin/ls')
R2.cmd("aaaa")
print([hex(eval(func)[0]['addr']) for func in R2.cmd('afbj ##f').split("\n")[:-1]])
I am making a web scraper to build a database. The site I plan to use has index pages each containing 50 links. The amount of pages to be parsed is estimated to be around 60K and up, this is why I want to implement multiprocessing.
Here is some pseudo-code of what I want to do:
def harvester(index):
main=dict()
....
links = foo.findAll ( 'a')
for link in links:
main.append(worker(link))
# or maybe something like: map_async(worker(link))
def worker(url):
''' this function gather the data from the given url'''
return dictionary
Now what I want to do with that is to have a certain number of worker function to gather data in parallel on different pages. This data would then be appended to a big dictionary located in harvester or written directly in a csv file by the worker function.
I'm wondering how I can implement parallelism. I have done a faire
amount of research on using gevent, threading and multiprocessing but
I am not sure how to implement it.
I am also not sure if appending data to a large dictionary or writing
directly in a csv using DictWriter will be stable with that many input at the same time.
Thanks
I propose you to split your work into separate workers which communicate via Queues.
Here you mostly have IO wait time (crawling, csv writing)
So you can do the following (not tested, just see the idea):
import threading
import Queue
class CsvWriter(threading.Thread):
def __init__(self, resultq):
super(CsvWriter, self).__init__()
self.resultq = resultq
self.writer = csv.DictWriter(open('results.csv', 'wb'))
def run(self):
done = False
while not done:
row = self.requltq.get()
if row != -1:
self.writer.writerow(row)
else:
done = True
class Crawler(threading.Thread):
def __init__(self, inputqueue, resultq):
super(Crawler, self).__init__()
self.iq = inputq
self.oq = resultq
def run(self):
done = False
while not done:
link = self.iq.get()
if link != -1:
result = self.extract_data(link)
self.oq.put(result)
else:
done = True
def extract_data(self, link):
# crawl and extract what you need and return a dict
pass
def main():
linkq = Queue.Queue()
for url in your_urls:
linkq.put(url)
resultq = Queue.Queue()
writer = CsvWriter(resultq)
writer.start()
crawlers = [Crawler(linkq, resultq) for _ in xrange(10)]
[c.start() for c in crawlers]
[linkq.put(-1) for _ in crawlers]
[c.join() for c in crawlers]
resultq.put(-1)
writer.join()
This code should work (fix possible typos) and make it to exit when all the urls are finished
for testing reasons I start only 1 process. One given argument is an array that shall be changed from that process.
class Engine():
Ready = Value('i', False)
def movelisttoctypemovelist(self, movelist):
ctML = []
for zug in movelist:
ctZug = ctypeZug()
ctZug.VonReihe = zug.VonReihe
ctZug.VonLinie = zug.VonLinie
ctZug.NachReihe = zug.NachReihe
ctZug.NachLinie = zug.NachLinie
ctZug.Bewertung = zug.Bewertung
ctML.append(ctZug)
return ctML
def findbestmove(self, board, settings, enginesettings):
print ("Computer using", multiprocessing.cpu_count(),"Cores.")
movelist = Array(ctypeZug, [], lock = True)
movelist = self.movelisttoctypemovelist(board.movelist)
bd = board.boardtodictionary()
process = []
for i in range(1):
p = Process(target=self.calculatenullmoves, args=(bd, movelist, i, self.Ready))
process.append(p)
p.start()
for p in process:
p.join()
self.printctypemovelist(movelist, settings)
print ("Ready:", self.Ready.value)
def calculatenullmoves(self, boarddictionary, ml, processindex, ready):
currenttime = time()
print ("Process", processindex, "begins to work...")
board = Board()
board.dictionarytoboard(boarddictionary)
...
ml[processindex].Bewertung = 2.4
ready.value = True
print ("Process", processindex, "finished work in", time()-currenttime, "sec")
def printctypemovelist(self, ml):
for zug in ml:
print (zug.VonReihe, zug.VonLinie, zug.NachReihe, zug.NachLinie, zug.Bewertung)
I try to write 2.4 directly in the list, but no changing is shown when calling "printctypemovelist".
I set "Ready" to True and it works.
I used information from http://docs.python.org/2/library/multiprocessing.html#module-multiprocessing.sharedctypes
I hope someone can find my mistake, if it is too difficult to read, please let me know.
The problem is that you're trying to share a plain Python list:
ctML = []
Use a proxy object instead:
from multiprocessing import Manager
ctML = Manager().list()
See Python doc on Sharing state between processes for more detail.