python multiprocessing with multiple arguments - python

I'm trying to multiprocess a function that does multiple actions for a large file but I'm getting the knownle pickling error eventhough Im using partial.
The function looks something like this:
def process(r,intermediate_file,record_dict,record_id):
res=0
record_str = str(record_dict[record_id]).upper()
start = record_str[0:100]
end= record_str[len(record_seq)-100:len(record_seq)]
print sample, record_id
if r=="1":
if something:
res = something...
intermediate_file.write("...")
if something:
res = something
intermediate_file.write("...")
if r == "2":
if something:
res = something...
intermediate_file.write("...")
if something:
res = something
intermediate_file.write("...")
return res
The way im calling it is the following in another function:
def call_func():
intermediate_file = open("inter.txt","w")
record_dict = get_record_dict() ### get infos about each record as a dict based on the record_id
results_dict = {}
pool = Pool(10)
for a in ["a","b","c",...]:
if not results_dict.has_key(a):
results_dict[a] = {}
for b in ["1","2","3",...]:
if not results_dict[a].has_key(b):
results_dict[a][b] = {}
results_dict[a][b]['res'] = []
infile = open(a+b+".txt","r")
...parse the file and return values in a list called "record_ids"...
### now call the function based on for each record_id in record_ids
if b=="1":
func = partial(process,"1",intermediate_file,record_dict)
res=pool.map(func, record_ids)
## append the results for each pair (a,b) for EACH RECORD in the results_dict
results_dict[a][b]['res'].append(res)
if b=="2":
func = partial(process,"2",intermediate_file,record_dict)
res = pool.map(func, record_ids)
## append the results for each pair (a,b) for EACH RECORD in the results_dict
results_dict[a][b]['res'].append(res)
... do something with results_dict...
The idea is that for each record inside the record_ids, I want to save the results for each pair (a,b).
I'm not sure what is giving me this error:
File "/code/Python/Python-2.7.9/Lib/multiprocessing/pool.py", line 251, in map
return self.map_async(func, iterable, chunksize).get()
File "/code/Python/Python-2.7.9/Lib/multiprocessing/pool.py", line 558, in get
raise self._value
cPickle.PicklingError: Can't pickle <type 'function'>: attribute lookup __builtin__.function faile
d

func is not defined at the top level of the code so it can't be pickled.
You can use pathos.multiprocesssing which is not a standard module but it will work.
Or, use something diferent to Pool.map maybe a Queue of workers ?
https://docs.python.org/2/library/queue.html
In the end there is an example you can use, it's for threading but is very similar to the multiprocessing where there is also Queues...
https://docs.python.org/2/library/multiprocessing.html#pipes-and-queues

Related

'DataFrame' object is not callable in a ApplyResult reference

I want to start by stating that I am aware that this error message was posted multiple times. But I cannot seem to understand how those posts apply to me. So I want to try my luck:
I have Dataframe "df" and I am trying to perform a parallel processing of subsets of that dataframe:
for i in range(1, 2):
pool = ThreadPool(processes=4)
async_result = pool.apply_async(helper.Helper.transform(df.copy(), i))
lst.append(async_result)
results = []
for item in lst:
currentitem = item.get()
results.append(currentitem)
Helper Method:
#staticmethod
def transform(df, i):
return df
So I usualle code in Java and for a class I need to do some stuff in python. I just dont understand why in this case I get the error:
Traceback (most recent call last):
File "C:/Users/Barry/file.py", line 28, in <module>
currentitem = item.get()
File "C:\Users\Barry\AppData\Local\Programs\Python\Python38-32\lib\multiprocessing\pool.py", line 768, in get
raise self._value
File "C:\Users\Barry\AppData\Local\Programs\Python\Python38-32\lib\multiprocessing\pool.py", line 125, in worker
result = (True, func(*args, **kwds))
TypeError: 'DataFrame' object is not callable
A print in the thread function or before creating the thread results in proper output.
The issue is with the line:
async_result = pool.apply_async(helper.Helper.transform(df.copy(), i))
The catch - you're calling the function 'transform' before passing it to apply_async. As a result, apply async receives a data frame, "thinks" it's a function, and tries to call it asynchronously. The result is the exception you're seeing, and this result is saved as part of the AsyncResult object.
To fix it just change this line to:
async_result = pool.apply_async(helper.Helper.transform, (df.copy(), i))
Note that apply_async gets two arguments - the function and the parameters to the function.

Multiprocessing with text scraping

I want to scrape <p> from pages and since there will be a couple thousands of them I want to use multiprocessing. However, it doesn't work when I try to append the result to some variable
I want to append the result of scraping to the data = []
I made a url_common for a base website since some pages don't start with HTTP etc.
from tqdm import tqdm
import faster_than_requests as requests #20% faster on average in my case than urllib.request
import bs4 as bs
def scrape(link, data):
for i in tqdm(link):
if i[:3] !='htt':
url_common = 'https://www.common_url.com/'
else:
url_common = ''
try:
ht = requests.get2str(url_common + str(i))
except:
pass
parsed = bs.BeautifulSoup(ht,'lxml')
paragraphs = parsed.find_all('p')
for p in paragraphs:
data.append(p.text)
Above doesn't work, since map() doesn't accept function like above
I tried to use it another way:
def scrape(link):
for i in tqdm(link):
if i[:3] !='htt':
url_common = 'https://www.common_url.com/'
else:
url_common = ''
try:
ht = requests.get2str(url_common + str(i))
except:
pass
parsed = bs.BeautifulSoup(ht,'lxml')
paragraphs = parsed.find_all('p')
for p in paragraphs:
print(p.text)
from multiprocessing import Pool
p = Pool(10)
links = ['link', 'other_link', 'another_link']
data = p.map(scrape, links)
I get this error while using above function:
Traceback (most recent call last):
File "C:\ProgramData\Anaconda3\lib\multiprocessing\process.py", line 297, in _bootstrap
self.run()
File "C:\ProgramData\Anaconda3\lib\multiprocessing\process.py", line 99, in run
self._target(*self._args, **self._kwargs)
File "C:\ProgramData\Anaconda3\lib\multiprocessing\pool.py", line 110, in worker
task = get()
File "C:\ProgramData\Anaconda3\lib\multiprocessing\queues.py", line 354, in get
return _ForkingPickler.loads(res)
AttributeError: Can't get attribute 'scrape' on <module '__main__' (built-in)>
I have not figured a way to do it so that it uses Pool and at the same time appending the result of scraping to the given variable
EDIT
I change a little bit to see where it stops:
def scrape(link):
for i in tqdm(link):
if i[:3] !='htt':
url_common = 'https://www.investing.com/'
else:
url_common = ''
try: #tries are always halpful with url as you never know
ht = requests.get2str(url_common + str(i))
except:
pass
print('works1')
parsed = bs.BeautifulSoup(ht,'lxml')
paragraphs = parsed.find_all('p')
print('works2')
for p in paragraphs:
print(p.text)
links = ['link', 'other_link', 'another_link']
scrape(links)
#WORKS PROPERLY AND PRINTS EVERYTHING
if __name__ == '__main__':
p = Pool(5)
print(p.map(scrape, links))
#DOESN'T WORK, NOTHING PRINTS. Error like above
You are using the map function incorrectly.
It iterates over each element of the iterable and calls the function on each element.
You can see the map function as doing something like the following:
to_be_mapped = [1, 2, 3]
mapped = []
def mapping(x): # <-- note that the mapping accepts a single value
return x**2
for item in to_be_mapped:
res = mapping(item)
mapped.append(res)
So to solve your problem remove the outermost for-loop as iterating is handled by the map function
def scrape(link):
if link[:3] !='htt':
url_common = 'https://www.common_url.com/'
else:
url_common = ''
try:
ht = requests.get2str(url_common + str(link))
except:
pass
parsed = bs.BeautifulSoup(ht,'lxml')
paragraphs = parsed.find_all('p')
for p in paragraphs:
print(p.text)

Dict in AutoProxy object from remote Manager is not subscriptable

This is my code.
from multiprocessing.managers import BaseManager
from threading import Thread
def manager1():
my_dict = {}
my_dict['key'] = "value"
print(my_dict['key']) #this works
class SyncManager(BaseManager): pass
SyncManager.register('get_my_dict', callable=lambda:my_dict)
n = SyncManager(address=('localhost', 50001), authkey=b'secret')
t = n.get_server()
t.serve_forever()
def get_my_dict_from_the_manager():
class SyncManager(BaseManager): pass
SyncManager.register('get_my_dict')
n = SyncManager(address=('localhost', 50001), authkey=b'secret')
n.connect()
my_dict = n.get_my_dict()
return my_dict
thread1 = Thread(target=manager1)
thread1.daemon = True
thread1.start()
my_dict = get_my_dict_from_the_manager()
print(my_dict.keys()) #this works
print(my_dict['key']) #DOES NOT WORK
On the last line of the script, I try to access a value in the dictionary my_dict by subscripting with a key. This throws an error. This is my terminal output:
value
['key']
Traceback (most recent call last):
File "/home/magnus/PycharmProjects/docker-falcon/app/so_test.py", line 31, in <module>
print(my_dict['key'])
TypeError: 'AutoProxy[get_my_dict]' object is not subscriptable
Process finished with exit code 1
It seems the AutoProxy object sort of behaves like the dict it is supposed to proxy, but not quite. Is there a way to make it subscriptable?
The problem is that the AutoProxy object does not expose the __getitem__ method that a dict normally has. An answer to my similar question allows you to access items by their key: simply replace print(my_dict['key']) with print(my_dict.get('key'))

concurrent.futures job deleting itself from a list -> IndexError in <listcomp>

I've encountered an error that I cannot explain while try to retrieve the results of futures submitted to process pool. I've stored the future objects in a list, and my best guess is that the future object reference is being deleted somehow so that list comprehension fails.
The error is at results = [j.result() for j in jobs] in async_jobs below. The traceback,
in <listcomp>
results = [j.result() for j in jobs]
File "lib/python3.6/concurrent/futures/_base.py", line 405, in result
return self.__get_result()
File "lib/python3.6/concurrent/futures/_base.py", line 357, in __get_result
raise self._exception
IndexError: list index out of range
non-MVCE code
def _job(*args, **kwargs):
"""Does work with thread pool and returns True"""
def _thread_job(*args,**kwargs):
"""Can define here because we are using threading and don't need to pickle"""
...
return None
with futures.ThreadPoolExecutor(max_workers=4) as t_executor:
jobs = []
for i in range(...):
f = t_executor.submit(_thread_job, ..., ...)
jobs.append(f)
results = [j.results() for j in jobs]
return True
def async_jobs():
with futures.ProcessPoolExecutor(max_workers=8) as p_executor:
jobs = []
for i in range(...):
f = p_executor.submit(_job, ..., ...)
jobs.append(f)
results = [j.result() for j in jobs]
if __name__=='__main__':
async_jobs()

Python multiprocessing "Bad file descriptor" error (not repeatable)

Apologies in advance, but I am unable to post a fully working example (too much overhead in this code to distill to a runnable snippet). I will post as much explanatory detail as I can, and please do let me know if anything critical seems missing.
Running Python 2.7.5 through IDLE
I am writing a program to compare two text files. Since the files can be large (~500MB) and each row comparison is independent, I would like to implement multiprocessing to speed up the comparison. This is working pretty well, but I am getting stuck on a pseudo-random Bad file descriptor error. I am new to multiprocessing, so I guess there is a technical problem with my implementation. Can anyone point me in the right direction?
Here is the code causing the trouble (specifically the pool.map):
# openfiles
csvReaderTest = csv.reader(open(testpath, 'r'))
csvReaderProd = csv.reader(open(prodpath, 'r'))
compwriter = csv.writer(open(outpath, 'wb'))
pool = Pool()
num_chunks = 3
chunksTest = itertools.groupby(csvReaderTest, keyfunc)
chunksProd = itertools.groupby(csvReaderProd, keyfunc)
while True:
# make a list of num_chunks chunks
groupsTest = [list(chunk) for key, chunk in itertools.islice(chunksTest, num_chunks)]
groupsProd = [list(chunk) for key, chunk in itertools.islice(chunksProd, num_chunks)]
# merge the two lists (pair off comparison rows)
groups_combined = zip(groupsTest,groupsProd)
if groups_combined:
# http://stackoverflow.com/questions/5442910/python-multiprocessing-pool-map-for-multiple-arguments
a_args = groups_combined # a list - set of combinations to be tested
second_arg = True
worker_result = pool.map(worker_mini_star, itertools.izip(itertools.repeat(second_arg),a_args))
Here is the full error output. (This error sometimes occurs, and other times the comparison runs to finish without problems):
Traceback (most recent call last):
File "H:/<PATH_SNIP>/python_csv_compare_multiprocessing_rev02_test2.py", line 407, in <module>
main(fileTest, fileProd, fileout, stringFields, checkFileLengths)
File "H:/<PATH_SNIP>/python_csv_compare_multiprocessing_rev02_test2.py", line 306, in main
worker_result = pool.map(worker_mini_star, itertools.izip(itertools.repeat(second_arg),a_args))
File "C:\Python27\lib\multiprocessing\pool.py", line 250, in map
return self.map_async(func, iterable, chunksize).get()
File "C:\Python27\lib\multiprocessing\pool.py", line 554, in get
raise self._value
IOError: [Errno 9] Bad file descriptor
If it helps, here are the functions called by pool.map:
def worker_mini(flag, chunk):
row_comp = []
for entry, entry2 in zip(chunk[0][0], chunk[1][0]):
if entry == entry2:
temp_comp = entry
else:
temp_comp = '%s|%s' % (entry, entry2)
row_comp.append(temp_comp)
return True, row_comp
#takes a single tuple argument and unpacks the tuple to multiple arguments
def worker_mini_star(flag_chunk):
"""Convert `f([1,2])` to `f(1,2)` call."""
return worker_mini(*flag_chunk)
def main():

Categories

Resources