How to I parse 1500 mills lines of data efficiently? - python

I have 1,500,000,000 rows of data saved into multiple txt files. The data formatted as following:
key1 key2
Where key1 is url, and key2 is mysql record row_id.
I wrote following python code to parse the data, but it is slow.
e.g.
import Queue
import threading
class CheckThread(threading.Thread):
def __init__(self, queue, src_folder, dest_folder='check_result'):
super(CheckThread, self).__init__()
self._queue = queue
self.daemon = True
def run(self):
while True:
file_name = self._queue.get()
try:
self._prepare_check(file_name)
except:
self._queue.task_done()
continue
self._queue.task_done()
def Check(src_folder, workers=12, dest_folder='check_result'):
queue = Queue.Queue()
for (dirpath, dirnames, filelist) in os.walk(src_folder):
for name in filelist:
if name[0] == '.':
continue
queue.put(os.path.join(dirpath, name))
for worker in xrange(workers):
worker = str(worker + 1)
t = CheckThread(queue, src_folder, dest_folder)
t.start()
queue.join()
def main(folder, worker=12, out='check_result'):
try:
Check(folder, worker, out)
except:
return 1
return 0
Each thread parse a one file from the queue.
How do I improve the parsing speed of each file.

Some suggestions:
Returning 1 on error and 0 on success is not pythonic.
Never use except:, always specify which exceptions you want to catch.
Your first try: ... except: ... is not the propper structure to use here, you should use a try: ... finally: ... that executes the finally: part even if the try: part raises an exception.
Some of the parameters of CheckThread.__init__() are not being used.
What does CheckThread._prepare_check() do?
You are not using worker.
The changed code would be:
import Queue
import threading
class CheckThread(threading.Thread):
def __init__(self, queue, src_folder, dest_folder='check_result'):
super(CheckThread, self).__init__()
self._queue = queue
self.daemon = True
# Do something with src_folder and dest_folder or delete them from the parameter list
def run(self):
while True:
file_name = self._queue.get()
try:
self._prepare_check(file_name)
finally:
self._queue.task_done()
def Check(src_folder, workers=12, dest_folder='check_result'):
queue = Queue.Queue()
for (dirpath, dirnames, filelist) in os.walk(src_folder):
for name in filelist:
if name[0] == '.':
continue
queue.put(os.path.join(dirpath, name))
for worker in xrange(workers):
worker = str(worker + 1) # Do something with worker or delete this line
t = CheckThread(queue, src_folder, dest_folder)
t.start()
queue.join()
def main(folder, worker=12, out='check_result'):
Check(folder, worker, out)

Related

Avoid waiting for threads to finish in Python

I've wrote this script here to read data from a txt file and process it. But it seems that if I give it a big file and a high number of threads, the more it reads from the list, the slower the script gets.
Is there a way to avoid waiting for all the threads to finish and start a new one whenever a thread is done with the work?
Also it seems that when it finishes processing, the script doesn't exit.
import threading, Queue, time
class Work(threading.Thread):
def __init__(self, jobs):
threading.Thread.__init__(self)
self.Lock = threading.Lock()
self.jobs = jobs
def myFunction(self):
#simulate work
self.Lock.acquire()
print("Firstname: "+ self.firstname + " Lastname: "+ self.lastname)
self.Lock.release()
time.sleep(3)
def run(self):
while True:
self.item = self.jobs.get().rstrip()
self.firstname = self.item.split(":")[0]
self.lastname = self.item.split(":")[1]
self.myFunction()
self.jobs.task_done()
def main(file):
jobs = Queue.Queue()
myList = open(file, "r").readlines()
MAX_THREADS = 10
pool = [Work(jobs) for i in range(MAX_THREADS)]
for thread in pool:
thread.start()
for item in myList:
jobs.put(item)
for thread in pool:
thread.join()
if __name__ == '__main__':
main('list.txt')
The script probably seems to take longer on larger inputs because there's a 3 second pause between each batch of printing.
The issue with the script not finishing is, since you are using Queue, you need to call join() on the Queue, not on the individual threads. To make sure that the script returns when the jobs have stopped running, you should also set daemon = True.
The Lock will also not work in the current code because threading.Lock() produces a new lock each time. You need to have all the jobs share the same lock.
If you want to use this in Python 3 (which you should), the Queue module has been renamed to queue.
import threading, Queue, time
lock = threading.Lock() # One lock
class Work(threading.Thread):
def __init__(self, jobs):
threading.Thread.__init__(self)
self.daemon = True # set daemon
self.jobs = jobs
def myFunction(self):
#simulate work
lock.acquire() # All jobs share the one lock
print("Firstname: "+ self.firstname + " Lastname: "+ self.lastname)
self.Lock.release()
time.sleep(3)
def run(self):
while True:
self.item = self.jobs.get().rstrip()
self.firstname = self.item.split(":")[0]
self.lastname = self.item.split(":")[1]
self.myFunction()
self.jobs.task_done()
def main(file):
jobs = Queue.Queue()
with open(file, 'r') as fp: # Close the file when we're done
myList = fp.readlines()
MAX_THREADS = 10
pool = [Work(jobs) for i in range(MAX_THREADS)]
for thread in pool:
thread.start()
for item in myList:
jobs.put(item)
jobs.join() # Join the Queue
if __name__ == '__main__':
main('list.txt')
Simpler example (based on an example from the Python docs)
import threading
import time
from Queue import Queue # Py2
# from queue import Queue # Py3
lock = threading.Lock()
def worker():
while True:
item = jobs.get()
if item is None:
break
firstname, lastname = item.split(':')
lock.acquire()
print("Firstname: " + firstname + " Lastname: " + lastname)
lock.release()
time.sleep(3)
jobs.task_done()
jobs = Queue()
pool = []
MAX_THREADS = 10
for i in range(MAX_THREADS):
thread = threading.Thread(target=worker)
thread.start()
pool.append(thread)
with open('list.txt') as fp:
for line in fp:
jobs.put(line.rstrip())
# block until all tasks are done
jobs.join()
# stop workers
for i in range(MAX_THREADS):
jobs.put(None)
for thread in pool:
thread.join()

Shared variables in Python

I am trying to create multiple threads of bot and they share some variables, but I am failing miserably in getingt the shared variables to work.
Here is the code:
import requests
import sys
import threading
import signal
import time
class bot(threading.Thread):
terminate = False
#def __init__(self):
# threading.Thread.__init__(self)
# self.terminate = False
def getCode():
code_lock.acquire()
work_code = code
try:
code += 1
finally:
code_lock.release()
return work_code
def checkCode(code):
try:
#if(code % 1000000 == 0):
print("Code "+str(code)+" is being checked...\n")
html = requests.get(url+str(code))
html.encoding = 'utf-8'
return not 'Page Not Found' in html.text
except requests.exceptions.ConnectionError:
print("Connection Error! Retrying...\n")
time.sleep(0.5)
except KeyboardInterrupt:
logCode(code)
sys.exit()
def storeCode(code):
file_lock.acquire()
try:
file.write(code+'\n')
finally:
file_lock.release()
def logCode(code):
log_lock.acquire()
try:
log.write(code+'\n')
finally:
log_lock.release()
#def run(self):
# global bots
# global url
# global file
# global log
# global code_lock
# global file_lock
# global log_lock
while(not terminate):
code = getCode()
if(checkCode(code)):
storeCode(code)
logCode(code)
def main(code = 0, threads = 16):
#bots = [threading.Thread(target=bot) for bot in range(threads)]
bots = []
url = 'https://test.ing/codes/'
file = open("valid-codes.txt", "a")
log = open("log.txt", "a")
code_lock = threading.Lock()
file_lock = threading.Lock()
log_lock = threading.Lock()
def signal_handler(signal, frame):
print('Exiting...\n')
log_lock.acquire()
try:
log.write("\n\n"+str(time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()))+"\n")
finally:
log_lock.release()
for bot in bots:
bot.terminate = True
for bot in bots:
bot.join()
sys.exit(0)
#for bot in bots:
# bot.start()
for i in range(threads):
t = bot()
bots.append(t)
t.start()
signal.signal(signal.SIGINT, signal_handler)
while True:
signal.pause()
main(736479509787350, 1)
With this code I get this error:
Traceback (most recent call last): File "bot.py", line 7, in
class bot(threading.Thread): File "bot.py", line 59, in bot
code = getCode() File "bot.py", line 14, in getCode
code_lock.acquire() NameError: name 'code_lock' is not defined
I don't know if I should override the run(self) method of bot, but when I tried that it never actually ran the method run and I also receive the same error from all the threads created: that int is not callable (and I can't see where I can possibly be using an int as object).
Additionaly I don't know if I am handling correctly the exit signal from keyboard, as you can see I am trying to deal with that using a terminate variable, but I don't think that this is the problem...
One last thing, the ConnectionError exception is not being appropriately handled, as it's saying "Retrying...", but in fact it will not retry, but I am aware of that and it should be ok, I'll fix it latter.
Worth mentioning that I'm not very used to deal with multi-threading and when I do deal with it, it is in C or C++.
Edit
I can make the code work by using global variables, but I do not want to do that, I prefer to avoid using globals. My attempts of passing the variables directly to the instances of the class bot or by passing an data-object to it weren't successful so far, whenever I pass the variables or the auxiliar object to bot I am unable to access them as attributes using self. and without self. Python claims that the variable was not defined.
Here is the updated code, without success yet:
import requests
import sys
import threading
import signal
import time
class Shared:
def __init__(self, code, url, file, log, code_lock, file_lock, log_lock):
self.code = code
self.url = url
self.file = file
self.log = log
self.code_lock = code_lock
self.file_lock = file_lock
self.log_lock = log_lock
class bot(threading.Thread):
def __init__(self, data):
threading.Thread.__init__(self)
self.terminate = False
self.data = data
#classmethod
def getCode(self):
self.data.code_lock.acquire()
work_code = self.data.code
try:
self.data.code += 1
finally:
self.data.code_lock.release()
return work_code
#classmethod
def checkCode(self, work_code):
try:
#if(code % 1000000 == 0):
print("Code "+str(work_code)+" is being checked...\n")
html = requests.get(self.data.url+str(work_code))
html.encoding = 'utf-8'
return not 'Page Not Found' in html.text
except requests.exceptions.ConnectionError:
print("Connection Error! Retrying...\n")
time.sleep(0.5)
except KeyboardInterrupt:
self.logCode(work_code)
sys.exit()
#classmethod
def storeCode(self, work_code):
self.data.file_lock.acquire()
try:
self.data.file.write(work_code+'\n')
finally:
self.data.file_lock.release()
#classmethod
def logCode(self, work_code):
self.data.log_lock.acquire()
try:
self.data.log.write(work_code+'\n')
finally:
self.data.log_lock.release()
#classmethod
def run(self):
while(not self.terminate):
work_code = self.getCode()
if(self.checkCode(work_code)):
self.storeCode(work_code)
self.logCode(work_code)
def main(code = 0, threads = 16):
#bots = [threading.Thread(target=bot) for bot in range(threads)]
bots = []
url = 'https://www.test.ing/codes/'
file = open("valid-codes.txt", "a")
log = open("log.txt", "a")
code_lock = threading.Lock()
file_lock = threading.Lock()
log_lock = threading.Lock()
data = Shared(code, url, file, log, code_lock, file_lock, log_lock)
def signal_handler(signal, frame):
print('Exiting...\n')
log_lock.acquire()
try:
log.write("\n\n"+str(time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()))+"\n")
finally:
log_lock.release()
for bot in bots:
bot.terminate = True
for bot in bots:
bot.join()
sys.exit(0)
#for bot in bots:
# bot.start()
for i in range(threads):
t = bot(data)
bots.append(t)
t.start()
signal.signal(signal.SIGINT, signal_handler)
while True:
signal.pause()
main(736479509787350, 4)
Yet, the working code with global variables:
import requests
import sys
import threading
import signal
import time
code = 736479509787350
url = 'https://www.test.ing/codes/'
file = open("valid-codes.txt", "a")
log = open("log.txt", "a")
code_lock = threading.Lock()
file_lock = threading.Lock()
log_lock = threading.Lock()
terminate = False
class bot(threading.Thread):
def __init__(self):
threading.Thread.__init__(self)
#classmethod
def getCode(self):
global code
code_lock.acquire()
work_code = code
try:
code += 1
finally:
code_lock.release()
return work_code
#classmethod
def checkCode(self, work_code):
try:
if(code % 1000000 == 0):
print("Code "+str(work_code)+" is being checked...\n")
html = requests.get(url+str(work_code))
html.encoding = 'utf-8'
if(not 'Page Not Found' in html.text):
time.sleep(0.5)
html = requests.get(url+str(work_code)+":999999999")
html.encoding = 'utf-8'
return 'Page Not Found' in html.text
except requests.exceptions.ConnectionError:
#print("Connection Error! Retrying...\n")
time.sleep(1)
return self.checkCode(work_code)
except KeyboardInterrupt:
self.logCode(work_code)
sys.exit()
#classmethod
def storeCode(self, work_code):
global file_lock
global file
file_lock.acquire()
try:
file.write(str(work_code)+'\n')
finally:
file_lock.release()
#classmethod
def logCode(self, work_code):
global log_lock
global log
log_lock.acquire()
try:
log.write(str(work_code)+'\n')
finally:
log_lock.release()
#classmethod
def run(self):
global terminate
while(not terminate):
work_code = self.getCode()
if(self.checkCode(work_code)):
print("Code "+str(work_code)+" is a valid code!\n")
self.storeCode(work_code)
self.logCode(work_code)
def main(threads = 16):
#bots = [threading.Thread(target=bot) for bot in range(threads)]
bots = []
#url = 'https://www.facebook.com/leticia.m.demenezes/posts/'
#file = open("valid-codes.txt", "a")
#log = open("log.txt", "a")
#code_lock = threading.Lock()
#file_lock = threading.Lock()
#log_lock = threading.Lock()
def signal_handler(signal, frame):
global terminate
print('Exiting...\n')
log_lock.acquire()
try:
log.write("\n\n"+str(time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()))+"\n")
finally:
log_lock.release()
terminate = True
for bot in bots:
bot.join()
sys.exit(0)
#for bot in bots:
# bot.start()
for i in range(threads):
t = bot()
bots.append(t)
t.start()
signal.signal(signal.SIGINT, signal_handler)
while True:
signal.pause()
main()
You could make the code_lock global as you're trying to do, but why not just pass it into each bot class?
t = bot(code_lock)
Next create a constructor for your class:
class bot(threading.Thread):
def __init__(self, code_lock):
threading.Thread.__init__(self)
self.code_lock = code_lock
Now, whenever you try to use code_lock inside your bot class, always prefix it with self (self.code_lock).
If you really insist on using global variables, then look into the global keyword.
It's clear that you are trying to access code_lock out of it's scope, may be you can follow #MartinKonecny suggestion to fix that.
I could see that even after fixing code_lock problem, your code has lot of problems. as soon as you fix code_lock problem you'll face similar issue with the variable code in the same function getCode.
After fixing all those compiled time issues, you'll face issues with your whole class implementation, this is not the way to implement Python classes.
It's better that you go through this to know more about python name spaces and classes.

Python Queue and Threading Module - Impose a extra custom lock?

Using Linux and Python 2.7.6, I have a script that uploads lots of files at one time. I am using multi-threading with the Queue and Threading modules.
I have a object that keeps track of the files that have been successfully uploaded and decrements after each successfull upload. I need to make this operation atomic/thread safe. Since the Queue module is high level and has it's own mutex on the lower level, can I impose my own lock/acquire in addition to it? I tried doing this and had no errors(at the bottom of the last code block where file_quantity.deduct() is). But I am not sure if it is truly working as it should. Here is the shortened version for readability:
class FileQuantity(object):
"""Keeps track of files that have been uploaded and how many are left"""
def __init__(self, file_quantity):
self.quantity = file_quantity
self.total = file_quantity
def deduct(self):
self.quantity -= 1
kill_received = False
lock = threading.Lock()
class CustomQueue(Queue.Queue):
#Can not use .join() because it would block any processing
#for SIGINT untill threads are done. To counter this,
# wait() is given a time out along with while not kill_received
#to be checked
def join(self):
self.all_tasks_done.acquire()
try:
while not kill_received and self.unfinished_tasks:
self.all_tasks_done.wait(10.0)
finally:
self.all_tasks_done.release()
def do_the_uploads(file_list, file_quantity,
retry_list, authenticate):
"""The uploading engine"""
value = raw_input(
"\nPlease enter how many concurent "
"uploads you want at one time(example: 200)> ")
value = int(value)
logger.info('{} concurent uploads will be used.'.format(value))
confirm = raw_input(
"\nProceed to upload files? Enter [Y/y] for yes: ").upper()
if confirm == "Y":
kill_received = False
sys.stdout.write("\x1b[2J\x1b[H")
q = CustomQueue()
def worker():
global kill_received
while not kill_received:
item = q.get()
upload_file(item, file_quantity, retry_list, authenticate, q)
q.task_done()
for i in range(value):
t = Thread(target=worker)
t.setDaemon(True)
t.start()
for item in file_list:
q.put(item)
q.join()
print "Finished. Cleaning up processes...",
#Allowing the threads to cleanup
time.sleep(4)
print "done."
def upload_file(file_obj, file_quantity, retry_list, authenticate, q):
"""Uploads a file. One file per it's own thread. No batch style. This way if one upload
fails no others are effected."""
absolute_path_filename, filename, dir_name, token, url = file_obj
url = url + dir_name + '/' + filename
try:
with open(absolute_path_filename) as f:
r = requests.put(url, data=f, headers=header_collection, timeout=20)
except requests.exceptions.ConnectionError as e:
pass
if src_md5 == r.headers['etag']:
lock.acquire()
file_quantity.deduct()
lock.release()
Well, the code you posted doesn't define lock anywhere, so hard to say for sure. It would be more common to protect the code that actually needs protecting:
def deduct(self):
with lock:
self.quantity -= 1
Sanest is to allocate a lock in the structure that needs it, like so:
class FileQuantity(object):
"""Keeps track of files that have been uploaded and how many are left"""
def __init__(self, file_quantity):
self.quantity = file_quantity
self.total = file_quantity
self.lock = threading.Lock()
def deduct(self):
with self.lock:
self.quantity -= 1
and use self.lock similarly for any other mutations of FileQuantity data members that may be invoked simultaneously by multiple threads.

Multithread proxy change but once

Let say I have a code like this:
def func1(a,b,c):
try:
p = pycurl.Curl()
p.setopt(pycurl.PROXY, "127.0.0.1")
p.setopt(pycurl.PROXYPORT, 9050)
p.setopt(pycurl.PROXYTYPE, pycurl.PROXYTYPE_SOCKS5)
p.perform()
p.close()
except pycurl.error as error:
if error[0] == 28: # timeout - change proxy
print "Tor timeout, need to change"
queue.put((a,b,c))
new_tor()
return
def new_tor():
# send_signal_for_new_ident_is_here
I start this code in 7 threads.
And when a thread receives error 28 it change the identify.
But it happens that ALL 7 THREADS sending signal to change identify.
How to do this:
If thread received error 28, then it calls new_tor() and other 6 threads don't but waiting for result and only then they proceed to work. How to sync this?
Just put an error "id" into the queue and if you encounter it, put the value back into the queue, and then handle as needed.
You don't wish to end the thread, which is what I did.
So, you can have some unique identifier for each thread, such that if once a thread encounters an error, it also adds the data (it's identifier) that says it encountered this error before, so that if all threads have encountered this error, the error is removed from the queue.
Code:
import threading
import Queue
y = 0
def f1():
global y
y += 1
if y > 100:
raise ValueError('trial')
def f2():
return
class Test(threading.Thread):
def __init__(self, func, name):
threading.Thread.__init__(self)
self.func = func
self.name = name
def run(self):
while True:
x = ''
if not queue.empty():
x = queue.get()
if x == 'error':
queue.put(x)
print 'Stopping %s' % (self.name,)
return
try:
self.func()
except Exception as e:
queue.put('error')
queue = Queue.Queue()
thread1 = Test(f1, '1')
thread2 = Test(f2, '2')
thread1.start()
thread2.start()

python threading in a loop

I have a project that requires a bunch of large matrices, which are stored in ~200 MB files, to be cross-correlated (i.e. FFT * conj(FFT)) with each other. The number of files is such that I can't just load them all up and then do my processing. On the other hand, reading in each file as I need it is slower than I'd like.
what I have so far is something like:
result=0
for i in xrange(N_files):
f1 = file_reader(file_list[i])
############################################################################
# here I want to have file_reader go start reading the next file I'll need #
############################################################################
in_place_processing(f1)
for j in xrange(i+1,N_files):
f2 = file_reader(file_list[j])
##################################################################
# here I want to have file_reader go start reading the next file #
##################################################################
in_place_processing(f2)
result += processing_function(f1,f2)
So basically, I just want to have two threads that will each read a file, give it to me when I ask for it (or as soon as it's done after I ask for it), and then go start reading the next file for when I ask for it. The object the file_reader returns is rather large and complicated, so I'm not sure if multiprocessing is the way to go here...
I've read about threading and queues but can't seem to figure out the part where I ask the thread to go read the file and can proceed with the program while it does. I don't want the threads to simply go about their business in the background -- am I missing a detail here, or is threading not the way to go?
Below is an example of using the multiprocessing module that will spawn off child processes to call your file_reader method and queue up their results. The queue should block when full, so you can control the number of read ahead's you'd like to perform with the QUEUE_SIZE constant.
This utilizes a standard Producer/Consumer model of multiprocess communication, with the child processes act as Producers, with the main thread being the Consumer. The join method call in the class destructor ensures the child process resources are cleaned up properly. There are some print statements interspersed for demonstration purposes.
Additionally, I added the ability for the QueuedFileReader class to offload work to a worker thread or run in the main thread, rather than using a child process, for comparison. This is done by specifying the mode parameter at class initialization to MODE_THREADS or MODE_SYNCHRONOUS, respectively.
import multiprocessing as mp
import Queue
import threading
import time
QUEUE_SIZE = 2 #buffer size of queue
## Placeholder for your functions and variables
N_files = 10
file_list = ['file %d' % i for i in range(N_files)]
def file_reader(filename):
time.sleep(.1)
result = (filename,'processed')
return result
def in_place_processing(f):
time.sleep(.2)
def processing_function(f1,f2):
print f1, f2
return id(f1) & id(f2)
MODE_SYNCHRONOUS = 0 #file_reader called in main thread synchronously
MODE_THREADS = 1 #file_reader executed in worker thread
MODE_PROCESS = 2 #file_reader executed in child_process
##################################################
## Class to encapsulate multiprocessing objects.
class QueuedFileReader():
def __init__(self, idlist, mode=MODE_PROCESS):
self.mode = mode
self.idlist = idlist
if mode == MODE_PROCESS:
self.queue = mp.Queue(QUEUE_SIZE)
self.process = mp.Process(target=QueuedFileReader.worker,
args=(self.queue,idlist))
self.process.start()
elif mode == MODE_THREADS:
self.queue = Queue.Queue(QUEUE_SIZE)
self.thread = threading.Thread(target=QueuedFileReader.worker,
args=(self.queue,idlist))
self.thread.start()
#staticmethod
def worker(queue, idlist):
for i in idlist:
queue.put((i, file_reader(file_list[i])))
print id(queue), 'queued', file_list[i]
queue.put('done')
def __iter__(self):
if self.mode == MODE_SYNCHRONOUS:
self.index = 0
return self
def next(self):
if self.mode == MODE_SYNCHRONOUS:
if self.index == len(self.idlist): raise StopIteration
q = (self.idlist[self.index],
file_reader(file_list[self.idlist[self.index]]))
self.index += 1
else:
q = self.queue.get()
if q == 'done': raise StopIteration
return q
def __del__(self):
if self.mode == MODE_PROCESS:
self.process.join()
elif self.mode == MODE_THREADS:
self.thread.join()
#mode = MODE_PROCESS
mode = MODE_THREADS
#mode = MODE_SYNCHRONOUS
result = 0
for i, f1 in QueuedFileReader(range(N_files),mode):
in_place_processing(f1)
for j, f2 in QueuedFileReader(range(i+1,N_files),mode):
in_place_processing(f2)
result += processing_function(f1,f2)
If your intermediate values are too large to pass through the Queue, you can execute each iteration of the outer loop in its own process. A handy way to do that would be using the Pool class in multiprocessing as in the example below.
import multiprocessing as mp
import time
## Placeholder for your functions and variables
N_files = 10
file_list = ['file %d' % i for i in range(N_files)]
def file_reader(filename):
time.sleep(.1)
result = (filename,'processed')
return result
def in_place_processing(f):
time.sleep(.2)
def processing_function(f1,f2):
print f1, f2
return id(f1) & id(f2)
def file_task(file_index):
print file_index
f1 = file_reader(file_list[file_index])
in_place_processing(f1)
task_result = 0
for j in range(file_index+1, N_files):
f2 = file_reader(file_list[j])
in_place_processing(f2)
task_result += processing_function(f1,f2)
return task_result
pool = mp.Pool(processes=None) #processes default to mp.cpu_count()
result = 0
for file_result in pool.map(file_task, range(N_files)):
result += file_result
print 'result', result
#or simply
#result = sum(pool.map(file_task, range(N_files)))

Categories

Resources