I'm trying to run my code with a multiprocessing function but mongo keep returning
"MongoClient opened before fork. Create MongoClient with
connect=False, or create client after forking."
I really doesn't understand how i can adapt my code to this.
Basically the structure is:
db = MongoClient().database
db.authenticate('user', 'password', mechanism='SCRAM-SHA-1')
collectionW = db['words']
collectionT = db['sinMemo']
collectionL = db['sinLogic']
def findW(word):
rows = collectionw.find({"word": word})
ind = 0
for row in rows:
ind += 1
id = row["_id"]
if ind == 0:
a = ind
else:
a = id
return a
def trainAI(stri):
...
if findW(word) == 0:
_id = db['words'].insert(
{"_id": getNextSequence(db.counters, "nodeid"), "word": word})
story = _id
else:
story = findW(word)
...
def train(index):
# searching progress
progFile = "./train/progress{0}.txt".format(index)
trainFile = "./train/small_file_{0}".format(index)
if os.path.exists(progFile):
f = open(progFile, "r")
ind = f.read().strip()
if ind != "":
pprint(ind)
i = int(ind)
else:
pprint("No progress saved or progress lost!")
i = 0
f.close()
else:
i = 0
#get the number of line of the file
rangeC = rawbigcount(trainFile)
#fix unicode
non_bmp_map = dict.fromkeys(range(0x10000, sys.maxunicode + 1), 0xfffd)
files = io.open(trainFile, "r", encoding="utf8")
str1 = ""
str2 = ""
filex = open(progFile, "w")
with progressbar.ProgressBar(max_value=rangeC) as bar:
for line in files:
line = line.replace("\n", "")
if i % 2 == 0:
str1 = line.translate(non_bmp_map)
else:
str2 = line.translate(non_bmp_map)
bar.update(i)
trainAI(str1 + " " + str2)
filex.seek(0)
filex.truncate()
filex.write(str(i))
i += 1
#multiprocessing function
maxProcess = 3
def f(l, i):
l.acquire()
train(i + 1)
l.release()
if __name__ == '__main__':
lock = Lock()
for num in range(maxProcess):
pprint("start " + str(num))
Process(target=f, args=(lock, num)).start()
This code is made for reading 4 different file in 4 different process and at the same time insert the data in the database.
I copied only part of the code for make you understand the structure of it.
I've tried to add connect=False to this code but nothing...
db = MongoClient(connect=False).database
db.authenticate('user', 'password', mechanism='SCRAM-SHA-1')
collectionW = db['words']
collectionT = db['sinMemo']
collectionL = db['sinLogic']
then i've tried to move it in the f function (right before train() but what i get is that the program doesn't find collectionW,collectionT and collectionL.
I'm not very expert of python or mongodb so i hope that this is not a silly question.
The code is running under Ubuntu 16.04.2 with python 2.7.12
db.authenticate will have to connect to mongo server and it will try to make a connection. So, even though connect=False is being used, db.authenticate will require a connection to be open.
Why don't you create the mongo client instance after fork? That's look like the easiest solution.
Since db.authenticate must open the MongoClient and connect to the server, it creates connections which won't work in the forked subprocess. Hence, the error message. Try this instead:
db = MongoClient('mongodb://user:password#localhost', connect=False).database
Also, delete the Lock l. Acquiring a lock in one subprocess has no effect on other subprocesses.
Here is how I did it for my problem:
import pathos.pools as pp
import time
import db_access
class MultiprocessingTest(object):
def __init__(self):
pass
def test_mp(self):
data = [[form,'form_number','client_id'] for form in range(5000)]
pool = pp.ProcessPool(4)
pool.map(db_access.insertData, data)
if __name__ == '__main__':
time_i = time.time()
mp = MultiprocessingTest()
mp.test_mp()
time_f = time.time()
print 'Time Taken: ', time_f - time_i
Here is db_access.py:
from pymongo import MongoClient
def insertData(form):
client = MongoClient()
db = client['TEST_001']
db.initialization.insert({
"form": form[0],
"form_number": form[1],
"client_id": form[2]
})
This is happening to your code because you are initiating MongoCLient() once for all the sub-processes. MongoClient is not fork safe. So, initiating inside each function works and let me know if there are other solutions.
Related
I'm a hobby coder started with AHK, then some java and now I try to learn Python. I have searched and found some tips but I have yet not been able to implement it into my own code.
Hopefully someone here can help me, it's a very short program.
I'm using .txt csv database with ";" as a separator.
DATABASE EXAMPLE:
Which color is normally a cat?;Black
How tall was the longest man on earth?;272 cm
Is the earth round?;Yes
The database now consists of 20.000 lines which makes the program "to slow", only using 25% CPU (1 core).
If I can make it use all 4 cores (100%) I guess it would perform the task alot faster. The task is basically to compare the CLIPBOARD with the database and if there is a match, it should give me an answer as a return. Perhaps also I can separate the database into 4 pieces?
The code right now looks like this! Not more then 65 lines and its doing its job (but to slow). Advice on how I can make this process into multi core needed.
import time
import pyperclip as pp
import pandas as pd
import pymsgbox as pmb
from fuzzywuzzy import fuzz
import numpy
ratio_threshold = 90
fall_back_time = 1
db_file_path = 'database.txt'
db_separator = ';'
db_encoding = 'latin-1'
def load_db():
while True:
try:
# Read and create database
db = pd.read_csv(db_file_path, sep=db_separator, encoding=db_encoding)
db = db.drop_duplicates()
return db
except:
print("Error in load_db(). Will sleep for %i seconds..." % fall_back_time)
time.sleep(fall_back_time)
def top_answers(db, question):
db['ratio'] = db['question'].apply(lambda q: fuzz.ratio(q, question))
db_sorted = db.sort_values(by='ratio', ascending=False)
db_sorted = db_sorted[db_sorted['ratio'] >= ratio_threshold]
return db_sorted
def write_txt(top):
result = top.apply(lambda row: "%s" % (row['answer']), axis=1).tolist()
result = '\n'.join(result)
fileHandle = open("svar.txt", "w")
fileHandle.write(result)
fileHandle.close()
pp.copy("")
def main():
try:
db = load_db()
last_db_reload = time.time()
while True:
# Get contents of clipboard
question = pp.paste()
# Rank answer
top = top_answers(db, question)
# If answer was found, show results
if len(top) > 0:
write_txt(top)
time.sleep(fall_back_time)
except:
print("Error in main(). Will sleep for %i seconds..." % fall_back_time)
time.sleep(fall_back_time)
if name == 'main':
main()'
If you could divide the db into four equally large you could process them in parallel like this:
import time
import pyperclip as pp
import pandas as pd
import pymsgbox as pmb
from fuzzywuzzy import fuzz
import numpy
import threading
ratio_threshold = 90
fall_back_time = 1
db_file_path = 'database.txt'
db_separator = ';'
db_encoding = 'latin-1'
def worker(thread_id, question):
thread_id = str(thread_id)
db = pd.read_csv(db_file_path + thread_id, sep=db_separator, encoding=db_encoding)
db = db.drop_duplicates()
db['ratio'] = db['question'].apply(lambda q: fuzz.ratio(q, question))
db_sorted = db.sort_values(by='ratio', ascending=False)
db_sorted = db_sorted[db_sorted['ratio'] >= ratio_threshold]
top = db_sorted
result = top.apply(lambda row: "%s" % (row['answer']), axis=1).tolist()
result = '\n'.join(result)
fileHandle = open("svar" + thread_id + ".txt", "w")
fileHandle.write(result)
fileHandle.close()
pp.copy("")
return
def main():
question = pp.paste()
for i in range(1, 4):
t = threading.Thread(target=worker, args=(i, question))
t.start()
t.join()
if name == 'main':
main()
The solution with multiprocessing:
import time
import pyperclip as pp
import pandas as pd
#import pymsgbox as pmb
from fuzzywuzzy import fuzz
import numpy as np
# pathos uses better pickle to tranfer more complicated objects
from pathos.multiprocessing import Pool
from functools import reduce
import sys
import os
from contextlib import closing
ratio_threshold = 70
fall_back_time = 1
db_file_path = 'database.txt'
db_separator = ';'
db_encoding = 'latin-1'
chunked_db = []
NUM_PROCESSES = os.cpu_count()
def load_db():
while True:
try:
# Read and create database
db = pd.read_csv(db_file_path, sep=db_separator, encoding=db_encoding)
db.columns = ['question', 'answer']
#db = db.drop_duplicates() # i drop it for experiment
break
except:
print("Error in load_db(). Will sleep for %i seconds..." % fall_back_time)
time.sleep(fall_back_time)
# split database into equal chunks:
# (if you have a lot of RAM, otherwise you
# need to compute ranges in db, something like
# chunk_size = len(db)//NUM_PROCESSES
# ranges[i] = (i*chunk_size, (i+1)*cjunk_size)
# and pass ranges in original db to processes
chunked_db = np.split(db, [NUM_PROCESSES], axis=0)
return chunked_db
def top_answers_multiprocessed(question, chunked_db):
# on unix, python uses 'fork' mode by default
# so the process has 'copy-on-change' access to all global variables
# i.e. if process will change something in db, it will be copied to it
# with a lot of overhead
# Unfortunately, I'fe heard that on Windows only 'spawn' mode with full
# copy of everything is used
# Process pipeline uses pickle, it's quite slow.
# so on small database you may not have benefit from multiprocessing
# If you are going to transfer big objects in or out, look
# in the direction of multiprocessing.Array
# this solution is not fully efficient,
# as pool is recreated each time
# You can create daemon processes which will monitor
# Queue for incoming questions, but it's harder to implement
def top_answers(idx):
# question is in the scope of parent function,
chunked_db[idx]['ratio'] = chunked_db[idx]['question'].apply(lambda q: fuzz.ratio(q, question))
db_sorted = chunked_db[idx].sort_values(by='ratio', ascending=False)
db_sorted = db_sorted[db_sorted['ratio'] >= ratio_threshold]
return db_sorted
with closing(Pool(processes=NUM_PROCESSES)) as pool:
# chunked_db is a list of databases
# they are in global scope, we send only index beacause
# all the data set is pickled
num_chunks = len(chunked_db)
# apply function top_answers across generator range(num_chunks)
res = pool.imap_unordered(top_answers, range(num_chunks))
res = list(res)
# now res is list of dataframes, let's join it
res_final = reduce(lambda left,right: pd.merge(left,right,on='ratio'), res)
return res_final
def write_txt(top):
result = top.apply(lambda row: "%s" % (row['answer']), axis=1).tolist()
result = '\n'.join(result)
fileHandle = open("svar.txt", "w")
fileHandle.write(result)
fileHandle.close()
pp.copy("")
def mainfunc():
global chunked_db
chunked_db = load_db()
last_db_reload = time.time()
print('db loaded')
last_clip = ""
while True:
# Get contents of clipboard
try:
new_clip = pp.paste()
except:
continue
if (new_clip != last_clip) and (len(new_clip)> 0):
print(new_clip)
last_clip = new_clip
question = new_clip.strip()
else:
continue
# Rank answer
top = top_answers_multiprocessed(question, chunked_db)
# If answer was found, show results
if len(top) > 0:
#write_txt(top)
print(top)
if __name__ == '__main__':
mainfunc()
I'm trying to transfer ca. 10GB of json data (tweets in my case) to a collection in arangodb. I'm also trying to use joblib for it:
from ArangoConn import ArangoConn
import Userdata as U
import encodings
from joblib import Parallel,delayed
import json
from glob import glob
import time
def progress(total, prog, start, stri = ""):
if(prog == 0):
print("")
prog = 1;
perc = prog / total
diff = time.time() - start
rem = (diff / prog) * (total - prog)
bar = ""
for i in range(0,int(perc*20)):
bar = bar + "|"
for i in range(int(perc*20),20):
bar = bar + " "
print("\r"+"progress: " + "[" + bar + "] " + str(prog) + " of " +
str(total) + ": {0:.1f}% ".format(perc * 100) + "- " +
time.strftime("%H:%M:%S", time.gmtime(rem)) + " " + stri, end="")
def processfile(filepath):
file = open(filepath,encoding='utf-8')
s = file.read()
file.close()
data = json.loads(s)
Parallel(n_jobs=12, verbose=0, backend="threading"
(map(delayed(ArangoConn.createDocFromObject), data))
files = glob(U.path+'/*.json')
i = 1
j = len(files)
starttime = time.time()
for f in files:
progress(j,i,starttime,f)
i = i+1
processfile(f)
and
from pyArango.connection import Connection
import Userdata as U
import time
class ArangoConn:
def __init__(self,server,user,pw,db,collectionname):
self.server = server
self.user = user
self.pw = pw
self.db = db
self.collectionname = collectionname
self.connection = None
self.dbHandle = self.connect()
if not self.dbHandle.hasCollection(name=self.collectionname):
coll = self.dbHandle.createCollection(name=collectionname)
else:
coll = self.dbHandle.collections[collectionname]
self.collection = coll
def db_createDocFromObject(self, obj):
data = obj.__dict__()
doc = self.collection.createDocument()
for key,value in data.items():
doc[key] = value
doc._key= str(int(round(time.time() * 1000)))
doc.save()
def connect(self):
self.connection = Connection(arangoURL=self.server + ":8529",
username=self.user, password=self.pw)
if not self.connection.hasDatabase(self.db):
db = self.connection.createDatabase(name=self.db)
else:
db = self.connection.databases.get(self.db)
return db
def disconnect(self):
self.connection.disconnectSession()
def getAllData(self):
docs = []
for doc in self.collection.fetchAll():
docs.append(self.doc_to_result(doc))
return docs
def addData(self,obj):
self.db_createDocFromObject(obj)
def search(self,collection,search,prop):
docs = []
aql = """FOR q IN """+collection+""" FILTER q."""+prop+""" LIKE
"%"""+search+"""%" RETURN q"""
results = self.dbHandle.AQLQuery(aql, rawResults=False, batchSize=1)
for doc in results:
docs.append(self.doc_to_result(doc))
return docs
def doc_to_result(self,arangodoc):
modstore = arangodoc.getStore()
modstore["_key"] = arangodoc._key
return modstore
def db_createDocFromJson(self,json):
for d in json:
doc = self.collection.createDocument()
for key,value in d.items():
doc[key] = value
doc._key = str(int(round(time.time() * 1000)))
doc.save()
#staticmethod
def createDocFromObject(obj):
c = ArangoConn(U.url, U.user, U.pw, U.db, U.collection)
data = obj
doc = c.collection.createDocument()
for key, value in data.items():
doc[key] = value
doc._key = doc["id"]
doc.save()
c.connection.disconnectSession()
It kinda works like that. My problem is that the data that lands in the database is somehow mixed up.
as you can see in the screenshot "id" and "id_str" are not the same - as they should be.
what i investigated so far:
I thought that at some points the default keys in the databese may "collide"
because of the threading so I set the key to the tweet id.
I tried to do it without multiple threads. the threading doesn't seem to be
the problem
I looked at the data I send to the database... everything seems to be fine
But as soon as I communicate with the db the data mixes up.
My professor thought that maybe something in pyarango isn't threadsafe and it messes up the data but I don't think so as threading doesn't seem to be the problem.
I have no ideas left where this behavior could come from...
Any ideas?
The screenshot shows the following values:
id : 892886691937214500
id_str : 892886691937214465
It looks like somewhere along the way the value is converted to an IEEE754 double, which cannot safely represent the latter value. So there is potentially some precision loss due to conversion.
A quick example in node.js (JavaScript is using IEEE754 doubles for any number values greater than 0xffffffff) shows that this is likely the problem cause:
$ node
> 892886691937214500
892886691937214500
> 892886691937214465
892886691937214500
So the question is where the conversion does happen. Can you check whether the python client program is correctly sending the expected values to ArangoDB, or does it already send the converted/truncated values?
In general, any integer number that exceeds 0x7fffffffffffffff will be truncated when stored in ArangoDB, or converted to an IEEE754 double. This can be avoided by storing the number values inside a string, but of course comparing two number strings will produce different results than comparing two numbers (e.g. "10" < "9" vs. 10 > 9).
# -*- coding: utf-8 -*-
from __future__ import print_function
import os, codecs, re, string, mysql
import mysql.connector
'''Reading files with txt extension'''
y_ = ""
for root, dirs, files in os.walk("/Users/Documents/source-document/part1"):
for file in files:
if file.endswith(".txt"):
x_ = codecs.open(os.path.join(root,file),"r", "utf-8-sig")
for lines in x_.readlines():
y_ = y_ + lines
#print(tokenized_docs)
'''Tokenizing sentences of the text files'''
from nltk.tokenize import sent_tokenize
raw_docs = sent_tokenize(y_)
tokenized_docs = [sent_tokenize(y_) for sent in raw_docs]
'''Removing stop words'''
stopword_removed_sentences = []
from nltk.corpus import stopwords
stopset = stopwords.words("English")
for i in tokenized_docs[0]:
tokenized_docs = ' '.join([word for word in i.split() if word not in stopset])
stopword_removed_sentences.append(tokenized_docs)
''' Removing punctuation marks'''
regex = re.compile('[%s]' % re.escape(string.punctuation)) #see documentation here: http://docs.python.org/2/library/string.html
nw = []
for review in stopword_removed_sentences:
new_review = ''
for token in review:
new_token = regex.sub(u'', token)
if not new_token == u'':
new_review += new_token
nw.append(new_review)
'''Lowercasing letters after removing puctuation marks.'''
lw = [] #lw stands for lowercase word.
for i in nw:
k = i.lower()
lw.append(k)
'''Removing number with a dummy symbol'''
nr = []
for j in lw:
string = j
regex = r'[^\[\]]+(?=\])'
# let "#" be the dummy symbol
output = re.sub(regex,'#',string)
nr.append(output)
nrfinal = []
for j in nr:
rem = 0
outr = ''
for i in j:
if ord(i)>= 48 and ord(i)<=57:
rem += 1
if rem == 1:
outr = outr+ '#'
else:
rem = 0
outr = outr+i
nrfinal.append(outr)
'''Inserting into database'''
def connect():
for j in nrfinal:
conn = mysql.connector.connect(user = 'root', password = '', unix_socket = "/tmp/mysql.sock", database = 'Thesis' )
cursor = conn.cursor()
cursor.execute("""INSERT INTO splitted_sentences(sentence_id, splitted_sentences) VALUES(%s, %s)""",(cursor.lastrowid,j))
conn.commit()
conn.close()
if __name__ == '__main__':
connect()
I am not getting any error with this code. It is doing well for the text files. The problem is only the execution time as I have a lot of text files (nearly 6Gb) for which the program is taking too much time. On inspection i found that it is CPU-bound. So to solve it, multiprocessing is needed. Please help me to write my code with multiprocessing module so that parallel processing can be done.
Thank you all.
there's an example in the python docs which demonstrates the use of multiprocessing:
from multiprocessing import Pool
def f(x):
return x*x
if __name__ == '__main__':
with Pool(5) as p:
print(p.map(f, [1, 2, 3]))
You can use this to adapt your code. Once you've obtained the text files, you use the map function to execute the rest in parallel. You'd have to define a function encapsulating the code you want to execute on multiple cores.
However, reading files in parallel may decrease the performance. Also, adding content to the database in asynchronously may not work. So you may want to perform these two tasks in the main thread, still
for testing reasons I start only 1 process. One given argument is an array that shall be changed from that process.
class Engine():
Ready = Value('i', False)
def movelisttoctypemovelist(self, movelist):
ctML = []
for zug in movelist:
ctZug = ctypeZug()
ctZug.VonReihe = zug.VonReihe
ctZug.VonLinie = zug.VonLinie
ctZug.NachReihe = zug.NachReihe
ctZug.NachLinie = zug.NachLinie
ctZug.Bewertung = zug.Bewertung
ctML.append(ctZug)
return ctML
def findbestmove(self, board, settings, enginesettings):
print ("Computer using", multiprocessing.cpu_count(),"Cores.")
movelist = Array(ctypeZug, [], lock = True)
movelist = self.movelisttoctypemovelist(board.movelist)
bd = board.boardtodictionary()
process = []
for i in range(1):
p = Process(target=self.calculatenullmoves, args=(bd, movelist, i, self.Ready))
process.append(p)
p.start()
for p in process:
p.join()
self.printctypemovelist(movelist, settings)
print ("Ready:", self.Ready.value)
def calculatenullmoves(self, boarddictionary, ml, processindex, ready):
currenttime = time()
print ("Process", processindex, "begins to work...")
board = Board()
board.dictionarytoboard(boarddictionary)
...
ml[processindex].Bewertung = 2.4
ready.value = True
print ("Process", processindex, "finished work in", time()-currenttime, "sec")
def printctypemovelist(self, ml):
for zug in ml:
print (zug.VonReihe, zug.VonLinie, zug.NachReihe, zug.NachLinie, zug.Bewertung)
I try to write 2.4 directly in the list, but no changing is shown when calling "printctypemovelist".
I set "Ready" to True and it works.
I used information from http://docs.python.org/2/library/multiprocessing.html#module-multiprocessing.sharedctypes
I hope someone can find my mistake, if it is too difficult to read, please let me know.
The problem is that you're trying to share a plain Python list:
ctML = []
Use a proxy object instead:
from multiprocessing import Manager
ctML = Manager().list()
See Python doc on Sharing state between processes for more detail.
I am filtering huge text files using multiprocessing.py. The code basically opens the text files, works on it, then closes it.
Thing is, I'd like to be able to launch it successively on multiple text files. Hence, I tried to add a loop, but for some reason it doesn't work (while the code works on each file). I believe this is an issue with:
if __name__ == '__main__':
However, I am looking for something else. I tried to create a Launcher and a LauncherCount files like this:
LauncherCount.py:
def setLauncherCount(n):
global LauncherCount
LauncherCount = n
and,
Launcher.py:
import os
import LauncherCount
LauncherCount.setLauncherCount(0)
os.system("OrientedFilterNoLoop.py")
LauncherCount.setLauncherCount(1)
os.system("OrientedFilterNoLoop.py")
...
I import LauncherCount.py, and use LauncherCount.LauncherCount as my loop index.
Of course, this doesn't work too as it edits the variable LauncherCount.LauncherCount locally, so it won't be edited in the imported version of LauncherCount.
Is there any way to edit globally a variable in an imported file? Or, is there any way to do this in any other way? What I need is running a code multiple times, in changing one value, and without using any loop apparently.
Thanks!
Edit: Here is my main code if necessary. Sorry for the bad style ...
import multiprocessing
import config
import time
import LauncherCount
class Filter:
""" Filtering methods """
def __init__(self):
print("launching methods")
# Return the list: [Latitude,Longitude] (elements are floating point numbers)
def LatLong(self,line):
comaCount = []
comaCount.append(line.find(','))
comaCount.append(line.find(',',comaCount[0] + 1))
comaCount.append(line.find(',',comaCount[1] + 1))
Lat = line[comaCount[0] + 1 : comaCount[1]]
Long = line[comaCount[1] + 1 : comaCount[2]]
try:
return [float(Lat) , float(Long)]
except ValueError:
return [0,0]
# Return a boolean:
# - True if the Lat/Long is within the Lat/Long rectangle defined by:
# tupleFilter = (minLat,maxLat,minLong,maxLong)
# - False if not
def LatLongFilter(self,LatLongList , tupleFilter) :
if tupleFilter[0] <= LatLongList[0] <= tupleFilter[1] and
tupleFilter[2] <= LatLongList[1] <= tupleFilter[3]:
return True
else:
return False
def writeLine(self,key,line):
filterDico[key][1].write(line)
def filteringProcess(dico):
myFilter = Filter()
while True:
try:
currentLine = readFile.readline()
except ValueError:
break
if len(currentLine) ==0: # Breaks at the end of the file
break
if len(currentLine) < 35: # Deletes wrong lines (too short)
continue
LatLongList = myFilter.LatLong(currentLine)
for key in dico:
if myFilter.LatLongFilter(LatLongList,dico[key][0]):
myFilter.writeLine(key,currentLine)
###########################################################################
# Main
###########################################################################
# Open read files:
readFile = open(config.readFileList[LauncherCount.LauncherCount][1], 'r')
# Generate writing files:
pathDico = {}
filterDico = config.filterDico
# Create outputs
for key in filterDico:
output_Name = config.readFileList[LauncherCount.LauncherCount][0][:-4]
+ '_' + key +'.log'
pathDico[output_Name] = config.writingFolder + output_Name
filterDico[key] = [filterDico[key],open(pathDico[output_Name],'w')]
p = []
CPUCount = multiprocessing.cpu_count()
CPURange = range(CPUCount)
startingTime = time.localtime()
if __name__ == '__main__':
### Create and start processes:
for i in CPURange:
p.append(multiprocessing.Process(target = filteringProcess ,
args = (filterDico,)))
p[i].start()
### Kill processes:
while True:
if [p[i].is_alive() for i in CPURange] == [False for i in CPURange]:
readFile.close()
for key in config.filterDico:
config.filterDico[key][1].close()
print(key,"is Done!")
endTime = time.localtime()
break
print("Process started at:",startingTime)
print("And ended at:",endTime)
To process groups of files in sequence while working on files within a group in parallel:
#!/usr/bin/env python
from multiprocessing import Pool
def work_on(args):
"""Process a single file."""
i, filename = args
print("working on %s" % (filename,))
return i
def files():
"""Generate input filenames to work on."""
#NOTE: you could read the file list from a file, get it using glob.glob, etc
yield "inputfile1"
yield "inputfile2"
def process_files(pool, filenames):
"""Process filenames using pool of processes.
Wait for results.
"""
for result in pool.imap_unordered(work_on, enumerate(filenames)):
#NOTE: in general the files won't be processed in the original order
print(result)
def main():
p = Pool()
# to do "successive" multiprocessing
for filenames in [files(), ['other', 'bunch', 'of', 'files']]:
process_files(p, filenames)
if __name__=="__main__":
main()
Each process_file() is called in sequence after the previous one has been complete i.e., the files from different calls to process_files() are not processed in parallel.