I have a code in which I need to read an excel file and store the information into dictionaries.
I have to use multiprocessing.Manager() to create the dictionaries in order to be able to retrieve calculation output from a function that I run using multiprocess.Process.
The problem is that, when multiprocessing.Manager() and manager.dict() is used to create a dictionary it takes ~400 times longer than using only dict() (and dict() is not a shared memory structure).
Here is a sample code to verify the diference:
import xlrd
import multiprocessing
import time
def DictManager(inp1, inp2):
manager = multiprocessing.Manager()
Dict = manager.dict()
Dict['input1'] = inp1
Dict['input2'] = inp2
Dict['Output1'] = None
Dict['Output2'] = None
return Dict
def DictNoManager(inp1, inp2):
Dict = dict()
Dict['input1'] = inp1
Dict['input2'] = inp2
Dict['Output1'] = None
Dict['Output2'] = None
return Dict
def ReadFileManager(excelfile):
DictList = []
book = xlrd.open_workbook(excelfile)
sheet = book.sheet_by_index(0)
line = 2
for line in range(2,sheet.nrows):
inp1 = sheet.cell(line,2).value
inp2 = sheet.cell(line,3).value
dictionary = DictManager(inp1, inp2)
DictList.append(dictionary)
print 'Done!'
def ReadFileNoManager(excelfile):
DictList = []
book = xlrd.open_workbook(excelfile)
sheet = book.sheet_by_index(0)
line = 2
for line in range(2,sheet.nrows):
inp1 = sheet.cell(line,2).value
inp2 = sheet.cell(line,3).value
dictionary = DictNoManager(inp1, inp2)
DictList.append(dictionary)
print 'Done!'
if __name__ == '__main__':
excelfile = 'MyFile.xlsx'
start = time.time()
ReadFileNoManager(excelfile)
end = time.time()
print 'Run time NoManager:', end - start, 's'
start = time.time()
ReadFileManager(excelfile)
end = time.time()
print 'Run time Manager:', end - start, 's'
Is there a way to improve the performance of multiprocessing.Manager()?
If the answer is No, is there any other shared memory structure that I can use to replace what I am doing and improve performance?
I would appreciate your help!
EDIT:
My main function uses the following code:
def MyFunction(Dictionary, otherdata):
#Perform calculation and save results in the dictionary
Dict['Output1'] = Value1
Dict['Output2'] = Value2
ListOfProcesses = []
for Dict in DictList:
p = multiprocessing.Process(target=MyFunction, args=(Dict, otherdata)
p.start()
ListOfProcesses.append(p)
for p in ListOfProcesses:
p.join()
If I do not use the manager, I will not be able to retrieve the Outputs.
As I mentioned in the comments, I recommend using the main process to read in the excel file. Then using multiprocessing for the function calls. Just add your function to apply_function and make sure it returns whatever you want. results will contain a list of your results.
Update: I changed map to starmap to include your extra argument
def ReadFileNoManager(excelfile):
DictList = []
book = xlrd.open_workbook(excelfile)
sheet = book.sheet_by_index(0)
line = 2
for line in range(2,sheet.nrows):
inp1 = sheet.cell(line,2).value
inp2 = sheet.cell(line,3).value
dictionary = DictNoManager(inp1, inp2)
DictList.append(dictionary)
print 'Done!'
return DictList
def apply_function(your_dict, otherdata):
pass
if __name__ == '__main__':
excelfile = 'MyFile.xlsx'
dict_list = ReadFileNoManager(excelfile)
pool = multiprocessing.Pool(multiprocessing.cpu_count())
results = pool.starmap(apply_function, zip(dict_list, repeat(otherdata)))
Related
I have the following function:
def match_keywords(reviews_match, nlu_match, keywords_match):
for j in range(df_NLU_Reviews.shape[0]):
if((j%1000)==0):
print(j)
keywords = df_NLU_Reviews.Keywords.iloc[j]
for i in range(len(sentences)):
try:
counter=0
for keyword in keywords:
if(keyword in sentences[i]):
counter+=1
if( (len(keywords)) == counter ):
reviews_match.append(sentences[i])
nlu_match.append(df_NLU_Reviews.NLU_Review.iloc[j])
keywords_match.append(df_NLU_Reviews.Keywords.iloc[j])
sentences.remove(sentences[i])
break
except Exception as e:
print(i)
print(j)
raise e
df_match = pd.DataFrame()
df_match['Reviews'] = reviews_match
df_match['NLU'] = nlu_match
df_match['Keywords'] = keywords_match
df_match.to_pickle("Match_Reviews.pkl")
return df_match
This function takes 3 empty lists as arguments that will be filled during the execution of the function.
I want to parallelize using multiprocessing.Pool, but i can't figure out how to do it.
I have tried this:
reviews_match = []
nlu_match = []
keywords_match = []
match_list = [reviews_match, nlu_match, keywords_match]
if __name__ == '__main__':
with Pool(processes = 12) as pool:
results = pool.map(match_keywords, zip(reviews_match, nlu_match, keywords_match))
print(results)
this:
reviews_match = []
nlu_match = []
keywords_match = []
match_list = [reviews_match, nlu_match, keywords_match]
if __name__ == '__main__':
with Pool(processes = 12) as pool:
results = pool.map(match_keywords, zip(match_list))
print(results)
and this too:
reviews_match = []
nlu_match = []
keywords_match = []
match_list = [reviews_match, nlu_match, keywords_match]
if __name__ == '__main__':
with Pool(processes = 12) as pool:
results = pool.starmap(match_keywords, zip(reviews_match, nlu_match, keywords_match))
print(results)
But none of this works, these methods throw errors or empty lists as output. If i run this function without parallelization like this:
match_keywords(reviews_match, nlu_match, keywords_match)
It works just fine. Could someone please show me the right way of doing this and explain to me why this is not working?
Thank you very much in advance
You last variant look correct and will not return empty list, check it again.
But i think you can't parallelise it this way, because it is not equal to run
match_keywords(reviews_match, nlu_match, keywords_match)
in many threads, it is equal to run:
match_keywords(reviews_match[0], nlu_match[0], keywords_match[0])
match_keywords(reviews_match[1], nlu_match[1], keywords_match[1])
match_keywords(reviews_match[2], nlu_match[2], keywords_match[2])
match_keywords(reviews_match[3], nlu_match[3], keywords_match[3])
...
many times.
I'm trying to build a list of parent/comment pairs from the publicly available Reddit data set.
I have a CSV file which I load into a Pandas dataframe which contains rows of the comments with the parent and child id, as well as the child comment. The data is loaded using the following block of code:
import os
import multiprocessing as mp
import numpy as np
import pandas as pd
sourcePATH = r'C:\'
workingFILE = r'\output-pt1.csv'
# filepaths
input_file = sourcePATH + workingFILE
data_df = pd.read_csv(input_file,header=None,names=['PostIDX','ParentIDX','Comment','Score','Controversiality'])
The aim is to scan through each row in the dataframe and using the parent id to search through the rest of the dataframe to see if their is a parent comment present. If it is I then store the child and parent comments in a tuple with some other information. This will then be added to a list which will then be written out to a csv file at the end. To do this I use the following code:
def checkChildParent(ParentIDX_curr, ChildIDX_curr,ChildComment_curr,ChildScore_curr,ChildCont_curr):
idx = data_df.loc[data_df['PostIDX'] == ParentIDX_curr]
if idx.empty is False:
ParentComment = idx.iloc[0,2]
ParentScore = idx.iloc[0,3]
ParentCont = idx.iloc[0,4]
outPut.put([ParentIDX_curr[0], ParentComment,ParentScore,ParentCont,ChildIDX_curr[0], ChildComment_curr[0],ChildScore_curr[0],ChildCont_curr[0]])
if __name__ == '__main__':
print('Process started')
t_start_init = time.time()
t_start = time.time()
noCores = 1
#pool = mp.Pool(processes=noCores)
update_freq = 100
n = 1000
#n = round(len(data_df)/8)
flag_create = 0
flag_run = 0
i = 0
outPut = mp.Queue()
#parent_child_df = pd.DataFrame()
#parent_child_df.coumns = ['PostIDX','ParentIDX']
while i < n:
#print(i)
procs = []
ParentIDX = []
ParentComment = []
ParentScore = []
ParentCont = []
ChildIDX = []
ChildComment = []
ChildScore = []
ChildCont = []
for worker in range(0,noCores):
ParentIDX.append(data_df.iloc[i,1])
ChildIDX.append(data_df.iloc[i,0])
ChildComment.append(data_df.iloc[i,2])
ChildScore.append(data_df.iloc[i,3])
ChildCont.append(data_df.iloc[i,4])
i = i + 1
#when I call the function this way it returns the expected matches
#checkChildParent(ParentIDX,ChildIDX,ChildComment,
# ChildScore,ChildCont)
#when I call the function with Process function nothing appears to be happening
for proc in range(0,noCores):
p = mp.Process(target = checkChildParent, args=(ParentIDX[proc],ChildIDX[proc],ChildComment[proc],ChildScore[proc],ChildCont[proc]))
procs.append(p)
p.start()
#for p in procs:
# p.join()
if outPut.empty() is False:
print(outPut.get())
At the top of the file is a function which scans the dataframe for a given row and returns the tuple of the matched parent and child comment if it was found. If I call this function normally then it works fine, however when I call the function using the Process function it doesn't match anything!. I'm guessing its the form the arguments that are being passed to the function is being passed to the function that is causing the issue, but I have been trying to debug this all afternoon and have failed so far. If anyone has any suggestions then please let me know!
Thanks!
Lets assume that I have a text file with only 2 rows as follows:
File.txt:
100022441 #DavidBartonWB Guarding Constitution
100022441 RT #frankgaffney 2nd Amendment Guy.
First column is user id and second column is user tweet. I'd like to read the above text file and update the following dictionary:
d={'100022441':{'#frankgaffney': 0, '#DavidBartonWB': 0}}.
Here is my code:
def f(line):
data = line.split('\t')
uid = data[0]
tweet = data[1]
if uid in d.keys():
for gn in d[uid].keys():
if gn in tweet:
return uid, gn, 1
else:
return uid, gn, 0
p = Pool(4)
with open('~/File.txt') as source_file:
for uid, gn, r in p.map(f, source_file):
d[uid][gn] += r
So basically I need to read each line of the file and determine whether the user is in my dictionary, and if it is, whether the tweet contain user's keys in the dictionary (e.g. '#frankgaffney' and '#DavidBartonWB'). So based on the two lines I wrote above, the code should result:
d = {{'100022441':{'#frankgaffney': 1, '#DavidBartonWB': 1 }}
But it gives:
d = {{'100022441':{'#frankgaffney': 1, '#DavidBartonWB': 0 }}
For some reason the code always loses one of the keys for all users. Any idea what is wrong in my code?
Your file is tab delimited, and you are always checking the third column for the mention; it works correctly for the first mention because you are passing in the entire file to the function, not each line. So effectively you are doing this:
>>> s = '100022441\t#DavidBartonWB Guarding Constitution\n100022441\tRT#frankgaffney 2nd Amendment Guy.'
>>> s.split('\t')
['100022441', '#DavidBartonWB Guarding Constitution\n100022441', 'RT#frankgaffney 2nd Amendment Guy.']
I recommend two approaches:
Map your function to each line in the file.
Use regular expressions for a more robust search.
Try this version:
import re
d = {'100022441':{'#frankgaffney': 0, '#DavidBartonWB': 0}}
e = r'(#\w+)'
def parser(line):
key, tweet = line.split('\t')
data = d.get(key)
if data:
mentions = re.findall(e, tweet)
for mention in mentions:
if mention in data.keys():
d[key][mention] += 1
with open('~/File.txt') as f:
for line in f:
parser(line)
print(d)
Once you've confirmed its working correctly, then you can multi-process it:
import itertools, re
from multiprocessing import Process, Manager
def parse(queue, d, m):
while True:
line = queue.get()
if line is None:
return # we are done with this thread
key, tweet = line.split('\t')
data = d.get(key)
e = r'(#\w+)'
if data:
mentions = re.findall(e, tweet)
for mention in mentions:
if mention in data:
if mention not in m:
m[mention] = 1
else:
m[mention] += 1
if __name__ == '__main__':
workers = 2
manager = Manager()
d = manager.dict()
d2 = manager.dict()
d = {'100022441': ['#frankgaffney', '#DavidBartonWB']}
queue = manager.Queue(workers)
worker_pool = []
for i in range(workers):
p = Process(target=parse, args=(queue, d, d2))
p.start()
worker_pool.append(p)
# Fill the queue with data for the workers
with open(r'tweets2.txt') as f:
iters = itertools.chain(f, (None,)*workers)
for line in iters:
queue.put(line)
for p in worker_pool:
p.join()
for i,data in d.iteritems():
print('For ID: {}'.format(i))
for key in data:
print(' {} - {}'.format(key, d2[key]))
second column is data[1], not data[2]
the fact that data[2] works means that you are splitting into words, not columns
if you want to search for the user key as a separate word (as opposed to substring), you need tweet=data[1:]
if you want to search for a substring you need to split into exactly two pieces: uid,tweet=line.split(None,1)
for testing reasons I start only 1 process. One given argument is an array that shall be changed from that process.
class Engine():
Ready = Value('i', False)
def movelisttoctypemovelist(self, movelist):
ctML = []
for zug in movelist:
ctZug = ctypeZug()
ctZug.VonReihe = zug.VonReihe
ctZug.VonLinie = zug.VonLinie
ctZug.NachReihe = zug.NachReihe
ctZug.NachLinie = zug.NachLinie
ctZug.Bewertung = zug.Bewertung
ctML.append(ctZug)
return ctML
def findbestmove(self, board, settings, enginesettings):
print ("Computer using", multiprocessing.cpu_count(),"Cores.")
movelist = Array(ctypeZug, [], lock = True)
movelist = self.movelisttoctypemovelist(board.movelist)
bd = board.boardtodictionary()
process = []
for i in range(1):
p = Process(target=self.calculatenullmoves, args=(bd, movelist, i, self.Ready))
process.append(p)
p.start()
for p in process:
p.join()
self.printctypemovelist(movelist, settings)
print ("Ready:", self.Ready.value)
def calculatenullmoves(self, boarddictionary, ml, processindex, ready):
currenttime = time()
print ("Process", processindex, "begins to work...")
board = Board()
board.dictionarytoboard(boarddictionary)
...
ml[processindex].Bewertung = 2.4
ready.value = True
print ("Process", processindex, "finished work in", time()-currenttime, "sec")
def printctypemovelist(self, ml):
for zug in ml:
print (zug.VonReihe, zug.VonLinie, zug.NachReihe, zug.NachLinie, zug.Bewertung)
I try to write 2.4 directly in the list, but no changing is shown when calling "printctypemovelist".
I set "Ready" to True and it works.
I used information from http://docs.python.org/2/library/multiprocessing.html#module-multiprocessing.sharedctypes
I hope someone can find my mistake, if it is too difficult to read, please let me know.
The problem is that you're trying to share a plain Python list:
ctML = []
Use a proxy object instead:
from multiprocessing import Manager
ctML = Manager().list()
See Python doc on Sharing state between processes for more detail.
I am filtering huge text files using multiprocessing.py. The code basically opens the text files, works on it, then closes it.
Thing is, I'd like to be able to launch it successively on multiple text files. Hence, I tried to add a loop, but for some reason it doesn't work (while the code works on each file). I believe this is an issue with:
if __name__ == '__main__':
However, I am looking for something else. I tried to create a Launcher and a LauncherCount files like this:
LauncherCount.py:
def setLauncherCount(n):
global LauncherCount
LauncherCount = n
and,
Launcher.py:
import os
import LauncherCount
LauncherCount.setLauncherCount(0)
os.system("OrientedFilterNoLoop.py")
LauncherCount.setLauncherCount(1)
os.system("OrientedFilterNoLoop.py")
...
I import LauncherCount.py, and use LauncherCount.LauncherCount as my loop index.
Of course, this doesn't work too as it edits the variable LauncherCount.LauncherCount locally, so it won't be edited in the imported version of LauncherCount.
Is there any way to edit globally a variable in an imported file? Or, is there any way to do this in any other way? What I need is running a code multiple times, in changing one value, and without using any loop apparently.
Thanks!
Edit: Here is my main code if necessary. Sorry for the bad style ...
import multiprocessing
import config
import time
import LauncherCount
class Filter:
""" Filtering methods """
def __init__(self):
print("launching methods")
# Return the list: [Latitude,Longitude] (elements are floating point numbers)
def LatLong(self,line):
comaCount = []
comaCount.append(line.find(','))
comaCount.append(line.find(',',comaCount[0] + 1))
comaCount.append(line.find(',',comaCount[1] + 1))
Lat = line[comaCount[0] + 1 : comaCount[1]]
Long = line[comaCount[1] + 1 : comaCount[2]]
try:
return [float(Lat) , float(Long)]
except ValueError:
return [0,0]
# Return a boolean:
# - True if the Lat/Long is within the Lat/Long rectangle defined by:
# tupleFilter = (minLat,maxLat,minLong,maxLong)
# - False if not
def LatLongFilter(self,LatLongList , tupleFilter) :
if tupleFilter[0] <= LatLongList[0] <= tupleFilter[1] and
tupleFilter[2] <= LatLongList[1] <= tupleFilter[3]:
return True
else:
return False
def writeLine(self,key,line):
filterDico[key][1].write(line)
def filteringProcess(dico):
myFilter = Filter()
while True:
try:
currentLine = readFile.readline()
except ValueError:
break
if len(currentLine) ==0: # Breaks at the end of the file
break
if len(currentLine) < 35: # Deletes wrong lines (too short)
continue
LatLongList = myFilter.LatLong(currentLine)
for key in dico:
if myFilter.LatLongFilter(LatLongList,dico[key][0]):
myFilter.writeLine(key,currentLine)
###########################################################################
# Main
###########################################################################
# Open read files:
readFile = open(config.readFileList[LauncherCount.LauncherCount][1], 'r')
# Generate writing files:
pathDico = {}
filterDico = config.filterDico
# Create outputs
for key in filterDico:
output_Name = config.readFileList[LauncherCount.LauncherCount][0][:-4]
+ '_' + key +'.log'
pathDico[output_Name] = config.writingFolder + output_Name
filterDico[key] = [filterDico[key],open(pathDico[output_Name],'w')]
p = []
CPUCount = multiprocessing.cpu_count()
CPURange = range(CPUCount)
startingTime = time.localtime()
if __name__ == '__main__':
### Create and start processes:
for i in CPURange:
p.append(multiprocessing.Process(target = filteringProcess ,
args = (filterDico,)))
p[i].start()
### Kill processes:
while True:
if [p[i].is_alive() for i in CPURange] == [False for i in CPURange]:
readFile.close()
for key in config.filterDico:
config.filterDico[key][1].close()
print(key,"is Done!")
endTime = time.localtime()
break
print("Process started at:",startingTime)
print("And ended at:",endTime)
To process groups of files in sequence while working on files within a group in parallel:
#!/usr/bin/env python
from multiprocessing import Pool
def work_on(args):
"""Process a single file."""
i, filename = args
print("working on %s" % (filename,))
return i
def files():
"""Generate input filenames to work on."""
#NOTE: you could read the file list from a file, get it using glob.glob, etc
yield "inputfile1"
yield "inputfile2"
def process_files(pool, filenames):
"""Process filenames using pool of processes.
Wait for results.
"""
for result in pool.imap_unordered(work_on, enumerate(filenames)):
#NOTE: in general the files won't be processed in the original order
print(result)
def main():
p = Pool()
# to do "successive" multiprocessing
for filenames in [files(), ['other', 'bunch', 'of', 'files']]:
process_files(p, filenames)
if __name__=="__main__":
main()
Each process_file() is called in sequence after the previous one has been complete i.e., the files from different calls to process_files() are not processed in parallel.