How can I parallel parsing in python? - python

I have the following code which converts graph from edges list to adjacency matrix:
for line in open('graph.txt'):
converted = [sparse_to_dense.get(int(ID)) for ID in line.split()]
i = converted[0]
j = converted[1]
I.append(i)
J.append(j)
n = max([max(I), max(J)]) + 1
data = [1]*len(I)
return coo_matrix((data, (I,J)), shape=(n,n), dtype='i1')
This code is awfully slow -- on may machine conversion of 500k edges takes hours. On the other hand i/o is obviously is not bottleneck (I can read full file in memory almost instantaneously) so I think there is a room for parallelism. But I'm not sure how to proceed: should I read file in parallel or something?

Use multiprocessing one way to do it is this. I did not check and could be further improved
import multiprocessing
class Worker(multiprocessing.Process):
def __init__(self, queue, results):
multiprocessing.Process.__init__(self):
self.q = queue
self.results = results
def run(self):
while True:
try:
lineno, linecontents = self.q.get(block=False)
except Queue.Empty:
break
converted = [sparse_to_dense.get(int(ID)) for ID in line.split()]
i = converted[0]
j = converted[1]
self.results.put((i, j))
def main():
q = multiprocessing.Queue()
results = multiprocessing.JoinableQueue()
for i, l in open(fname):
q.put((i, l))
for _ in xrange(4):
w = Worker(q, results)
w.start()
I, J = []
while True:
try:
i, j = results.get(block=False)
except Queue.Empty:
break
I.append(i)
J.append(j)
results.task_done()
results.join()
n = max([max(I), max(J)]) + 1
data = [1]*len(I)
coo = coo_matrix((data, (I,J)), shape=(n,n), dtype='i1')

Related

How to return a dictionary from a process in Python?

I want to make an inverted index using multiprocessing to speed up its work. My idea is to split the files into groups, and each process will build its own inverted index, and then I want to merge all these indexes into one inverted index. But I don't know how to return them to the main process that will merge them.
import multiprocessing as mp
from pathlib import Path
import re
import time
class InvertedIndex:
def __init__(self):
self.index = dict()
def createIndex(self, path='data', threads_num=4):
pathList = list(Path(path).glob('**/*.txt'))
fileNum = len(pathList)
oneProcessNum = fileNum / threads_num
processes = []
for i in range(threads_num):
startIndex = int(i * oneProcessNum)
endIndex = int((i + 1) * oneProcessNum)
currLi = pathList[startIndex:endIndex]
p = mp.Process(target=self.oneProcessTask, args=(currLi,))
processes.append(p)
[x.start() for x in processes]
[x.join() for x in processes]
#staticmethod
def oneProcessTask(listOfDoc):
#print(f'Start: {list[0]}, end: {list[-1]}') # temp
tempDict = dict()
for name in listOfDoc:
with open(name) as f:
text = f.read()
li = re.findall(r'\b\w+\b', text)
for w in li:
if tempDict.get(w) is None:
tempDict[w] = set()
tempDict[w].add(str(name))
def getListOfDoc(self, keyWord):
return self.index[keyWord]
if __name__ == '__main__':
ii = InvertedIndex()
start_time = time.time()
ii.createIndex()
print("--- %s seconds ---" % (time.time() - start_time))
I used multiprocessing.manager to write everything in one dictionary, but that solution was too slow. So I went back to the idea of creating own inverted index for each process and then merging them. But I don't know how to return all indexes to one process.
Take a look at concurrent.futures (native library) with either ThreadPoolExecutor or ProcessPoolExecutor. FYI: I wrote on that in here and did not test but, this is more or less the jist of what I use all the time.
from concurrent.futures import ThreadPoolExecutor, as_completed
def foo(stuff: int) -> dict:
return {}
things_to_analyze = [1,2,3]
threads = []
results = []
with ThreadPoolExecutor() as executor:
for things in things_to_analyze:
threads.append(executor.submit(foo, thing))
for job in as_completed(threads):
results.append(job.results())
I found a solution. I used pool.starmap to return a list of indexes.
My code:
class InvertedIndex:
def __init__(self):
self.smallIndexes = None
self.index = dict()
def createIndex(self, path='data', threads_num=4):
pathList = list(Path(path).glob('**/*.txt')) # Рекурсивно проходимо по всіх текстових файлах і робимо з них список
fileNum = len(pathList)
oneProcessNum = fileNum / threads_num # Розраховуємо скільки файлів має обробити один процес
processes_args = []
for i in range(threads_num):
startIndex = int(i * oneProcessNum)
endIndex = int((i + 1) * oneProcessNum)
processes_args.append((path, startIndex, endIndex))
pool = mp.Pool(threads_num)
self.smallIndexes = pool.starmap(self.oneProcessTask, processes_args)
self.mergeIndex()
#staticmethod
def oneProcessTask(path, startIndex, endIndex):
pathList = list(Path(path).glob('**/*.txt'))
listOfDoc = pathList[startIndex:endIndex]
tempDict = dict()
for name in listOfDoc:
with open(name) as f:
text = f.read()
li = re.findall(r'\b\w+\b', text)
for w in li:
if tempDict.get(w) is None:
tempDict[w] = set()
tempDict[w].add(str(name))
return tempDict
Execution time decreased from 200 seconds (when I used shared memory and menger.dict ) to 0.8 seconds (when I used pool.starmap).

Multithreading in Python does not give the expected performance

I am building an application to solve Matrices in Linear Algebra. I have this class:
class Matrix:
def __init__(self):
self.elements = []
self.height = 0
self.width = 0
def __add__(self, matrix):
resultant_matrix = []
row = []
for i in range(self.height):
row.clear()
for j in range(self.width):
row.insert(j, self.elements[i][j] + matrix.elements[i][j])
resultant_matrix.insert(i, row)
return self.list_to_object(resultant_matrix)
#staticmethod
def list_to_object(list_matrix):
matrix_obj = Matrix()
matrix_obj.elements = list_matrix
matrix_obj.height = len(list_matrix)
matrix_obj.width = len(list_matrix[0])
return matrix_obj
#classmethod
def get_matrix(cls, rows=3, columns=3):
mat = Matrix()
for i in range(rows):
row = []
for j in range(columns):
row.append(j)
mat.elements.append(row)
mat.height = len(mat.elements)
mat.width = len(mat.elements[0])
return mat
My focus is on the add method that helps me to add to matrices.
I have a method in the following class addition to split the matrices into smaller parts, then do the addition process on them, then reassemble them and return the new matrix.
class ThreadingManager:
threads = []
maximum_available_threads = 8
def do_add(self, slice1, slice2, index, _list):
m1 = Matrix.list_to_object(slice1)
m2 = Matrix.list_to_object(slice2)
elements = (m1 + m2).elements
for i, row in enumerate(elements):
_list[index+i] = row
def addition(self, matrix_a, matrix_b):
m1 = matrix_a.elements
m2 = matrix_b.elements
parts_number = self.maximum_available_threads
sub_matrices_m2 = get_sub_matrix(m1, parts_number)
sub_matrices_m3 = get_sub_matrix(m2, parts_number)
new_list = [[] * matrix_a.width] * matrix_a.height
for i in range(len(sub_matrices_m2)):
sub_matrices_m2_i = sub_matrices_m2[i]
sub_matrices_m3_i = sub_matrices_m3[i]
rows_number_per_thread = i*len(sub_matrices_m2_i)
thread = threading.Thread(target=self.do_add, args=(sub_matrices_m2_i, sub_matrices_m3_i,
rows_number_per_thread, new_list))
thread.start()
self.threads.append(thread)
for thread in self.threads:
thread.join()
self.threads.remove(thread)
return Matrix.list_to_object(new_list)
Finally, I have this test case (You can change the 50000 to any number, depending on your computer resources)
mat_a = Matrix.get_matrix(50000, 5)
mat_b = Matrix.get_matrix(50000, 5)
start = datetime.now()
mat_c = mat_a + mat_b
end = datetime.now()
print('time needed before multithreading is:', end - start)
start = datetime.now()
result = ThreadingManager().addition(mat_a, mat_b)
end = datetime.now()
print('time needed before multithreading is:', end - start)
My problem is that I cannot find any difference between the normal addition and multithreading. How can I use multithreading in a better way?
Note: If I removed the line row.clear() in the Matrix class, I have a wrong answer but I can notice the differences after multithreading
needed imports
import math
import threading
from datetime import datetime

Improving execution time for multiprocessing

I am trying to improve the performance of this code in terms of time without success for now. Even running in 32 processes takes like 5m. Maybe do you have any suggestions to improve the time of this code? Here evaluated_f_bool_func_lst is a list with 2**24 elements. The elements of this list are 1-length strings with values of '1' or '0' ("binary list").
from sage.all import *
import time
from multiprocessing import Pool
import multiprocessing
def create_ext_component_function_i(dim, chunk_i, chunk_size, evaluated_f_bool_func_lst):
sum_y_str = []
for y in range(chunk_i, chunk_i + chunk_size):
prod = ""
for i in range(dim):
minus1 = ((-1)**(1&(y>>(i))))
prod += f'(1-{str(minus1)}*x[{str(i)}])*'
sum_y_str.append(f'{prod}{evaluated_f_bool_func_lst[y]}')
return "+".join(sum_y_str)
def create_ext_component_function(dim, evaluated_f_bool_func_lst):
sum_y = ""
chunk_size = (2**dim)/32
pool = Pool(32)
results = []
for i in range(0, 2**dim, chunk_size):
results.append(pool.apply_async(create_ext_component_function_i, args=(dim, i, chunk_size, evaluated_f_bool_func_lst)))
pool.close()
pool.join()
join_results = [result.get() for result in results]
print("+".join(join_results))
return 0
if __name__ == '__main__':
evaluated_f_bool_func_lst = load("evaluated_f_bool_func_lst.obj")
dim = 24
create_ext_component_function(dim, evaluated_f_bool_func_lst)

Multiprocessing.Process do not run process in parallel

I tried to run a very simple multiprocessing code, but the code is still serially processed.
I have tried to run it on Mac(macOS 10.13) and Linux(Ubuntu 18.04) with python 2 and 3, but in both environments I had the same problem.
the function _process has to receive numpy array as arguments, so I decided to use Multiprocess.Process instead of Multiprocess.Pool.map() and Multiprocess.Pool.apply_async() because pickle is broken when use pool.map() in a class. https://stackoverflow.com/a/21345308/4755986
import time
from multiprocessing import Process, Queue
import numpy as np
class model:
def __init__(self):
self.results = []
self.jobs = []
self.start = time.time()
def _process(self, x,y,z):
j= 0
for i in range(10**8):
j = i+j
return j
def work(self,X,Y,Z, result_queue):
start = time.time() -self.start
result = self._process(X,Y,Z)
result_queue.put(result)
print(result)
end = time.time() -self.start
print( 'start time: ', start)
print('end time:', end)
# return result_queue
def fit(self,num):
for i in range(num):
X, Y, Z = np.ones([5,5]), np.ones([3,3]), np.ones([2,2])
result_queue = Queue()
p = Process(target=self.work, args = (X,Y,Z, result_queue))
self.jobs.append(p)
p.start()
print( 'ChildProcess...',i)
result = result_queue.get()
self.results.append(result)
for p in self.jobs:
p.join()
p.close()
return self.results
R = model()
k = R.fit(10)
print(k)
The time of start and end of each process is printed, and the second process only starts after the first process is finished. This is strange because each process should be automatically assign to different core and run in parallel.
result = result_queue.get()
result_queue.get() will block if it is empty. An item will only be added when a process finishes, hence the next process will be spawned only if the previous has finished.
Below is a version that does spawn 10 processes at once. I've marked the section I've added:
import time
from multiprocessing import Process, Queue
import numpy as np
class model:
def __init__(self):
self.results = []
self.jobs = []
self.start = time.time()
def _process(self, x,y,z):
j= 0
for i in range(10**8):
j = i+j
return j
def work(self,X,Y,Z, result_queue):
start = time.time() -self.start
result = self._process(X,Y,Z)
result_queue.put(result)
print(result)
end = time.time() -self.start
print( 'start time: ', start)
print('end time:', end)
# return result_queue
def fit(self,num):
for i in range(num):
X, Y, Z = np.ones([5,5]), np.ones([3,3]), np.ones([2,2])
result_queue = Queue()
p = Process(target=self.work, args = (X,Y,Z, result_queue))
self.jobs.append(p)
p.start()
print( 'ChildProcess...',i)
#result = result_queue.get() # <--- This blocks
#self.results.append(result)
for p in self.jobs:
p.join()
p.close()
for result in result_queue: # <-----
self.results.append(result) # <-----
return self.results
R = model()
k = R.fit(10)
print(k)

Python multiprocessing (pool map) freezing

multi() freezes somewhere in the middle of its activity:
def current_proc(): print mp.current_process().name, 'started'
def multi(fn, func):
print 'Process started on',time.strftime('%H:%M:%S')
count = mp.cpu_count()*2
input = nohead(xlsx2array(fn))
parts = chunks(input, 10)
pool = mp.Pool(processes = count, initializer = current_proc, maxtasksperchild = 1)
for part in parts:
with stopwatch() as r: pool.map(func, part)
return r
pool.close()
pool.join()
I am using multiprocessing with the function to get effective urls:
def query(i):
attempts = 2
while attempts:
try:
q = requests.get(i, allow_redirects = True, verify = False, timeout = 2)
match = q.url
match = str(match)
break
except:
attempts -= 1
match = 'pattern not found'
pass
return [ i, match ]
Please advise how can I avoid such freezing. Thanks,

Categories

Resources