I want to add multiple processes to speed up my program, but I found that after adding multiple processes, the program execution time has become longer.My code is as follows.
'''before'''
if __name__ == '__main__':
result = []
start_time = int(time.time())
for i in range(20000000):
result.append(demo3(i, i + 1))
end_time = int(time.time())
print(result)
print(end_time - start_time)
'''Add multiple processes '''
def demo3(i, j):
return int(i) * int(j)
if __name__ == '__main__':
pool = multiprocessing.Pool(processes=10)
result = []
start_time = int(time.time())
for i in range(20000000):
result.append(pool.apply_async(demo3, args=(i, i + 1)).get())
pool.close()
pool.join()
end_time = int(time.time())
print(result)
print(end_time - start_time)
Related
I have been working on developing a synthetic benchmarking program in python in order to stress-test the CPU in various systems for a class project. I have based my approach on Mersenne primality tests (inspired by prime95). The program is intended to test the Mersenne primality of numbers over a working set, defined by the user. I have so far implemented this in python, however once implementing the concurrent.futures module in order to run the task in parallel to maximize CPU utilization, I hit a snag. When testing my program I ran into 2 issues.
CPU utilization is still only ~35%
When testing larger working sets, the program stalls for several minutes before it starts iterating through each prime - I am assuming this has something to do with concurrent.futures's setup.
I was hoping someone could provide some insight into how to maximize system resource usage with this program and iron out the issues with larger sets.
Code below:
sys = platform.uname()
c = wmi.WMI()
winSys = c.Win32_ComputerSystem()[0]
mode1 = "Integer Mode"
mode2 = "Floating-Point Mode"
def lehmer(p: int) -> bool:
s = 4
M = (1 << p) - 1
for i in range(p - 2):
s = ((s * s) - 2) % M
return s == 0
#Initial printout of system information and menu screen schowing benchamrking options
print("_________________________________________________________________________________")
print("------------------------------System Information---------------------------------")
print(f"\tOS: {sys.system} {sys.release} ") #
print(f"\tMachine Name: {sys.node}")
print(f"\tVersion: {sys.version}")
print(f"\tCPU: {sys.processor}")
print("\tNumber of Cores: " + str(psutil.cpu_count()))
print(f"\tRAM: {psutil.virtual_memory()}")
print("---------------------------------------------------------------------------------")
modeSelect = 0;
print("Welcome to ParaBench! Please select what benchmarking mode you would like to use." + '\n')
modeSelect = int(input("[1] -> " + mode1 + '\n' + "[2] -> " + mode2
+ "\n[9] -> Exit\n_________________________________________________________________________________\n"))
#User selects Integer benchmarking mode
if modeSelect == 1:
print("User Selected " + mode1)
#Printout of selection for order of magnitude
print("[1] -> First 1x10^2 Primes\n" + "[2] -> First 1x10^3 Primes\n"
+"[3] -> First 1x10^4 Primes\n" + "[4] -> First 1x10^5 Primes\n" + "[5] -> First 1x10^6 Primes\n")
mersenneOrder = int(input("Please Select an option\n"))
if mersenneOrder == 1:
print("Starting Benchmark...")
with ThreadPoolExecutor(15) as executor:
timeStart = perf_counter()
for result in executor.map(lehmer,range(2,100)):
print(result)
timeStop = perf_counter()
print("1E2 Benchmark Complete in ",timeStop-timeStart)
if mersenneOrder == 2:
print("Starting Benchmark...")
with ThreadPoolExecutor(15) as executor:
timeStart = perf_counter()
for result in executor.map(lehmer,range(2,1000)):
print(result)
timeStop = perf_counter()
print("1E3 Benchmark Complete!!", timeStop-timeStart)
if mersenneOrder == 3:
print("Starting Benchmark...")
with ThreadPoolExecutor() as executor:
timeStart = perf_counter()
for result in executor.map(lehmer,range(2,10000)):
print(result)
timeStop = perf_counter()
print("1E4 Benchmark Complete!!", timeStop-timeStart)
if mersenneOrder == 4:
print("Starting Benchmark...")
with ThreadPoolExecutor(15) as executor:
timeStart = perf_counter()
for result in executor.map(lehmer,range(2,100000)):
print(result)
timeStop = perf_counter()
print("1E5 Benchmark Complete!!", timeStop-timeStart)
if mersenneOrder == 5:
print("Starting Benchmark...")
with ThreadPoolExecutor(15) as executor:
timeStart = perf_counter()
for result in executor.map(lehmer,range(2,1000000)):
print(result)
timeStop = perf_counter()
print("1E6 Benchmark Complete!!", timeStop-timeStart)
#Single-threaded test (DEPRECATED)
#for x in range(2,1000000):
# if lehmer(x):
# print(x)
I tried to run a very simple multiprocessing code, but the code is still serially processed.
I have tried to run it on Mac(macOS 10.13) and Linux(Ubuntu 18.04) with python 2 and 3, but in both environments I had the same problem.
the function _process has to receive numpy array as arguments, so I decided to use Multiprocess.Process instead of Multiprocess.Pool.map() and Multiprocess.Pool.apply_async() because pickle is broken when use pool.map() in a class. https://stackoverflow.com/a/21345308/4755986
import time
from multiprocessing import Process, Queue
import numpy as np
class model:
def __init__(self):
self.results = []
self.jobs = []
self.start = time.time()
def _process(self, x,y,z):
j= 0
for i in range(10**8):
j = i+j
return j
def work(self,X,Y,Z, result_queue):
start = time.time() -self.start
result = self._process(X,Y,Z)
result_queue.put(result)
print(result)
end = time.time() -self.start
print( 'start time: ', start)
print('end time:', end)
# return result_queue
def fit(self,num):
for i in range(num):
X, Y, Z = np.ones([5,5]), np.ones([3,3]), np.ones([2,2])
result_queue = Queue()
p = Process(target=self.work, args = (X,Y,Z, result_queue))
self.jobs.append(p)
p.start()
print( 'ChildProcess...',i)
result = result_queue.get()
self.results.append(result)
for p in self.jobs:
p.join()
p.close()
return self.results
R = model()
k = R.fit(10)
print(k)
The time of start and end of each process is printed, and the second process only starts after the first process is finished. This is strange because each process should be automatically assign to different core and run in parallel.
result = result_queue.get()
result_queue.get() will block if it is empty. An item will only be added when a process finishes, hence the next process will be spawned only if the previous has finished.
Below is a version that does spawn 10 processes at once. I've marked the section I've added:
import time
from multiprocessing import Process, Queue
import numpy as np
class model:
def __init__(self):
self.results = []
self.jobs = []
self.start = time.time()
def _process(self, x,y,z):
j= 0
for i in range(10**8):
j = i+j
return j
def work(self,X,Y,Z, result_queue):
start = time.time() -self.start
result = self._process(X,Y,Z)
result_queue.put(result)
print(result)
end = time.time() -self.start
print( 'start time: ', start)
print('end time:', end)
# return result_queue
def fit(self,num):
for i in range(num):
X, Y, Z = np.ones([5,5]), np.ones([3,3]), np.ones([2,2])
result_queue = Queue()
p = Process(target=self.work, args = (X,Y,Z, result_queue))
self.jobs.append(p)
p.start()
print( 'ChildProcess...',i)
#result = result_queue.get() # <--- This blocks
#self.results.append(result)
for p in self.jobs:
p.join()
p.close()
for result in result_queue: # <-----
self.results.append(result) # <-----
return self.results
R = model()
k = R.fit(10)
print(k)
I've defined a (test) Function in Python, which I am using to understand the different computation time that might be required to execute the code - using normal code (without using multi-processing or multi-threading), and then implementing each of them one by one.
Function (for Basic Usage):
from random import randint as rInt
def highComputationFunction(rangeNumber):
count_ = 0
for i in range(rangeNumber):
count_ = count_*2 + rInt(rangeNumber**2, rangeNumber**3)
count_ = 10**100//count_
return count_
Also, for Multi-Processing & Multi-Threading, I wanted to return the result of the thread to my Parent Function, so modified it like this:
from random import randint as rInt
def highComputationFunction(rangeNumber, result):
count_ = 0
for i in range(rangeNumber):
count_ = count_*2 + rInt(rangeNumber**2, rangeNumber**3)
count_ = 10**100//count_
return count_
Looking into the CPU Usage for each of the main function as below:
import time
if __name__ == '__main__':
startTime = time.time()
rangeNumber = 10000
coumputedNum = float(round(highComputationFunction(rangeNumber)//100**5000, 3))
print('\tFunction of {} Executed in: {} seconds. Result = {}'.format(rangeNumber, round(time.time() - startTime, 2), coumputedNum))
inTime = time.time()
rangeNumber = 100000
coumputedNum = float(round(highComputationFunction(rangeNumber)//100**5000, 3))
print('\tFunction of {} Executed in: {} seconds. Result = {}'.format(rangeNumber, round(time.time() - inTime, 2), coumputedNum))
inTime = time.time()
rangeNumber = 1000000
coumputedNum = float(round(highComputationFunction(rangeNumber)//100**5000, 3))
print('\tFunction of {} Executed in: {} seconds. Result = {}'.format(rangeNumber, round(time.time() - inTime, 2), coumputedNum))
print('Total Execution Time: {}'.format(round(time.time() - startTime, 2)))
This was executed in approximately 46 Seconds in Total. One output is as Below:
# python understandComputation.py
# Function of 10000 Executed in: 0.03 seconds. Result = 0.0
# Function of 100000 Executed in: 0.91 seconds. Result = 0.0
# Function of 1000000 Executed in: 45.49 seconds. Result = 0.0
# Total Execution Time: 46.44
Executed the same thing with Multi-Threading:
import time
import threading
if __name__ == '__main__':
startTime = time.time()
result_ = 0
threadList = []
for i in [10000, 100000, 1000000]:
curThread = threading.Thread(target = highComputationFunction, args = (i, result_))
curThread.start()
print('\tThread for {} Started.'.format(i))
threadList.append(curThread)
result_ += result_
for i in threadList:
i.join()
print('Total Function Executed in: {} seconds. Result = {}'.format(round(time.time() - startTime, 2), result_))
For Multi-Processing:
import time
import multiprocessing
if __name__ == '__main__':
startTime = time.time()
result_ = 0
procList = []
for i in [10000, 100000, 1000000]:
curProc = multiprocessing.Process(target = highComputationFunction, args = (i, result_))
curProc.start()
print('\tProcess for {} Started.'.format(i))
procList.append(curProc)
result_ += result_
for i in procList:
i.join()
print('Total Function Executed in: {} seconds. Result = {}'.format(round(time.time() - startTime, 2), result_))
Implementing this, got the output in much more time than usual.
# python understandComputation.py
# Thread for 10000 Started.
# Thread for 100000 Started.
# Thread for 1000000 Started.
# Total Function Executed in: 47.04 seconds. Result = 0
# python understandComputation.py
# Process for 10000 Started.
# Process for 100000 Started.
# Process for 1000000 Started.
# Total Function Executed in: 47.21 seconds. Result = 0
Please tell me, if it is wrong with the implementation of the code or not. Expected result for multi-threading and multi-processing should be less than 45.5 Seconds, which is the maximum time taken for the execution of the 1000000 number in the actual code, but I'm not getting the desired output.
I have following script:
max_number = 100000
minimums = np.full((max_number), np.inf, dtype=np.float32)
data = np.zeros((max_number, 128, 128, 128), dtype=np.uint8)
if __name__ == '__main__':
main()
def worker(array, start, end):
for in_idx in range(start, end):
value = data[start:end][in_idx] # compute something using this array
minimums[in_idx] = value
def main():
jobs = []
num_jobs = 5
for i in range(num_jobs):
start = int(i * (1000 / num_jobs))
end = int(start + (1000 / num_jobs))
p = multiprocessing.Process(name=('worker_' + str(i)), target=worker, args=(start, end))
jobs.append(p)
p.start()
for proc in jobs:
proc.join()
print(jobs)
How can I ensure that the numpy array is global and can be accessed by each worker? Each worker uses a different part of the numpy array
import numpy as np
import multiprocessing as mp
ar = np.zeros((5,5))
def callback_function(result):
x,y,data = result
ar[x,y] = data
def worker(num):
data = ar[num,num]+3
return num, num, data
def apply_async_with_callback():
pool = mp.Pool(processes=5)
for i in range(5):
pool.apply_async(worker, args = (i, ), callback = callback_function)
pool.close()
pool.join()
print "Multiprocessing done!"
if __name__ == '__main__':
ar = np.ones((5,5)) #This will be used, as local scope comes before global scope
apply_async_with_callback()
Explanation: You set up your data array and your workers and callback functions. The number of processes in the pool set up a number of independent workers, where each worker can do more than one task. The callback writes the result back to the array.
The __name__=='__main__' protects the following line from being run at each import.
I'm experiencing with multiprocessing in python. I know that it can be slower than serialized computation, this is not the point of my post.
I'm just wandering why a single process pool is faster than the serialized computation of my basic problem. Shouldn't these times be the same?
Here is the code:
import time
import multiprocessing as mp
import matplotlib.pyplot as plt
def func(x):
return x*x*x
def multi_proc(nb_procs):
tic = time.time()
pool = mp.Pool(processes=nb_procs)
pool.map_async(func, range(1, 10000000))
toc = time.time()
return toc-tic
def single_core():
tic = time.time()
[func(x) for x in range(1, 10000000)]
toc = time.time()
return toc-tic
if __name__ == '__main__':
sc_times = [0]
mc_times = [0]
print('single core computation')
sc_constant_time = single_core()
print('{} secs'.format(sc_constant_time))
for nb_procs in range(1, 12):
print('computing for {} processes...'.format(nb_procs))
time_elapsed = (multi_proc(nb_procs))
print('{} secs'.format(time_elapsed))
mc_times.append(time_elapsed)
sc_times = [sc_constant_time for _ in mc_times]
plt.plot(sc_times, 'r--')
plt.plot(mc_times, 'b--')
plt.xlabel('nb procs')
plt.ylabel('time (s)')
plt.show()
And the plot of times per number of processes (red = serial computation, blue = multiprocessing):
EDIT 1:
I modified my code as Sidhnarth Gupta indicated, and here is the new code I have. I changed my func for no reason.
import time
import multiprocessing as mp
import matplotlib.pyplot as plt
import random
def func(x):
return random.choice(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
def multi_proc(nb_procs, nb_iter):
tic = time.time()
pool = mp.Pool(processes=nb_procs)
pool.map_async(func, range(1, nb_iter)).get()
toc = time.time()
return toc-tic
def single_core(nb_iter):
tic = time.time()
[func(x) for x in range(1, nb_iter)]
toc = time.time()
return toc-tic
if __name__ == '__main__':
# configure
nb_iter = 100000
max_procs = 16
sc_times = [0]
mc_times = [0]
# multi proc calls
for nb_procs in range(1, max_procs):
print('computing for {} processes...'.format(nb_procs))
time_elapsed = (multi_proc(nb_procs, nb_iter))
print('{} secs'.format(time_elapsed))
mc_times.append(time_elapsed)
# single proc call
print('single core computation')
for nb in range(1, len(mc_times)):
print('{}...'.format(nb))
sc_times.append(single_core(nb_iter))
# average time
average_time = sum(sc_times)/len(sc_times)
print('average time on single core: {} secs'.format(average_time))
# plot
plt.plot(sc_times, 'r--')
plt.plot(mc_times, 'b--')
plt.xlabel('nb procs')
plt.ylabel('time (s)')
plt.show()
Here is the new plot I have:
I think I can now say that I have increased my program's speed by using multiprocessing.
Your current code to calculate the time taken by multiprocessing is actually telling the time taken by the process to submit the task to the pool. The processing is actually happening in asynchronous mode without blocking the thread.
I tried your program with following changes:
def multi_proc(nb_procs):
tic = time.time()
pool = mp.Pool(processes=nb_procs)
pool.map_async(func, range(1, 10000000)).get()
toc = time.time()
return toc-tic
and
def multi_proc(nb_procs):
tic = time.time()
pool = mp.Pool(processes=nb_procs)
pool.map(func, range(1, 10000000))
toc = time.time()
return toc-tic
Both of them take significantly more time than then serialised computation.
Also while creating such graphs, you should also consider calling the single_core() function everytime you want to map the value instead of mapping the same value multiple time. You will see a significant variance in time taken by the same.