I have been trying to use multiprocessing module from python to achieve parallism.
I'm able to execute my code, it run in parallel but after some time just only one process finishes its task and the others exit without finishing I know there is join()method that wait all tasks to finish but not work properly. I have been reading multiprocessing's manual page and forums to find out why it isn't working and i haven't figured it out yet.
I think that the problem may be related with some other thing like database or python version my python version 3.10 and i have 8 cores CPU , any help
Here is my code:
I have 5 process (20000 items split to batch of 4000)
def process_batch(self, batch_index, batched_payloads):
payloads = self.import_file.get_payloads_for_import()
imported_rows = []
total_payload_count = len(payloads)
batch_size = frappe.conf.data_import_batch_size or 4000
for i, payload in enumerate(batched_payloads):
doc = payload.doc
row_indexes = [row.row_number for row in payload.rows]
current_index = (i + 1) + (batch_index * batch_size)
if set(row_indexes).intersection(set(imported_rows)):
print("Skipping imported rows", row_indexes)
if total_payload_count > 5:
frappe.publish_realtime(
"data_import_progress",
{
"current": current_index,
"total": total_payload_count,
"skipping": True,
"data_import": self.data_import.name,
},
user=frappe.session.user,
)
continue
try:
start = timeit.default_timer()
# insert data to database process_doc method
doc = self.process_doc(doc)
processing_time = timeit.default_timer() - start
eta = self.get_eta(current_index, total_payload_count, processing_time)
if self.console:
update_progress_bar(
f"Importing {total_payload_count} records",
current_index,
total_payload_count,
)
elif total_payload_count > 5:
frappe.publish_realtime(
"data_import_progress",
{
"current": current_index,
"total": total_payload_count,
"docname": doc.name,
"data_import": self.data_import.name,
"success": True,
"row_indexes": row_indexes,
"eta": eta,
},
user=frappe.session.user,
)
# commit after every successful import
frappe.db.commit()
except Exception:
# rollback if exception
frappe.db.rollback()
here my code for multiprocessing
def import_data()
batch_size = frappe.conf.data_import_batch_size or 4000
workers = []
for batch_index, batched_payloads in enumerate(frappe.utils.create_batch(payloads, batch_size)):
p = Process(target=self.process_batch, args=(batch_index, batched_payloads))
p.start()
workers.append(p)
for worker in workers:
worker.join()
same thing if I do :
processes_count = multiprocessing.cpu_count()
pool = multiprocessing.Pool(processes_count)
#pool = multiprocessing.Pool(4)
for batch_index, batched_payloads in enumerate(
frappe.utils.create_batch(payloads, batch_size)):
pool.apply_async(self.process_batch, args=(batch_index, batched_payloads))
pool.close()
pool.join()
edit:
same thing with concurrent.futures.ProcessPoolExecutor
batches = enumerate(frappe.utils.create_batch(payloads, batch_size))
with concurrent.futures.ProcessPoolExecutor(max_workers=processes_count) as executor:
executor.map(self.process_batch, batches)
same thing with concurrent.futures.ProcessPoolExecutor submit()
with concurrent.futures.ProcessPoolExecutor(max_workers=processes_count) as executor:
future_to_batch = [executor.submit(self.process_batch, batch_index, batched_payloads) for batch_index, batched_payloads in enumerate(frappe.utils.create_batch(payloads, batch_size))]
for future in concurrent.futures.as_completed(future_to_batch):
future.result()
since concurrency is hard to debug, we need more context to help you with your problem. at least show us what self.process_batch definition is
Related
I would like to implement a status bar that gives the user feedback of the status of the script. I have build a script that will loop through a video directory and for every video that it detects it runs a process against it. While the initial code does work, the issue I am having is that when the code finishes using a trained tensorflow model, it fails to release the gpu memory reserved for the process. My solution to this is to run a separate script that has values passed through to it from the initial script. Any help would be appreciated.
Here is the code I referenced for building a multi-status bar output:
`
import multiprocessing
import random
from concurrent.futures import ProcessPoolExecutor
from time import sleep
#from rich import progress
from rich.progress import Progress
def long_running_fn(progress, task_id):
len_of_task = random.randint(3, 20) # take some random length of time
for n in range(0, len_of_task):
sleep(1) # sleep for a bit to simulate work
progress[task_id] = {"progress": n + 1, "total": len_of_task}
if __name__ == "__main__":
n_workers = 8 # set this to the number of cores you have on your machine
with Progress() as progress:
futures = [] # keep track of the jobs
with multiprocessing.Manager() as manager:
# this is the key - we share some state between our
# main process and our worker functions
_progress = manager.dict()
overall_progress_task = progress.add_task("[green]All jobs progress:")
with ProcessPoolExecutor(max_workers=n_workers) as executor:
for n in range(0, 20): # iterate over the jobs we need to run
# set visible false so we don't have a lot of bars all at once:
task_id = progress.add_task(f"task {n}", visible=False)
futures.append(executor.submit(long_running_fn, _progress, task_id))
# monitor the progress:
while (n_finished := sum([future.done() for future in futures])) < len(
futures
):
progress.update(
overall_progress_task, completed=n_finished, total=len(futures)
)
for task_id, update_data in _progress.items():
latest = update_data["progress"]
total = update_data["total"]
# update the progress bar for this task:
progress.update(
task_id,
completed=latest,
total=total,
visible=latest < total,
)
# raise any errors:
for future in futures:
future.result()
`
Which gives a result that look like this:
progress bars
In implementing this into my code, the progress bars work, but only until the gpu memory fills up than I am unable to keep the code running. If I run a two script variant without a progress bar everything works perfectly.
Here is the code that I currently have. I apologize in advance its a WIP.
`
def detect_frames(progress, task_id, gpu, filename, xl, size):
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu) #"1"
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
for gpu in gpus:
try:
tf.config.experimental.set_virtual_device_configuration(gpu,
[tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1024*size)]) #1024 #starts mem limit at 1Gig, 5715 is for the R720
except RuntimeError as e:
print(e)
model = tensorflow.keras.models.load_model('SQ-D4096-V7.h5', compile=False)
labels = ['Corrupt','Good'] #needs to be in same order as text file
start_time = time.time()
f3 = 0
T = 0 #time
T2 = 0 #time2 for testing
cap = cv.VideoCapture(filename)
fps = cap.get(cv.CAP_PROP_FPS)
totalNoFrames = cap.get(cv.CAP_PROP_FRAME_COUNT)#/30
durationInSeconds = (totalNoFrames / fps)/fps
good = []
corrupt = []
bad = []
count = 0
while cap.isOpened():
ret, frame = cap.read()
count += 120
cap.set(cv.CAP_PROP_POS_FRAMES, count)
fnum = cap.get(cv.CAP_PROP_POS_FRAMES);
progress[task_id] = {"progress": fnum, "total": totalNoFrames}
rest of def ....
if __name__ == "__main__":
CWD = os.path.dirname(os.path.realpath(__file__))
nvidia_smi.nvmlInit()
try:
deviceCount = nvmlDeviceGetCount()
for i in range(deviceCount):
#gpus = len(i)
#print("Num GPU: ",gpus)
handle = nvmlDeviceGetHandleByIndex(i)
info = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)
card = nvmlDeviceGetName(handle)
print("Device", i, ":", nvmlDeviceGetName(handle))
print("Total memory: %sGB" %(info.total//(1.024*10**9)))
print("Free memory: %sGB" %(info.free//(1.024*10**9)))
print("Used memory: %sGB" %(info.used//(1.024*10**9)))
Num = (info.free//(1.024*10**9))-1
print("Max number of scripts at once: ",Num)
#print("Recommended Max number of scripts at once, per GPU: ",Num-1)
except NVMLError as error:
print(error)
numgpu = deviceCount
nvidia_smi.nvmlShutdown()
# Input area
#%%
#Where are the videos located?
#file = glob.glob("/Data/2_DeInter/*.avi") #for production
file = glob.glob("/Data/2_DeInter/Untitled_5.avi") #for Testing
#file = sys.argv[1]
#print(file)
#How many videos do you want to process per GPU?
#Num = 12
NumProc = 1
#nvidia_smi.nvmlInit()
if (str(card) == "b'NVIDIA GeForce GTX 1050 Ti'") and (NumProc > 2 ):
print("Unfortunately the 1050 Ti can only process two streams per card:")
print("Changing number of processes to 2!")
NumProc = 2
#if NumProc == 1:
# MemFree = ((Num//NumProc)-1)
#else:
# MemFree = (Num//NumProc)
print("NumProc: ",NumProc)
MemFree = 1/((Num*1.2)/NumProc)
print("Memory free:",MemFree)
size = (Num/NumProc)#MemFree #2 #1.2 #size of the tensorflow model
print('Memory size Used: ',size)
#print(size)
print("\n")
n_workers = NumProc#*numgpu #2
#%%
gpu = 0
x = 0
T = 0
nvidia_smi.nvmlInit()
with Progress() as progress:
futures = [] # keep track of the jobs
# with multiprocessing.Manager() as manager:
# # this is the key - we share some state between our
# # main process and our worker functions
# _progress = manager.dict()
# overall_progress_task = progress.add_task("[green]All jobs progress:")
with multiprocessing.Manager() as manager:
# this is the key - we share some state between our
# main process and our worker functions
_progress = manager.dict()
overall_progress_task = progress.add_task("[green]All jobs progress:")
#for n in range(len(file)):
with ProcessPoolExecutor(max_workers=n_workers) as executor:
#with ProcessPoolExecutor(max_workers=n_workers) as executor:
while True:
#for n in range(len(file)):
#task_id = progress.add_task(f"task {x}", visible=False)
task_id = progress.add_task(f"Processing video: {Path(file[x]).name}", visible=False)
#print("X: %s"%x)
#while True:
#for gpu in range(deviceCount):
#print("GPU: ",gpu)
print("X:",x)
# if x > NumProc: #clear the tf models
# tf.keras.backend.clear_session()
# tf.compat.v1.reset_default_graph()
# del model
# load.model()
#print("Mem Free: ",info.free//(size*10**9)-1)
#with Progress() as progress:
# futures = []
while info.free/(size*10**9) <= MemFree:
#print("Mem Free: ",info.free/(1.024*10**9))
#print("Mem Free: ",info.free/(size*10**9))
#print("No Free Memory!")
#print("Checking GPU: %s for space"%gpu)
#print("Update Progress Bar")
if (n_finished := sum([future.done() for future in futures])) < len(futures):
progress.update(overall_progress_task, completed=n_finished, total=len(futures))
for task_id, update_data in _progress.items():
latest = update_data["progress"]
total = update_data["total"]
# update the progress bar for this task:
progress.update(task_id,completed=latest,total=total,visible=latest < total,)
time.sleep(1)
gpu = gpu+1
if gpu == deviceCount:
gpu = 0
handle = nvmlDeviceGetHandleByIndex(gpu)
info = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)
if info.free/(size*10**9) > MemFree:
tf.config.experimental.set_virtual_device_configuration(gpu,
futures.append(executor.submit(detect_frames, _progress, task_id, gpu, file[x], x, size))
if (n_finished := sum([future.done() for future in futures])) < len(futures):
progress.update(overall_progress_task, completed=n_finished, total=len(futures))
for task_id, update_data in _progress.items():
latest = update_data["progress"]
total = update_data["total"]
# update the progress bar for this task:
progress.update(task_id,completed=latest,total=total,visible=latest < total,)
time.sleep(5)
#print("Update Progress Bar")
if (n_finished := sum([future.done() for future in futures])) < len(futures):
progress.update(overall_progress_task, completed=n_finished, total=len(futures))
for task_id, update_data in _progress.items():
latest = update_data["progress"]
total = update_data["total"]
# update the progress bar for this task:
progress.update(task_id,completed=latest,total=total,visible=latest < total,)
x = x+1
gpu = gpu+1
if gpu == deviceCount:
gpu = 0
#break
if x == len(file):
print("End of file list")
#print("Update Progress Bar")
while (n_finished := sum([future.done() for future in futures])) < len(futures):
progress.update(overall_progress_task, completed=n_finished, total=len(futures))
for task_id, update_data in _progress.items():
latest = update_data["progress"]
total = update_data["total"]
# update the progress bar for this task:
progress.update(task_id,completed=latest,total=total,visible=latest < total,)
break
handle = nvmlDeviceGetHandleByIndex(gpu)
info = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)
nvidia_smi.nvmlShutdown()
`
I have the following code which I would like to convert from using ThreadPool to use of ProcessPoolExecutor since it is all CPU intensive calculations and when i observe the CPU monitor I note that my 8 core processor is only using a single thread.
import datetime
from multiprocessing.dummy import Pool as ThreadPool
def thread_run(q, clients_credit_array, clients_terr_array,
freq_small_list, freq_large_list, clients, year, admin):
claim_id = []
claim_client_id = []
claim_company_id = []
claim_year = []
claim_type = []
claim_closed = []
claim_cnt = []
claim_amount = []
print(datetime.datetime.utcnow())
i = 0
client_cnt = 1000
loop_incr = 8
while i < client_cnt:
ind_rng = range(i, min((i + loop_incr), (client_cnt)), 1)
call_var = []
for q in ind_rng:
call_var.append((q,
clients_credit_array,
clients_terr_array,
freq_small_list,
freq_large_list,
clients,
year,
admin))
pool = ThreadPool(len(call_var))
results = pool.map(call_claim, call_var)
pool.close()
pool.join()
for result in results:
if result[0] == []:
pass
else:
r = 0
if r < len(result[0]):
claim_index += 1
claim_id.append(claim_index)
claim_client_id.append(result[0][r])
claim_company_id.append(result[1][r])
claim_year.append(result[2][r])
claim_type.append(result[3][r])
claim_closed.append(result[4][r])
claim_cnt.append(result[5][r])
claim_amount.append(result[6][r])
r += 1
i += loop_incr
print(datetime.datetime.utcnow())
The difficulty I am having, however, is that when I modify the code as follows, I get error messages:
from concurrent.futures import ProcessPoolExecutor as PThreadPool
pool = PThreadPool(max_workers=len(call_var))
#pool = ThreadPool(len(call_var))
results = pool.map(call_claim, call_var)
#pool.close()
#pool.join()
I had to remove the pool.close() and pool.join() as it generated errors. But when I removed them, my code was not utilizing parallel processors and it ran much longer and slower than originally. What am I missing?
As was pointed out in the comments, it is common to see Executor used as part of a context manager and without the need for join or close operations. Below is a simplified example to illustrate the concepts.
Example:
import concurrent.futures
import random
import time
import os
values = [1, 2, 3, 4, 5]
def times_two(n):
time.sleep(random.randrange(1, 5))
print("pid:", os.getpid())
return n * 2
def main():
with concurrent.futures.ProcessPoolExecutor() as executor:
results = executor.map(times_two, values)
for one_result in results:
print(one_result)
if __name__ == "__main__":
main()
Output:
pid: 396
pid: 8904
pid: 25440
pid: 20592
pid: 14636
2
4
6
8
10
I have a parallel processing in Python which read data from a database, do some manipulation and run the Dijkstra algorithm:
t1 = 200101
t2 = 200229
import psutil
from multiprocess import Pool
pool = Pool(psutil.cpu_count(logical=False))
def graph_analysis(i):
input_date = str(i)
sql_data = """select trim(cast(p.Barcode as nvarchar(20))) Barcode ,cast(s.invoiceid as
nvarchar(20)) invoiceid
from sales s inner join Product_981115 p on s.productid = p.productid
where s.date = """+ input_date +""" and s.qty != 0 and p.sectionid != 1691.199 and s.RegionID = """ + input_region
data = []
for chunk in pd.read_sql(sql_data,conn,chunksize = 1000000):
data.append(chunk)
data = pd.concat(data, ignore_index = True)
data = data.merge(candid_sale_invoices)
data = data.merge(candid_barcodes)
final_edges_df = data.iloc[:,[2,3,4]]
final_edges_tuples = [tuple(x) for x in final_edges_df.values]
Gm = ig.Graph.TupleList(final_edges_tuples, directed = True, edge_attrs = ['weight'])
longest_paths = pd.DataFrame(Gm.shortest_paths_dijkstra(None,None, weights = 'weight'))
longest_paths = longest_paths.swifter.apply(log_transform)
longest_paths["Date"] = input_date
longest_paths["RegionID"] = input_region
Return longest_paths
results = pool.map(graph_analysis,range(t1,(t2) + 1)))
pool.close()
results= pd.concat(results, ignore_index = True)
I ran this code a couple of days ago and it perfectly completed in parallel, utilizing almost all cores. However when I run it today, it seems that I have generated parallel processes but the cores doesn't process in parallel.
The system has 128 GB RAM and 32 Cores and nothing changed in it since the last successful parallel run.
I restarted the system to solve any probable issue but the problem still exists.
So what could be the problem?
Thanks in advance.
I am making a PyQt5 real-time application that streamline images and its segmentation result on interface. It has 3 components as a cycle:
pull out frame
feed the frame into segmentation module
print out original frame and segmentation result on interface
In particularly, (2) is a bottleneck. I tried to run (2) in main thread, and compared its speed and CPU consumption rate against running (2) in background (using QThread). I strangely found that running by QThread costs ~20 ms slower and is more CPU intensive.
Method 1: Wrapping (1), (2) in a QObject Worker and Run in Main Thread
I don't intentionally want to run all these in main thread because it causes window being locked, but I do this for speed testing. Below is my implementation of the QObject worker (FrameStore is an object retrieving key results in QObject and pass them to GUI):
class FrameObject(QObject):
frame_signal = pyqtSignal(FrameStore)
def __init__(self, parent = None):
super().__init__()
self.load_img()
self.IS_SILENT = True
self.model_config = ModelMetaConfig()
self.frame_store = FrameStore()
def load_img(self):
self.PATH = os.path.join(os.getcwd(), 'test_cases', 'test.jpg')
rgb_img = cv2.imread(self.PATH)
self.rgb_img = rgb_img[:, :, ::-1]
#pyqtSlot()
def run(self):
end_loop = time.time()
while True:
crt_t = time.time()
print('time = {:10.4f} s'.format(time.time()))
print('unexplain = {:10.4f} s'.format(crt_t - end_loop))
torch.cuda.synchronize()
start = time.time()
seg_out = self.model_config.raw_predict(self.rgb_img, self.IS_SILENT)
seg_out = self.model_config.process_predict(seg_out, self.IS_SILENT)
torch.cuda.synchronize()
end = time.time()
print('model run = {:10.4f} s'.format(end - start))
start = time.time()
# store key data at a snapsho
self.frame_store.rgb_img = self.rgb_img
self.frame_store.seg_out = seg_out
#self.frame_signal.emit(self.frame_store)
end_loop = time.time()
self.frame_signal.emit(self.frame_store)
print('emit = {:10.4f} s'.format(end_loop - start))
Below screenshot tells that CPU usage is 60% with frame rate ~120 ms:
Screenshot: ~60% CPU Usage + ~120 ms Frame Rate
Method 2: Wrapping (1), (2) in a QThread
My implementation of QThread is similar to the QObject above:
class FrameThread(QThread):
frame_signal = pyqtSignal(FrameStore)
def __init__(self, parent = None):
super().__init__()
self.load_img()
self.IS_SILENT = True
self.model_config = ModelMetaConfig()
self.frame_store = FrameStore()
def load_img(self):
self.PATH = os.path.join(os.getcwd(), 'test_cases', 'test.jpg')
rgb_img = cv2.imread(self.PATH)
self.rgb_img = rgb_img[:, :, ::-1]
def run(self):
end_loop = time.time()
while True:
crt_t = time.time()
print('time = {:10.4f} s'.format(time.time()))
print('unexplain = {:10.4f} s'.format(crt_t - end_loop))
torch.cuda.synchronize()
start = time.time()
seg_out = self.model_config.raw_predict(self.rgb_img, self.IS_SILENT)
seg_out = self.model_config.process_predict(seg_out, self.IS_SILENT)
torch.cuda.synchronize()
end = time.time()
print('model run = {:10.4f} s'.format(end - start))
start = time.time()
# store key data at a snapsho
self.frame_store.rgb_img = self.rgb_img
self.frame_store.rgb2_img = self.rgb_img
self.frame_store.rgb3_img = self.rgb_img
self.frame_store.rgb4_img = self.rgb_img
self.frame_store.seg_out = seg_out
self.frame_signal.emit(self.frame_store)
end_loop = time.time()
print('emit = {:10.4f} s'.format(end_loop - start))
But it turns out the frame rate is ~20-30 ms slower with 100% CPU usage, as shown below:
Screenshot: 100% CPU Usage + ~145 ms Frame Rate
Reproducing My Result
I have uploaded a simplified version of my application on my repo. You can reproduce my result by running $ python main.py. Note the application requires a GPU and installation of pytorch.
To switch between method 1 and method 2, just toggle comment in the following lines in main.py:
def __init__(self):
super().__init__()
# set up window
self.title = 'Simply Reproducing the Slow Speed'
self.top = 100
self.left = 100
self.width = 1280
self.height = 1280
self.init_window()
#self.init_qobject() #<- uncomment this to run QObject in main thread
#self.init_thread() #<- uncomment this to run QThread
A brief outline of key scripts:
model_utils.py: wrapping all segmentation procedures into a class
object_utils.py: a QObject worker class computing (1) and (2)
thread_utils.py: a QThread class computing (1) and (2)
main.py: main interface application
Question
The slow down and extra CPU usage is so annoying to me. Could anyone suggests me an alternative implementation that could delegate the work to background more efficiently?
I am writing a simple python script that I need to scale to many threads. For simplicity, I have replaced the actual function I need to use with a matrix matrix multiply. I am having trouble getting my code to scale with the number of processors. Any advice to help me get the correct speedup would be helpful! My code and results are as follows:
import numpy as np
import time
import math
from multiprocessing.dummy import Pool
res = 4
#we must iterate over all of these values
wavektests = np.linspace(.1,2.5,res)
omegaratios = np.linspace(.1,2.5,res)
wavekmat,omegamat = np.meshgrid(wavektests,omegaratios)
def solve_for_omegaratio( ind ):
#obtain the indices for this run
x_ind = ind % res
y_ind = math.floor(ind / res)
#obtain the value for this run
wavek = wavektests[x_ind]
omega = omegaratios[y_ind]
#do some work ( I have replaced the real function with this)
randmat = np.random.rand(4000,4000)
nop = np.linalg.matrix_power(randmat,3)
#obtain a scalar value
value = x_ind + y_ind**2.0
return value
list_ind = range(res**2)
#Serial code execution
t0_proc = time.clock()
t0_wall = time.time()
threads = 0
dispersion = map( solve_for_omegaratio , list_ind)
displist = list(dispersion)
t1_proc = time.clock()
t1_wall = time.time()
print('serial execution')
print('wall clock time = ',t1_wall-t0_wall)
print('processor clock time = ',t1_proc-t0_proc)
print('------------------------------------------------')
#Using pool defaults
t0_proc = time.clock()
t0_wall = time.time()
if __name__ == '__main__':
pool = Pool()
dispersion = pool.map( solve_for_omegaratio , list_ind)
displist = list(dispersion)
t1_proc = time.clock()
t1_wall = time.time()
pool.close
print('num of threads = default')
print('wall clock time = ',t1_wall-t0_wall)
print('processor clock time = ',t1_proc-t0_proc)
print('------------------------------------------------')
# Using 4 threads
t0_proc = time.clock()
t0_wall = time.time()
threads = 4
if __name__ == '__main__':
pool = Pool(threads)
dispersion = pool.map( solve_for_omegaratio , list_ind)
displist = list(dispersion)
t1_proc = time.clock()
t1_wall = time.time()
pool.close
print('num of threads = ' + str(threads))
print('wall clock time = ',t1_wall-t0_wall)
print('processor clock time = ',t1_proc-t0_proc)
print('------------------------------------------------')
Results:
serial execution
wall clock time = 66.1561758518219
processor clock time = 129.16376499999998
------------------------------------------------
num of threads = default
wall clock time = 81.86436200141907
processor clock time = 263.45369
------------------------------------------------
num of threads = 4
wall clock time = 77.63390111923218
processor clock time = 260.66285300000004
------------------------------------------------
Because python has a GIL https://wiki.python.org/moin/GlobalInterpreterLock , "python-native" threads can't run execute truly concurrently and thus can't improve the performance of CPU-bound tasks like math. They can be used to parallelize IO bound tasks effectively (eg API calls which spend almost all their time waiting for network I/O). Forking separate processes with multiprocessing rather than dummy's thread-backed implementation will create multiple processes, not threads, which will be able to run concurrently ( at cost of significant memory overhead).