Multiprocessing doesn't run on all CPU cores

Multiprocessing doesn't run on all CPU cores - python

I have a parallel processing in Python which read data from a database, do some manipulation and run the Dijkstra algorithm:
t1 = 200101
t2 = 200229
import psutil
from multiprocess import Pool
pool = Pool(psutil.cpu_count(logical=False))
def graph_analysis(i):
input_date = str(i)
sql_data = """select trim(cast(p.Barcode as nvarchar(20))) Barcode ,cast(s.invoiceid as
nvarchar(20)) invoiceid
from sales s inner join Product_981115 p on s.productid = p.productid
where s.date = """+ input_date +""" and s.qty != 0 and p.sectionid != 1691.199 and s.RegionID = """ + input_region
data = []
for chunk in pd.read_sql(sql_data,conn,chunksize = 1000000):
data.append(chunk)
data = pd.concat(data, ignore_index = True)
data = data.merge(candid_sale_invoices)
data = data.merge(candid_barcodes)
final_edges_df = data.iloc[:,[2,3,4]]
final_edges_tuples = [tuple(x) for x in final_edges_df.values]
Gm = ig.Graph.TupleList(final_edges_tuples, directed = True, edge_attrs = ['weight'])
longest_paths = pd.DataFrame(Gm.shortest_paths_dijkstra(None,None, weights = 'weight'))
longest_paths = longest_paths.swifter.apply(log_transform)
longest_paths["Date"] = input_date
longest_paths["RegionID"] = input_region
Return longest_paths
results = pool.map(graph_analysis,range(t1,(t2) + 1)))
pool.close()
results= pd.concat(results, ignore_index = True)
I ran this code a couple of days ago and it perfectly completed in parallel, utilizing almost all cores. However when I run it today, it seems that I have generated parallel processes but the cores doesn't process in parallel.
The system has 128 GB RAM and 32 Cores and nothing changed in it since the last successful parallel run.
I restarted the system to solve any probable issue but the problem still exists.
So what could be the problem?
Thanks in advance.

Related

Converting from ThreadPool to ProcessExecutorPool

I have the following code which I would like to convert from using ThreadPool to use of ProcessPoolExecutor since it is all CPU intensive calculations and when i observe the CPU monitor I note that my 8 core processor is only using a single thread.
import datetime
from multiprocessing.dummy import Pool as ThreadPool
def thread_run(q, clients_credit_array, clients_terr_array,
freq_small_list, freq_large_list, clients, year, admin):
claim_id = []
claim_client_id = []
claim_company_id = []
claim_year = []
claim_type = []
claim_closed = []
claim_cnt = []
claim_amount = []
print(datetime.datetime.utcnow())
i = 0
client_cnt = 1000
loop_incr = 8
while i < client_cnt:
ind_rng = range(i, min((i + loop_incr), (client_cnt)), 1)
call_var = []
for q in ind_rng:
call_var.append((q,
clients_credit_array,
clients_terr_array,
freq_small_list,
freq_large_list,
clients,
year,
admin))
pool = ThreadPool(len(call_var))
results = pool.map(call_claim, call_var)
pool.close()
pool.join()
for result in results:
if result[0] == []:
pass
else:
r = 0
if r < len(result[0]):
claim_index += 1
claim_id.append(claim_index)
claim_client_id.append(result[0][r])
claim_company_id.append(result[1][r])
claim_year.append(result[2][r])
claim_type.append(result[3][r])
claim_closed.append(result[4][r])
claim_cnt.append(result[5][r])
claim_amount.append(result[6][r])
r += 1
i += loop_incr
print(datetime.datetime.utcnow())
The difficulty I am having, however, is that when I modify the code as follows, I get error messages:
from concurrent.futures import ProcessPoolExecutor as PThreadPool
pool = PThreadPool(max_workers=len(call_var))
#pool = ThreadPool(len(call_var))
results = pool.map(call_claim, call_var)
#pool.close()
#pool.join()
I had to remove the pool.close() and pool.join() as it generated errors. But when I removed them, my code was not utilizing parallel processors and it ran much longer and slower than originally. What am I missing?

As was pointed out in the comments, it is common to see Executor used as part of a context manager and without the need for join or close operations. Below is a simplified example to illustrate the concepts.
Example:
import concurrent.futures
import random
import time
import os
values = [1, 2, 3, 4, 5]
def times_two(n):
time.sleep(random.randrange(1, 5))
print("pid:", os.getpid())
return n * 2
def main():
with concurrent.futures.ProcessPoolExecutor() as executor:
results = executor.map(times_two, values)
for one_result in results:
print(one_result)
if __name__ == "__main__":
main()
Output:
pid: 396
pid: 8904
pid: 25440
pid: 20592
pid: 14636
2
4
6
8
10

Python multiprocessing context switching of CPU's

I have created this simple code to check multiprocessing reading from a global dictionary object:
import numpy as np
import multiprocessing as mp
import psutil
from itertools import repeat
def computations_x( max_int ):
#random selection
mask_1 = np.random.randint( low=0, high=max_int, size=1000 )
mask_2 = np.random.randint( low=0, high=max_int, size=1000 )
exponent_1 = np.sqrt( np.pi )
vector_1 = np.array( [ read_obj[ k ]**( exponent_1 ) for k in mask_1 ] )
vector_2 = np.array( [ read_obj[ k ]**np.pi for k in mask_2 ] )
result = []
for j in range(100):
res_col = []
for i in range(100):
c = np.multiply( vector_1, vector_2 ).sum( axis=0 )
res_col.append(c)
res_col = np.array( res_col )
result.append( res_col )
result = np.array( result )
return result
global read_obj
total_items = 40000
max_int = 1000
keys = np.arange(0, max_int)
number_processors = psutil.cpu_count( logical=False )
#number_used_processors = 1
number_used_processors = number_processors - 1
number_tasks = number_used_processors
read_obj = { k: np.random.rand( 1000 ) for k in keys }
pool = mp.Pool( processes = number_used_processors )
args = list( repeat( max_int, number_tasks ) )
results = pool.map( computations_x, args )
pool.close()
pool.join()
However, when looking at CPU performance, I see that the CPU's are being switched by the OS when performing the computations. I am running on Ubuntu 18.04, is this normal behaviour when using Python's MP module? Here is what I observe in the system monitor when debugging the code (I am using Eclipse2019 for debugging)
Any help is appreciated, as in my main project I need to share a global "read only" object through processes in the same spirit as is done here, and I want to be sure this is not affecting performance really badly; I also want to make sure all tasks are executed concurrently within the Pool class. thanks.

I'd say that is the normal behaviour as the OS has to make sure that other processes are not starving for CPU time.
Here's a nice article on the OS scheduler basics: https://www.ardanlabs.com/blog/2018/08/scheduling-in-go-part1.html
It's focusing on Golang but the first part is pretty general.

why can not free memory in python, even using del and gc?

A lot of HDF5 files need to be processed. This is related to calculate ndarray. Every HDF5 file is about 800M. I used loop and function to do this. However, the memory was consumed rapidly.
I used del and gc.collect() to free memory, but it can not help. And, a large of ‘modified memory’ was produced（I find out it in resource monitor of windows OS）.
The software: python3.7, jupyter notebook, windows 7 OS.
import tensorflow as tf
import numpy as np
import pandas as pd
import os,glob,h5py,gc
from tqdm import tqdm_notebook as tqdm
def DLmodel(dataset,bands2,ratio):
#a trained deep learning model was applicated
morm_para = pd.read_hdf(r'I:\HY_project\calibration\blackbodyDN\HY1C_oneline\H1C_OCT_0720_FrameNumber0_train_stats_'+bands2+'.h5',key=bands2)
norm_dataset = (dataset - morm_para['mean']) / morm_para['std']
model = tf.keras.models.load_model(r'I:\HY_project\calibration\blackbodyDN\blackbody_model_'+ bands2+ '.h5')
simulated_value = model.predict(norm_dataset)
mat = np.full((simulated_value.shape[0],2),0.0)
simulated_value.shape = (simulated_value.shape[0])
mat[:,0] = simulated_value
mat[:,1] = simulated_value/ratio
return mat,mat.shape[0],mat.shape[1]
def simulation(file,ratio):
# processing HDF5 file
# 4 parameters
bands = ['DN_412','DN_443','DN_490','DN_520','DN_565','DN_670','DN_750','DN_865']
bands2 = ['Band412','Band443','Band490','Band520','Band565','Band670','Band750','Band865']
gain = np.array([0.0129, 0.0118, 0.0107, 0.0115, 0.0101, 0.0077, 0.0047, 0.0026])
offset = [0.013705,0.013545, 0.012612, 0.01255, 0.01204,0.0108225, 0.01032,0.00589]
#open file and group
h5 = h5py.File(file,'a')
try:
extra = h5.create_group('Extra Data')
except:
extra = h5['Extra Data']
try:
smr = h5.create_group('scanline mean radiance')
except:
smr = h5['scanline mean radiance']
#processing for every dataset node
for j, band in enumerate(bands):
#read dataset
radiance = (np.mean(h5['Geophysical Data/'+band][:,:],axis = 1) * gain[j] + offset[j])* 1.0
radiance.shape = (radiance.shape[0],1)
try:
smr.create_dataset('scanline mean radiance_'+band[3:],(radiance.shape[0], radiance.shape[1]), dtype='f',data = radiance)
except:
pass
sza = h5['Scan Line Attributes/Center Solar Zenith Angle'][:,:]
dataset = pd.DataFrame(np.concatenate((sza,radiance),axis=1), columns = ['sza', 'radiance'])
dataset['sza'] = np.cos(dataset['sza']*np.pi/180.0)
#a trained deep learning model was applicated
mat,row, column = DLmodel(dataset,bands2[j],ratio[j])
try:
extra.create_dataset('Ext_'+band[3:],(row, column), dtype='f',data = mat)
except:
continue
h5.close()
if __name__ =='__main__':
files = glob.glob(r'H:\HY1BL1A\DNsubBlack\H1B_OPER_OCT_L1A*.h5')
pbar=tqdm(total=len(files),desc='process:')
ratio = np.array([3.88372, 4.05084, 4.61682243, 3.9826 , 4.31683, 5.09090, 5.08510, 7.34615])
for i, file in enumerate(files):
simulation(file,ratio)
gc.collect()
pbar.update(1)
pbar.close()
I want to free enough memory after every loop.

Lack of scaling for python's multiprocessing pool

I am writing a simple python script that I need to scale to many threads. For simplicity, I have replaced the actual function I need to use with a matrix matrix multiply. I am having trouble getting my code to scale with the number of processors. Any advice to help me get the correct speedup would be helpful! My code and results are as follows:
import numpy as np
import time
import math
from multiprocessing.dummy import Pool
res = 4
#we must iterate over all of these values
wavektests = np.linspace(.1,2.5,res)
omegaratios = np.linspace(.1,2.5,res)
wavekmat,omegamat = np.meshgrid(wavektests,omegaratios)
def solve_for_omegaratio( ind ):
#obtain the indices for this run
x_ind = ind % res
y_ind = math.floor(ind / res)
#obtain the value for this run
wavek = wavektests[x_ind]
omega = omegaratios[y_ind]
#do some work ( I have replaced the real function with this)
randmat = np.random.rand(4000,4000)
nop = np.linalg.matrix_power(randmat,3)
#obtain a scalar value
value = x_ind + y_ind**2.0
return value
list_ind = range(res**2)
#Serial code execution
t0_proc = time.clock()
t0_wall = time.time()
threads = 0
dispersion = map( solve_for_omegaratio , list_ind)
displist = list(dispersion)
t1_proc = time.clock()
t1_wall = time.time()
print('serial execution')
print('wall clock time = ',t1_wall-t0_wall)
print('processor clock time = ',t1_proc-t0_proc)
print('------------------------------------------------')
#Using pool defaults
t0_proc = time.clock()
t0_wall = time.time()
if __name__ == '__main__':
pool = Pool()
dispersion = pool.map( solve_for_omegaratio , list_ind)
displist = list(dispersion)
t1_proc = time.clock()
t1_wall = time.time()
pool.close
print('num of threads = default')
print('wall clock time = ',t1_wall-t0_wall)
print('processor clock time = ',t1_proc-t0_proc)
print('------------------------------------------------')
# Using 4 threads
t0_proc = time.clock()
t0_wall = time.time()
threads = 4
if __name__ == '__main__':
pool = Pool(threads)
dispersion = pool.map( solve_for_omegaratio , list_ind)
displist = list(dispersion)
t1_proc = time.clock()
t1_wall = time.time()
pool.close
print('num of threads = ' + str(threads))
print('wall clock time = ',t1_wall-t0_wall)
print('processor clock time = ',t1_proc-t0_proc)
print('------------------------------------------------')
Results:
serial execution
wall clock time = 66.1561758518219
processor clock time = 129.16376499999998
------------------------------------------------
num of threads = default
wall clock time = 81.86436200141907
processor clock time = 263.45369
------------------------------------------------
num of threads = 4
wall clock time = 77.63390111923218
processor clock time = 260.66285300000004
------------------------------------------------

Because python has a GIL https://wiki.python.org/moin/GlobalInterpreterLock , "python-native" threads can't run execute truly concurrently and thus can't improve the performance of CPU-bound tasks like math. They can be used to parallelize IO bound tasks effectively (eg API calls which spend almost all their time waiting for network I/O). Forking separate processes with multiprocessing rather than dummy's thread-backed implementation will create multiple processes, not threads, which will be able to run concurrently ( at cost of significant memory overhead).

Django - how to make a complex math annotation (k Nearest Neighbors)

I have this model:
class Image(models.Model):
title = models.CharField(max_length=200)
image = models.ImageField(upload_to='img/')
signature = models.TextField(null = True)
The signature is a numpy monodimensional vector encoded in json. In order to make my query, I have to decode each object signature into a nparray, and make a dot product between each object's signature and a given vector, then annotate as a float (named "score") field beside each raw. Lastly I have to order from max to min.
I tried this in view.py
def image_sorted(request):
query_signature = extract_feat(settings.MEDIA_ROOT + "/cache" + "/003_ant_image_0003.jpg") # a NParray object
image_list = Image.objects.annotate(score=np.dot(
JSONVectConverter.json_to_vect(F('signature')), query_signature.T
).astype(float)).order_by('score') #JSONVectConverter is a class of mine
return render(request, 'images/sorted.html', {'image_sorted': image_list})
of course it doesn't work. I think "F()" operator is out of scope...
If you're wondering, I'm writing an image retrieval webapp for my university thesis.
Thank you.
EDIT:
I found this that is quite the same problem (He use postgres instead of MySQL)
EDIT2: I just remember now what is the last solution I've adopted! First I pull out every vector from the DB and mantain it in RAM, then I make some simple computes to find the K-Nearest Neighbors. Then, I retrieve from the DB the respective image using its index (primary key). So I decouple this task from Django ORM. Here's the code (from the Rest API)
def query_over_db(query_signature, page):
query_signature = np.array(query_signature)
t0 = time.time()
descriptor_matrix = cache.get('descriptor_matrix')
id_vector = cache.get('id_vector')
if not descriptor_matrix:
id_vector = []
descriptor_matrix = []
images_dict = Image.objects.all().values('id', 'signature')
for image in images_dict:
s = image['signature']
descriptor = np.array(s)
descriptor_matrix.append(descriptor)
id_vector.append(image['id'])
cache.set('id_vector', id_vector)
cache.set('descriptor_matrix', descriptor_matrix)
t1 = time.time()
print("time to pull out the descriptors : " + str(t1 - t0))
t1 = time.time()
#result = np.abs(np.dot(descriptor_matrix, query_signature.T))
#result = np.sum((descriptor_matrix - query_signature)**2, axis=1)
result = ne.evaluate('sum((descriptor_matrix - query_signature)**2, axis=1)')
t2 = time.time()
print("time to calculate similarity: " + str(t2 - t1))
perm = np.argsort(result)[(page - 1) * 30:page * 30]
print(perm.shape)
print(len(id_vector))
perm_id = np.array(id_vector)[perm]
print(len(perm_id))
print("printing sort")
print(np.sort(result)[0])
t4 = time.time()
print("time to order the result: " + str(t4 - t2))
qs = Image.objects.defer('signature').filter(id__in=perm_id.tolist())
qs_new = []
for i in range(len(perm_id)):
qs_new.append(qs.get(id=perm_id[i]))
t3 = time.time()
print("time to get the results from the DB : " + str(t3 - t2))
print("total time : " + str(t3 - t0))
print(result[perm])
return qs_new

I didn't come close to trying something this complex, however I've solved a similar issue here:
Combining Django F, Value and a dict to annotate a queryset
I haven't tried this but you could give it a go:
from django.db.models import Case, When, FloatField
query_signature = extract_feat(settings.MEDIA_ROOT + "/cache" + "/003_ant_image_0003.jpg") # a NParray object
value_dict = {}
for image in Image.objects.all():
value_dict[image.signature] = np.dot(
JSONVectConverter.json_to_vect(image.signature),
query_signature.T
).astype(float)
whens = [
When(signature=k, then=v) for k, v in value_dict.items()
]
qs = Image.objects.all().annotate(
score=Case(
*whens,
default=0,
output_field=FloatField()
)
).order_by('score')
Hope it helps

So that's the final working code:
def image_sorted(request):
query_signature = extract_feat(settings.MEDIA_ROOT + "/cache" + "/001_accordion_image_0001.jpg") # a NParray object
#query_signature = extract_feat(settings.MEDIA_ROOT + "/cache" + "/003_ant_image_0003.jpg") # a NParray object
value_dict = {}
for image in Image.objects.all():
S = image.signature
value_dict[image.signature] = np.dot(
JSONVectConverter.json_to_vect(S),
query_signature.T
).astype(float)
whens = [
When(signature=k, then=v) for k, v in value_dict.items()
]
qs = Image.objects.all().annotate(
score=Case(
*whens,
default=0,
output_field=FloatField()
)
).order_by('-score')
for image in qs:
print(image.score)
return render(request, 'images/sorted.html', {'image_sorted': qs})
Thanks to Omar for helping me! Of course I'm still here if there are finer solutions.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Multiprocessing doesn't run on all CPU cores - python

Related

Converting from ThreadPool to ProcessExecutorPool

Python multiprocessing context switching of CPU's

why can not free memory in python, even using del and gc?

Lack of scaling for python's multiprocessing pool

Django - how to make a complex math annotation (k Nearest Neighbors)

Categories

Resources