I have problem with multiprocessing code below, after running it in Jupyter notebook program freeze and kernel restarting is requited. Data variable contains 60x2001 DataFrame, some of this data is extracted to dict_results as key of dictionary and variable score as value.
import multiprocessing as mp
def search_regression_loop(data, row, dict_results):
for row2 in range(2000):
if row != row2:
score = linear_model_pred_iloc(data.iloc[:,:51], row, row2)
if (score > 0.30) and (score != 1):
if (data.iloc[row,54] is not None) and (data.iloc[row2,54] is not None):
name =str(data.iloc[row,54]) + ' - ' + str(data.iloc[row2,54])
dict_results[name] = score
else:
name = str(data.iloc[row].name) + ' - ' + str(data.iloc[row2].name)
dict_results[name] = score
manager = mp.Manager()
dict_results = manager.dict()
procs = []
for row in tqdm(range(2000)):
p = mp.Process(target=search_regression_loop, args=(tumor,row,dict_results))
procs.append(p)
p.start()
for p in procs:
p.join()
Related
I have a function main_fun() with 3 arguments. (And it returns the pandas dataframe)
i use the multiprocessing on this within for loop:
def main_fun(ar1, ar2, ar3):
# Do something
df = ar1.copy()
return df
process_list = []
for i in glob.glob(FolderPath + "/*.csv"):
for j in ['File1', 'File2']:
if (j == 'File1'): j_tmp = 'File1_XXX'
else: j_tmp = j
if j_tmp in i.upper():
p = mp.Process(target = main_fun, args = (i, j, FolderPath + '/This_Is_Fixed.csv'))
p.start()
process_list.append(p)
for process in process_list:
process.join()
With above, I can get the job done.
However, if I want to collect each dataframe results and concat them after multiprocessing, how should I do with this for loop?
Here is the answer with simple example:
def foo(name, name2):
d = {'col1': f'{name}', 'col2': f'{name2}'}
df = pd.DataFrame(data=[d])
return df
arg1 = []
arg2 = []
for i in ['bob1_XX', 'steve_XX', 'andy_XX']:
for j in ["BOB", "ANDY"]:
if (j == 'BOB'): j_tmp = 'BOB1'
else: j_tmp = j
if j_tmp in i.upper():
arg1.append(i)
arg2.append(j)
with mp.Pool(processes = (mp.cpu_count() - 1)) as pool:
data = pool.starmap(foo, zip(arg1, arg2))
pd.concat(data, axis=0)
SYSTEM
Linux (Manjaro KDE)
Python 3.8.3
PROGRAM:
I have incoming string data on a UDP port. The main loop spools up the processes prior to using selectors to monitor the UDP port. I want the UDP data, which is constantly updated, available for each process.
TRIED:
Multiprocessing Queues with maxsize = 1 and that became a headache and quickly broke down.
Multiprocessing Arrays (this is where I'm at now)
I have checked, and the Array at each location I'm looking at has the same memory address (I think). For whatever reason, when I try to access the contents of the Array in the child process, the process hangs.
NOT TRIED
Pipes. I have a feeling this may be the way to go. But I'm already deep in uncharted territory; I've never used them before.
WHAT I WANT
I would like to access the UDP data from the child processes - these are the camera_view method.
Dummy UDP string
import socket
import random
import datetime
import time
conn = ('127.0.0.1', 6666)
def rand_value(f_val, t_val):
result = round(random.uniform(f_val, t_val), 2)
result = random.uniform(f_val, t_val)
return result
sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
while True:
time.sleep(6)
timestamp = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
overlay = timestamp
for i in range(9):
val = rand_value(i*10, i*10+10)
if i == 8: val = 'TASK: Im the real Batman'
overlay = overlay + "," + str(val)
print(overlay)
sock.sendto(overlay.encode(), conn)
My Program
import datetime
import selectors
import socket
import time
from multiprocessing import Lock, Process, Queue
from multiprocessing.sharedctypes import Array
from ctypes import c_char_p
REQUIRED_CAMERAS = 1
CAMERA_CONN = {'name':['Colour Camera'], 'ip':['127.0.0.1'], 'port':[9000]}
OVERLAY_CONN = ('0.0.0.0', 6666)
CONTROL_CONN = ('0.0.0.0', 6667)
NUMBER_OF_ITEMS_IN_OVERLAY = 10
class Camera():
def __init__(self, cam_name, cam_ip, cam_port):
self.ip = cam_ip
self.port = cam_port
self.capture = cv2.VideoCapture(0)
self.frame_width = int(self.capture.get(3))
self.frame_height = int(self.capture.get(4))
self.name = cam_name
def get_overlay(data_packet):
data = data_packet.decode()
data = data.split(',')
field0 = data[0]
field1 = 'KP: ' + str(round(float(data[1]), 3))
field2 = 'DCC: ' + str(round(float(data[2]), 2)) + 'm'
field3 = 'E: ' + str(round(float(data[3]), 2)) + 'm'
field4 = 'N: ' + str(round(float(data[4]), 2)) + 'm'
field5 = 'D: ' + str(round(float(data[5]), 2)) + 'm'
field6 = 'H: ' + str(round(float(data[6]), 2)) # + '°'
field7 = 'R: ' + str(round(float(data[7]), 2)) # + '°'
field8 = 'P: ' + str(round(float(data[8]), 2)) # + '°'
field9 = data[9]
x = []
for i in range(NUMBER_OF_ITEMS_IN_OVERLAY):
x.append(eval('field' + str(i)).encode())
# if i == 0:
# print(x[i])
return x
def socket_reader(sock, mask, q, REQUIRED_CAMERAS, overlay):
data_packet, sensor_ip = sock.recvfrom(1024)
sensor_port = sock.getsockname()[1]
print(f'SENSOR PORT {sensor_port} and SENSOR_IP {sensor_ip}')
if sensor_port == OVERLAY_CONN[1]:
x = get_overlay(data_packet)
for i in range(len(x)):
overlay[i] = x[i]
print(f'Socket Reader {overlay}')
def camera_view(CAMERA_CONN, cam_name, camera, overlay_q, control_q, overlay):
while True:
print(f'PROCESS {camera} RUNNING FOR: {cam_name}')
try:
print(f'Camera View {overlay}')
for i in range(len(overlay)):
print(overlay[i])
except:
pass
time.sleep(1)
def controller(REQUIRED_CAMERAS, CAMERA_CONN, OVERLAY_CONN, CONTROL_CONN):
if REQUIRED_CAMERAS > len(CAMERA_CONN['name']):
print(f'REQURIED_CAMERAS: {REQUIRED_CAMERAS} - more than connections in CAMERA_CONN ')
else:
# Set up a UDP connection for the overlay string and the control commands
sock_overlay = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
sock_control = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
sock_overlay.bind(OVERLAY_CONN)
sock_control.bind(CONTROL_CONN)
# Set up the selector to watch over the socket
# and trigger when data is ready for reading
sel = selectors.DefaultSelector()
sel.register(fileobj=sock_overlay, events=selectors.EVENT_READ, data=socket_reader)
sel.register(fileobj=sock_control, events=selectors.EVENT_READ, data=socket_reader)
# create shared memory
overlay_q = Queue(maxsize=1)
control_q = Queue(maxsize=1)
overlay = Array(c_char_p, range(NUMBER_OF_ITEMS_IN_OVERLAY))
print(f'Init Overlay {overlay}')
# Generate the processes; one per camera
processes = []
for camera in range(REQUIRED_CAMERAS):
processes.append(Process(target=camera_view, args=(CAMERA_CONN, CAMERA_CONN['name'][camera], camera, overlay_q, control_q, overlay)))
for process in processes:
process.daemon = True
process.start()
# Spin over the selector
while True:
# Only have one connnection registered, so to stop
# the loop spinning up the CPU, I have made it blocking
# with the timeout = 1 (sec) instead of =0.
events = sel.select(timeout=None)
for key, mask in events:
# the selector callback is the data= from the register above
callback = key.data
# the callback gets the sock, mask and the sensor queues
if key.fileobj == sock_overlay:
callback(key.fileobj, mask, overlay_q, REQUIRED_CAMERAS, overlay)
else:
callback(key.fileobj, mask, control_q, REQUIRED_CAMERAS, overlay)
if __name__ == "__main__":
controller(REQUIRED_CAMERAS, CAMERA_CONN, OVERLAY_CONN, CONTROL_CONN)
EDIT1:
from multiprocessing import Process, Array
from ctypes import c_char_p
import time
def worker(arr):
count = 0
while True:
count += 1
val = 'val' + str(count)
arr[0] = val
print(arr[:])
time.sleep(2)
def main():
arr = Array(c_char_p, 1)
p = Process(target=worker, args=(arr,))
p.daemon = True
p.start()
while True:
print(arr[:])
try:
print(arr[:].decode('utf-8'))
except :
pass
# try:
# val = arr[:]
# val = val.decode('utf-8')
# print(f'main {val}')
# except:
# pass
time.sleep(1)
if __name__ == "__main__":
main()
'''
from multiprocessing import Process, Array
from ctypes import c_char_p
import time
def worker(arr):
count = 0
while True:
count += 1
val = 'val' + str(count)
arr[0] = bytes(val, 'utf-8')
print(arr[:])
time.sleep(2)
def main():
arr = Array(c_char_p, 1)
p = Process(target=worker, args=(arr,))
p.daemon = True
p.start()
while True:
print(arr[:])
try:
print(arr[:].decode('utf-8'))
except :
pass
time.sleep(1)
if __name__ == "__main__":
main()
if __name__ == "__main__":
main()
'''
EDIT2:
Thanks to #RolandSmith, I have persevered with Queues and I think I have got a template on how I can move forward. See below code. If I can't get this to work in program, I'll be back here.
from multiprocessing import Process, Queue
import time
import datetime
def worker(camera, q):
val = ''
while True:
if q.full() == True:
val = q.get()
else:
val = val
print(f'WORKER{camera} {val}')
time.sleep(0.2)
def main():
cameras = 2
processes = []
queues = []
for camera in range(cameras):
queues.append(Queue(maxsize=1))
processes.append(Process(target=worker, args=(camera, queues[camera])))
for process in processes:
process.daemon = True
process.start()
while True:
for q in queues:
if not q.empty():
try:
_ = q.get()
except:
pass
else:
q.put(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
time.sleep(.5)
if __name__ == "__main__":
main()
In my view, using Queue is a less error-prone solution than using an Array.
Here is your second example, converted to using a Queue:
from multiprocessing import Process, Queue
import time
def worker(q):
count = 0
while True:
count += 1
val = 'val' + str(count)
q.put(val)
print('worker:', val)
time.sleep(2)
def main():
q = Queue()
p = Process(target=worker, args=(q, ))
p.daemon = True
p.start()
while True:
if not q.empty():
print('main:', q.get())
time.sleep(1)
if __name__ == "__main__":
main()
This yields:
> python3 test3.py
worker: val1
main: val1
worker: val2
main: val2
worker: val3
main: val3
worker: val4
main: val4
worker: val5
Here is the same example using a Pipe:
from multiprocessing import Process, Pipe
import time
def worker(p):
count = 0
while True:
count += 1
val = 'val' + str(count)
p.send(val)
print('worker:', val)
time.sleep(2)
def main():
child, parent = Pipe()
p = Process(target=worker, args=(child, ))
p.daemon = True
p.start()
while True:
if parent.poll():
print('main:', parent.recv())
time.sleep(1)
if __name__ == "__main__":
main()
This produces the same result as the previous example.
Additionally, by default a pipe is bidirectional.
So you could also send back data from the workers to the parent.
How can a non parallelised version of this code run much faster than a parallel version, when I am using a Manager object to share a list across processes. I did this to avoid any serialisation, and I don't need to edit the list.
I return an 800,000 row data set from Oracle, convert it to a list and store it in shared memory using Manager.list().
I am iterating over each column in the query results, in parallel, to obtain some statistics (I know I could do it in SQL).
Main code:
import cx_Oracle
import csv
import os
import glob
import datetime
import multiprocessing as mp
import get_column_stats as gs;
import pandas as pd
import pandas.io.sql as psql
def get_data():
print("Starting Job: " + str(datetime.datetime.now()));
manager = mp.Manager()
# Step 1: Init multiprocessing.Pool()
pool = mp.Pool(mp.cpu_count())
print("CPU Count: " + str(mp.cpu_count()))
dsn_tns = cx_Oracle.makedsn('myserver.net', '1521', service_name='PARIELGX');
con = cx_Oracle.connect(user='fred', password='password123', dsn=dsn_tns);
stats_results = [["OWNER","TABLE","COLUMN_NAME","RECORD_COUNT","DISTINCT_VALUES","MIN_LENGTH","MAX_LENGTH","MIN_VAL","MAX_VAL"]];
sql = "SELECT * FROM ARIEL.DIM_REGISTRATION_SET"
cur = con.cursor();
print("Start Executing SQL: " + str(datetime.datetime.now()));
cur.execute(sql);
print("End SQL Execution: " + str(datetime.datetime.now()));
print("Start SQL Fetch: " + str(datetime.datetime.now()));
rs = cur.fetchall();
print("End SQL Fetch: " + str(datetime.datetime.now()));
print("Start Creation of Shared Memory List: " + str(datetime.datetime.now()));
lrs = manager.list(list(rs)) # shared memory list
print("End Creation of Shared Memory List: " + str(datetime.datetime.now()));
col_names = [];
for field in cur.description:
col_names.append(field[0]);
#print(col_names)
#print('-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-')
#print(rs)
#print('-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-')
#print(lrs)
col_index = 0;
print("Start In-Memory Iteration of Dataset: " + str(datetime.datetime.now()));
# we go through every field
for field in cur.description:
col_names.append(field[0]);
# start at column 0
col_index = 0;
# iterate through each column, to gather stats from each column using parallelisation
pool_results = pool.map_async(gs.get_column_stats_rs, [(lrs, col_name, col_names) for col_name in col_names]).get()
for i in pool_results:
stats_results.append(i)
# Step 3: Don't forget to close
pool.close()
print("End In-Memory Iteration of Dataset: " + str(datetime.datetime.now()));
# end filename for
cur.close();
outfile = open('C:\jupyter\Experiment\stats_dim_registration_set.csv','w');
writer=csv.writer(outfile,quoting=csv.QUOTE_ALL, lineterminator='\n');
writer.writerows(stats_results);
outfile.close()
print("Ending Job: " + str(datetime.datetime.now()));
get_data();
Code being called in parallel:
def get_column_stats_rs(args):
# rs is a list recordset of the results
rs, col_name, col_names = args
col_index = col_names.index(col_name)
sys.stdout = open("col_" + col_name + ".out", "a")
print("Starting Iteration of Column: " + col_name)
max_length = 0
min_length = 100000 # abitrarily large number!!
max_value = ""
min_value = "zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz" # abitrarily large number!!
distinct_value_count = 0
has_values = False # does the column have any non-null values
has_null_values = False
row_count = 0
# create a dictionary into which we can add the individual items present in each row of data
# a dictionary will not let us add the same value more than once, so we can simply count the
# dictionary values at the end
distinct_values = {}
row_index = 0
# go through every row, for the current column being processed to gather the stats
for val in rs:
row_value = val[col_index]
row_count += 1
if row_value is None:
value_length = 0
else:
value_length = len(str(row_value))
if value_length > max_length:
max_length = value_length
if value_length < min_length:
if value_length > 0:
min_length = value_length
if row_value is not None:
if str(row_value) > max_value:
max_value = str(row_value)
if str(row_value) < min_value:
min_value = str(row_value)
# capture distinct values
if row_value is None:
row_value = "Null"
has_null_values = True
else:
has_values = True
distinct_values[row_value] = 1
row_index += 1
# end row for
distinct_value_count = len(distinct_values)
if has_values == False:
distinct_value_count = None
min_length = None
max_length = None
min_value = None
max_value = None
elif has_null_values == True and distinct_value_count > 0:
distinct_value_count -= 1
if min_length == 0 and max_length > 0 and has_values == True:
min_length = max_length
print("Ending Iteration of Column: " + col_name)
return ["ARIEL", "DIM_REGISTRATION_SET", col_name, row_count, distinct_value_count, min_length, max_length,
strip_crlf(str(min_value)), strip_crlf(str(max_value))]
Helper function:
def strip_crlf(value):
return value.replace('\n', ' ').replace('\r', '')
I am using a Manager.list() object to share state across the processes:
lrs = manager.list(list(rs)) # shared memory list
And am passing the list in the map_async() method:
pool_results = pool.map_async(gs.get_column_stats_rs, [(lrs, col_name, col_names) for col_name in col_names]).get()
Manager overhead is adding up to your runtime. Also you're not directly using shared memory here. You're only using multiprocessing manager, which is known to be slower than shared memory or single thread implementations. If you don't need synchronisation in your code, meaning that you're not modifying the shared data, just skip manager and use shared memory objects directly.
https://docs.python.org/3.7/library/multiprocessing.html
Server process managers are more flexible than using shared memory
objects because they can be made to support arbitrary object types.
Also, a single manager can be shared by processes on different
computers over a network. They are, however, slower than using shared
memory.
My code looks like the following. It seems to be "hanging" during the proc.join() loop. If I create the dataframe df with 10 records, the whole process completes fast, but starting with 10000 (as shown), then the program seems to just hang. I am using htop to look at the CPU core usages, and I do see all of them spike up to 100%, but then long after they go back down to 0%, the program doesn't seem to continue. Any ideas on what I'm doing wrong?
import pandas as pd
import numpy as np
import multiprocessing
from multiprocessing import Process, Queue
def do_something(df, partition, q):
for index in partition:
q.put([v for v in df.iloc[index]])
def start_parallel_processing(df, partitions):
q = Queue()
procs = []
results = []
for partition in partitions:
proc = Process(target=do_something, args=(df, partition, q))
proc.start()
procs.extend([proc])
for i in range(len(partitions)):
results.append(q.get(True))
for proc in procs:
proc.join()
return results
num_cpus = multiprocessing.cpu_count()
df = pd.DataFrame([(x, x+1) for x in range(10000)], columns=['x','y'])
partitions = np.array_split(df.index, num_cpus)
results = start_parallel_processing(df, partitions)
len(results)
It appears Queue.Queue doesn't behave as you want and it wasn't made for sharing between multiple process, instead you must use Manager.Queue()
I have added some print to understand your code flow,
You can still polish your code to use Pool() instead of num_cpus
import pandas as pd
import numpy as np
import multiprocessing
import pprint
from multiprocessing import Process, Queue, Manager
def do_something(df, partition, q):
# print "do_something " + str(len(partition)) + " times"
for index in partition:
# print index
for v in df.iloc[index]:
#print "sending v to queue: " + str(len(df.iloc[index]))
q.put(v, False)
print "task_done(), qsize is "+ str(q.qsize())
def start_parallel_processing(df, partitions):
m = Manager()
q = m.Queue()
procs = []
results = []
print "START: launching "+ str(len(partitions)) + " process(es)"
index = 0
for partition in partitions:
print "launching "+ str(len(partitions)) + " process"
proc = Process(target=do_something, args=(df, partition, q))
procs.extend([proc])
proc.start()
index += 1
print "launched "+ str(index) + "/" + str(len(partitions)) + " process(es)"
while True:
try:
results.append(q.get( block=False ))
except:
print "QUEUE END"
break
print pprint.pformat(results)
process_count = 0
for proc in procs:
process_count += 1
print "joining "+ str(process_count) + "/" + str(len(procs)) + " process(es)"
proc.join()
return results
num_cpus = multiprocessing.cpu_count()
df = pd.DataFrame([(x, x+1) for x in range(10000)], columns=['x','y'])
partitions = np.array_split(df.index, num_cpus)
results = start_parallel_processing(df, partitions)
print "len(results) is: "+ str(len(results))
I am trying to optimize this code, as of right now it runs 340 Requests in 10 mins. I have trying to get 1800 requests in 30 mins. Since I can run a request every second, according to amazon api. Can I use multithreading with this code to increase the number of runs??
However, I was reading in the full data to the main function, should I split it now, how can I figure out how many each thread should take?
def newhmac():
return hmac.new(AWS_SECRET_ACCESS_KEY, digestmod=sha256)
def getSignedUrl(params):
hmac = newhmac()
action = 'GET'
server = "webservices.amazon.com"
path = "/onca/xml"
params['Version'] = '2013-08-01'
params['AWSAccessKeyId'] = AWS_ACCESS_KEY_ID
params['Service'] = 'AWSECommerceService'
params['Timestamp'] = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
key_values = [(urllib.quote(k), urllib.quote(v)) for k,v in params.items()]
key_values.sort()
paramstring = '&'.join(['%s=%s' % (k, v) for k, v in key_values])
urlstring = "http://" + server + path + "?" + \
('&'.join(['%s=%s' % (k, v) for k, v in key_values]))
hmac.update(action + "\n" + server + "\n" + path + "\n" + paramstring)
urlstring = urlstring + "&Signature="+\
urllib.quote(base64.encodestring(hmac.digest()).strip())
return urlstring
def readData():
data = []
with open("ASIN.csv") as f:
reader = csv.reader(f)
for row in reader:
data.append(row[0])
return data
def writeData(data):
with open("data.csv", "a") as f:
writer = csv.writer(f)
writer.writerows(data)
def main():
data = readData()
filtData = []
i = 0
count = 0
while(i < len(data) -10 ):
if (count %4 == 0):
time.sleep(1)
asins = ','.join([data[x] for x in range(i,i+10)])
params = {'ResponseGroup':'OfferFull,Offers',
'AssociateTag':'4chin-20',
'Operation':'ItemLookup',
'IdType':'ASIN',
'ItemId':asins}
url = getSignedUrl(params)
resp = requests.get(url)
responseSoup=BeautifulSoup(resp.text)
quantity = ['' if product.amount is None else product.amount.text for product in responseSoup.findAll("offersummary")]
price = ['' if product.lowestnewprice is None else product.lowestnewprice.formattedprice.text for product in responseSoup.findAll("offersummary")]
prime = ['' if product.iseligibleforprime is None else product.iseligibleforprime.text for product in responseSoup("offer")]
for zz in zip(asins.split(","), price,quantity,prime):
print zz
filtData.append(zz)
print i, len(filtData)
i+=10
count +=1
writeData(filtData)
threading.Timer(1.0, main).start()
If you are using python 3.2 you can use concurrent.futures library to make it easy to launch tasks in multiple threads. e.g. here I am simulating running 10 url parsing job in parallel, each one of which takes 1 sec, if run synchronously it would have taken 10 seconds but with thread pool of 10 should take about 1 seconds
import time
from concurrent.futures import ThreadPoolExecutor
def parse_url(url):
time.sleep(1)
print(url)
return "done."
st = time.time()
with ThreadPoolExecutor(max_workers=10) as executor:
for i in range(10):
future = executor.submit(parse_url, "http://google.com/%s"%i)
print("total time: %s"%(time.time() - st))
Output:
http://google.com/0
http://google.com/1
http://google.com/2
http://google.com/3
http://google.com/4
http://google.com/5
http://google.com/6
http://google.com/7
http://google.com/8
http://google.com/9
total time: 1.0066466331481934