Python Multiprocessing Jits - python

I was trying to execute a multi processed program like this
recQ = eventQ = Queue()
def _getRecords():
files=os.listdir(LOGDIR)
for file in files:
fileHeader = open(os.path.join(LOGDIR,file))
reader = logf.LogfileReader(fileHeader)
data = defaultdict(lambda: defaultdict(int))
for record in fileHeader.readlines():
recQ.put(record)
recQ.put('Done')
def _makeEvents():
while(True):
rec = recQ.get()
if not rec == 'Done':
e = event._getOject(rec)
eventQ.put(e)
else: break
eventQ.put('Done')
def _updateSQL():
while(True):
data = eventQ.get()
if not data == 'Done':
event = eventQ.get()
sql._updateEvent(event)
else: break
def _trigger():
processes = []
processes.append(Process(target = _getRecords(), args = ()))
processes.append(Process(target = _makeEvents(), args = ()))
processes.append(Process(target = _updateSQL(), args = ()))
for process in processes:
process.start()
if __name__ == '__main__':
_trigger()
Now the problem is when i am starting all the process from for loop, they are getting executed one after the other. I ve tried a similar example on some simpler data it is working very fine with it. Can any one tell me whats wrong with my code.
Thanks in advance :)

Related

python multiprocess queue with infinity loop

I create 3 processes and want the function wirte1 to write value 'A,B,C' to queue1 ,and function read1 read value from queue1 and put it to queue2 ,in the same time, function read2 read value from queue2, but value B,C can't read from queue2 in time and the process finished.
from multiprocessing import Process, Queue,Manager,Pool,Lock
import os, time, random
#向队列1写数据
def write1(q1,lock):
lock.acquire()
for value in ['A', 'B', 'C']:
print ('Put %s to queue111...%s' % (value,str(os.getpid())))
q1.put(value)
time.sleep(1)
lock.release()
#从队列1读取数据并写入队列2
def read1(q1,q2,lock):
lock.acquire()
while True:
time.sleep(1)
value=q1.get()
# if value is None:break
print('Get %s from queue111.%s' % (value,str(os.getpid())))
q2.put(value)
print('Put %s to queue222...%s' % (value,str(os.getpid())))
lock.release()
def read2(q2,lock):
lock.acquire()
while True:
# if not q2.empty() or not q1.empty():
time.sleep(2)
value=q2.get(True)
print('Get %s from queue222.%s' % (value,os.getpid()))
lock.release()
if __name__=='__main__':
manager = Manager()
# 父进程创建Queue,并传给各个子进程:
q1 = manager.Queue()
q2 = manager.Queue()
lock1 = manager.Lock()
lock2 = manager.Lock()
lock3 = manager.Lock()
start=time.time()
p = Pool()
# pw = p.apply_async(write1, args=(q1,lock1,))
pw = Process(target=write1,args=(q1,lock1,))
# time.sleep(0.5)
# pr = p.apply_async(read1, args=(q1,q2,lock2,))
# pr2 = p.apply_async(read2, args=(q2,lock3))
pr=Process(target=read1,args=(q1,q2,lock2,))
pr2 = Process(target=read2,args=(q2,lock3,))
pw.start()
pr.start()
pr2.start()
# p.close()
# p.join()
pw.join()
pr.terminate()
pr2.terminate()
end=time.time()
# print
print('finished!!')
print(end-start)
the output is:
Put A to queue111...77678 Put B to queue111...77678 Get A from queue111.77680 Put A to queue222...77680 Put C to queue111...77678 Get A from queue222.77681 Get B from queue111.77680 Put B to queue222...77680 Get C from queue111.77680 Put C to queue222...77680 finished!! 3.025238275527954
You can’t use terminate to control a system like this: it races with completing the actual work. Instead, make your loops not be infinite, probably by using a sentinel value in each Queue (as in one commented-out line).

What is the most efficient way of sending a sequence of different data types with send_multipart() in ZMQ?

I am trying to use ZeroMQ for multiprocessing. I want to stream files from a tar file so I used the streamer.
Below is an instance of what want to do.
import time
import zmq
from zmq.devices.basedevice import ProcessDevice
from multiprocessing import Process
def server(frontend_port, number_of_workers):
context = zmq.Context()
socket = context.socket(zmq.PUSH)
socket.connect("tcp://127.0.0.1:%d" % frontend_port)
for i in range(0,10):
socket.send_json('#%s' % i)
for i in range(number_of_workers):
socket.send_json('STOP')
return True
def worker(work_num, backend_port):
context = zmq.Context()
socket = context.socket(zmq.PULL)
socket.connect("tcp://127.0.0.1:%d" % backend_port)
while True:
message = socket.recv_json()
if message == 'STOP':
break
print("Worker #%s got message! %s" % (work_num, message))
time.sleep(1)
def main():
frontend_port = 7559
backend_port = 7560
number_of_workers = 2
streamerdevice = ProcessDevice(zmq.STREAMER, zmq.PULL, zmq.PUSH)
streamerdevice.bind_in("tcp://127.0.0.1:%d" % frontend_port )
streamerdevice.bind_out("tcp://127.0.0.1:%d" % backend_port)
streamerdevice.setsockopt_in(zmq.IDENTITY, b'PULL')
streamerdevice.setsockopt_out(zmq.IDENTITY, b'PUSH')
streamerdevice.start()
processes = []
for work_num in range(number_of_workers):
w = Process(target=worker, args=(work_num,backend_port))
processes.append(w)
w.start()
time.sleep(1)
s = Process(target=server, args=(frontend_port,number_of_workers))
s.start()
# server(frontend_port)
s.join()
for w in processes:
w.join()
if __name__ == '__main__':
main()
This code works properly. But I want to use send_multipart() to send a tuple or a list that includes items with different types like [string, numpy_array, integer] but json can't handle numpy arrays. I am avoiding using pickle because I need it to be as fast as possible. I tried to convert the array to bytes too but it didn't work. (maybe I was doing it wrong I am not sure).
I appreciate if you can provide a working snippet of code.
Ideally, I want to do something like this:
socket.send_multipart([string, numpy_array, integer])
So I want to know what is the most efficient way of doing it.
I am using Python 3.6
msgpack and msgpack_numpy are the best option I could find.
Try this:
import time
import zmq
from zmq.devices.basedevice import ProcessDevice
from multiprocessing import Process
import numpy as np
import msgpack
import msgpack_numpy as m
def server(frontend_port, number_of_workers):
context = zmq.Context()
socket = context.socket(zmq.PUSH)
socket.connect("tcp://127.0.0.1:%d" % frontend_port)
for i in range(0,10):
arr = np.array([[[i,i],[i,i]],[[i,i],[i,i]]])
file_name = 'image file name or any other srting'
number = 10 # just an instance of an integer
msg = msgpack.packb((arr, number, file_name), default=m.encode, use_bin_type=True)
socket.send(msg, copy=False)
time.sleep(1)
for i in range(number_of_workers):
msg = msgpack.packb((b'STOP', b'STOP'), default=m.encode, use_bin_type=True)
socket.send(msg, copy=False)
return True
def worker(work_num, backend_port):
context = zmq.Context()
socket = context.socket(zmq.PULL)
socket.connect("tcp://127.0.0.1:%d" % backend_port)
while True:
task = socket.recv()
task = msgpack.unpackb(task, object_hook= m.decode, use_list=False, max_bin_len=50000000, raw=False)
if task[1] == b'STOP':
break
(arr, number, file_name) = task
print("Worker ",work_num, 'got message!', file_name)
return True
def main():
m.patch()
frontend_port = 3559
backend_port = 3560
number_of_workers = 2
streamerdevice = ProcessDevice(zmq.STREAMER, zmq.PULL, zmq.PUSH)
streamerdevice.bind_in("tcp://127.0.0.1:%d" % frontend_port )
streamerdevice.bind_out("tcp://127.0.0.1:%d" % backend_port)
streamerdevice.setsockopt_in(zmq.IDENTITY, b'PULL')
streamerdevice.setsockopt_out(zmq.IDENTITY, b'PUSH')
streamerdevice.start()
processes = []
for work_num in range(number_of_workers):
w = Process(target=worker, args=(work_num,backend_port))
processes.append(w)
w.start()
time.sleep(1)
s = Process(target=server, args=(frontend_port,number_of_workers))
s.start()
s.join()
for w in processes:
w.join()
if __name__ == '__main__':
main()

Multiprocessing show multiple progress bars

For my program, I have a file that writes random integers to a .CSV file.
from __future__ import absolute_import, division, print_function
from numpy.random import randint as randrange
import os, argparse, time
from tqdm import tqdm
def write_to_csv(filename, *args, newline = True):
write_string = ''
for arg in args:
if type(arg) == list:
for i in arg:
write_string += str(i) + ','
else:
write_string += str(arg) + ','
if newline:
write_string = write_string.rstrip(',') + '\n'
else:
write_string = write_string.rstrip(',')
with open(filename+'.csv', 'a') as file:
file.write(write_string)
def move_dir(dirname, parent = False):
if not parent:
dirname = str(dirname)
exists = os.path.isfile(dirname)
try:
os.mkdir(dirname)
os.chdir(dirname)
except FileExistsError:
os.chdir(dirname)
else:
os.chdir("..")
def calculate_probability(odds, exitmode = False, low_cpu = 0):
try:
file_count = 0
move_dir('Probability')
move_dir(str(odds))
d = {}
writelist = []
percentlist = []
for i in tqdm(range(odds)):
d[str(i)] = 0
writelist.append(f'Times {i}')
percentlist.append(f'Percent {i}')
while True:
if os.path.isfile(str(file_count)+'.csv'):
file_count += 1
else:
break
filename = str(file_count)
write_to_csv(filename, 'Number', 'Value')
rep = 500 * odds
if rep > 10000:
rep = 10000
for i in tqdm(range(rep)):
ran = randrange(odds)
ran = int(ran)
d[str(ran)] += 1
if i == 999:
write_to_csv(filename, i, ran+1, newline = False)
else:
write_to_csv(filename, i, ran+1)
if low_cpu:
time.sleep(0.01*float(low_cpu))
writelist2 = []
percentlist2 = []
for i in tqdm(range(odds)):
val = d[str(i)]
writelist2.append(val)
percentlist2.append(round(((val/rep)*100), 2))
if os.path.isfile('runs.csv'):
write_to_csv('runs', file_count, writelist2, percentlist2)
else:
write_to_csv('runs', 'Run #', writelist, percentlist)
write_to_csv('runs', file_count, writelist2, percentlist2)
if exitmode:
exit()
except(KeyboardInterrupt, SystemExit):
if exitmode:
os.remove(str(file_count)+'.csv')
exit()
else:
try:
os.system('cls')
print('User/program interrupted, lauching shutdown mode...')
os.remove(str(file_count)+'.csv')
print('Finilizaing current trial...')
os.chdir("..")
os.chdir("..")
except FileNotFoundError:
exit()
calculate_probability(odds, exitmode = True)
I also have a repetition system to do this multiple times.
def run_tests(times, odds, low_cpu = 0, shutdown = False):
for i in tqdm(range(times)):
calculate_probability(odds, low_cpu = low_cpu)
os.chdir("..")
os.chdir("..")
if shutdown:
os.system('shutdown /S /F /T 0 /hybrid')
However, if I were to run like 30 trails, it would take forever. So I decided to use the multiprocessing module to speed up the process. Because each run needs to write to the same file at the end, I had to collect the data and write them after the processes ended.
def calculate_probability(odds, low_cpu = 0):
try:
file_count = 0
move_dir('Probability')
move_dir(str(odds))
d = {}
writelist = []
percentlist = []
for i in tqdm(range(odds)):
d[str(i)] = 0
writelist.append(f'Times {i}')
percentlist.append(f'Percent {i}')
while True:
if os.path.isfile(str(file_count)+'.csv'):
file_count += 1
else:
break
filename = str(file_count)
write_to_csv(filename, 'Number', 'Value')
rep = 500 * odds
if rep > 10000:
rep = 10000
for i in range(rep):
ran = randrange(odds)
ran = int(ran)
d[str(ran)] += 1
if i == 999:
write_to_csv(filename, i, ran+1, newline = False)
else:
write_to_csv(filename, i, ran+1)
if low_cpu:
time.sleep(0.01*float(low_cpu))
writelist2 = []
percentlist2 = []
for i in range(odds):
val = d[str(i)]
writelist2.append(val)
percentlist2.append(round(((val/rep)*100), 2))
return (writelist, percentlist, writelist2, percentlist2)
except(KeyboardInterrupt, SystemExit):
try:
os.remove(str(file_count)+'.csv')
finally:
exit()
def worker(odds, returndict, num, low_cpu = 0):
returndict[f'write{num}'] = calculate_probability(odds, low_cpu = low_cpu)
os.chdir("..")
os.chdir("..")
os.system('cls')
def run_tests(times, odds, low_cpu = 0, shutdown = False):
print('Starting...')
manager = Manager()
return_dict = manager.dict()
job_list = []
for i in range(times):
p = Process(target=worker, args=(odds,return_dict,i), kwargs = {'low_cpu' : low_cpu})
job_list.append(p)
p.start()
try:
for proc in job_list:
proc.join()
except KeyboardInterrupt:
print('User quit program...')
time.sleep(5)
for proc in job_list:
proc.join()
exit()
else:
move_dir('Probability')
move_dir(str(odds))
if not os.path.isfile('runs.csv'):
write_to_csv('runs', return_dict.values()[0][0], return_dict.values()[0][1])
for value in return_dict.values():
write_to_csv('runs', value[2], value[3])
print('Done!')
finally:
if shutdown:
os.system('shutdown /S /F /T 0 /hybrid')
However, when I run this new code, there is one progressbar, and each process overwrites the bar, so the bar is flashing with random numbers, making the bar useful. I want to have a stack of bars, one for each process, that each update without interrupting the others. The bars do not need to be ordered; I just need to have an idea of how fast each process is doing their tasks.
STDOUT is just a stream, and all of your processes are attached to the same one, so there's no direct way to tell it to print the output from different processes on different lines.
Probably the simplest way to achieve this would be to have a separate process that is responsible for aggregating the status of all the other processes and reporting the results. You can use a multiprocessing.Queue to pass data from the worker threads to the status thread, then the status thread can print the status to stdout. If you want a stack of progress bars, you'll have to get a little creative with the formatting (essentially update all the progress bars at the same time and print them in the same order so they appear to stack up).

Multiprocessing Queue.get() hangs

I'm trying to implement basic multiprocessing and I've run into an issue. The python script is attached below.
import time, sys, random, threading
from multiprocessing import Process
from Queue import Queue
from FrequencyAnalysis import FrequencyStore, AnalyzeFrequency
append_queue = Queue(10)
database = FrequencyStore()
def add_to_append_queue(_list):
append_queue.put(_list)
def process_append_queue():
while True:
item = append_queue.get()
database.append(item)
print("Appended to database in %.4f seconds" % database.append_time)
append_queue.task_done()
return
def main():
database.load_db()
print("Database loaded in %.4f seconds" % database.load_time)
append_queue_process = Process(target=process_append_queue)
append_queue_process.daemon = True
append_queue_process.start()
#t = threading.Thread(target=process_append_queue)
#t.daemon = True
#t.start()
while True:
path = raw_input("file: ")
if path == "exit":
break
a = AnalyzeFrequency(path)
a.analyze()
print("Analyzed file in %.4f seconds" % a._time)
add_to_append_queue(a.get_results())
append_queue.join()
#append_queue_process.join()
database.save_db()
print("Database saved in %.4f seconds" % database.save_time)
sys.exit(0)
if __name__=="__main__":
main()
The AnalyzeFrequency analyzes the frequencies of words in a file and get_results() returns a sorted list of said words and frequencies. The list is very large, perhaps 10000 items.
This list is then passed to the add_to_append_queue method which adds it to a queue. The process_append_queue takes the items one by one and adds the frequencies to a "database". This operation takes a bit longer than the actual analysis in main() so I am trying to use a seperate process for this method. When I try and do this with the threading module, everything works perfectly fine, no errors. When I try and use Process, the script hangs at item = append_queue.get().
Could someone please explain what is happening here, and perhaps direct me toward a fix?
All answers appreciated!
UPDATE
The pickle error was my fault, it was just a typo. Now I am using the Queue class within multiprocessing but the append_queue.get() method still hangs.
NEW CODE
import time, sys, random
from multiprocessing import Process, Queue
from FrequencyAnalysis import FrequencyStore, AnalyzeFrequency
append_queue = Queue()
database = FrequencyStore()
def add_to_append_queue(_list):
append_queue.put(_list)
def process_append_queue():
while True:
database.append(append_queue.get())
print("Appended to database in %.4f seconds" % database.append_time)
return
def main():
database.load_db()
print("Database loaded in %.4f seconds" % database.load_time)
append_queue_process = Process(target=process_append_queue)
append_queue_process.daemon = True
append_queue_process.start()
#t = threading.Thread(target=process_append_queue)
#t.daemon = True
#t.start()
while True:
path = raw_input("file: ")
if path == "exit":
break
a = AnalyzeFrequency(path)
a.analyze()
print("Analyzed file in %.4f seconds" % a._time)
add_to_append_queue(a.get_results())
#append_queue.join()
#append_queue_process.join()
print str(append_queue.qsize())
database.save_db()
print("Database saved in %.4f seconds" % database.save_time)
sys.exit(0)
if __name__=="__main__":
main()
UPDATE 2
This is the database code:
class FrequencyStore:
def __init__(self):
self.sorter = Sorter()
self.db = {}
self.load_time = -1
self.save_time = -1
self.append_time = -1
self.sort_time = -1
def load_db(self):
start_time = time.time()
try:
file = open("results.txt", 'r')
except:
raise IOError
self.db = {}
for line in file:
word, count = line.strip("\n").split("=")
self.db[word] = int(count)
file.close()
self.load_time = time.time() - start_time
def save_db(self):
start_time = time.time()
_db = []
for key in self.db:
_db.append([key, self.db[key]])
_db = self.sort(_db)
try:
file = open("results.txt", 'w')
except:
raise IOError
file.truncate(0)
for x in _db:
file.write(x[0] + "=" + str(x[1]) + "\n")
file.close()
self.save_time = time.time() - start_time
def create_sorted_db(self):
_temp_db = []
for key in self.db:
_temp_db.append([key, self.db[key]])
_temp_db = self.sort(_temp_db)
_temp_db.reverse()
return _temp_db
def get_db(self):
return self.db
def sort(self, _list):
start_time = time.time()
_list = self.sorter.mergesort(_list)
_list.reverse()
self.sort_time = time.time() - start_time
return _list
def append(self, _list):
start_time = time.time()
for x in _list:
if x[0] not in self.db:
self.db[x[0]] = x[1]
else:
self.db[x[0]] += x[1]
self.append_time = time.time() - start_time
Comments suggest you're trying to run this on Windows. As I said in a comment,
If you're running this on Windows, it can't work - Windows doesn't
have fork(), so each process gets its own Queue and they have nothing
to do with each other. The entire module is imported "from scratch" by
each process on Windows. You'll need to create the Queue in main(),
and pass it as an argument to the worker function.
Here's fleshing out what you need to do to make it portable, although I removed all the database stuff because it's irrelevant to the problems you've described so far. I also removed the daemon fiddling, because that's usually just a lazy way to avoid shutting down things cleanly, and often as not will come back to bite you later:
def process_append_queue(append_queue):
while True:
x = append_queue.get()
if x is None:
break
print("processed %d" % x)
print("worker done")
def main():
import multiprocessing as mp
append_queue = mp.Queue(10)
append_queue_process = mp.Process(target=process_append_queue, args=(append_queue,))
append_queue_process.start()
for i in range(100):
append_queue.put(i)
append_queue.put(None) # tell worker we're done
append_queue_process.join()
if __name__=="__main__":
main()
The output is the "obvious" stuff:
processed 0
processed 1
processed 2
processed 3
processed 4
...
processed 96
processed 97
processed 98
processed 99
worker done
Note: because Windows doesn't (can't) fork(), it's impossible for worker processes to inherit any Python object on Windows. Each process runs the entire program from its start. That's why your original program couldn't work: each process created its own Queue, wholly unrelated to the Queue in the other process. In the approach shown above, only the main process creates a Queue, and the main process passes it (as an argument) to the worker process.
queue.Queue is thread-safe, but doesn't work across processes. This is quite easy to fix, though. Instead of:
from multiprocessing import Process
from Queue import Queue
You want:
from multiprocessing import Process, Queue

python for loop in parallel

I am trying to read data from an input file, and for each line perform a task in a while loop. Problem is that when I create the first process - its loop is executing and not returning control to the above for loop. Bottom line there is no parallelism. What am I doing wrong?
Here is the relevant code:
from multiprocessing import Process
def work_line(list1Line,jobId):
while True:
print list1Line
tenant = list1Line[0]
module = list1Line[1]
endTime = int(time.time())
startTime = endTime - startTimeDelta
generate(jobId, startTime, endTime, tenantServiceAddress, tenant, module)
print ("tenant {} will sleep for {} seconds").format(tenant,sleepBetweenLoops)
time.sleep(sleepBetweenLoops)
def openFiles():
file = open(CLOUD_INPUT_FILE, 'r')
lines = file.readlines()
file.close()
linesLen = len(lines)
processes = []
for linesIndex in range(0, linesLen):
jobId = GenerateRandomID()
line = lines[linesIndex]
list1Line = line.split()
p = Process(target=work_line(list1Line,jobId))
p.start()
processes.append(p)
print processes
for p in processes:
p.join()
if __name__ == '__main__':
CLOUD_INPUT_FILE = r'C:\CF\input_file.txt'
tenantServiceAddress = 'address.address'
startTimeDelta = 300
sleepBetweenLoops = 1800
print multiprocessing.cpu_count()
openFiles()
You are actually calling the function. Change to
p = Process(target=work_line, args=(list1Line,jobId))

Categories

Resources