So I'm using processes and a queue to search through data and find the rows that have the same entry in a different columns. I decided to use multiprocessing to try and make it so can be scaled for large data. The file has a 1000 lines and 10 points of data per line. I read in only 80 lines of the data and the program stalls. 70 lines and it works fine and at a decent speed too.
My question is what am I doing wrong or are the limitations with this approach that I haven't identified? The code isn't perfect by any means and is probably bad in itself. The code is as follows:
from multiprocessing import Process, Queue
import random
def openFile(file_name, k, division):
i = 0
dataSet = []
with open(file_name) as f:
for line in f:
stripLine = line.strip('\n')
splitLine = stripLine.split(division)
dataSet += [splitLine]
i += 1
if(i == k):
break
return(dataSet)
def setCombination(q,data1,data2):
newData = []
for i in range(0,len(data1)):
for j in range(0, len(data2)):
if(data1[i][1] == data2[j][3]):
newData += data2[j]
q.put(newData)
if __name__ == '__main__':
# Takes in the file, the length of the data to read in, and how the data is divided.
data = openFile('testing.txt', 80, ' ')
for i in range(len(data)):
for j in range(len(data[i])):
try:
data[i][j] = float(data[i][j])
except ValueError:
pass
#print(data)
k = len(data)//10
q = Queue()
processes = [Process(target=setCombination, args=(q, data[k*x: k + k*x], data))
for x in range(10)]
for p in processes:
p.start()
# Exit the completed processes
for p in processes:
p.join()
saleSet = [q.get() for p in processes]
print('\n', saleSet)
The data file testing.txt
It appears that something about what your code does is causing a deadlock. While experimenting, I noticed that 3 out of the 10 tasks would never terminate, but, to be honest, I don't really don't know the reason(s) why.
The good news is it's easy to fix by just removing or disabling the
# Exit the completed processes
for p in processes:
p.join()
loop you have in your code.
Here's a complete version of your code with (mostly) just that modification in it:
from multiprocessing import Process, Queue
def openFile(file_name, k, division):
i = 0
dataSet = []
with open(file_name) as f:
for line in f:
stripLine = line.strip('\n')
splitLine = stripLine.split(division)
dataSet += [splitLine]
i += 1
if i == k:
break
return dataSet
def setCombination(q, data1, data2):
newData = []
for i in range(len(data1)):
for j in range(len(data2)):
if data1[i][1] == data2[j][3]:
newData += data2[j]
q.put(newData)
if __name__ == '__main__':
# Takes in the file, the length of the data to read in, and how the data is divided.
data = openFile('testing.txt', 80, ' ')
for i in range(len(data)):
for j in range(len(data[i])):
try:
data[i][j] = float(data[i][j])
except ValueError:
pass
k = len(data) // 10
q = Queue()
processes = [Process(target=setCombination, args=(q, data[k*x: k*x+k], data))
for x in range(10)]
for p in processes:
p.start()
# NO LONGER USED (HANGS)
# # Exit the completed processes
# for p in processes:
# p.join()
# note: this works since by default, get() will block until it can retrieve something
saleSet = [q.get() for _ in processes] # a queue item should be added by each Process
print('\n', saleSet)
Related
I have multidimensional array which needs to be calculated with an imported function. (I am using jupyter notebook, so I exported the function to ipynb and imported it again) The function takes argument of 1 dimensional array.
#Function
def calculatespi(datagrid,q):
date_time = datagrid['time'][:]
gridvalue = datagrid.values
if np.isnan(np.sum(gridvalue)) != True:
df_precip = pd.DataFrame({"Date": date_time,"precip":gridvalue})
spi_prc = spi.SPI()
spi3_grid = spi_prc.calculate(df_precip, 'Date', 'precip', freq = 'M', scale = 3, fit_type ="lmom", dist_type="gam")
spi3 = spi3_grid['precip_scale_3_calculated_index'].values
else:
spi3 = np.empty((489))
spi3[:] = np.nan
q.put(spi3)
#Main Notebook
if name == "main":
spipi = []
processes = []
for x in range (3):
for y in range(3):
q = multiprocessing.Queue()
p = multiprocessing.Process(target=calculatespi, args= (prcoba[:,x,y],q))
p.start()
processes.append(p)
spipi.append(q.get())
for process in processes:
process.join()
After hundreds of attempt, finally I can retrieve the results from my problem but it took times longer than running it without using multiprocessing. What should I do?
Using concurrent.futures.ProcessPoolExecutor makes things much easier.
First, replace in calculatespi the q.put(spi3) by return spi3 and remove the q parameter. Then the "main" code can be written as
#Main Notebook
if name == "main":
from concurrent.futures import ProcessPoolExecutor
args = []
for x in range (3):
for y in range(3):
args.append(prcoba[:,x,y])
with ProcessPoolExecutor() as executor:
spipi = list(executor.map(calculatespi, args))
The executor takes care about everything else.
I have the following function:
def match_keywords(reviews_match, nlu_match, keywords_match):
for j in range(df_NLU_Reviews.shape[0]):
if((j%1000)==0):
print(j)
keywords = df_NLU_Reviews.Keywords.iloc[j]
for i in range(len(sentences)):
try:
counter=0
for keyword in keywords:
if(keyword in sentences[i]):
counter+=1
if( (len(keywords)) == counter ):
reviews_match.append(sentences[i])
nlu_match.append(df_NLU_Reviews.NLU_Review.iloc[j])
keywords_match.append(df_NLU_Reviews.Keywords.iloc[j])
sentences.remove(sentences[i])
break
except Exception as e:
print(i)
print(j)
raise e
df_match = pd.DataFrame()
df_match['Reviews'] = reviews_match
df_match['NLU'] = nlu_match
df_match['Keywords'] = keywords_match
df_match.to_pickle("Match_Reviews.pkl")
return df_match
This function takes 3 empty lists as arguments that will be filled during the execution of the function.
I want to parallelize using multiprocessing.Pool, but i can't figure out how to do it.
I have tried this:
reviews_match = []
nlu_match = []
keywords_match = []
match_list = [reviews_match, nlu_match, keywords_match]
if __name__ == '__main__':
with Pool(processes = 12) as pool:
results = pool.map(match_keywords, zip(reviews_match, nlu_match, keywords_match))
print(results)
this:
reviews_match = []
nlu_match = []
keywords_match = []
match_list = [reviews_match, nlu_match, keywords_match]
if __name__ == '__main__':
with Pool(processes = 12) as pool:
results = pool.map(match_keywords, zip(match_list))
print(results)
and this too:
reviews_match = []
nlu_match = []
keywords_match = []
match_list = [reviews_match, nlu_match, keywords_match]
if __name__ == '__main__':
with Pool(processes = 12) as pool:
results = pool.starmap(match_keywords, zip(reviews_match, nlu_match, keywords_match))
print(results)
But none of this works, these methods throw errors or empty lists as output. If i run this function without parallelization like this:
match_keywords(reviews_match, nlu_match, keywords_match)
It works just fine. Could someone please show me the right way of doing this and explain to me why this is not working?
Thank you very much in advance
You last variant look correct and will not return empty list, check it again.
But i think you can't parallelise it this way, because it is not equal to run
match_keywords(reviews_match, nlu_match, keywords_match)
in many threads, it is equal to run:
match_keywords(reviews_match[0], nlu_match[0], keywords_match[0])
match_keywords(reviews_match[1], nlu_match[1], keywords_match[1])
match_keywords(reviews_match[2], nlu_match[2], keywords_match[2])
match_keywords(reviews_match[3], nlu_match[3], keywords_match[3])
...
many times.
Hardware: Raspberry Pi 4B (1GB) & Macbook Pro
OS: Raspbian & OSX
Python Version: 3.7.3
I'm having an issue with multiprocessing.Queue() skipping the first item that is placed in the queue. After some testing I have figured out that I can keep this from happening if I add additional code (time.sleep(.0001), print(''), anything except for commented code) between the subsequent q.put() commands. q.get will always skip the first item in the queue and start on the second item without a delay between adding items to the queue, and when a delay is added, it will always get the first item in the queue. Maybe someone can explain to me what is happening and how to resolve this issue in a better way. Thanks in advance.
Here is a sample bit of code that shows the problem that I'm having*(see note).
import multiprocessing
import time
set_size = 3
def process_queueing():
entry = 1
data_list = []
for i in range(1,100):
data_list.append(i)
if i % set_size == 0:
data = [data_list, set_size, entry]
q.put(data)
#time.sleep(.001) #Uncomment to fix problem
entry = entry + 1
data_list.clear()
def process_data():
while True:
data = q.get()
for i in data[0]:
print('Entry: ' + str(data[2]) + ' Data: ' + str(i))
q = multiprocessing.Queue()
process = multiprocessing.Process(target=process_data, daemon=True)
process.start()
process_queueing()
*Note: This code actually shows the data in the queue as being incomplete and incorrect (Entry: 1 Data: 4 Entry: 1 Data: 5 Entry: 1 Data: 6 (full output) instead of Entry: 1 Data: 1 Entry: 1 Data: 2 Entry: 1 Data: 3 and so on...) in this example and when run on my Macbook Pro (Python 3.7.3, OSX 10.14.5) doesn't output anything. Again, adding the additional code as a delay fixes all the problems.
import multiprocessing
import time
set_size = 3
def process_queueing():
entry = 1
data_list = []
for i in range(1,100):
data_list.append(i)
if i % set_size == 0:
data = [list(data_list), set_size, entry]
q.put(data)
# time.sleep(.001) #Uncomment to fix problem
# print(data)
entry = entry + 1
data_list.clear()
def process_data():
while True:
data = q.get()
for i in data[0]:
print('Entry: ' + str(data[2]) + ' Data: ' + str(i))
q = multiprocessing.Queue()
process = multiprocessing.Process(target=process_data, daemon=True)
process.start()
process_queueing()
OUTPUT
Entry: 1 Data: 1
Entry: 1 Data: 2
Entry: 1 Data: 3
Entry: 2 Data: 4
Entry: 2 Data: 5
Entry: 2 Data: 6
Entry: 3 Data: 7
Entry: 3 Data: 8
Entry: 3 Data: 9
Entry: 4 Data: 10
I think I got it working, by changing the data_list to list(data_list), I think what happens is that you are overwriting the data_list not making a new list every time. I really think you should use something like Locks for this, to avoid any race conditions like the ones you are facing.
Changing data_list.clear() to data_list = [] seems to have solved the problem. I also opted to throw the output into a queue because process_data() is running in a separate process and won't print to my main process stdout when running in the IDLE shell (windows) (there are other solutions for that).
import multiprocessing
import time
set_size = 3
def process_queueing(q):
entry = 1
data_list = []
for i in range(1,100):
data_list.append(i)
if i % set_size == 0:
data = [data_list, set_size, entry]
q.put(data)
#time.sleep(.001) #Uncomment to fix problem
entry = entry + 1
## data_list.clear()
data_list = []
return('Done')
def process_data(q,r):
while True:
data = q.get()
for i in data[0]:
r.put('Entry: ' + str(data[2]) + ' Data: ' + str(i))
if __name__ == '__main__':
q = multiprocessing.Queue()
r = multiprocessing.Queue()
process = multiprocessing.Process(target=process_data,
args=(q,r),
daemon=True)
process.start()
print(process_queueing(q))
print('foo')
print(r.empty())
#wait for process_data to put stuff on the queue
while r.empty():
pass
while not r.empty():
data = r.get()
#hopefully print takes enough time for more things to get put on the queue
print(data)
I don't believe the additional queue affects the outcome - although it does introduce a wait for the data to get pickled before putting it on the queue. Running py -m tmp from the powershell command prompt works fine without the additional queue
tmp.py
import multiprocessing
from queue import Empty
import time
set_size = 3
def process_queueing(q):
entry = 1
data_list = []
for i in range(1,100):
data_list.append(i)
if i % set_size == 0:
data = [data_list, set_size, entry]
q.put(data)
#time.sleep(.001) #Uncomment to fix problem
entry = entry + 1
## data_list.clear()
data_list = []
q.put('Done')
return('Done')
def process_data(q,r):
while True:
try:
data = q.get(timeout=1)
if data == 'Done':
print('donedone')
break
for i in data[0]:
## r.put('Entry: ' + str(data[2]) + ' Data: ' + str(i))
print('foo Entry: ' + str(data[2]) + ' Data: ' + str(i))
except Empty:
break
if __name__ == '__main__':
q = multiprocessing.Queue()
r = multiprocessing.Queue()
process = multiprocessing.Process(target=process_data,
args=(q,r),
daemon=True)
process.start()
print(process_queueing(q))
while process.is_alive():
pass
I am trying to use this question for my file processing:
Python multiprocessing safely writing to a file
This is my modification of the code:
def listener(q):
'''listens for messages on the q, writes to file. '''
while 1:
reads = q.get()
if reads == 'kill':
#f.write('killed')
break
for read in reads:
out_bam.write(read)
out_bam.flush()
out_bam.close()
def fetch_reads(line, q):
parts = line[:-1].split('\t')
print(parts)
start,end = int(parts[1])-1,int(parts[2])-1
in_bam = pysam.AlignmentFile(args.bam, mode='rb')
fetched = in_bam.fetch(parts[0], start, end)
reads = [read for read in fetched if (read.cigarstring and read.pos >= start and read.pos < end and 'S' not in read.cigarstring)]
in_bam.close()
q.put(reads)
return reads
#must use Manager queue here, or will not work
manager = mp.Manager()
q = manager.Queue()
if not args.threads:
threads = 1
else:
threads = int(args.threads)
pool = mp.Pool(threads+1)
#put listener to work first
watcher = pool.apply_async(listener, (q,))
with open(args.bed,'r') as bed:
jobs = []
cnt = 0
for line in bed:
# Fire off the read fetchings
job = pool.apply_async(fetch_reads, (line, q))
jobs.append(job)
cnt += 1
if cnt > 10000:
break
# collect results from the workers through the pool result queue
for job in jobs:
job.get()
print('get')
#now we are done, kill the listener
q.put('kill')
pool.close()
The differences in that I am opening and closing the file in the function since otherwise I get unusual errors from bgzip.
At first, print(parts) and print('get') are interchangeably printed (more or less), then there are less and less prints of 'get'. Ultimately the code hangs, and nothing is printed (all the parts are printed, but 'get' simply doesn't print anymore). The output file remains zero bytes.
Can anyone lend a hand? Cheers!
I may be approaching this all wrong but still this is where I'm at. I have very large log files I'm trying to search, up to 30gb in some cases. I'm writing a script to pull info and have been playing with multi process to speed it up a bit. right now I'm testing running two functions at the same time to search from the top and bottom to get results, which seems to work. I'm wondering if it's possible to stop one function one a result from the other. Such as if the top function finds a result they both stop. This way I can build it out as needed.
from file_read_backwards import FileReadBackwards
from multiprocessing import Process
import sys
z = "log.log"
#!/usr/bin/env python
rocket = 0
def top():
target = "test"
with open(z) as src:
found= None
for line in src:
if len(line) == 0: break #happens at end of file, then stop loop
if target in line:
found= line
break
print(found)
def bottom():
target = "text"
with FileReadBackwards(z) as src:
found= None
for line in src:
if len(line) == 0: break #happens at end of file, then stop loop
if target in line:
found= line
break
print(found)
if __name__=='__main__':
p1 = Process(target = top)
p1.start()
p2 = Process(target = bottom)
p2.start()
Here's a proof-of-concept of the approach I mentioned in the comments:
import os
import random
import sys
from multiprocessing import Process, Value
def search(proc_no, file_name, seek_to, max_size, find, flag):
stop_at = seek_to + max_size
with open(file_name) as f:
if seek_to:
f.seek(seek_to - 1)
prev_char = f.read(1)
if prev_char != '\n':
# Landed in the middle of a line. Skip back one (or
# maybe more) lines so this line isn't excluded. Start
# by seeking back 256 bytes, then 512 if necessary, etc.
exponent = 8
pos = seek_to
while pos >= seek_to:
pos = f.seek(max(0, pos - (2 ** exponent)))
f.readline()
pos = f.tell()
exponent += 1
while True:
if flag.value:
break
line = f.readline()
if not line:
break # EOF
data = line.strip()
if data == find:
flag.value = proc_no
print(data)
break
if f.tell() > stop_at:
break
if __name__ == '__main__':
# list.txt contains lines with the numbers 1 to 1000001
file_name = 'list.txt'
info = os.stat(file_name)
file_size = info.st_size
if len(sys.argv) == 1:
# Pick a random value from list.txt
num_lines = 1000001
choices = list(range(1, num_lines + 1))
choices.append('XXX')
find = str(random.choice(choices))
else:
find = sys.argv[1]
num_procs = 4
chunk_size, remainder = divmod(file_size, num_procs)
max_size = chunk_size + remainder
flag = Value('i', 0)
procs = []
print(f'Using {num_procs} processes to look for {find} in {file_name}')
for i in range(num_procs):
seek_to = i * chunk_size
proc = Process(target=search, args=(i + 1, file_name, seek_to, max_size, find, flag))
procs.append(proc)
for proc in procs:
proc.start()
for proc in procs:
proc.join()
if flag.value:
print(find, 'found by proc', flag.value)
else:
print(find, 'not found')
After reading various posts[1] about reading files with multiprocessing and multithreading, it seems that neither is a great approach due to potential disk thrashing and serialized reads. So here's a different, simpler approach that is way faster (at least for the file with a million lines I was trying it out on):
import mmap
import sys
def search_file(file_name, text, encoding='utf-8'):
text = text.encode(encoding)
with open(file_name) as f:
with mmap.mmap(f.fileno(), 0, flags=mmap.ACCESS_READ, prot=mmap.PROT_READ) as m:
index = m.find(text)
if index > -1:
# Found a match; now find beginning of line that
# contains match so we can grab the whole line.
while index > 0:
index -= 1
if m[index] == 10:
index += 1
break
else:
index = 0
m.seek(index)
line = m.readline()
return line.decode(encoding)
if __name__ == '__main__':
file_name, search_string = sys.argv[1:]
line = search_file(file_name, search_string)
sys.stdout.write(line if line is not None else f'Not found in {file_name}: {search_string}\n')
I'm curious how this would perform with a 30GB log file.
[1] Including this one
Simple example using a multiprocessing.Pool and callback function.
Terminates remaining pool processes once a result has returned.
You could add an arbitrary number of processes to search from different offsets in the file using this approach.
import math
import time
from multiprocessing import Pool
from random import random
def search(pid, wait):
"""Sleep for wait seconds, return PID
"""
time.sleep(wait)
return pid
def done(result):
"""Do something with result and stop other processes
"""
print("Process: %d done." % result)
pool.terminate()
print("Terminate Pool")
pool = Pool(2)
pool.apply_async(search, (1, math.ceil(random() * 3)), callback=done)
pool.apply_async(search, (2, math.ceil(random() * 3)), callback=done)
# do other stuff ...
# Wait for result
pool.close()
pool.join() # block our main thread
This is essentially the same as Blurp's answer, but I shortened it and made it a bit to make it more general. As you can see top should be an infinite loop, but bottom stops top immediately.
from multiprocessing import Process
valNotFound = True
def top():
i=0
while ValNotFound:
i += 1
def bottom():
ValNotFound = False
p1 = Process(target = top)
p2 = Process(target = bottom)
p1.start()
p2.start()