Inside a worker thread I am generating a data frame . Trying to put this into the queue passed to the worker thread is failing. In fact trying to put any values into the queue is failing.
The part of the code that is failing inside the worker thread task1() is given below:
df = pd.DataFrame([[1,2,3,4],[3,4,5,6]])
qmdlvalues.put(df)
mdltiming = time.time() - start
qmdlparams.put(paramval)
qtiming.put(mdltiming)
Complete code
import threading
import queue
from sklearn.manifold import TSNE
import os
import time
def write_tsne_op(opdata,fname,header):
with open(fname, 'w') as outfile:
outfile.write(header)
for data_slice in opdata:
np.savetxt(outfile, data_slice,delimiter=",")
def task1(qmdlvalues,qmdlparams,qtiming,paramval):
start = time.time()
#tmpmdl1 = TSNE(perplexity=100,early_exaggeration=1, n_components=2,random_state=0,verbose=1)
#qmdlvalues.put(tmpmdl1.fit_transform(dense_mx))
df = pd.DataFrame([[1,2,3,4],[3,4,5,6]])
qmdlvalues.put(df)
mdltiming = time.time() - start
qmdlparams.put(paramval)
qtiming.put(mdltiming)
print(df)
print(str(mdltiming))
print(paramval)
def task2(qmdlvalues,qmdlparams,qtiming,paramval):
start = time.time()
#tmpmdl2 = TSNE(perplexity=100,early_exaggeration=10, n_components=2,random_state=0,verbose=1)
#qmdlvalues.put(tmpmdl2.fit_transform(dense_mx2))
qmdlvalues.put(pd.DataFrame([[1,2,3,4],[3,4,5,6]]))
qmdlparams.put(paramval)
mdltiming = time.time() - start
qtiming.put(mdltiming)
if __name__ == "__main__":
dense_mx2 = dense_mx
dense_mx3 = dense_mx
qmdlvl = queue.Queue()
qmdlch = queue.Queue()
qtme = queue.Queue()
mdlvalues = pd.DataFrame()
t1 = threading.Thread(target=task1,args=(qmdlvl,qmdlch,qtme,"#perplex: 100 early exag: 1 timing:$_plex100_exag1.csv"), name='t1')
t2 = threading.Thread(target=task2,args=(qmdlvl,qmdlch,qtme,"#perplex: 100 early exag: 10 timing:$_plex100_exag10.cv"), name='t2')
# starting threads
t1.start()
t2.start()
while True:
if qmdlvl.empty():
print("Queue closed. Exiting thread.")
break
try:
item = qmdlvl.get(timeout=.5)
except:
continue
print("Got item:", item)
# wait until all threads finish
t1.join()
t2.join()
Below is the actual output I am getting from the code in the main
while True:
if qmdlvl.empty():
print("Queue closed. Exiting thread.")
break
try:
item = qmdlvl.get(timeout=.5)
except:
continue
print("Got item:", item)
ID of process running main program: 6456
Main thread name: MainThread
Queue closed. Exiting thread.
I want to able to put the data frame into a queue inside the worker thread and access the same data frame in the main thread.
There are parameter mis-matches in my earlier code those have been corrected an a full working code presented below.
I stored the output of t-SNE directly into the queue and retrieved the same in the main thread. The next progression would be convert this to thread pool and sub-classing.
import threading
import queue
from sklearn.manifold import TSNE
import os
import time
def write_tsne_op(opdata,fname,header):
with open(fname, 'w') as outfile:
outfile.write(header)
for data_slice in opdata:
np.savetxt(outfile, data_slice,delimiter=",")
def task1(ip_matrix,qmdlvalues,qmdlparam,plex,exag,qmdltime,qmdlhrfn,hderfname):
string=""
start=0
end=0
mdltiming=0
start = time.time()
tmpmdl1 = TSNE(perplexity=plex,early_exaggeration=exag, n_components=2,random_state=0,verbose=1)
qmdlvalues.put(tmpmdl1.fit_transform(ip_matrix))
string = str(plex)+ "$" + str(exag)
qmdlparam.put(string)
qmdlhrfn.put(hderfname)
end = time.time()
mdltimig = end - start
print(str(mdltiming)+"time")
qmdltime.put(mdltiming)
def task2(ip_matrix,qmdlvalues,qmdlparam,plex,exag,qmdltime,qmdlhrfn,hderfname):
string=""
start=0
end=0
mdltiming=0
start = time.time()
tmpmdl2 = TSNE(perplexity=plex,early_exaggeration=exag, n_components=2,random_state=0,verbose=1)
qmdlvalues.put(tmpmdl2.fit_transform(ip_matrix))
string = str(plex)+ "$" + str(exag)
qmdlparam.put(string)
qmdlhrfn.put(hderfname)
end = time.time()
mdltimig = end - start
qmdltime.put(mdltiming)
def task3(ip_matrix,qmdlvalues,qmdlparam,plex,exag,qmdltime,qmdlhrfn,hderfname):
string=""
start=0
end=0
mdltiming=0
start = time.time()
tmpmdl3 = TSNE(perplexity=plex,early_exaggeration=exag, n_components=2,random_state=0,verbose=1)
qmdlvalues.put(tmpmdl3.fit_transform(ip_matrix))
string = str(plex)+ "$" + str(exag)
qmdlparam.put(string)
qmdlhrfn.put(hderfname)
end = time.time()
mdltimig = end - start
qmdltime.put(mdltiming)
def task4(ip_matrix,qmdlvalues,qmdlparam,plex,exag,qmdltime,qmdlhrfn,hderfname):
string=""
start=0
end=0
mdltiming=0
start = time.time()
tmpmdl4 = TSNE(perplexity=plex,early_exaggeration=exag, n_components=2,random_state=0,verbose=1)
qmdlvalues.put(tmpmdl4.fit_transform(ip_matrix))
string = str(plex)+ "$" + str(exag)
qmdlparam.put(string)
qmdlhrfn.put(hderfname)
end = time.time()
mdltimig = end - start
qmdltime.put(mdltiming)
if __name__ == "__main__":
# print ID of current process
print("ID of process running main program: {}".format(os.getpid()))
# print name of main thread
print("Main thread name: {}".format(threading.main_thread().name))
dense_mx2 = dense_mx
dense_mx3 = dense_mx
dense_mx4 = dense_mx
qmdlvl = queue.Queue()
qmdlch = queue.Queue()
qmdltme = queue.Queue()
qmdlhdrfname = queue.Queue()
perplex = 200
# creating threads
exag=10
t1 = threading.Thread(target=task1,args=(dense_mx,qmdlvl,qmdlch,perplex,exag,qmdltme,qmdlhdrfname,"#perplex: 200 early exag: 10 timing:$_plex200_exag10.csv"), name='t1')
exag=30
t2 = threading.Thread(target=task2,args=(dense_mx2,qmdlvl,qmdlch,perplex,exag,qmdltme,qmdlhdrfname,"#perplex: 200 early exag: 30 timing:$_plex200_exag30.cv"), name='t2')
exag=50
t3 = threading.Thread(target=task3,args=(dense_mx3,qmdlvl,qmdlch,perplex,exag,qmdltme,qmdlhdrfname,"#perplex: 200 early exag: 50 timing:$_plex200_exag50.csv"), name='t3')
exag=100
t4 = threading.Thread(target=task4,args=(dense_mx4,qmdlvl,qmdlch,perplex,exag,qmdltme,qmdlhdrfname,"#perplex: 200 early exag: 100 timing:$_plex200_exag100.cv"), name='t4')
# starting threads
t1.start()
t2.start()
t3.start()
t4.start()
# wait until all threads finish
t1.join()
t2.join()
t3.join()
t4.join()
while True:
if qmdlvl.empty():
print("Queue closed. Exiting thread.")
break
try:
item1 = qmdlvl.get(timeout=.5)
item2 = qmdlch.get(timeout=.5)
item3 = qmdltme.get(timeout=.5)
header,fname = qmdlhdrfname.get(timeout=.5).split('$')
except:
continue
write_tsne_op(item1,fname,header)
Related
Given the following program running my_function in a subprocess using run_process_timeout_wrapper leads to a timeout (over 160s), while running it "normally" takes less than a second.
from multiprocessing import Process, Queue
import time
import numpy as np
import xgboost
def run_process_timeout_wrapper(function, args, timeout):
def foo(n, out_q):
res = function(*n)
out_q.put(res) # to get result back from thread target
result_q = Queue()
p = Process(target=foo, args=(args, result_q))
p.start()
try:
x = result_q.get(timeout=timeout)
except Empty as e:
p.terminate()
raise multiprocessing.TimeoutError("Timed out after waiting for {}s".format(timeout))
p.terminate()
return x
def my_function(fun):
print("Started")
t1 = time.time()
pol = xgboost.XGBRegressor()
pol.fit(np.random.rand(5,1500), np.random.rand(50,1))
print("Took ", time.time() - t1)
pol.predict(np.random.rand(2,1500))
return 5
if __name__ == '__main__':
t1 = time.time()
pol = xgboost.XGBRegressor()
pol.fit(np.random.rand(50,150000), np.random.rand(50,1))
print("Took ", time.time() - t1)
my_function(None)
t1 = time.time()
res = run_process_timeout_wrapper(my_function, (None,),160)
print("Res ", res, " Time ", time.time() - t1)
I am running this on Linux. Since it has come up, I have also added a print in the beginning of my_function showing that this function is at least reached.
Gathered from this issue I found that forking a multi-threaded application is problematic. One possible solution is to add
if __name__ == "__main__":
mp.set_start_method('spawn')
However, this may lead to other issues.
Trying to have 2 methods running at once. One is a timer method and the other writes data to a CSV. I am trying to use Treading to run them both at once, but the thread starts before it is called.
Code;
with open("C:\\ProgramData\\Example.txt", "r", encoding="utf8") as file:
array = for line in file.readlines()]))
fieldnames = 'Col1','Col2','Col3'
with open("C:\\ProgramData\\example.csv", 'w', newline='', encoding="utf8") as csvfile:
writer = csv.writer(csvfile)
writer.writerow(fieldnames)
writer.writerows(array)
csvfile.close()
def timer():
import time
import sys
time_start = time.time()
seconds = 0
minutes = 0
while True:
try:
sys.stdout.write("\r{minutes} Minutes {seconds} Seconds".format(minutes=minutes, seconds=seconds))
sys.stdout.flush()
time.sleep(1)
seconds = int(time.time() - time_start) - minutes * 60
if seconds >= 60:
minutes += 1
seconds = 0
except KeyboardInterrupt as e:
break
if __name__=="__main__":
print("Not running")
t1 = threading.Thread(target=timer())
print("clearly running")
t2 = threading.Thread(target=regx())
t1.setName('t1')
t2.setName('t2')
t1.start()
t2.start()
t1.join()
t2.join()
# pool =Pool(processes=2)
# pool.map(timer(),regx())
The output from the console;
Not running
2 Minutes 32 Seconds
Process finished with exit code -1
Can anyone help me fix this?
Thanks
Don't use () unless you want to run the method immediately. If you want to reference the method itself (like to pass it to Thread), leave off the ().
Try this code:
if __name__=="__main__":
print("Not running")
t1 = threading.Thread(target=timer)
print("clearly running")
t2 = threading.Thread(target=regx)
t1.setName('t1')
t2.setName('t2')
t1.start()
t2.start()
t1.join()
t2.join()
So Im trying to code a really simple Internet Download Manager Spoof with Python 2.7
It is supposed to query a files HTTP header, get the byte range and spread the download among a no.of threads(I hard-coded 2 for simplicity) according to the byte range and later join the file parts together again.
The problem is my console log tells me that only 1 thread is started.
[EDIT] The problem has been solved. Find the working code below.
Here is my source:
from __future__ import print_function
import threading
import urllib
import urllib2
import time
threads = []
# url to open
url = "http://www.sample-videos.com/video/mp4/720/big_buck_bunny_720p_1mb.mp4"
u = urllib.urlopen(url)
# define file
file_name = "test.mp4"
f = open(file_name, 'wb')
# open url and get header info
def get_file_size(url):
stream_size = u.info()['Content-Length']
end = stream_size
return end
start = 0
#get stream size
end = get_file_size(url)
# specify block size
block_sz = 512
#algo to divide work among 2 threads
def calculate_no_of_bytes_for_thread1():
full_stream_size = end
first_thread = {'start':0, 'end':(int(full_stream_size)/2)}
print(first_thread)
return first_thread
#algo to divide work among 2 threads
def calculate_no_of_bytes_for_thread2():
full_stream_size = end
second_thread= {'start':int(full_stream_size)/2,'end': int(full_stream_size)}
print(second_thread)
return second_thread
# download function
def download_thread(url ,id,start,end):
current_size = int(float(start)/1024)
total_size = int(float(end)/1024)
print ("Start at_"+str(current_size) + "Ends at_" + str(total_size))
# specify request range and init stream
req = urllib2.Request(url)
req.headers['Range'] = 'bytes=%s-%s' % (start, end)
data = urllib2.urlopen(req)
while True:
buffer = u.read(block_sz)
if not buffer:
break
start += len(buffer)
f.write(buffer)
thread_id = id
#percentage = (current_size * 100 / total_size)
status = str(thread_id) + "_" + str(current_size) + "_" +str(total_size)
print (status)
#starts 2 threads
def start_threads():
for i in range(2):
#if first loop, start thread 1
if(i==1):
start = calculate_no_of_bytes_for_thread1().get('start')
end = calculate_no_of_bytes_for_thread1().get('end')
print("Thread 1 started")
t = threading.Thread(target=download_thread, args=(url,i,start,end))
t.start()
threads.append( t)
#if second loop, start thread 1
if(i==2):
start = calculate_no_of_bytes_for_thread2().get('start')
end = calculate_no_of_bytes_for_thread2().get('end')
print("Thread 2 started")
t = threading.Thread(target=download_thread, args=(url,i,start,end))
t.start()
threads.append( t)
# Join threads back (order doesn't matter, you just want them all)
for i in threads:
i.join()
#start benchmarking
start_time = time.clock()
start_threads()
print ("Finito!")
end_time = time.clock()
benchmark = str(end_time - start_time)
print ("Download took_" +benchmark)
f.close()
And the output:
{'start': 0, 'end': 527868}
{'start': 0, 'end': 527868}
Thread 1 started
Start at_0Ends at_515
1_0_515
1_0_515
Finito!
Download took_6.97844422658
Working code:
from __future__ import print_function
import threading
import urllib
import urllib2
import time
threads = []
parts = {}
# url to open
url = "http://www.sample-videos.com/audio/mp3/india-national-anthem.mp3"
u = urllib.urlopen(url)
# define file
file_name = "test.mp3"
f = open(file_name, 'wb')
# open url and get header info
def get_file_size(url):
stream_size = u.info()['Content-Length']
file_size = stream_size
return file_size
start = 0
#get stream size
end = get_file_size(url)
# specify block size
block_sz = 512
#algo to divide work among 2 threads
def calculate_no_of_bytes_for_thread1():
full_stream_size = end
first_thread = {'start':0, 'end':(int(full_stream_size)/2)}
print(first_thread)
return first_thread
#algo to divide work among 2 threads
def calculate_no_of_bytes_for_thread2():
full_stream_size = end
second_thread= {'start':int(full_stream_size)/2,'end': int(full_stream_size)}
print(second_thread)
return second_thread
# download function
def download_thread(url ,id,start,end):
current_size = int(float(start)/1024)
total_size = int(float(end)/1024)
print ("Start at_"+str(current_size) + "Ends at_" + str(total_size))
# specify request range and init stream
req = urllib2.Request(url)
req.headers['Range'] = 'bytes=%s-%s' % (start, end)
while True:
buffer = u.read(block_sz)
if not buffer:
break
start += len(buffer)
f.write(buffer)
thread_id = id
status = "Thread ID_" +str(thread_id) + "Downloaded_" + str(int(start/1024)) + "Total_" +str(total_size)
print (status)
#starts 2 threads
def start_threads():
for i in range(2):
#if first loop, start thread 1
if(i==0):
start = calculate_no_of_bytes_for_thread1().get('start')
end = calculate_no_of_bytes_for_thread1().get('end')
print("Thread 1 started")
t = threading.Thread(target=download_thread, args=(url,i,start,end))
t.start()
threads.append( t)
#if second loop, start thread 2
if(i==1):
start = calculate_no_of_bytes_for_thread2().get('start')
end = calculate_no_of_bytes_for_thread2().get('end')
print("Thread 2 started")
t = threading.Thread(target=download_thread, args=(url,i,start,end))
t.start()
threads.append( t)
# Join threads back (order doesn't matter, you just want them all)
for i in threads:
i.join()
# Sort parts and you're done
# result = ''
# for i in range(2):
# result += parts[i*block_sz]
#start benchmarking
start_time = time.clock()
start_threads()
print ("Finito!")
end_time = time.clock()
benchmark = str(end_time - start_time)
print ("Download took_" +benchmark)
f.close()
You have:
for i in range(2):
if(i==1):
...
if(i==2):
...
But range(2) iterates over [0,1] not [1,2].
Save some trouble and just remove those 3 lines. The code to start the two threads can just run serially.
I am a little bit confused testing the multiprocessing module.
Let's simulate a digital timer. The code would look like:
start=datetime.now()
while True:
now=datetime.now()
delta=now-start
s = delta.seconds + delta.microseconds/1E6
print s
time.sleep(1)
Which returns correctly:
8e-06
1.001072
2.00221
3.003353
4.004416
...
Now I want to read the clock from my virtual external digital clock device using a pipe:
def ask_timer(conn):
start=datetime.now()
while True:
now=datetime.now()
delta=now-start
s = delta.seconds + delta.microseconds/1E6
conn.send(s)
parent_conn, child_conn = Pipe()
p = Process(target=ask_timer, args=(child_conn,))
p.start()
while True:
print parent_conn.recv()
time.sleep(1)
It returns:
2.9e-05
6.7e-05
7.7e-05
8.3e-05
8.9e-05
9.4e-05
0.0001
...
Here the timer doesn't seem to run permanently in the background..The implementation of "Queue" looks like:
def ask_timer(q):
while True:
now=datetime.now()
delta=now-start
s = delta.seconds + delta.microseconds/1E6
q.put(s)
#conn.close()
q = Queue()
p = Process(target=ask_timer, args=(q,))
p.start()
while True:
print q.get()
time.sleep(1)
which does the same like pipe. Is this just my misconception of multiprocessing of python? How could I ask a value realtime from a running parallel-thread?
Everything is working correctly. The child process is executing ask_timer() function completely independently from you main process. You don't have any time.sleep() in this function, so it just prints or puts in the queue deltas in the infinite loop with interval of like 10ms.
Once a second your main process asks child process for data and get's it. Data is one of those small intervals.
The problem there is that you're putting much more data into pipe/queue, than taking from it. So you're getting old data, when you ask. To test that you can print queue size in the loop (won't work on OS X):
def ask_timer(q):
start = datetime.now()
while True:
now = datetime.now()
delta = now - start
s = delta.seconds + delta.microseconds / 1E6
q.put(s)
q = Queue()
p = Process(target=ask_timer, args=(q,))
p.start()
while True:
print q.get()
print q.qsize()
time.sleep(1)
The queue size will grow really fast.
Apparently you can use shared memory to read current value from the child process.
from multiprocessing import Process, Value
from datetime import datetime
import time
from ctypes import c_double
def ask_timer(v):
start = datetime.now()
while True:
now = datetime.now()
delta = now - start
s = delta.seconds + delta.microseconds / 1E6
v.value = s
val = Value(c_double, 0.0)
p = Process(target=ask_timer, args=(val,))
p.start()
while True:
print(val.value)
time.sleep(1)
I am using the multiprocessing module in python to spawn new processes, one for each year between 2000 to 2012. This was running successfully until last week. Now, the code runs fine without throwing any errors and seems to spawn new processes, but does not start them simultaneously. The CPU I am running this on uses ubuntu and has plenty of memory with 24 processors.
The processes seem to run sequentially instead of parallel. There have been no code changes in the past 3 months, so I am suspecting its an environment issue but am clueless about where to start debugging. Any suggestions?
Is it possible for some default setting of the kernel to prevent simultaneous execution of code? Some setting of python?
Code:
class ForEachPerson(multiprocessing.Process):
"""This class contains the funcs for the main processing."""
def __init__(self, year_queue, result_queue, dict_of_files, all, today):
multiprocessing.Process.__init__(self)
self.work_queue = year_queue
self.result_queue = result_queue
self.kill_received = False
self.dict = dict_of_files
self.all = all
self.today = today
def run(self):
while not self.kill_received:
try:
year = self.work_queue.get_nowait()
year_start_date = year[0]
year_end_date = year[1]
split = year_end_date.year
except Queue.Empty:
self.result_queue.close()
return
if self.all:
try:
null_pids = self.dict["null_pids"]
except KeyError:
null_pids = []
#For each employee calculate the data and write to file.
today = self.today
hie = hie_util.Build()
hie_op = open("output.csv", "wb")
hierarchy_op.write("....\n")
/* do function */
............
hierarchy_op.close()
timestr = ("%s End writing for %s"
% (str(datetime.datetime.now()), str(year)))
self.result_queue.put(timestr)
def Manage(years, dict_of_files, num_processes, all, today):
"""Responsible for creating & assigning tasks to worker processes."""
#load up year queue
year_queue = multiprocessing.Queue()
for year in years:
year_queue.put(year)
if num_processes > len(years):
num_processes = len(years)
# queue to pass to workers to store the results
result_queue = multiprocessing.Queue()
# spawn workers
workers = []
for i in range(num_processes):
worker = ForEachPerson(year_queue, result_queue, dict_of_files, all, today)
logging.info("Worker spawned for processor " + str(i + 1))
worker.start()
workers.append(worker)
# collect results off the queue
logging.info("results being collected")
results = []
while len(results) < len(years):
try:
result = result_queue.get()
logging.info(str(result[0]))
results.append(result[1])
except Queue.Empty:
pass
count = 0
for worker in workers:
logging.info("Terminating worker: " + str(count))
worker.terminate()
count += 1
return results
def RunHie():
"""Main control flow for building."""
logging.info("Start ")
sql_instance = hie_sql.SQLExportImport()
sql_instance.RunEtl()
# gather list of dates
date_full_list = DailyDates()
dict_of_files = ReadFiles()
# calculate hierarchy - run
num_processes = multiprocessing.cpu_count() - 1
results = Manage(date_full_list, dict_of_files, num_processes, 0, today[1])
logging.info("End")