I am writing a python script that will parse through a file quickly by sending lines to different processes to handle. At the end, I want the parent to receive the results from each child process and then be able to manipulate that. Here is the code:
#!/usr/bin/env python
import os
import re
from datetime import datetime
from multiprocessing import Process, JoinableQueue
class LineConsumer(Process):
def __init__(self, queue):
self.queue = queue
self.lines = 0
super(LineConsumer, self).__init__( )
def run(self):
print "My PID is %d" % self.pid
while True:
line = self.queue.get( )
print self.lines
if ':' in line:
self.lines += 1
self.queue.task_done( )
class Parser(object):
def __init__(self, filename, processes=4):
self.filename = filename
self.processes = processes
def parse(self):
queue = JoinableQueue(100)
consumers = [ ]
parents = [ ]
for i in range(0, self.processes):
lc = LineConsumer(queue)
lc.start( )
consumers.append(lc)
starttime = datetime.now( )
problem = False
numlines = 0
with open(self.filename, 'r') as data:
for line in data:
numlines += 1
def checkAlive(p):
if not p.is_alive( ):
return False
return True
alive = map(checkAlive, consumers)
if False in alive:
problem = True
print "A process died!!!"
break
queue.put(line)
if not problem:
queue.join( )
for p in consumers:
print p.lines( )
p.terminate( )
p.join( )
endtime = datetime.now( )
timedelta = endtime - starttime
lps = numlines / timedelta.total_seconds( )
print "Processed packets at %f lps" % lps
if __name__ == "__main__":
import sys
if len(sys.argv) != 2:
print "Supply a file to read"
sys.exit(1)
parser = Parser(sys.argv[1])
parser.parse( )
Here are the results:
My PID is 11578
My PID is 11579
My PID is 11580
My PID is 11581
0
1
0
2
1
3
2
1
...
555
627
564
556
628
0
0
0
0
Processed packets at 27189.771341 lps
As you can see, each child can save its line count, but when I try to access the count from the parent, I keep getting 0. How can I send the line count to the parent?
You can pass values back through an results queue.
In LineConsumer:
def __init__(self, queue, result_queue):
self.result_queue = result_queue
# ...
def terminate(self):
self.results_queue.put(self.lines)
super(LineConsumer, self).terminate()
In Parser:
queue = JoinableQueue(100)
result_queue = Queue()
# ...
lc = LineConsumer(queue, result_queue)
# ...
for p in consumers:
p.terminate()
p.join()
while True:
try:
print results.queue.get(False)
except Queue.Empty: # need to import Queue
break
I guess the issue is that you are trying to access object copy of which was actually changed in another process (child). You need to use explicit inter-process communication to send response back to parent. I.e. something like that JoinableQueue you use to pass information to children.
btw, I can't see how you send your data to consumers.
Related
So Im trying to code a really simple Internet Download Manager Spoof with Python 2.7
It is supposed to query a files HTTP header, get the byte range and spread the download among a no.of threads(I hard-coded 2 for simplicity) according to the byte range and later join the file parts together again.
The problem is my console log tells me that only 1 thread is started.
[EDIT] The problem has been solved. Find the working code below.
Here is my source:
from __future__ import print_function
import threading
import urllib
import urllib2
import time
threads = []
# url to open
url = "http://www.sample-videos.com/video/mp4/720/big_buck_bunny_720p_1mb.mp4"
u = urllib.urlopen(url)
# define file
file_name = "test.mp4"
f = open(file_name, 'wb')
# open url and get header info
def get_file_size(url):
stream_size = u.info()['Content-Length']
end = stream_size
return end
start = 0
#get stream size
end = get_file_size(url)
# specify block size
block_sz = 512
#algo to divide work among 2 threads
def calculate_no_of_bytes_for_thread1():
full_stream_size = end
first_thread = {'start':0, 'end':(int(full_stream_size)/2)}
print(first_thread)
return first_thread
#algo to divide work among 2 threads
def calculate_no_of_bytes_for_thread2():
full_stream_size = end
second_thread= {'start':int(full_stream_size)/2,'end': int(full_stream_size)}
print(second_thread)
return second_thread
# download function
def download_thread(url ,id,start,end):
current_size = int(float(start)/1024)
total_size = int(float(end)/1024)
print ("Start at_"+str(current_size) + "Ends at_" + str(total_size))
# specify request range and init stream
req = urllib2.Request(url)
req.headers['Range'] = 'bytes=%s-%s' % (start, end)
data = urllib2.urlopen(req)
while True:
buffer = u.read(block_sz)
if not buffer:
break
start += len(buffer)
f.write(buffer)
thread_id = id
#percentage = (current_size * 100 / total_size)
status = str(thread_id) + "_" + str(current_size) + "_" +str(total_size)
print (status)
#starts 2 threads
def start_threads():
for i in range(2):
#if first loop, start thread 1
if(i==1):
start = calculate_no_of_bytes_for_thread1().get('start')
end = calculate_no_of_bytes_for_thread1().get('end')
print("Thread 1 started")
t = threading.Thread(target=download_thread, args=(url,i,start,end))
t.start()
threads.append( t)
#if second loop, start thread 1
if(i==2):
start = calculate_no_of_bytes_for_thread2().get('start')
end = calculate_no_of_bytes_for_thread2().get('end')
print("Thread 2 started")
t = threading.Thread(target=download_thread, args=(url,i,start,end))
t.start()
threads.append( t)
# Join threads back (order doesn't matter, you just want them all)
for i in threads:
i.join()
#start benchmarking
start_time = time.clock()
start_threads()
print ("Finito!")
end_time = time.clock()
benchmark = str(end_time - start_time)
print ("Download took_" +benchmark)
f.close()
And the output:
{'start': 0, 'end': 527868}
{'start': 0, 'end': 527868}
Thread 1 started
Start at_0Ends at_515
1_0_515
1_0_515
Finito!
Download took_6.97844422658
Working code:
from __future__ import print_function
import threading
import urllib
import urllib2
import time
threads = []
parts = {}
# url to open
url = "http://www.sample-videos.com/audio/mp3/india-national-anthem.mp3"
u = urllib.urlopen(url)
# define file
file_name = "test.mp3"
f = open(file_name, 'wb')
# open url and get header info
def get_file_size(url):
stream_size = u.info()['Content-Length']
file_size = stream_size
return file_size
start = 0
#get stream size
end = get_file_size(url)
# specify block size
block_sz = 512
#algo to divide work among 2 threads
def calculate_no_of_bytes_for_thread1():
full_stream_size = end
first_thread = {'start':0, 'end':(int(full_stream_size)/2)}
print(first_thread)
return first_thread
#algo to divide work among 2 threads
def calculate_no_of_bytes_for_thread2():
full_stream_size = end
second_thread= {'start':int(full_stream_size)/2,'end': int(full_stream_size)}
print(second_thread)
return second_thread
# download function
def download_thread(url ,id,start,end):
current_size = int(float(start)/1024)
total_size = int(float(end)/1024)
print ("Start at_"+str(current_size) + "Ends at_" + str(total_size))
# specify request range and init stream
req = urllib2.Request(url)
req.headers['Range'] = 'bytes=%s-%s' % (start, end)
while True:
buffer = u.read(block_sz)
if not buffer:
break
start += len(buffer)
f.write(buffer)
thread_id = id
status = "Thread ID_" +str(thread_id) + "Downloaded_" + str(int(start/1024)) + "Total_" +str(total_size)
print (status)
#starts 2 threads
def start_threads():
for i in range(2):
#if first loop, start thread 1
if(i==0):
start = calculate_no_of_bytes_for_thread1().get('start')
end = calculate_no_of_bytes_for_thread1().get('end')
print("Thread 1 started")
t = threading.Thread(target=download_thread, args=(url,i,start,end))
t.start()
threads.append( t)
#if second loop, start thread 2
if(i==1):
start = calculate_no_of_bytes_for_thread2().get('start')
end = calculate_no_of_bytes_for_thread2().get('end')
print("Thread 2 started")
t = threading.Thread(target=download_thread, args=(url,i,start,end))
t.start()
threads.append( t)
# Join threads back (order doesn't matter, you just want them all)
for i in threads:
i.join()
# Sort parts and you're done
# result = ''
# for i in range(2):
# result += parts[i*block_sz]
#start benchmarking
start_time = time.clock()
start_threads()
print ("Finito!")
end_time = time.clock()
benchmark = str(end_time - start_time)
print ("Download took_" +benchmark)
f.close()
You have:
for i in range(2):
if(i==1):
...
if(i==2):
...
But range(2) iterates over [0,1] not [1,2].
Save some trouble and just remove those 3 lines. The code to start the two threads can just run serially.
I'm trying to run a lengthy command within Python that outputs to both stdout and stderr. I'd like to poll the subprocess and write the output to separate files.
I tried the following, based on this answer Non-blocking read on a subprocess.PIPE in python
import subprocess
from Queue import Queue, Empty
from threading import Thread
def send_cmd(cmd, shell=False):
"""
Send cmd to the shell
"""
if not isinstance(cmd, list): cmd = shlex.split(cmd)
params = {'args' : cmd,
'stdout' : subprocess.PIPE,
'stderr' : subprocess.PIPE,
'shell' : shell}
proc = subprocess.Popen(**params)
return proc
def monitor_command(process, stdout_log=os.devnull, stderr_log=os.devnull):
"""
Monitor the process that is running, and log it if desired
"""
def enqueue_output(out, queue):
for line in iter(out.readline, b''):
queue.put(line)
def setup_process(log_name, proc):
FID = open(log_name, 'w')
queue = Queue()
thread = Thread(target=enqueue_output, args=(proc, queue))
thread.daemon = True # Thread dies with program
thread.start()
return (queue, FID)
def check_queues(queue_list, errors):
for queue, FID in queue_list:
try:
line = queue.get_nowait()
if 'error' in line.lower() or 'failed' in line.lower():
errors.append(line)
except Empty:
pass
else:
FID.write(line)
errors = []
queue_list = []
for log, proc in [(stdout_log, process.stdout), (stderr_log, process.stderr)]:
queue_list.append(setup_process(log, proc)
while process.poll() is None:
check_queues(queue_list, errors)
while not queue_list[0][0].empty() or queue_list[1][0].empty():
check_queues(queue_list, errors)
for queue, FID in queue_list:
FID.close()
return errors
process = send_cmd('long_program.exe')
errors = monitor_command(process, stdout_log='stdout.log', stderr_log='stderr.log')
But it the output file for stdout is empty, and the output file for stderr is only a few lines long, whereas both should be quite large.
What am I missing?
I did that once.. here is some old code I wrote
class Process_Communicator():
def join(self):
self.te.join()
self.to.join()
self.running = False
self.aggregator.join()
self.ti.join()
def enqueue_in(self):
while self.running and self.p.stdin is not None:
while not self.stdin_queue.empty():
s = self.stdin_queue.get()
self.p.stdin.write(str(s) + '\n\r')
pass
def enqueue_output(self):
if not self.p.stdout or self.p.stdout.closed:
return
out = self.p.stdout
for line in iter(out.readline, b''):
self.qo.put(line)
# out.flush()
def enqueue_err(self):
if not self.p.stderr or self.p.stderr.closed:
return
err = self.p.stderr
for line in iter(err.readline, b''):
self.qe.put(line)
def aggregate(self):
while (self.running):
self.update()
self.update()
def update(self):
line = ""
try:
while self.qe.not_empty:
line = self.qe.get_nowait() # or q.get(timeout=.1)
self.unbblocked_err += line
except Queue.Empty:
pass
line = ""
try:
while self.qo.not_empty:
line = self.qo.get_nowait() # or q.get(timeout=.1)
self.unbblocked_out += line
except Queue.Empty:
pass
while not self.stdin_queue.empty():
s = self.stdin_queue.get()
self.p.stdin.write(str(s))
def get_stdout(self, clear=True):
ret = self.unbblocked_out
if clear:
self.unbblocked_out = ""
return ret
def has_stdout(self):
ret = self.get_stdout(False)
if ret == '':
return None
else:
return ret
def get_stderr(self, clear=True):
ret = self.unbblocked_out
if clear:
self.unbblocked_out = ""
return ret
def has_stderr(self):
ret = self.get_stdout(False)
if ret == '':
return None
else:
return ret
def __init__(self, subp):
'''This is a simple class that collects and aggregates the
output from a subprocess so that you can more reliably use
the class without having to block for subprocess.communicate.'''
self.p = subp
self.unbblocked_out = ""
self.unbblocked_err = ""
self.running = True
self.qo = Queue.Queue()
self.to = threading.Thread(name="out_read",
target=self.enqueue_output,
args=())
self.to.daemon = True # thread dies with the program
self.to.start()
self.qe = Queue.Queue()
self.te = threading.Thread(name="err_read",
target=self.enqueue_err,
args=())
self.te.daemon = True # thread dies with the program
self.te.start()
self.stdin_queue = Queue.Queue()
self.aggregator = threading.Thread(name="aggregate",
target=self.aggregate,
args=())
self.aggregator.daemon = True # thread dies with the program
self.aggregator.start()
pass
You may not need the whole example, but feel free to cut copy and paste what you need. It's also important to show how I did the threading.
The code looks more complicated than the task requires. I don't see why do you need to call process.poll() or queue.get_nowait() here. To deliver subprocess' stdout/stderr to several sinks; you could start with teed_call() that accepts arbitrary file-like objects: you could pass logfiles and special file-like objects that accumulates errors in theirs .write() methods.
To fix your code with minimal changes; you should call .join() on the reader threads (even if process.poll() is not None i.e., the subprocess exited; there could be some pending output. Joining reader's threads ensures that all output is read).
I am new to python.
I am trying out Hbase thrift client using thrift. I got some code on net, which I just modify to work with latest version of thrift but when I run the code , it just exit, no threads are started.
Here is the code.
import json, traceback, sys, datetime, time, logging, threading, random
import logging.handlers
import thrift
sys.path.append('gen-py')
from thrift.transport.TSocket import TSocket
from thrift.transport.TTransport import TBufferedTransport
from thrift.protocol import TBinaryProtocol
from hbase import THBaseService
gWritenItems = 0
gStartT = 0
gEndT = 0
recordsPerBatch = 300 #reports per client per day
columns = 3
#config
concurrent = 10
records = 60000#6000000 #6 million
bytesPerRecord = 1024
mylock = threading.RLock()
class writeThread(threading.Thread):
def __init__(self, threadname, RecordsThreadwillwrite):
threading.Thread.__init__(self, name = threadname)
bytesPerColumn = int(bytesPerRecord/columns) - 11 #suppose 3 columns
self.columnvalue = "value_" + "x"*bytesPerColumn + "_endv"
self.tbwBatch = int (RecordsThreadwillwrite / recordsPerBatch)
self.transport = TBufferedTransport(TSocket('pnq-adongrevm1', 5151), 40960)
self.transport.open()
protocol = TBinaryProtocol.TBinaryProtocol(self.transport)
self.client = THBaseService.Client(protocol)
self.table = "example"
def run(self):
print "+%s start" % (self.getName())
global gEndT
global gWritenItems
threadWritenItem = 0
for loopidx in xrange(0, self.tbwBatch):
self.write_hbase() #write
threadWritenItem += recordsPerBatch
mylock.acquire()
gEndT = time.time()
gWritenItems += threadWritenItem
print "%s done, %s seconds past, %d reocrds saved" % (self.getName(), gEndT-gStartT, gWritenItems)
mylock.release()
self.transport.close()
def write_hbase(self): #write 50 rowkyes, and 3 column families in each rowkey
print self.getName(), "Start write"
batchmutations = []
for i in xrange(0, recordsPerBatch): # write to db, 300 items together
mutations = []
rowkey = "RK_%s_%s" % (random.random(), time.time())
for ii in xrange(0, columns):
mutations.append(THBaseService.TPut(row=rowkey, columnValues=[TColumnValue(family="f1", qualifier="%s"%ii, value=self.columnvalue)]))
self.client.putMultiple(self.table,mutations)
itemsPerThread = int(records / concurrent)
for threadid in xrange(0, concurrent):
gStartT = time.time()
t = writeThread("Thread_%s" % threadid, itemsPerThread)
t.start();
print "%d thread created, each thread will write %d records" % (concurrent, itemsPerThread)
I just get a message 10 thread created, each thread will write 6000 records
Yep, this is because you are not waiting for threads to finish their job, so the main thread just exits. Try this:
itemsPerThread = int(records / concurrent)
threads = []
for threadid in xrange(0, concurrent):
gStartT = time.time()
t = writeThread("Thread_%s" % threadid, itemsPerThread)
t.start();
threads.append(t)
# wait until all finish the job
for t in threads:
t.join()
EDIT Ha, I don't think I'm right here, because you didn't mark your threads as daemons. It should work even without joining. But have a look at this code:
class CustomThread(threading.Thread):
def run(self):
print "test"
for x in xrange(0, 10):
t = CustomThread()
t.start()
It will always reach print "test" line no matter what. So in your code it should always reach print "+%s start" % (self.getName()) no matter what. Are you sure it doesn't work? :)
If it doesn't, then there are only two possibilities:
There is a blocking operation and/or exception in your __init__ method. But then it would not reach final print;
concurrent variable is 0 for some reason (which is not consistent with the final print).
The purpose of my program is to download files with threads. I define the unit, and using len/unit threads, the len is the length of the file which is going to be downloaded.
Using my program, the file can be downloaded, but the threads are not stopping. I can't find the reason why.
This is my code...
#! /usr/bin/python
import urllib2
import threading
import os
from time import ctime
class MyThread(threading.Thread):
def __init__(self,func,args,name=''):
threading.Thread.__init__(self);
self.func = func;
self.args = args;
self.name = name;
def run(self):
apply(self.func,self.args);
url = 'http://ubuntuone.com/1SHQeCAQWgIjUP2945hkZF';
request = urllib2.Request(url);
response = urllib2.urlopen(request);
meta = response.info();
response.close();
unit = 1000000;
flen = int(meta.getheaders('Content-Length')[0]);
print flen;
if flen%unit == 0:
bs = flen/unit;
else :
bs = flen/unit+1;
blocks = range(bs);
cnt = {};
for i in blocks:
cnt[i]=i;
def getStr(i):
try:
print 'Thread %d start.'%(i,);
fout = open('a.zip','wb');
fout.seek(i*unit,0);
if (i+1)*unit > flen:
request.add_header('Range','bytes=%d-%d'%(i*unit,flen-1));
else :
request.add_header('Range','bytes=%d-%d'%(i*unit,(i+1)*unit-1));
#opener = urllib2.build_opener();
#buf = opener.open(request).read();
resp = urllib2.urlopen(request);
buf = resp.read();
fout.write(buf);
except BaseException:
print 'Error';
finally :
#opener.close();
fout.flush();
fout.close();
del cnt[i];
# filelen = os.path.getsize('a.zip');
print 'Thread %d ended.'%(i),
print cnt;
# print 'progress : %4.2f'%(filelen*100.0/flen,),'%';
def main():
print 'download at:',ctime();
threads = [];
for i in blocks:
t = MyThread(getStr,(blocks[i],),getStr.__name__);
threads.append(t);
for i in blocks:
threads[i].start();
for i in blocks:
# print 'this is the %d thread;'%(i,);
threads[i].join();
#print 'size:',os.path.getsize('a.zip');
print 'download done at:',ctime();
if __name__=='__main__':
main();
Could someone please help me understand why the threads aren't stopping.
I can't really address your code example because it is quite messy and hard to follow, but a potential reason you are seeing the threads not end is that a request will stall out and never finish. urllib2 allows you to specify timeouts for how long you will allow the request to take.
What I would recommend for your own code is that you split your work up into a queue, start a fixed number of thread (instead of a variable number), and let the worker threads pick up work until it is done. Make the http requests have a timeout. If the timeout expires, try again or put the work back into the queue.
Here is a generic example of how to use a queue, a fixed number of workers and a sync primitive between them:
import threading
import time
from Queue import Queue
def worker(queue, results, lock):
local_results = []
while True:
val = queue.get()
if val is None:
break
# pretend to do work
time.sleep(.1)
local_results.append(val)
with lock:
results.extend(local_results)
print threading.current_thread().name, "Done!"
num_workers = 4
threads = []
queue = Queue()
lock = threading.Lock()
results = []
for i in xrange(100):
queue.put(i)
for _ in xrange(num_workers):
# Use None as a sentinel to signal the threads to end
queue.put(None)
t = threading.Thread(target=worker, args=(queue,results,lock))
t.start()
threads.append(t)
for t in threads:
t.join()
print sorted(results)
print "All done"
I am using the multiprocessing module in python to spawn new processes, one for each year between 2000 to 2012. This was running successfully until last week. Now, the code runs fine without throwing any errors and seems to spawn new processes, but does not start them simultaneously. The CPU I am running this on uses ubuntu and has plenty of memory with 24 processors.
The processes seem to run sequentially instead of parallel. There have been no code changes in the past 3 months, so I am suspecting its an environment issue but am clueless about where to start debugging. Any suggestions?
Is it possible for some default setting of the kernel to prevent simultaneous execution of code? Some setting of python?
Code:
class ForEachPerson(multiprocessing.Process):
"""This class contains the funcs for the main processing."""
def __init__(self, year_queue, result_queue, dict_of_files, all, today):
multiprocessing.Process.__init__(self)
self.work_queue = year_queue
self.result_queue = result_queue
self.kill_received = False
self.dict = dict_of_files
self.all = all
self.today = today
def run(self):
while not self.kill_received:
try:
year = self.work_queue.get_nowait()
year_start_date = year[0]
year_end_date = year[1]
split = year_end_date.year
except Queue.Empty:
self.result_queue.close()
return
if self.all:
try:
null_pids = self.dict["null_pids"]
except KeyError:
null_pids = []
#For each employee calculate the data and write to file.
today = self.today
hie = hie_util.Build()
hie_op = open("output.csv", "wb")
hierarchy_op.write("....\n")
/* do function */
............
hierarchy_op.close()
timestr = ("%s End writing for %s"
% (str(datetime.datetime.now()), str(year)))
self.result_queue.put(timestr)
def Manage(years, dict_of_files, num_processes, all, today):
"""Responsible for creating & assigning tasks to worker processes."""
#load up year queue
year_queue = multiprocessing.Queue()
for year in years:
year_queue.put(year)
if num_processes > len(years):
num_processes = len(years)
# queue to pass to workers to store the results
result_queue = multiprocessing.Queue()
# spawn workers
workers = []
for i in range(num_processes):
worker = ForEachPerson(year_queue, result_queue, dict_of_files, all, today)
logging.info("Worker spawned for processor " + str(i + 1))
worker.start()
workers.append(worker)
# collect results off the queue
logging.info("results being collected")
results = []
while len(results) < len(years):
try:
result = result_queue.get()
logging.info(str(result[0]))
results.append(result[1])
except Queue.Empty:
pass
count = 0
for worker in workers:
logging.info("Terminating worker: " + str(count))
worker.terminate()
count += 1
return results
def RunHie():
"""Main control flow for building."""
logging.info("Start ")
sql_instance = hie_sql.SQLExportImport()
sql_instance.RunEtl()
# gather list of dates
date_full_list = DailyDates()
dict_of_files = ReadFiles()
# calculate hierarchy - run
num_processes = multiprocessing.cpu_count() - 1
results = Manage(date_full_list, dict_of_files, num_processes, 0, today[1])
logging.info("End")