Python Thread/Queue issue

Python Thread/Queue issue - python

I'm creating a threaded python script that has a collection of files that is put into a queue and then an unknown amount of threads (default is 3) to start downloading. When each of the threads complete it updates the stdout with the queue status and a percentage. All the files are being downloaded but the status information is wrong on the 3rd thread and I'm not sure why. I've been considering creating a work_completed queue to use for the calculation but don't think I should have to/that it would matter. Could someone point me in the right direction here?
download_queue = queue.Queue()
class Downloader(threading.Thread):
def __init__(self,work_queue):
super().__init__()
self.current_job = 0
self.work_queue = work_queue
self.queue_size = work_queue.qsize()
def run(self):
while self.work_queue.qsize() > 0:
url = self.work_queue.get(True)
system_call = "wget -nc -q {0} -O {1}".format(url,local_file)
os.system(system_call)
self.current_job = int(self.queue_size) - int(self.work_queue.qsize())
self.percent = (self.current_job / self.queue_size) * 100
sys.stdout.flush()
status = "\rDownloading " + url.split('/')[-1] + " [status: " + str(self.current_job) + "/" + str(self.queue_size) + ", " + str(round(self.percent,2)) + "%]"
finally:
self.work_queue.task_done()
def main:
if download_queue.qsize() > 0:
if options.active_downloads:
active_downloads = options.active_downloads
else:
active_downloads = 3
for x in range(active_downloads):
downloader = Downloader(download_queue)
downloader.start()
download_queue.join()

You can't check the queue size in one statement, and then .get() from the queue in the next. In the meantime the whole world may have changed. The .get() method call is the single atomic operation you need to call. If it raises Empty or blocks, the queue is empty.
Your threads can overwrite each other's output. I would have another thread with an input queue whos only job is to print the items in the queue to stdout. It can also count off the number of completed items and produce status information.
I also tend not to subclass Thread, but instead just supply a plain Thread instance with a target= parameter and .start() the thread.
based on your response, try this:
download_queue = queue.Queue()
class Downloader(threading.Thread):
def __init__(self,work_queue, original_size):
super().__init__()
self.current_job = 0
self.work_queue = work_queue
self.queue_size = original_size
def run(self):
while True:
try:
url = self.work_queue.get(False)
system_call = "wget -nc -q {0} -O {1}".format(url,local_file)
os.system(system_call)
# the following code is questionable. By the time we get here,
# many other items may have been taken off the queue.
self.current_job = int(self.queue_size) - int(self.work_queue.qsize())
self.percent = (self.current_job / self.queue_size) * 100
sys.stdout.flush()
status = ("\rDownloading " + url.split('/')[-1] +
" [status: " + str(self.current_job) +
"/" + str(self.queue_size) + ", " +
str(round(self.percent,2)) + "%]" )
except queue.Empty:
pass
finally:
self.work_queue.task_done()
def main:
if download_queue.qsize() > 0:
original_size = download_queue.qsize()
if options.active_downloads:
active_downloads = options.active_downloads
else:
active_downloads = 3
for x in range(active_downloads):
downloader = Downloader(download_queue, original_size)
downloader.start()
download_queue.join()

If you'd like to use the multiprocessing module, it includes a very nice parallel imap_unordered, which would reduce your problem to the very elegant:
import multiprocessing, sys
class ParallelDownload:
def __init__(self, urls, processcount=3):
self.total_items = len(urls)
self.pool = multiprocessing.Pool(processcount)
for n, status in enumerate(self.pool.imap_unordered(self.download, urls)):
stats = (n, self.total_items, n/self.total_items)
sys.stdout.write(status + " [%d/%d = %0.2f %%]\n"%stats)
def download(self, url):
system_call = "wget -nc -q {0} -O {1}".format(url, local_file)
os.system(system_call)
status = "\rDownloaded " + url.split('/')[-1]
return status

Related

Multiprocessing Queue.get() hangs

I'm trying to implement basic multiprocessing and I've run into an issue. The python script is attached below.
import time, sys, random, threading
from multiprocessing import Process
from Queue import Queue
from FrequencyAnalysis import FrequencyStore, AnalyzeFrequency
append_queue = Queue(10)
database = FrequencyStore()
def add_to_append_queue(_list):
append_queue.put(_list)
def process_append_queue():
while True:
item = append_queue.get()
database.append(item)
print("Appended to database in %.4f seconds" % database.append_time)
append_queue.task_done()
return
def main():
database.load_db()
print("Database loaded in %.4f seconds" % database.load_time)
append_queue_process = Process(target=process_append_queue)
append_queue_process.daemon = True
append_queue_process.start()
#t = threading.Thread(target=process_append_queue)
#t.daemon = True
#t.start()
while True:
path = raw_input("file: ")
if path == "exit":
break
a = AnalyzeFrequency(path)
a.analyze()
print("Analyzed file in %.4f seconds" % a._time)
add_to_append_queue(a.get_results())
append_queue.join()
#append_queue_process.join()
database.save_db()
print("Database saved in %.4f seconds" % database.save_time)
sys.exit(0)
if __name__=="__main__":
main()
The AnalyzeFrequency analyzes the frequencies of words in a file and get_results() returns a sorted list of said words and frequencies. The list is very large, perhaps 10000 items.
This list is then passed to the add_to_append_queue method which adds it to a queue. The process_append_queue takes the items one by one and adds the frequencies to a "database". This operation takes a bit longer than the actual analysis in main() so I am trying to use a seperate process for this method. When I try and do this with the threading module, everything works perfectly fine, no errors. When I try and use Process, the script hangs at item = append_queue.get().
Could someone please explain what is happening here, and perhaps direct me toward a fix?
All answers appreciated!
UPDATE
The pickle error was my fault, it was just a typo. Now I am using the Queue class within multiprocessing but the append_queue.get() method still hangs.
NEW CODE
import time, sys, random
from multiprocessing import Process, Queue
from FrequencyAnalysis import FrequencyStore, AnalyzeFrequency
append_queue = Queue()
database = FrequencyStore()
def add_to_append_queue(_list):
append_queue.put(_list)
def process_append_queue():
while True:
database.append(append_queue.get())
print("Appended to database in %.4f seconds" % database.append_time)
return
def main():
database.load_db()
print("Database loaded in %.4f seconds" % database.load_time)
append_queue_process = Process(target=process_append_queue)
append_queue_process.daemon = True
append_queue_process.start()
#t = threading.Thread(target=process_append_queue)
#t.daemon = True
#t.start()
while True:
path = raw_input("file: ")
if path == "exit":
break
a = AnalyzeFrequency(path)
a.analyze()
print("Analyzed file in %.4f seconds" % a._time)
add_to_append_queue(a.get_results())
#append_queue.join()
#append_queue_process.join()
print str(append_queue.qsize())
database.save_db()
print("Database saved in %.4f seconds" % database.save_time)
sys.exit(0)
if __name__=="__main__":
main()
UPDATE 2
This is the database code:
class FrequencyStore:
def __init__(self):
self.sorter = Sorter()
self.db = {}
self.load_time = -1
self.save_time = -1
self.append_time = -1
self.sort_time = -1
def load_db(self):
start_time = time.time()
try:
file = open("results.txt", 'r')
except:
raise IOError
self.db = {}
for line in file:
word, count = line.strip("\n").split("=")
self.db[word] = int(count)
file.close()
self.load_time = time.time() - start_time
def save_db(self):
start_time = time.time()
_db = []
for key in self.db:
_db.append([key, self.db[key]])
_db = self.sort(_db)
try:
file = open("results.txt", 'w')
except:
raise IOError
file.truncate(0)
for x in _db:
file.write(x[0] + "=" + str(x[1]) + "\n")
file.close()
self.save_time = time.time() - start_time
def create_sorted_db(self):
_temp_db = []
for key in self.db:
_temp_db.append([key, self.db[key]])
_temp_db = self.sort(_temp_db)
_temp_db.reverse()
return _temp_db
def get_db(self):
return self.db
def sort(self, _list):
start_time = time.time()
_list = self.sorter.mergesort(_list)
_list.reverse()
self.sort_time = time.time() - start_time
return _list
def append(self, _list):
start_time = time.time()
for x in _list:
if x[0] not in self.db:
self.db[x[0]] = x[1]
else:
self.db[x[0]] += x[1]
self.append_time = time.time() - start_time

Comments suggest you're trying to run this on Windows. As I said in a comment,
If you're running this on Windows, it can't work - Windows doesn't
have fork(), so each process gets its own Queue and they have nothing
to do with each other. The entire module is imported "from scratch" by
each process on Windows. You'll need to create the Queue in main(),
and pass it as an argument to the worker function.
Here's fleshing out what you need to do to make it portable, although I removed all the database stuff because it's irrelevant to the problems you've described so far. I also removed the daemon fiddling, because that's usually just a lazy way to avoid shutting down things cleanly, and often as not will come back to bite you later:
def process_append_queue(append_queue):
while True:
x = append_queue.get()
if x is None:
break
print("processed %d" % x)
print("worker done")
def main():
import multiprocessing as mp
append_queue = mp.Queue(10)
append_queue_process = mp.Process(target=process_append_queue, args=(append_queue,))
append_queue_process.start()
for i in range(100):
append_queue.put(i)
append_queue.put(None) # tell worker we're done
append_queue_process.join()
if __name__=="__main__":
main()
The output is the "obvious" stuff:
processed 0
processed 1
processed 2
processed 3
processed 4
...
processed 96
processed 97
processed 98
processed 99
worker done
Note: because Windows doesn't (can't) fork(), it's impossible for worker processes to inherit any Python object on Windows. Each process runs the entire program from its start. That's why your original program couldn't work: each process created its own Queue, wholly unrelated to the Queue in the other process. In the approach shown above, only the main process creates a Queue, and the main process passes it (as an argument) to the worker process.

queue.Queue is thread-safe, but doesn't work across processes. This is quite easy to fix, though. Instead of:
from multiprocessing import Process
from Queue import Queue
You want:
from multiprocessing import Process, Queue

Only 1 Thread started in for loop

So Im trying to code a really simple Internet Download Manager Spoof with Python 2.7
It is supposed to query a files HTTP header, get the byte range and spread the download among a no.of threads(I hard-coded 2 for simplicity) according to the byte range and later join the file parts together again.
The problem is my console log tells me that only 1 thread is started.
[EDIT] The problem has been solved. Find the working code below.
Here is my source:
from __future__ import print_function
import threading
import urllib
import urllib2
import time
threads = []
# url to open
url = "http://www.sample-videos.com/video/mp4/720/big_buck_bunny_720p_1mb.mp4"
u = urllib.urlopen(url)
# define file
file_name = "test.mp4"
f = open(file_name, 'wb')
# open url and get header info
def get_file_size(url):
stream_size = u.info()['Content-Length']
end = stream_size
return end
start = 0
#get stream size
end = get_file_size(url)
# specify block size
block_sz = 512
#algo to divide work among 2 threads
def calculate_no_of_bytes_for_thread1():
full_stream_size = end
first_thread = {'start':0, 'end':(int(full_stream_size)/2)}
print(first_thread)
return first_thread
#algo to divide work among 2 threads
def calculate_no_of_bytes_for_thread2():
full_stream_size = end
second_thread= {'start':int(full_stream_size)/2,'end': int(full_stream_size)}
print(second_thread)
return second_thread
# download function
def download_thread(url ,id,start,end):
current_size = int(float(start)/1024)
total_size = int(float(end)/1024)
print ("Start at_"+str(current_size) + "Ends at_" + str(total_size))
# specify request range and init stream
req = urllib2.Request(url)
req.headers['Range'] = 'bytes=%s-%s' % (start, end)
data = urllib2.urlopen(req)
while True:
buffer = u.read(block_sz)
if not buffer:
break
start += len(buffer)
f.write(buffer)
thread_id = id
#percentage = (current_size * 100 / total_size)
status = str(thread_id) + "_" + str(current_size) + "_" +str(total_size)
print (status)
#starts 2 threads
def start_threads():
for i in range(2):
#if first loop, start thread 1
if(i==1):
start = calculate_no_of_bytes_for_thread1().get('start')
end = calculate_no_of_bytes_for_thread1().get('end')
print("Thread 1 started")
t = threading.Thread(target=download_thread, args=(url,i,start,end))
t.start()
threads.append( t)
#if second loop, start thread 1
if(i==2):
start = calculate_no_of_bytes_for_thread2().get('start')
end = calculate_no_of_bytes_for_thread2().get('end')
print("Thread 2 started")
t = threading.Thread(target=download_thread, args=(url,i,start,end))
t.start()
threads.append( t)
# Join threads back (order doesn't matter, you just want them all)
for i in threads:
i.join()
#start benchmarking
start_time = time.clock()
start_threads()
print ("Finito!")
end_time = time.clock()
benchmark = str(end_time - start_time)
print ("Download took_" +benchmark)
f.close()
And the output:
{'start': 0, 'end': 527868}
{'start': 0, 'end': 527868}
Thread 1 started
Start at_0Ends at_515
1_0_515
1_0_515
Finito!
Download took_6.97844422658
Working code:
from __future__ import print_function
import threading
import urllib
import urllib2
import time
threads = []
parts = {}
# url to open
url = "http://www.sample-videos.com/audio/mp3/india-national-anthem.mp3"
u = urllib.urlopen(url)
# define file
file_name = "test.mp3"
f = open(file_name, 'wb')
# open url and get header info
def get_file_size(url):
stream_size = u.info()['Content-Length']
file_size = stream_size
return file_size
start = 0
#get stream size
end = get_file_size(url)
# specify block size
block_sz = 512
#algo to divide work among 2 threads
def calculate_no_of_bytes_for_thread1():
full_stream_size = end
first_thread = {'start':0, 'end':(int(full_stream_size)/2)}
print(first_thread)
return first_thread
#algo to divide work among 2 threads
def calculate_no_of_bytes_for_thread2():
full_stream_size = end
second_thread= {'start':int(full_stream_size)/2,'end': int(full_stream_size)}
print(second_thread)
return second_thread
# download function
def download_thread(url ,id,start,end):
current_size = int(float(start)/1024)
total_size = int(float(end)/1024)
print ("Start at_"+str(current_size) + "Ends at_" + str(total_size))
# specify request range and init stream
req = urllib2.Request(url)
req.headers['Range'] = 'bytes=%s-%s' % (start, end)
while True:
buffer = u.read(block_sz)
if not buffer:
break
start += len(buffer)
f.write(buffer)
thread_id = id
status = "Thread ID_" +str(thread_id) + "Downloaded_" + str(int(start/1024)) + "Total_" +str(total_size)
print (status)
#starts 2 threads
def start_threads():
for i in range(2):
#if first loop, start thread 1
if(i==0):
start = calculate_no_of_bytes_for_thread1().get('start')
end = calculate_no_of_bytes_for_thread1().get('end')
print("Thread 1 started")
t = threading.Thread(target=download_thread, args=(url,i,start,end))
t.start()
threads.append( t)
#if second loop, start thread 2
if(i==1):
start = calculate_no_of_bytes_for_thread2().get('start')
end = calculate_no_of_bytes_for_thread2().get('end')
print("Thread 2 started")
t = threading.Thread(target=download_thread, args=(url,i,start,end))
t.start()
threads.append( t)
# Join threads back (order doesn't matter, you just want them all)
for i in threads:
i.join()
# Sort parts and you're done
# result = ''
# for i in range(2):
# result += parts[i*block_sz]
#start benchmarking
start_time = time.clock()
start_threads()
print ("Finito!")
end_time = time.clock()
benchmark = str(end_time - start_time)
print ("Download took_" +benchmark)
f.close()

You have:
for i in range(2):
if(i==1):
...
if(i==2):
...
But range(2) iterates over [0,1] not [1,2].
Save some trouble and just remove those 3 lines. The code to start the two threads can just run serially.

Python Multi Threading Variables getting overwritten and mixed up

It tries to make two connections per thread now, still fails.
I think I solved the shared access thing because it uses self.x instead of local variables?
I'm not sure what the problem is :/, you don't happen to be a freelancer?
#!/usr/bin/python
from xml.etree.ElementTree import fromstring
from socks import socksocket, PROXY_TYPE_SOCKS5
from socket import socket, AF_INET, SOCK_STREAM
from linecache import getline
from threading import Thread, current_thread, Lock, activeCount
from os.path import isfile, getmtime
from urllib import urlopen
from time import time, sleep
from sys import exit
from json import loads
from random import randint, randrange, choice
from urlparse import parse_qs
from pprint import pprint
class myThread (Thread):
def __init__(self, threadID, name):
Thread.__init__(self)
self.threadID = threadID
self.name = name
def run(self):
self.user = parse_qs(getline('./_files/ids.txt', randint(1, idLen)).strip("\n"))
self.proxy = getline('./_files/proxies.txt', randint(1, proxyLen)).strip("\n").split(":")
self.user2 = parse_qs(getline('./_files/ids.txt', randint(1, idLen)).strip("\n"))
self.proxy2 = getline('./_files/proxies.txt', randint(1, proxyLen)).strip("\n").split(":")
try:
self.socket = socksocket(AF_INET, SOCK_STREAM)
self.socket.settimeout(5)
self.socket.setproxy(PROXY_TYPE_SOCKS5, self.proxy[0], int(self.proxy[1]))
self.socket2 = socksocket(AF_INET, SOCK_STREAM)
self.socket2.settimeout(5)
self.socket2.setproxy(PROXY_TYPE_SOCKS5, self.proxy2[0], int(self.proxy2[1]))
self.socket.connect((chatConnection[0], int(chatConnection[1])))
self.socket2.connect((chatConnection[0], int(chatConnection[1])))
send(self.socket, "<y r=\"%s\" v=\"0\" u=\"%s\" />\0" % (room, self.user["UserId"][0]))
send(self.socket2, "<y r=\"%s\" v=\"0\" u=\"%s\" />\0" % (room, self.user2["UserId"][0]))
self.data = read(self.socket)
self.data2 = read(self.socket2)
if self.data == "" or not self.data: return
if self.data2 == "" or not self.data2: return
self.xml = fromstring(self.data.strip(chr(0))).attrib
self.xml2 = fromstring(self.data2.strip(chr(0))).attrib
self.bSock = socket(AF_INET, SOCK_STREAM)
self.bSock.settimeout(5)
self.bSock2 = socket(AF_INET, SOCK_STREAM)
self.bSock2.settimeout(5)
self.bSock.connect(("127.0.0.1", 1337))
send(self.bSock, "<bot p=\"%s\" yi=\"%s\" au=\"%s\" />\0" % (self.xml["p"], self.xml["i"], self.xml["au"]))
self.data = read(self.bSock)
send(self.bSock, "<bot p=\"%s\" yi=\"%s\" au=\"%s\" />\0" % (self.xml2["p"], self.xml2["i"], self.xml2["au"]))
self.data2 = read(self.bSock)
self.data = self.data.replace("_lol", "")
self.l5 = self.data[self.data.find('l5="') + 4:]
self.l5 = self.l5[:self.l5.find('"')]
self.ya = self.data[self.data.find('c="') + 3:]
self.ya = self.ya[:self.ya.find('"')]
self.data2 = self.data2.replace("_lol", "")
self.l52 = self.data2[self.data2.find('l5="') + 4:]
self.l52 = self.l52[:self.l52.find('"')]
self.ya2 = self.data2[self.data2.find('c="') + 3:]
self.ya2 = self.ya2[:self.ya2.find('"')]
print self.ya2 + " : " + self.l52
self.bSock.close()
self.yaSock = socksocket(AF_INET, SOCK_STREAM)
self.yaSock.settimeout(5)
self.yaSock.setproxy(PROXY_TYPE_SOCKS5, self.proxy[0], int(self.proxy[1]))
self.yaSock.connect((chatConnection[0], int(chatConnection[1])))
self.yaSock2 = socksocket(AF_INET, SOCK_STREAM)
self.yaSock2.settimeout(5)
self.yaSock2.setproxy(PROXY_TYPE_SOCKS5, self.proxy2[0], int(self.proxy2[1]))
self.yaSock2.connect((chatConnection[0], int(chatConnection[1])))
send(self.yaSock, "<ya r=\"%s\" u=\"%s\" c=\"%s\" k=\"%s\" />\0" % (room, self.user["UserId"][0], self.ya, self.xml["k"]))
print read(self.yaSock)
self.yaSock.close()
send(self.yaSock2, "<ya r=\"%s\" u=\"%s\" c=\"%s\" k=\"%s\" />\0" % (room, self.user2["UserId"][0], self.ya2, self.xml2["k"]))
print read(self.yaSock2)
self.yaSock2.close()
self.j2 = "<j2 Y=\"2\" l5=\"" + self.l5 + "\" l4=\"1200\" l3=\"844\" l2=\"0\" cb=\"0\" q=\"1\" y=\"" + self.xml["i"] + "\" k=\"" + self.user["k1"][0] + "\" k3=\"0\" p=\"0\" c=\"" + room + "\" f=\"2\" u=\"" + self.user["UserId"][0] + "\" d0=\"0\" n=\"Zuhnny\" a=\"1\" h=\"xat sux\" v=\"0\" />\0"
self.j22 = "<j2 Y=\"2\" l5=\"" + self.l52 + "\" l4=\"1200\" l3=\"844\" l2=\"0\" cb=\"0\" q=\"1\" y=\"" + self.xml2["i"] + "\" k=\"" + self.user2["k1"][0] + "\" k3=\"0\" p=\"0\" c=\"" + room + "\" f=\"2\" u=\"" + self.user2["UserId"][0] + "\" d0=\"0\" n=\"Zuhnny\" a=\"1\" h=\"xat sux\" v=\"0\" />\0"
send(self.socket, self.j2)
send(self.socket2, self.j22)
while True:
print self.socket.recv(6096)
print self.socket2.recv(6096)
sleep(1)
send(self.socket, "<m t=\" F U C K X A T %s\" u=\"%s\" />\0" % (randint(0,5000), self.user["UserId"][0]))
send(self.socket2, "<m t=\" F U C K X A T %s\" u=\"%s\" />\0" % (randint(0,5000), self.user2["UserId"][0]))
except IOError, err: pass
except Exception, error: pass
def read(socket):
data = socket.recv(1024)
return data
def send(socket, data):
socket.sendall(data)
def getChatConnection(room):
print '\ntest\n'
if not isfile('./_files/ips.txt') or time() - getmtime('./_files/ips.txt') > 86400:
fh = open('./_files/ips.txt', 'w')
fh.write(urlopen('http://xat.com/web_gear/chat/ip2.htm?' + str(time())).read())
fh.close()
try:
fh = open('./_files/ips.txt', 'r')
iprules = loads(fh.read())
Fx = iprules[iprules["order"][0][0]]
xAddr = Fx[1][randint(0, len(Fx[1]) - 1)].split(':')
if len(xAddr) == 1: xAddr.append(10000)
if len(xAddr) == 2: xAddr.append(39)
xPort = xAddr[1] + randint(0, xAddr[2] - 1)
return (xAddr[0], 9999 + int(room) if int(room) < 8 else 10007 + (int(room) % 32))
except Exception, e:
print e
file = open("./_files/proxies.txt")
proxyLen = len(map(lambda(x): x.split(':'), file))
file2 = open("./_files/ids.txt")
idLen = len(map(lambda(x): x.split('\n'), file2))
threadLock = Lock()
threads = []
room = raw_input("Room ID to raid: ")
chatConnection = getChatConnection(room)
for x in range(1000):
threads.append(myThread(x, "Thread-" + str(x)).start())
# Wait for all threads to complete
for t in threads:
t.join()
print "Exiting Main Thread"

I have a guess at your problem. I don't think it actually is race conditions at all. I haven't read all of your code carefully, but I don't see any global or otherwise shared variables being mutated. But I do see a different problem.
You aren't buffering up your reads; you're just expecting that each bSock.recv(1024) is going to receive exactly one message. That isn't how TCP works; you may receive half of a message, or two messages, or the second half of the previous message and the first half of the next.
If you don't stress your computer or the network very hard, and your messages are all pretty small, it may (depending on the platform) work 99.9% of the time, meaning you don't notice any problem. But as soon as you stress things, it'll start to fail more often.
And you've got 400 threads, and from your old-style code (e.g., except Type, value) it looks like you may be on a system old enough that it's stuck on Python 2.5, which means you may be stressing the system very hard.
You need to fix this by receiving in a loop until you have one or more complete messages, then handling those messages, then returning to the loop, instead of handling each recv as if it were guaranteed to be exactly one complete message.
Fortunately, you're dealing with IRC, which (assuming you're not doing any DCC, etc.) has exactly one command per line, and Python has a nice wrapper around sockets that makes them look like line-buffered files. So you can do this:
bfile = bsock.makefile()
for line in bfile:
Now you know that line is guaranteed to be a complete line, even if it had to do three reads, and buffer up most of the third read until your next time through the loop.
You're doing the same thing in at least three places, so obviously you need to fix them all. Also, you need to make sure to close the socket and the file appropriately. And you need to detect when the other sides closes the socket. (The recv, or the next line, will return an empty string.)

Another possibility:
There is at least one thing all of the threads are sharing: that bsock socket. And they all do this 5 seconds after launch:
bSock.sendall("<bot p=\"%s\" au=\"%s\" yi=\"%s\" />\0" % (xml["p"], xml["au"], xml["i"]))
data = bSock.recv(1024)
What's to stop thread #42 from doing its sendall, then thread #23 doing its sendall, then thread #42 from doing its recv and getting the data intended for thread #42?
This is what's called a "critical section" or "atomic block": a chunk of code that only one thread can run at a time or everyone will get confused. The usual way around it is to share a Lock, and have each thread acquire the Lock before running this code. If thread #42 already has the lock, and thread #23 tries to acquire it, it will be blocked until thread #42 releases the lock, so there's no chance of them conflicting. So:
bSockLock = threading.Lock()
# ...
for x in range(400):
Thread(target = __init__, args=[chatConnection, bSock, bSockLock]).start()
# ...
def __init__(chatConnection, bSock):
# ...
for x in range(3):
start(chatConnection, proxies[x][0], proxies[x][1], [ids[x]["UserId"][0], ids[x]["k1"][0], ids[x]["k2"][0]], room, bSock, bSockLock)
# ...
def start(chatConnection, proxyIP, proxyPort, user, room, bSock, bSockLock):
# ...
with bSockLock:
bSock.sendall("<bot p=\"%s\" au=\"%s\" yi=\"%s\" />\0" % (xml["p"], xml["au"], xml["i"]))
data = bSock.recv(1024)

multiprocessing module not spawning new processes

I am using the multiprocessing module in python to spawn new processes, one for each year between 2000 to 2012. This was running successfully until last week. Now, the code runs fine without throwing any errors and seems to spawn new processes, but does not start them simultaneously. The CPU I am running this on uses ubuntu and has plenty of memory with 24 processors.
The processes seem to run sequentially instead of parallel. There have been no code changes in the past 3 months, so I am suspecting its an environment issue but am clueless about where to start debugging. Any suggestions?
Is it possible for some default setting of the kernel to prevent simultaneous execution of code? Some setting of python?
Code:
class ForEachPerson(multiprocessing.Process):
"""This class contains the funcs for the main processing."""
def __init__(self, year_queue, result_queue, dict_of_files, all, today):
multiprocessing.Process.__init__(self)
self.work_queue = year_queue
self.result_queue = result_queue
self.kill_received = False
self.dict = dict_of_files
self.all = all
self.today = today
def run(self):
while not self.kill_received:
try:
year = self.work_queue.get_nowait()
year_start_date = year[0]
year_end_date = year[1]
split = year_end_date.year
except Queue.Empty:
self.result_queue.close()
return
if self.all:
try:
null_pids = self.dict["null_pids"]
except KeyError:
null_pids = []
#For each employee calculate the data and write to file.
today = self.today
hie = hie_util.Build()
hie_op = open("output.csv", "wb")
hierarchy_op.write("....\n")
/* do function */
............
hierarchy_op.close()
timestr = ("%s End writing for %s"
% (str(datetime.datetime.now()), str(year)))
self.result_queue.put(timestr)
def Manage(years, dict_of_files, num_processes, all, today):
"""Responsible for creating & assigning tasks to worker processes."""
#load up year queue
year_queue = multiprocessing.Queue()
for year in years:
year_queue.put(year)
if num_processes > len(years):
num_processes = len(years)
# queue to pass to workers to store the results
result_queue = multiprocessing.Queue()
# spawn workers
workers = []
for i in range(num_processes):
worker = ForEachPerson(year_queue, result_queue, dict_of_files, all, today)
logging.info("Worker spawned for processor " + str(i + 1))
worker.start()
workers.append(worker)
# collect results off the queue
logging.info("results being collected")
results = []
while len(results) < len(years):
try:
result = result_queue.get()
logging.info(str(result[0]))
results.append(result[1])
except Queue.Empty:
pass
count = 0
for worker in workers:
logging.info("Terminating worker: " + str(count))
worker.terminate()
count += 1
return results
def RunHie():
"""Main control flow for building."""
logging.info("Start ")
sql_instance = hie_sql.SQLExportImport()
sql_instance.RunEtl()
# gather list of dates
date_full_list = DailyDates()
dict_of_files = ReadFiles()
# calculate hierarchy - run
num_processes = multiprocessing.cpu_count() - 1
results = Manage(date_full_list, dict_of_files, num_processes, 0, today[1])
logging.info("End")

Sending and receiving async over multiprocessing.Pipe() in Python

I'm having some issues getting the Pipe.send to work in this code. What I would ultimately like to do is send and receive messages to and from the foreign process while its running in a fork. This is eventually going to be integrated into a pexpect loop for talking to interpreter processes.
from multiprocessing import Process, Pipe
from pexpect import spawn
class CockProc(Process):
def start(self):
self.process = spawn('coqtop', ['-emacs-U'])
def run(self, conn):
while True:
if not conn.poll():
cmd = conn.recv()
self.process.send(cmd)
self.process.expect('\<\/prompt\>')
result = self.process.before + self.process.after + " "
conn.send(result)
q, p = Pipe()
proc = CockProc()
proc.start()
proc.run(p)
res = q.recv()
command = raw_input(res + " ")
q.send(command)
res = q.recv()
parent_conn.send('OHHAI')
p.join()
`

This works, but might need some more work. Not sure how many of these i can create and loop over.
from multiprocessing import Process, Pipe
from pexpect import spawn
class CockProc(Process):
def start(self):
self.process = spawn('coqtop', ['-emacs-U'])
def run(self, conn):
if conn.poll():
cmd = conn.recv()
self.process.send(cmd + "\n")
print "sent comm"
self.process.expect('\<\/prompt\>')
result = self.process.before + self.process.after + " "
conn.send(result)
here, there = Pipe(duplex=True)
proc = CockProc()
proc.start()
proc.run(there)
while True:
if here.poll():
res = here.recv()
command = raw_input(res + " ")
here.send(command)
proc.run(there)

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.