Sending and receiving async over multiprocessing.Pipe() in Python - python

I'm having some issues getting the Pipe.send to work in this code. What I would ultimately like to do is send and receive messages to and from the foreign process while its running in a fork. This is eventually going to be integrated into a pexpect loop for talking to interpreter processes.
from multiprocessing import Process, Pipe
from pexpect import spawn
class CockProc(Process):
def start(self):
self.process = spawn('coqtop', ['-emacs-U'])
def run(self, conn):
while True:
if not conn.poll():
cmd = conn.recv()
self.process.send(cmd)
self.process.expect('\<\/prompt\>')
result = self.process.before + self.process.after + " "
conn.send(result)
q, p = Pipe()
proc = CockProc()
proc.start()
proc.run(p)
res = q.recv()
command = raw_input(res + " ")
q.send(command)
res = q.recv()
parent_conn.send('OHHAI')
p.join()
`

This works, but might need some more work. Not sure how many of these i can create and loop over.
from multiprocessing import Process, Pipe
from pexpect import spawn
class CockProc(Process):
def start(self):
self.process = spawn('coqtop', ['-emacs-U'])
def run(self, conn):
if conn.poll():
cmd = conn.recv()
self.process.send(cmd + "\n")
print "sent comm"
self.process.expect('\<\/prompt\>')
result = self.process.before + self.process.after + " "
conn.send(result)
here, there = Pipe(duplex=True)
proc = CockProc()
proc.start()
proc.run(there)
while True:
if here.poll():
res = here.recv()
command = raw_input(res + " ")
here.send(command)
proc.run(there)

Related

python multiprocessing doesn't work(doesn't go back to parent process) using Queue

I have a question about Queue in python multiprocessing.
I tried to perform multiprocessing operation on DataFrame using Queue and Process.
However, only sub-processes are executed, and subsequent operations are not executed.
In other words, only the completion messages of sub-processes are printed, and subsequent operations are not executed.
The code I ran is below.
import pandas as pd
from multiprocessing import Process, Queue
import os
import numpy as np
from pandasql import sqldf
import time
count_of_core = 8
def add_rownum_col(queue, df_of_split) :
sub_process_id = os.getpid()
print("!!! {} subprocess started !!!".format(sub_process_id), "\n")
df_of_split_with_rownum_col = sqldf("SELECT *, ROW_NUMBER() OVER() AS ROWNUM_ FROM df_of_split")
print("* making df_of_split_with_rownum_col finished", "\n")
print(df_of_split_with_rownum_col, "\n")
queue.put(df_of_split_with_rownum_col)
print("!!! {} subprocess finished !!!".format(sub_process_id), "\n")
return
if __name__ == '__main__':
parent_process_id = os.getpid()
print("* parent_process_id :", parent_process_id, "\n")
df_of_example = pd.read_csv('../data/example_for_multiprocessing.csv', engine='python', encoding='utf-8')
print("* df_of_example :", "\n")
print(df_of_example, "\n")
print("* length of df_of_example :", len(df_of_example), "\n")
df_of_split = np.array_split(df_of_example, count_of_core)
print("count of batch :", len(df_of_split), "\n")
print("!!! multiprocessing started !!!", "\n")
df_of_result = pd.DataFrame()
queue = Queue()
start_time_of_multiprocessing = time.time()
p_0 = Process(target = add_rownum_col, args = (queue, df_of_split[0]))
p_0.start()
p_1 = Process(target = add_rownum_col, args = (queue, df_of_split[1]))
p_1.start()
p_2 = Process(target = add_rownum_col, args = (queue, df_of_split[2]))
p_2.start()
p_3 = Process(target = add_rownum_col, args = (queue, df_of_split[3]))
p_3.start()
p_4 = Process(target = add_rownum_col, args = (queue, df_of_split[4]))
p_4.start()
p_5 = Process(target = add_rownum_col, args = (queue, df_of_split[5]))
p_5.start()
p_6 = Process(target = add_rownum_col, args = (queue, df_of_split[6]))
p_6.start()
p_7 = Process(target = add_rownum_col, args = (queue, df_of_split[7]))
p_7.start()
p_0.join()
p_1.join()
p_2.join()
p_3.join()
p_4.join()
p_5.join()
p_6.join()
p_7.join()
print("!!! multiprocessing finished !!!")
print("!!! back to parent process({}) !!!".format(parent_process_id))
print("* time for multiprocessing :", time.time() - start_time_of_multiprocessing, "\n")
print("!!! concatenating all batches in queue started !!!", "\n")
queue.put('exit')
while True :
df_batch = queue.get()
if df_batch == 'exit':
break
else:
df_of_result = pd.concat([df_of_result, df_batch])
print("!!! concatenating all batches in queue finished !!!", "\n")
print("* df_of_result :", "\n")
print(df_of_result, "\n")
print("* length of df_of_result :", len(df_of_result), "\n")
It seems that it cannot return to the parent-process after the operation of sub-processes is finished.
I would like to get a detailed answer as to why this problem occurred.
Please provide detailed answer.
(I'm using python 3.7)

Python Repeated IP-Scan with threads

After repeated ip scans the program crashes. The threads are not killed and I don't know how to to this. Any idea?
import time
import datetime
import subprocess,os,threading
from queue import Queue
ipbase = "192.168.101.{0}"
startadd = 20
stoppadd = 100
def ipscan(): #looking for available IP adresses
lock=threading.Lock()
_start=time.time()
def check(n):
with open(os.devnull, "wb") as limbo:
ip=ipbase.format(n)
result=subprocess.Popen(["ping", "-n", "2", "-w", "300", ip],stdout=limbo, stderr=limbo).wait(timeout=10)
with lock:
if not result:
print (ip, "active")
else:
pass
def threader():
while True:
worker=q.get()
check(worker)
q.task_done()
print("Scan IP...")
print("Address scan from " + ipbase + str(startadd) + " until " + ipbase + str(stoppadd))
q=Queue()
for _ in range(startadd,stoppadd):
t=threading.Thread(target=threader)
t.daemon=True
t.start()
for worker in range(startadd,stoppadd):
q.put(worker)
q.join()
if __name__ == "__main__":
starttime = datetime.datetime.now()
print (starttime)
print("first check of available ip adresses..")
ipscan() #looking for available IP adresses
cyclebegin = datetime.datetime.now()
acttime = datetime.datetime.now()
sampletime = 3
while (1):
if ((acttime - cyclebegin) > datetime.timedelta(seconds=sampletime)):
dtime = acttime - cyclebegin
print ("delta-seconds: ",dtime.seconds)
cyclebegin = datetime.datetime.now()
ipscan()
acttime = datetime.datetime.now()
After ipscan is finished with q.join() the tasks should be finished but not killed, as I understood. With the repeated call of ipscan the limit of the threads is exeeded. What do I have to modify to prevent this?

Subprocess only allows one input

I'm using subprocess to communicate with an interactive command line application, but after I send the first command to the application, all other input seems not to be communicated to the subprocess. Can anyone show me where my mistake is?
Here's the code:
from subprocess import Popen, PIPE, STDOUT
from threading import Thread
from queue import Queue, Empty
import time
class Prolog(object):
def __init__(self):
"""
Opens a subprocess running swi-prolog and reads all the header stuff that it writes
"""
self.prolog = Popen(r"C:\Program Files\swipl\bin\swipl.exe", stdin=PIPE, stdout=PIPE, stderr=STDOUT, bufsize=1)
def enqueue_output(out, queue):
for line in iter(out.readline, b''):
queue.put(line)
out.close()
# This thread runs in the background as long as the program is running it enqueues all the output from Prolog
self.q = Queue()
t = Thread(target=enqueue_output, args=(self.prolog.stdout, self.q))
t.daemon = True # thread dies with the program
t.start()
out = True
while out:
out = self.get_line()
def get_line(self):
"""
read line without blocking
:return: the next line in the output, else False if no more output
"""
try:
line = self.q.get(timeout=.1) # or q.get(timeout=.1)
except Empty:
return False
else: # got line
return line
def send_query(self, query):
"""
Sends a query to the Prolog shell
:param query: string containing the query to be sent to the prolog shell
:return: None
"""
query = query + "\n"
query = bytes(query, encoding="utf-8")
self.prolog.stdin.write(query)
self.prolog.stdin.flush()
def get_output(self):
output = self.get_line()
if not output:
return False
else:
return output[:-2]
def query(self, query):
output = []
self.send_query(query)
temp = self.get_output()
print(temp)
while not temp:
time.sleep(.1)
temp = self.get_output()
output.append(temp)
while not temp == b'true.' and not temp == b'false.':
self.send_query(";")
temp = self.get_output()
print(temp)
while not temp:
time.sleep(.1)
temp = self.get_output()
output.append(temp)
print(output)
if __name__ == "__main__":
p = Prolog()
p.query('[\"GCD.pl\"].')
p.get_output()
p.query("permut([a, b, c], X).")
The problem comes during the second call to p.query. The command doesn't seem to be passed to the shell at all, so there is never any output, so the program just gets stuck in the "while not temp" loop in the query method.

subprocess.Popen in Threads

I have a number of files (over 4000) that I want to simultaneously load into PostgreSQL. I have separated them into 4 different file lists and I want a thread to iterate through each list loading the data.
The problem I have is that is I use os.system to call the loading program but this prevents the other threads from running simultaneously. If I use subprocess.Popen then they run simultaneously but the threads believe they have finished execeuting so move onto the next part of my script.
Am I doing this the right way? Or is there a better way to call subprocesses from within a thread.
def thread1Load(self, thread1fileList):
connectionstring = settings.connectionstring
postgreshost = settings.postgreshost
postgresdatabase = settings.postgresdatabase
postgresport = settings.postgresport
postgresusername = settings.postgresusername
postgrespassword = settings.postgrespassword
tablename = None
encoding = None
connection = psycopg2.connect(connectionstring)
for filename in thread1fileList:
load_cmd = #load command
run = subprocess.Popen(load_cmd, shell=True)
print "finished loading thread 1"
def thread2Load(self, thread2fileList):
connectionstring = settings.connectionstring
postgreshost = settings.postgreshost
postgresdatabase = settings.postgresdatabase
postgresport = settings.postgresport
postgresusername = settings.postgresusername
postgrespassword = settings.postgrespassword
tablename = None
connection = psycopg2.connect(connectionstring)
for filename in thread2fileList:
load_cmd = #load command
run = subprocess.Popen(load_cmd, shell=True)
print "finished loading thread 2"
def thread3Load(self, thread3fileList):
connectionstring = settings.connectionstring
postgreshost = settings.postgreshost
postgresdatabase = settings.postgresdatabase
postgresport = settings.postgresport
postgresusername = settings.postgresusername
postgrespassword = settings.postgrespassword
tablename = None
connection = psycopg2.connect(connectionstring)
for shapefilename in thread3fileList:
load_cmd = #load command
run = subprocess.Popen(load_cmd, shell=True)
print "finished loading thread 3"
def thread4Load(self, thread4fileList):
connectionstring = settings.connectionstring
postgreshost = settings.postgreshost
postgresdatabase = settings.postgresdatabase
postgresport = settings.postgresport
postgresusername = settings.postgresusername
postgrespassword = settings.postgrespassword
tablename = None
connection = psycopg2.connect(connectionstring)
for filename in thread4fileList:
load_cmd = #load command
run = subprocess.Popen(load_cmd, shell=True)
print "finished loading thread 4"
def finishUp(self):
print 'finishing up'
def main():
load = Loader()
thread1 = threading.Thread(target=(load.thread1Load), args=(thread1fileList, ))
thread2 = threading.Thread(target=(load.thread2Load), args=(thread2fileList, ))
thread3 = threading.Thread(target=(load.thread3Load), args=(thread3fileList, ))
thread4 = threading.Thread(target=(load.thread4Load), args=(thread4fileList, ))
threads = [thread1, thread2, thread3, thread4]
for thread in threads:
thread.start()
thread.join()
load.finishUp(connectionstring)
if __name__ == '__main__':
main()
Don't repeat yourself. One threadLoad method suffices. That way, if you need to modify something in the method you do not need to make the same modification in 4 different places.
Use run.communicate() to block until the subprocess is done.
This starts one thread, then blocks until that thread finishes, then
starts another thread, etc:
for thread in threads:
thread.start()
thread.join()
Instead, start all the threads first, then join all the threads:
for thread in threads:
thread.start()
for thread in threads:
thread.join()
import subprocess
import threading
class Loader(object):
def threadLoad(self, threadfileList):
connectionstring = settings.connectionstring
...
connection = psycopg2.connect(connectionstring)
for filename in threadfileList:
load_cmd = # load command
run = subprocess.Popen(load_cmd, shell=True)
# block until subprocess is done
run.communicate()
name = threading.current_thread().name
print "finished loading {n}".format(n=name)
def finishUp(self):
print 'finishing up'
def main():
load = Loader()
threads = [threading.Thread(target=load.threadLoad, args=(fileList, ))
for fileList in (thread1fileList, thread2fileList,
thread3fileList, thread4fileList)]
for thread in threads:
thread.start()
for thread in threads:
thread.join()
load.finishUp(connectionstring)
if __name__ == '__main__':
main()

Python Thread/Queue issue

I'm creating a threaded python script that has a collection of files that is put into a queue and then an unknown amount of threads (default is 3) to start downloading. When each of the threads complete it updates the stdout with the queue status and a percentage. All the files are being downloaded but the status information is wrong on the 3rd thread and I'm not sure why. I've been considering creating a work_completed queue to use for the calculation but don't think I should have to/that it would matter. Could someone point me in the right direction here?
download_queue = queue.Queue()
class Downloader(threading.Thread):
def __init__(self,work_queue):
super().__init__()
self.current_job = 0
self.work_queue = work_queue
self.queue_size = work_queue.qsize()
def run(self):
while self.work_queue.qsize() > 0:
url = self.work_queue.get(True)
system_call = "wget -nc -q {0} -O {1}".format(url,local_file)
os.system(system_call)
self.current_job = int(self.queue_size) - int(self.work_queue.qsize())
self.percent = (self.current_job / self.queue_size) * 100
sys.stdout.flush()
status = "\rDownloading " + url.split('/')[-1] + " [status: " + str(self.current_job) + "/" + str(self.queue_size) + ", " + str(round(self.percent,2)) + "%]"
finally:
self.work_queue.task_done()
def main:
if download_queue.qsize() > 0:
if options.active_downloads:
active_downloads = options.active_downloads
else:
active_downloads = 3
for x in range(active_downloads):
downloader = Downloader(download_queue)
downloader.start()
download_queue.join()
You can't check the queue size in one statement, and then .get() from the queue in the next. In the meantime the whole world may have changed. The .get() method call is the single atomic operation you need to call. If it raises Empty or blocks, the queue is empty.
Your threads can overwrite each other's output. I would have another thread with an input queue whos only job is to print the items in the queue to stdout. It can also count off the number of completed items and produce status information.
I also tend not to subclass Thread, but instead just supply a plain Thread instance with a target= parameter and .start() the thread.
based on your response, try this:
download_queue = queue.Queue()
class Downloader(threading.Thread):
def __init__(self,work_queue, original_size):
super().__init__()
self.current_job = 0
self.work_queue = work_queue
self.queue_size = original_size
def run(self):
while True:
try:
url = self.work_queue.get(False)
system_call = "wget -nc -q {0} -O {1}".format(url,local_file)
os.system(system_call)
# the following code is questionable. By the time we get here,
# many other items may have been taken off the queue.
self.current_job = int(self.queue_size) - int(self.work_queue.qsize())
self.percent = (self.current_job / self.queue_size) * 100
sys.stdout.flush()
status = ("\rDownloading " + url.split('/')[-1] +
" [status: " + str(self.current_job) +
"/" + str(self.queue_size) + ", " +
str(round(self.percent,2)) + "%]" )
except queue.Empty:
pass
finally:
self.work_queue.task_done()
def main:
if download_queue.qsize() > 0:
original_size = download_queue.qsize()
if options.active_downloads:
active_downloads = options.active_downloads
else:
active_downloads = 3
for x in range(active_downloads):
downloader = Downloader(download_queue, original_size)
downloader.start()
download_queue.join()
If you'd like to use the multiprocessing module, it includes a very nice parallel imap_unordered, which would reduce your problem to the very elegant:
import multiprocessing, sys
class ParallelDownload:
def __init__(self, urls, processcount=3):
self.total_items = len(urls)
self.pool = multiprocessing.Pool(processcount)
for n, status in enumerate(self.pool.imap_unordered(self.download, urls)):
stats = (n, self.total_items, n/self.total_items)
sys.stdout.write(status + " [%d/%d = %0.2f %%]\n"%stats)
def download(self, url):
system_call = "wget -nc -q {0} -O {1}".format(url, local_file)
os.system(system_call)
status = "\rDownloaded " + url.split('/')[-1]
return status

Categories

Resources