Unable to use distribute LocalCluster in subprocess in python 3 - python

I get an error when using distribute's LocalCluster in a subprocess with python 3 (python 2 works fine). I have the following minimal example (I am using python 3.6, distributed 1.23.3, tornado 5.1.1):
import multiprocessing
from distributed import LocalCluster
from distributed import Client
def call_client(cluster_address):
with Client(cluster_address):
pass
def main():
cluster = LocalCluster(n_workers=2)
print(cluster.workers)
process = multiprocessing.Process(
target=call_client, args=(cluster.scheduler.address, )
)
process.start()
process.join()
if __name__ == "__main__":
main()
when executing the file I get the following error message:
user#9b97e84a3c58:/workspace$ python test.py
[<Nanny: tcp://127.0.0.1:35779, threads: 2>, <Nanny: tcp://127.0.0.1:40211, threads: 2>]
Process Process-3:
Traceback (most recent call last):
File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
self.run()
File "/usr/lib/python3.6/multiprocessing/process.py", line 93, in run
self._target(*self._args, **self._kwargs)
File "test.py", line 10, in call_client
with Client(cluster_address):
File "/home/user/venv/lib/python3.6/site-packages/distributed/client.py", line 610, in __init__
self.start(timeout=timeout)
File "/home/user/venv/lib/python3.6/site-packages/distributed/client.py", line 733, in start
sync(self.loop, self._start, **kwargs)
File "/home/user/venv/lib/python3.6/site-packages/distributed/utils.py", line 277, in sync
six.reraise(*error[0])
File "/home/user/venv/lib/python3.6/site-packages/six.py", line 693, in reraise
raise value
File "/home/user/venv/lib/python3.6/site-packages/distributed/utils.py", line 262, in f
result[0] = yield future
File "/home/user/venv/lib/python3.6/site-packages/tornado/gen.py", line 1133, in run
value = future.result()
File "/home/user/venv/lib/python3.6/site-packages/tornado/gen.py", line 1141, in run
yielded = self.gen.throw(*exc_info)
File "/home/user/venv/lib/python3.6/site-packages/distributed/client.py", line 821, in _start
yield self._ensure_connected(timeout=timeout)
File "/home/user/venv/lib/python3.6/site-packages/tornado/gen.py", line 1133, in run
value = future.result()
File "/home/user/venv/lib/python3.6/site-packages/tornado/gen.py", line 1141, in run
yielded = self.gen.throw(*exc_info)
File "/home/user/venv/lib/python3.6/site-packages/distributed/client.py", line 862, in _ensure_connected
self._update_scheduler_info())
File "/home/user/venv/lib/python3.6/site-packages/tornado/gen.py", line 1133, in run
value = future.result()
tornado.util.TimeoutError: Timeout

Using spawn seems to work. I suspect that there is some state that does not fork nicely.
process = multiprocessing.get_context('spawn').Process(...)

Since my original problem is starting the subprocess within a flask app I can't use 'spawn' as suggested by MRocklin in the other answer. My working solution right now is that I don't call cluster = LocalCluster(n_workers=2) in the main process but also start it in a subprocess:
import sys
import multiprocessing
import signal
from functools import partial
from distributed import LocalCluster
from distributed import Client
def _stop_cluster(cluster, *args):
cluster.close()
sys.exit(0)
def _start_local_cluster(q, n_workers):
cluster = LocalCluster(n_workers=n_workers)
q.put(cluster.scheduler.address)
# shut down cluster when process is terminated
signal.signal(signal.SIGTERM, partial(_stop_cluster, cluster))
# run forever
signal.pause()
def call_client(cluster_address):
with Client(cluster_address):
print("I am working")
def main():
q = multiprocessing.Queue()
p_dask = multiprocessing.Process(target=_start_local_cluster, args=(q, 2))
p_dask.start()
cluster_address = q.get()
process = multiprocessing.Process(
target=call_client, args=(cluster_address, )
)
process.start()
process.join()
p_dask.terminate()
if __name__ == "__main__":
main()

Related

Runtime Error on Cloud Run when using Dash App and Papermill executions on background tasks

I am facing an issue in CloudRun:
Below I am only showing the scheduling part (background tasks), but I also have some code for a Dash application.
I would like to have a Dash application and scheduler that can execute Jupyter Notebooks on the same cloud run application. I am deploying using a dockerFile. I have tried the CloudRun Revision with 4CPU and 4GiB memory.
Everything is working locally (Dash App and Scheduling) but it seems that the papermill execution are not working in CloudRun. The Notebook is simply sending a mail for now. All advices are welcome! Thank you
Here is my code:
def run_continuously(interval=1):
cease_continuous_run = threading.Event()
class ScheduleThread(threading.Thread):
#classmethod
def run(cls):
while not cease_continuous_run.is_set():
schedule.run_pending()
time.sleep(interval)
continuous_thread = ScheduleThread()
continuous_thread.start()
return cease_continuous_run
def runPapermill1():
print('running 1')
sec = str(datetime.now().minute)
pm.execute_notebook(r'testPapermill.ipynb', r"OutputNotebook/output-" + sec + ".ipynb", kernel_name='python3', start_timeout=120)
if __name__ == '__main__':
schedule.every(50).seconds.do(runPapermill1)
# Start the background thread
stop_run_continuously = run_continuously()
server_port = os.environ.get('PORT', '8080')
app.run_server(debug=True, port=server_port, host='0.0.0.0')
# Stop the background thread
print('Stopping background Task')
stop_run_continuously.set()
HERE IS AN ERROR IN CloudRun:
2022-03-21T15:18:54.442802ZTraceback (most recent call last): File "/usr/local/lib/python3.8/threading.py", line 932, in _bootstrap_inner self.run() File "/app/app.py", line 90, in run schedule.run_pending() File "/usr/local/lib/python3.8/site-packages/schedule/init.py", line 780, in run_pending default_scheduler.run_pending() File "/usr/local/lib/python3.8/site-packages/schedule/init.py", line 100, in run_pending self._run_job(job) File "/usr/local/lib/python3.8/site-packages/schedule/init.py", line 172, in _run_job ret = job.run() File "/usr/local/lib/python3.8/site-packages/schedule/init.py", line 661, in run ret = self.job_func() File "/app/app.py", line 118, in runPapermill1 pm.execute_notebook(r'testPapermill.ipynb', r"OutputNotebook/output-" + sec + ".ipynb", kernel_name='python3', File "/usr/local/lib/python3.8/site-packages/papermill/execute.py", line 107, in execute_notebook nb = papermill_engines.execute_notebook_with_engine( File "/usr/local/lib/python3.8/site-packages/papermill/engines.py", line 49, in execute_notebook_with_engine return self.get_engine(engine_name).execute_notebook(nb, kernel_name, **kwargs) File "/usr/local/lib/python3.8/site-packages/papermill/engines.py", line 359, in execute_notebook cls.execute_managed_notebook(nb_man, kernel_name, log_output=log_output, **kwargs) File "/usr/local/lib/python3.8/site-packages/papermill/engines.py", line 418, in execute_managed_notebook return PapermillNotebookClient(nb_man, **final_kwargs).execute() File "/usr/local/lib/python3.8/site-packages/papermill/clientwrap.py", line 43, in execute with self.setup_kernel(**kwargs): File "/usr/local/lib/python3.8/contextlib.py", line 113, in enter return next(self.gen) File "/usr/local/lib/python3.8/site-packages/nbclient/client.py", line 562, in setup_kernel self.start_new_kernel_client() File "/usr/local/lib/python3.8/site-packages/nbclient/util.py", line 84, in wrapped return just_run(coro(*args, **kwargs)) File "/usr/local/lib/python3.8/site-packages/nbclient/util.py", line 62, in just_run return loop.run_until_complete(coro) File "/usr/local/lib/python3.8/asyncio/base_events.py", line 616, in run_until_complete return future.result() File "/usr/local/lib/python3.8/site-packages/nbclient/client.py", line 532, in async_start_new_kernel_client await ensure_async(self.kc.wait_for_ready(timeout=self.startup_timeout)) File "/usr/local/lib/python3.8/site-packages/nbclient/util.py", line 96, in ensure_async result = await obj File "/usr/local/lib/python3.8/site-packages/jupyter_client/client.py", line 184, in _async_wait_for_ready raise RuntimeError("Kernel didn't respond in %d seconds" % timeout) RuntimeError: Kernel didn't respond in 120 seconds
SOMETIMES, IT STARTS RUNNING AND JUST STOPS as below
Exeample Here
Solution Found:
I just needed to set "CPU is always alloacted" on the CloudRun. This allows me to have the dash app and background tasks (papermill execution) running at the same time. And I set the min instances to 1 as well.

Redis queue Retry does not work with the interval argument

I am trying to use the rq Retry functionality by following the rq documentation but it does not work when using the interval argument
python version: 3.8.0
rq version: 1.10.0
The somewhere.py
def my_func():
print('Start...')
asdsa # Here a NameError is raised
A script that enqueues my_func with retry functionality
from redis import Redis
from rq import Retry, Queue
from somewhere import my_func
r = Redis("localhost",
6379,
socket_connect_timeout=1,
decode_responses=True,
)
q = Queue(connection=r)
q.enqueue(my_func, retry=Retry(max=3, interval=10))
I was expecting to see the worker running my_func 3 times with 10 sec intervals in the between but it actually runs it only once. The worker output:
17:35:19 Worker rq:worker:1801215fdd1040b2aee962cccceff587: started, version 1.10.1
17:35:19 Subscribing to channel rq:pubsub:1801215fdd1040b2aee962cccceff587
17:35:19 *** Listening on default...
17:35:22 default: somewhere.my_func() (dc051976-598a-4863-8d15-6813c61d1377)
1
17:35:22 Traceback (most recent call last):
File "/home/user/Documents/Projects/Aquacrop/aquacrop/aquacrop-api/env/lib/python3.8/site-packages/rq/worker.py", line 1061, in perform_job
rv = job.perform()
File "/home/user/Documents/Projects/Aquacrop/aquacrop/aquacrop-api/env/lib/python3.8/site-packages/rq/job.py", line 821, in perform
self._result = self._execute()
File "/home/user/Documents/Projects/Aquacrop/aquacrop/aquacrop-api/env/lib/python3.8/site-packages/rq/job.py", line 844, in _execute
result = self.func(*self.args, **self.kwargs)
File "./somewhere.py", line 3, in my_func
somewhere
NameError: name 'somewhere' is not defined
Traceback (most recent call last):
File "/home/user/Documents/Projects/Aquacrop/aquacrop/aquacrop-api/env/lib/python3.8/site-packages/rq/worker.py", line 1061, in perform_job
rv = job.perform()
File "/home/user/Documents/Projects/Aquacrop/aquacrop/aquacrop-api/env/lib/python3.8/site-packages/rq/job.py", line 821, in perform
self._result = self._execute()
File "/home/user/Documents/Projects/Aquacrop/aquacrop/aquacrop-api/env/lib/python3.8/site-packages/rq/job.py", line 844, in _execute
result = self.func(*self.args, **self.kwargs)
File "./somewhere.py", line 3, in my_func
somewhere
NameError: name 'somewhere' is not defined
If I do not use the interval argument, the worker retries the function 3 times as expected.
What am I doing wrong?
As sated here and here one has to run the worker with the --with-scheduler flag, like:
rq worker --url redis://localhost:6379 --with-scheduler

python multiprocessing manager cannot load list from distributed node

After serveral test, I find this problem caused by the dim of manager.list(manager.list(...)). But I really need it to be 2 dims. Any suggestion would be appreciated!
I'm trying to build a server and multiple clients across multiple nodes.
One node act as server which initial manager.list() for other client to use.
Other nodes act as client which attach server to get list and deal with it.
Firewall is closed. And when put server and client on a single node, it works fine.
Got problem like this:
Traceback (most recent call last):
File "main.py", line 352, in <module>
train(args)
File "main.py", line 296, in train
args, proc_manager, device)
File "main.py", line 267, in make_gossip_buffer
mng,sync_freq=args.sync_freq, num_nodes=args.num_nodes)
File "/home/think/gala-master-distprocess-changing_to_multinodes/gala/gpu_gossip_buffer.py", line 49, in __init__
r_events = read_events[rank]
File "<string>", line 2, in __getitem__
File "/home/think/anaconda3/envs/AC/lib/python3.7/multiprocessing/managers.py", line 819, in _callmethod
kind, result = conn.recv()
File "/home/think/anaconda3/envs/AC/lib/python3.7/multiprocessing/connection.py", line 251, in recv
return _ForkingPickler.loads(buf.getbuffer())
File "/home/think/anaconda3/envs/AC/lib/python3.7/multiprocessing/managers.py", line 943, in RebuildProxy
return func(token, serializer, incref=incref, **kwds)
File "/home/think/anaconda3/envs/AC/lib/python3.7/multiprocessing/managers.py", line 793, in __init__
self._incref()
File "/home/think/anaconda3/envs/AC/lib/python3.7/multiprocessing/managers.py", line 847, in _incref
conn = self._Client(self._token.address, authkey=self._authkey)
File "/home/think/anaconda3/envs/AC/lib/python3.7/multiprocessing/connection.py", line 492, in Client
c = SocketClient(address)
File "/home/think/anaconda3/envs/AC/lib/python3.7/multiprocessing/connection.py", line 620, in SocketClient
s.connect(address)
FileNotFoundError: [Errno 2] No such file or directory
Server runs on a single node.
Code of server are shown below:
import torch.multiprocessing as mp
from multiprocessing.managers import ListProxy, BarrierProxy, AcquirerProxy, EventProxy
from gala.arguments import get_args
mp.current_process().authkey = b'abc'
def server(manager,host, port, key, args):
read_events = manager.list([manager.list([manager.Event() for _ in range(num_learners)])
for _ in range(num_learners)])
manager.register('get_read_events', callable=lambda : read_events, proxytype=ListProxy)
print('start service at', host)
s = manager.get_server()
s.serve_forever()
if __name__ == '__main__':
mp.set_start_method('spawn')
args = get_args()
manager = mp.Manager()
server(manager,'10.107.13.120', 5000, b'abc', args)
Client runs on other nodes. those nodes connect server with ethernet. CLient ip is 10.107.13.80
Code of client are shown below:
import torch.multiprocessing as mp
mp.current_process().authkey = b'abc'
def make_gossip_buffer(mng):
read_events = mng.get_read_events()
gossip_buffer = GossipBuffer(parameters)
def train(args):
proc_manager = mp.Manager()
proc_manager.register('get_read_events')
proc_manager.__init__(address=('10.107.13.120', 5000), authkey=b'abc')
proc_manager.connect()
make_gossip_buffer(proc_manager)
if __name__ == "__main__":
mp.set_start_method('spawn')
train(args)
Any help would be appreciated!

Python 3.x multiprocess TypeError: can't pickle _thread.lock objects

I am testing python multiprocess. I use pymongo to manage queue, my code and error is like below. I can't solve the issue and I don't know the root cause, please help me, thank you very much. I know multithreading could work, everything else works too, I had a test line saying:
process_crawler(seed_url, scrape_callback=scrape_callback, cache=cache, max_threads=max_threads, timeout=10)
import time
import threading
from mongo_queue import MongoQueue
from downloader import Downloader
import multiprocessing
SLEEP_TIME = 1
def threaded_crawler(seed_url, delay=5, cache=None, scrape_callback=None, user_agent='wswp', proxies=None, num_retries=1, max_threads=10, timeout=60):
"""Crawl a website in multiple threads"""
# url queues to be crawled
crawl_queue = MongoQueue()
crawl_queue.clear()
crawl_queue.push(seed_url)
downloader = Downloader(delay=delay, user_agent=user_agent, proxies=proxies, num_retries=num_retries, cache=cache, timeout=timeout)
def process_queue():
while True:
try:
url = crawl_queue.pop()
except KeyError:
#crawl queue is empty
break
else:
html = downloader(url)
if scrape_callback:
try:
links = scrape_callback(url, html) or []
except Exception as e:
print('Error in call back for %s, %s' % (url, e))
else:
for link in links:
crawl_queue.push(link)
threads = []
while threads or crawl_queue:
# the craw is still active
for thread in threads:
if not thread.is_alive():
threads.remove(thread)
while len(threads) < max_threads and crawl_queue.peek():
# can start some more threads
thread = threading.Thread(target=process_queue)
thread.setDaemon(True)
thread.start()
threads.append(thread)
time.sleep(SLEEP_TIME)
def process_crawler(args, **kwargs):
num_cpus = multiprocessing.cpu_count()
print('Starting Multiprocessing.... CPU Number is ', num_cpus)
processes = []
for i in range(num_cpus):
p = multiprocessing.Process(target=threaded_crawler, args=[args], kwargs=kwargs)
p.start()
processes.append(p)
for p in processes:
p.join()
Traceback (most recent call last):
Starting Multiprocessing.... CPU Number is 8
File "C:/Users/Michael Qian/Desktop/Python/MyScraper/process_test.py", line 15, in <module>
test(1)
File "C:/Users/Michael Qian/Desktop/Python/MyScraper/process_test.py", line 10, in test
process_crawler(scrape_callback.seed_url, scrape_callback=scrape_callback, cache=cache, max_threads=max_threads, timeout=10)
File "C:\Users\Michael Qian\Desktop\Python\MyScraper\process_crawler.py", line 58, in process_crawler
p.start()
File "C:\Program Files\Python35\lib\multiprocessing\process.py", line 105, in start
self._popen = self._Popen(self)
File "C:\Program Files\Python35\lib\multiprocessing\context.py", line 212, in _Popen
return _default_context.get_context().Process._Popen(process_obj)
File "C:\Program Files\Python35\lib\multiprocessing\context.py", line 313, in _Popen
return Popen(process_obj)
File "C:\Program Files\Python35\lib\multiprocessing\popen_spawn_win32.py", line 66, in __init__
reduction.dump(process_obj, to_child)
File "C:\Program Files\Python35\lib\multiprocessing\reduction.py", line 59, in dump
ForkingPickler(file, protocol).dump(obj)
TypeError: can't pickle _thread.lock objects
Traceback (most recent call last):
File "<string>", line 1, in <module>
File "C:\Program Files\Python35\lib\multiprocessing\spawn.py", line 106, in spawn_main
exitcode = _main(fd)
File "C:\Program Files\Python35\lib\multiprocessing\spawn.py", line 116, in _main
self = pickle.load(from_parent)
EOFError: Ran out of input
I've just tried the multiprocessing and ran into the very same problem. The problem was caused by sharing the MongoClient object between the processes.
Have a look at FAQ: Using PyMongo with Multiprocessing

How to test concurrency using py.test

I want to test a thread safety of a function using py.test
My efforts:
def function_to_be_tested(arg1,arg2):
some functionality
class Test:
def setup()
def teardown()
def test_conc()
p1=Process(taget=function_to_be_tested,args(arg1,arg2,))
p2=Process (taget=function_to_be_tested,args(arg1,arg3,))
p1.start()
p2.start()
p1.join()
p2.join()
excute above file using py.test commad.
This shows following error.
ExceptionPexpect: isalive() encountered condition where "terminated" is 0, but there was no child process. Did someone else call waitpid() on our process?
Can you help me to decode this error and also give a guidance on how to do this.
Thanks
Here is the actual code I am trying and stacktrace:
import pytest
from multiprocessing import Process
from pexpect import pxssh
def func(cls,b):
cls.s.sendline("bteq")
cls.s.prompt()
print b
#some operations inside the bteq session
class Test:
#classmethod
def setup_class(cls):
cls.s=pxssh.pxssh()
cls.s.login("IP",'Username','Pwd')
#classmethod
def teardown_class(cls):
cls.s.logout()
print "teardown"
def test_1(cls):
p1=Process(target=func,args=(cls,13,))
p2=Process(target=func,args=(cls,46,))
p1.start()
p2.start()
p1.join()
p2.join()
stack trace:
dipakw#dipakw-Inspiron-3558:~$ py.test -v -s test.py
============================= test session starts ==============================
platform linux2 -- Python 2.7.6, pytest-3.0.6, py-1.4.32, pluggy-0.4.0 -- /usr/bin/python
cachedir: .cache
rootdir: /home/dipakw, inifile:
plugins: xdist-1.15.0
collected 1 items
test.py::Test::test_1 Process Process-1:
Traceback (most recent call last):
File "/usr/lib/python2.7/multiprocessing/process.py", line 258, in _bootstrap
self.run()
File "/usr/lib/python2.7/multiprocessing/process.py", line 114, in run
self._target(*self._args, **self._kwargs)
File "/home/dipakw/test.py", line 7, in func
cls.s.prompt()
File "/usr/lib/python2.7/dist-packages/pexpect/pxssh.py", line 352, in prompt
Process Process-2:
Traceback (most recent call last):
File "/usr/lib/python2.7/multiprocessing/process.py", line 258, in _bootstrap
self.run()
File "/usr/lib/python2.7/multiprocessing/process.py", line 114, in run
self._target(*self._args, **self._kwargs)
File "/home/dipakw/test.py", line 7, in func
cls.s.prompt()
File "/usr/lib/python2.7/dist-packages/pexpect/pxssh.py", line 352, in prompt
i = self.expect([self.PROMPT, TIMEOUT], timeout=timeout)
i = self.expect([self.PROMPT, TIMEOUT], timeout=timeout)
File "/usr/lib/python2.7/dist-packages/pexpect/__init__.py", line 1418, in expect
File "/usr/lib/python2.7/dist-packages/pexpect/__init__.py", line 1418, in expect
timeout, searchwindowsize)
timeout, searchwindowsize)
File "/usr/lib/python2.7/dist-packages/pexpect/__init__.py", line 1433, in expect_list
File "/usr/lib/python2.7/dist-packages/pexpect/__init__.py", line 1433, in expect_list
timeout, searchwindowsize)
File "/usr/lib/python2.7/dist-packages/pexpect/__init__.py", line 1502, in expect_loop
timeout, searchwindowsize)
c = self.read_nonblocking(self.maxread, timeout)
File "/usr/lib/python2.7/dist-packages/pexpect/__init__.py", line 1502, in expect_loop
File "/usr/lib/python2.7/dist-packages/pexpect/__init__.py", line 886, in read_nonblocking
if not self.isalive():
c = self.read_nonblocking(self.maxread, timeout)
File "/usr/lib/python2.7/dist-packages/pexpect/__init__.py", line 1220, in isalive
'on our process?')
File "/usr/lib/python2.7/dist-packages/pexpect/__init__.py", line 886, in read_nonblocking
if not self.isalive():
File "/usr/lib/python2.7/dist-packages/pexpect/__init__.py", line 1220, in isalive
ExceptionPexpect: isalive() encountered condition where "terminated" is 0, but there was no child process. Did someone else call waitpid() on our process?
'on our process?')
ExceptionPexpect: isalive() encountered condition where "terminated" is 0, but there was no child process. Did someone else call waitpid() on our process?
Try something along the lines of the below for testing concurrency / threads:
import threading
from functools import partial
#pytest.mark.django_db
def test_is_concurrency_safe():
# setup anything for the test...
# recreate multiple threads calling the same function at the same time
_call_concurrently(
partial(my_function, args),
partial(my_function, args),
)
# Test that the race condition didn't create duplicates, etc
def _call_concurrently(*callables):
threads = [threading.Thread(target=callable_) for callable_ in callables]
for thread in threads:
thread.start()
for thread in threads:
thread.join()

Categories

Resources