Code is working differantly in vscode and jupyter notebook with multiprocessing - python

I have the code which runs differantly in vs code and jupyternotebook.
from multiprocessing.pool import ThreadPool
import functools
def my_func():
for i in range(4):
time.sleep(5)
print(datetime.now().time())
I run the function as
def smap(f):
return f()
f_inc = functools.partial(my_func)
pool = ThreadPool(3)
res = pool.map(smap, [f_inc])
Expected output:(VS CODE)
02:08:10.288722
02:13:10.288722
02:18:10.288722
02:23:10.288722
But output which is coming is:
02:08:10.288
02:08:10.298
02:08:10.299
02:08:10.2998

Related

Multiprocessing using GetPass

Running my multiprocessing script while using the getpass function in python (import getpass): and
I keep getting this error message. I am also running this code in a .py file on my command prompt terminal, using windows 10 as well.
error message
The following is my code:
import time
import multiprocessing
from multiprocessing import Pool
from multiprocessing import freeze_support
import getpass
import jaydebeapi
import pandas as pd
import numpy as np
from multiprocessing import Process, freeze_support, set_start_method
class multiprocess:
def __init__(self):
pass
def test(self, Batch_test):
pw_2 = getpass.getpass(prompt="Password", stream=False)
PML = jaydebeapi.connect('com.ibm.db2.jcc.DB2Driver', 'jdbc:db2://he3qlxvtdbs957.fhlmc.com:50001/PMLFDB2',
['f408195', pw_2], 'C:/JDBC/db2jcc.jar')
PML = PML.cursor()
Batch_query = "select id_hstrcl_data_bch_load_frst_evnt as btch_strt, id_hstrcl_data_bch_load_last_evnt as btch_end from UDBADM.hstrcl_data_bch_load WHERE ID_HSTRCL_DATA_BCH_LOAD BETWEEN 1 and 2"
PML.execute(Batch_query)
Batch_records = PML.fetchall()
Batch_records = pd.DataFrame(Batch_records)
for ind in Batch_test:
print(ind)
first_evnt = Batch_records.iloc[ind, 0]
last_evnt = Batch_records.iloc[ind, 1]
PML_loan_Query = "select CAST(b.id_lpa_alt_loan AS INT) AS id_lpa_alt_loan from udbadm.pml_lst_cmpltd_trans_mtch a join udbadm.lpa_altv_loan_idtn b on a.id_evnt = b.id_evnt where b.cd_lpa_alt_loan_idtn = 'HewlettPackardGeneratedTransaction' and a.id_evnt between ? and ?"
PML.execute(PML_loan_Query, (first_evnt, last_evnt))
loan_records = PML.fetchall()
return loan_records
def run(self):
processes = []
for i in range(2):
p = multiprocessing.Process(target=self.test, args=(i,))
processes.append(p)
for p in processes:
p.start()
if __name__ == '__main__':
a = multiprocess()
a.run()

multiprocessing with map_async and progress bar

Is it possible to have progress bar with map_async from multiprocessing:
toy example:
from multiprocessing import Pool
import tqdm
def f(x):
print(x)
return x*x
n_job = 4
with Pool(processes=n_job) as pool:
results = pool.map_async(f, range(10)).get()
print(results)
something like this:
data = []
with Pool(processes=10) as pool:
for d in tqdm.tqdm(
pool.imap(f, range(10)),
total=10):
data.append(d)
There are a couple of ways of achieving what you want that I can think of:
Use apply_async with a callback argument to update the progress bar as each result becomes available.
Use imap and as you iterate the results you can update the progress bar.
There is a slight problem with imap in that the results must be returned in task-submission order, which is of course what you want. But that order does not necessarily reflect the order in which the submitted tasks complete so the progress bar is not necessarily getting updated as frequently as it otherwise might. But I will show that solution first since it is the simplest and probably adequate:
from multiprocessing import Pool
import tqdm
def f(x):
import time
time.sleep(1) # for demo purposes
return x*x
# Required by Windows:
if __name__ == '__main__':
pool_size = 4
results = []
with Pool(processes=pool_size) as pool:
with tqdm.tqdm(total=10) as pbar:
for result in pool.imap(f, range(10)):
results.append(result)
pbar.update()
print(results)
The solution that uses apply_async:
from multiprocessing import Pool
import tqdm
def f(x):
import time
time.sleep(1) # for demo purposes
return x*x
# Required by Windows:
if __name__ == '__main__':
def my_callback(_):
# We don't care about the actual result.
# Just update the progress bar:
pbar.update()
pool_size = 4
with Pool(processes=pool_size) as pool:
with tqdm.tqdm(total=10) as pbar:
async_results = [pool.apply_async(f, args=(x,), callback=my_callback) for x in range(10)]
results = [async_result.get() for async_result in async_results]
print(results)
I think this is it:
from multiprocessing import Pool
import tqdm
def f(x):
return x*x
n_job = 4
data = []
with Pool(processes=10) as pool:
for d in tqdm.tqdm(
pool.map_async(f, range(10)).get(),
total=10):
data.append(d)
print(data)

Why does my Python multiprocessing result not append on callback?

I can't seem to figure out why my results are not appending while using the multiprocessing package.
I've looked at many similar questions but can't seem to figure out what I'm doing wrong. This my first attempt at multiprocessing (as you might be able to tell) so I don't quite understand all the jargon in the documentation which might be part of the problem
Running this in PyCharm prints an empty list instead of the desired list of row sums.
import numpy as np
from multiprocessing import Pool
import timeit
data = np.random.randint(0, 100, size=(5, 1000))
def add_these(numbers_to_add):
added = np.sum(numbers_to_add)
return added
results = []
tic = timeit.default_timer() # start timer
pool = Pool(3)
if __name__ == '__main__':
for row in data:
pool.apply_async(add_these, row, callback=results.append)
toc = timeit.default_timer() # start timer
print(toc - tic)
print(results)
EDIT: Closing and joining pool, then printing results within the if name==main block results in the following error being raised repeatedly until I manually stop execution:
RuntimeError:
An attempt has been made to start a new process before the current process has finished its bootstrapping phase. This probably means that you are not using fork to start your child processes and you have forgotten to use the proper idiom in the main module:
if __name__ == '__main__':
freeze_support()
...
The "freeze_support()" line can be omitted if the program is not going to be frozen to produce an executable.
Code to reproduce error:
import numpy as np
from multiprocessing import Pool, freeze_support
import timeit
data = np.random.randint(0, 100, size=(5, 1000))
def add_these(numbers_to_add):
added = np.sum(numbers_to_add)
return added
results = []
tic = timeit.default_timer() # start timer
pool = Pool(3)
if __name__ == '__main__':
for row in data:
pool.apply_async(add_these, (row,), callback=results.append)
pool.close()
pool.join()
print(results)
toc = timeit.default_timer() # end timer
print(toc - tic)
I think this would be a more correct way:
import numpy as np
from multiprocessing import Pool
import timeit
data = np.random.randint(0, 100, size=(5, 1000))
def add_these(numbers_to_add):
added = np.sum(numbers_to_add)
return added
results = []
if __name__ == '__main__':
with Pool(processes=3) as pool:
for row in data:
results = pool.apply_async(add_these, (row,))
try:
print(results.get(timeout=1))
except TimeoutError:
print("Multiprocessing Timeout")

redirect sys.stdout to specific Jupyter Notebook cell

Jupyter==4.1.0, Python==2.7.10, IPython==4.2.0
I'm writing a SQL UI for my Jupyter Notebooks and would like to incorporate multithreading so that I can run a query in one cell and continue to work in other cells while the query is running.
The problem I'm having is that if I execute a query in one cell, the output will be displayed in the last-executed cell's output prompt instead of in the output prompt of the cell that executed the query.
I scoured the interwebs and discovered this clever trick, but I think it's outdated and/or no longer works in my version of Jupyter. When I run it, I only get output for whatever cell was last executed. So if I run both, I only get the last-executed output, instead of the output printing to separate cells simultaneously.
So I have my context manager which sets the parent_header:
import sys
import threading
from contextlib import contextmanager
# we need a lock so that other threads don't snatch control
# while we have set a temporary parent
stdout_lock = threading.Lock()
#contextmanager
def set_stdout_parent(parent):
"""a context manager for setting a particular parent for sys.stdout
the parent determines the destination cell of the output
"""
save_parent = sys.stdout.parent_header
with stdout_lock:
sys.stdout.parent_header = parent
try:
yield
finally:
# the flush is important, because that's when the parent_header actually has its effect
sys.stdout.flush()
sys.stdout.parent_header = save_parent
I essentially want to be able to get the parent_header of a cell In[1] and redirect the output of cell In[2] to the output of In[1].
Example:
Get parent_header of In[1]:
In[1]: t = sys.stdout.parent_header
Then the following code will run, but the output should print to Out[1] (currently, I get no output when I run this code):
In [2]: with set_stdout_parent(t):
print 'FOO'
Which should produce:
In[1]: t = sys.stdout.parent_header
Out[1]:'FOO'
The documentation for ipywidgets.Output has a section about interacting with output widgets from background threads. Using the Output.append_stdout method there is no need for locking. The final cell in this answer can then be replaced with:
def t1_main():
for i in range(10):
output1.append_stdout(f'thread1 {i}\n')
time.sleep(0.5)
def t2_main():
for i in range(10):
output2.append_stdout(f'thread2 {i}\n')
time.sleep(0.5)
output1.clear_output()
output2.clear_output()
t1 = Thread(target=t1_main)
t2 = Thread(target=t2_main)
t1.start()
t2.start()
t1.join()
t2.join()
You can use a combination of ipywidgets.Output (docs) and locking:
Code in jupyter cells:
# In[1]:
from threading import Thread, Lock
import time
from ipywidgets import Output
# In[2]:
output1 = Output()
output1
# In[3]:
output2 = Output()
output2
# In[4]:
print_lock = Lock()
def t1_main():
for i in range(10):
with print_lock, output1:
print('thread1', i)
time.sleep(0.5)
def t2_main():
for i in range(10):
with print_lock, output2:
print('thread2', i)
time.sleep(0.5)
output1.clear_output()
output2.clear_output()
t1 = Thread(target=t1_main)
t2 = Thread(target=t2_main)
t1.start()
t2.start()
t1.join()
t2.join()

convert linux python multiprocessing to windows

I would like to use this Linux Python script in Windows Python.
how to rewrite it ? The part to be rewritten in multiprocessing part.
from __future__ import print_function
from collections import Counter
import glob
import multiprocessing
import os
import re
import sys
import time
def create_data(filepath):
...
return values
filepaths = glob.glob('*/*.txt')
num_tasks = len(filepaths)
p = multiprocessing.Pool()
results = p.imap(create_data, filepaths)
while (True):
completed = results._index
print("\r--- Completed {:,} out of {:,}".format(completed, num_tasks), end='')
sys.stdout.flush()
time.sleep(1)
if (completed == num_tasks): break
p.close()
p.join()
df_full = pd.DataFrame(list(results))
print()
thanks for your help.

Categories

Resources