I am trying to copy some files to an OCI bucket (Oracle Cloud Infrastructure).
The fist 5 files are succefully copied, but then the script hangs and the processes on the task manager dies, remaining only the main one.
from array import array
from pathlib import Path
import oci
import datetime
from multiprocessing import Process
import threading
import logging
from oci.object_storage import UploadManager
from oci.object_storage.models import CreateBucketDetails
from oci.object_storage.transfer.constants import MEBIBYTE
logging.basicConfig(filename=r'############',filemode='w', format='%(asctime)s - %(message)s', level=logging.INFO)
# Number of max processes allowed at a time
concurrency= 5
sema = threading.BoundedSemaphore(concurrency)
# The root directory path, Replace with your path
p = Path(r"#####")
# The Compartment OCID
compartment_id = "#######"
# The Bucket name where we will upload
bucket_name = "######"
config = oci.config.from_file()
object_storage_client = oci.object_storage.ObjectStorageClient(config)
part_size = 2 * MEBIBYTE
today = datetime.date.today()
today = str(today)
def upload_to_object_storage(path:str,name:str,namespace):
#upload_manager = UploadManager(object_storage_client, allow_parallel_uploads=False)
with open(path, "rb") as in_file:
logging.info("Starting upload {}".format(name))
object_storage_client.put_object(namespace,bucket_name,name,in_file)
#upload_manager.upload_file(namespace, bucket_name, name, in_file.name, part_size=part_size)
logging.info("Finished uploading {}".format(name))
sema.release()
return
def createUploadProcess(object:Path,object_storage_client,namespace,proc_list):
name = object.relative_to(p).as_posix()
sema.acquire()
process = Process(target=upload_to_object_storage, args=(object.as_posix(),name,namespace))
proc_list.append(process)
process.start()
def processDirectoryObjects(object:Path,object_storage_client,namespace,proc_list):
if object.is_file():
createUploadProcess(object,object_storage_client,namespace,proc_list)
def processDirectory(path:Path,object_storage_client,namespace,proc_list):
if path.exists():
logging.info("in directory ---- " + path.relative_to(p).as_posix())
for objects in path.iterdir():
if objects.is_dir():
processDirectory(objects,object_storage_client,namespace,proc_list)
else:
if today in objects.name:
processDirectoryObjects(objects,object_storage_client,namespace,proc_list)
if __name__ == '__main__':
config = config
object_storage_client = object_storage_client
sema = sema
namespace = object_storage_client.get_namespace().data
proc_list: array = []
if p.exists() and p.is_dir():
processDirectory(p,object_storage_client,namespace,proc_list)
for job in proc_list:
job.join()
I have aproximaly 50 files to copy, but it uploads 5 and then hangs. The execution presents the following error for the 5 processes:
Process Process-1:
Traceback (most recent call last):
File "C:\Users\#######\AppData\Local\Programs\Python\Python36\lib\multiprocessing\process.py", line 258, in _bootstrap
self.run()
File "C:\Users\#######\AppData\Local\Programs\Python\Python36\lib\multiprocessing\process.py", line 93, in run
self._target(*self._args, **self._kwargs)
File "C:\Users\#######\Documents\copia_bkp_oci2.py", line 49, in upload_to_object_storage
sema.release()
File "C:\Users\#######\AppData\Local\Programs\Python\Python36\lib\threading.py", line 482, in release
raise ValueError("Semaphore released too many times")
ValueError: Semaphore released too many times
Related
I have an iterator that will retrive various number of lines from a very large (>20GB) file depend on some features. The iterator works fine, but I can only use 1 thread to process the result. I would like to feed the value from each iteration to multiple threads / processes.
I'm using a text file with 9 lines to mimic my data, here is my code. I've been struggling on how to create the feedback so when one process finished, it will go and retrive the next iteration:
from multiprocessing import Process, Manager
import time
# Iterator
class read_file(object):
def __init__(self, filePath):
self.file = open(filePath, 'r')
def __iter__(self):
return self
def __next__(self):
line = self.file.readline()
if line:
return line
else:
raise StopIteration
# worker for one process
def print_worker(a, n, stat):
print(a)
stat[n] = True # Set the finished status as True
return None
# main
def main():
file_path = 'tst_mp.txt' # the txt file wit 9 lines
n_worker = 2
file_handle = read_file(file_path)
workers = []
# Create shared list for store dereplicated dict and progress counter
manager = Manager()
status = manager.list([False] * 2) # list of dictonary for each thread
# Initiate the workers
for i in range(n_worker):
workers.append(Process(target=print_worker, args=(file_handle.__next__(), i, status,)))
for worker in workers:
worker.start()
block = file_handle.__next__() # The next block (line)
while block: # continue is there is still block left
print(status)
time.sleep(1) # for every second
for i in range(2):
if status[i]: # Worker i finished
workers[i].join()
# workers[i].close()
workers[i] = Process(target=print_worker, args=(block, i, status,))
status[i] = False # Set worker i as busy (False)
workers[i].start() # Start worker i
try: # try to get the next item in the iterator
block = file_handle.__next__()
except StopIteration:
block = False
if __name__ == '__main__':
main()
The code is clumsy, but it did print out the sequence, but also with some error when I ran the code twice:
1
2
3
4
5
6
7
8
9
Process Process-10:
Traceback (most recent call last):
File "/home/zewei/mambaforge/lib/python3.9/multiprocessing/managers.py", line 802, in _callmethod
conn = self._tls.connection
AttributeError: 'ForkAwareLocal' object has no attribute 'connection'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/zewei/mambaforge/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
self.run()
File "/home/zewei/mambaforge/lib/python3.9/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/home/zewei/share/paf_depth/test_multiprocess.py", line 31, in print_worker
stat[n] = True # Set the finished status as True
File "<string>", line 2, in __setitem__
File "/home/zewei/mambaforge/lib/python3.9/multiprocessing/managers.py", line 806, in _callmethod
self._connect()
File "/home/zewei/mambaforge/lib/python3.9/multiprocessing/managers.py", line 794, in _connect
dispatch(conn, None, 'accept_connection', (name,))
File "/home/zewei/mambaforge/lib/python3.9/multiprocessing/managers.py", line 90, in dispatch
kind, result = c.recv()
File "/home/zewei/mambaforge/lib/python3.9/multiprocessing/connection.py", line 255, in recv
buf = self._recv_bytes()
File "/home/zewei/mambaforge/lib/python3.9/multiprocessing/connection.py", line 419, in _recv_bytes
buf = self._recv(4)
File "/home/zewei/mambaforge/lib/python3.9/multiprocessing/connection.py", line 384, in _recv
chunk = read(handle, remaining)
ConnectionResetError: [Errno 104] Connection reset by peer
Here is where I'm stucked, I was wondering if there is any fix or more elegant way for this?
Thanks!
Here's a better way to do what you are doing, using pool:
from multiprocessing import Pool
import time
.
.
.
.
# worker for one process
def print_worker(a):
print(a)
return None
def main():
file_path = r'' # the txt file wit 9 lines
n_worker = 2
file_handle = read_file(file_path)
results = []
with Pool(n_worker) as pool:
for result in pool.imap(print_worker, file_handle):
results.append(result)
print(results)
if __name__ == '__main__':
main()
Here, the imap function lazily iterates over the iterator, so that the whole file won't be read into memory. Pool handles spreading the tasks across the number of processes you started (using n_worker) automatically so that you don't have to manage it yourself.
from david beazley paper(https://www.dabeaz.com/usenix2009/concurrent/Concurrent.pdf, pg62), i am trying to run codes :
# channel.py
import pickle
class Channel(object):
def __init__(self,out_f,in_f):
self.out_f = out_f
self.in_f = in_f
def send(self,item):
pickle.dump(item,self.out_f)
self.out_f.flush()
def recv(self):
return pickle.load(self.in_f)
# child.py
import channel
import sys
ch = channel.Channel(sys.stdout,sys.stdin)
while True:
item = ch.recv()
ch.send(("child",item))
# parent.py
import channel
import subprocess
p = subprocess.Popen(['python','child.py'],
stdin=subprocess.PIPE,
stdout=subprocess.PIPE)
ch = channel.Channel(p.stdin,p.stdout)
• Using the child worker
ch.send("Hello World")
Hello World
When I try this code, I got this error
Traceback (most recent call last):
File "child.py", line 6, in <module>
item = ch.recv()
File "C:\Users\bluesky\backtest\channel.py", line 16, in recv
return pickle.load(self.in_f)
TypeError: a bytes-like object is required, not 'str'
what i am trying to get is that sending my data to child and get back processed data to parent process, is there any way to fix it
I would like to be able to start a testmodule that is in my CANoe script; it works fine if I start everything without Python.
Now, my Python code can load CANoe, open the desired cfg, and start the simulation.
I tried some code from examples in the documentation and here, but it doesnt work...
# --------------------------------------------------------------------------
# Standard library imports
import os
import sys
import subprocess
# import win32com.client
import time
import threading
from win32com.client import *
from win32com.client.connect import *
my_xml_ts = r"C:\_my_path\boaz_test2.xml"
# Vector Canoe Class
class CANoe:
def __init__(self):
self.application = None
# check if there is any instance of CANoe process
# output = subprocess.check_output('tasklist', shell=True)
# if CANoe process is still available, kill the process
# if "CANoe32.exe" in str(output):
# os.system("taskkill /im CANoe32.exe /f 2>nul >nul")
# re-dispatch object for CANoe Application
self.application = win32com.client.DispatchEx("CANoe.Application")
self.ver = self.application.Version
print('Loaded CANoe version ',
self.ver.major, '.',
self.ver.minor, '.',
self.ver.Build, '...')#, sep,''
self.Measurement = self.application.Measurement.Running
print(self.Measurement)
def open_simulation(self, cfgname):
# open CANoe simulation
if (self.application != None):
# check for valid file and it is *.cfg file
if os.path.isfile(cfgname) and (os.path.splitext(cfgname)[1] == ".cfg"):
self.application.Open(cfgname)
else:
raise RuntimeError("Can't find CANoe cfg file")
else:
raise RuntimeError("CANoe Application is missing,unable to open simulation")
def set_testmodule(self):
print("set_testmodule start")
test_env = self.application.Configuration.TestSetup.TestEnvironments.Item("Test Environment")
print("set_testmodule 1")
test_env = win32com.client.CastTo(test_env, "ITestEnvironment2")
print("set_testmodule 2")
self.test_module = test_env.TestModules.Item("XML_Boaz_2")
print("set_testmodule end", self.test_module)
def test_module_run(self):
print("test_module_run start")
self.test_module.Start()
print("test_module_run end")
# {.Sequence} property returns a collection of <TestCases> or <TestGroup>
# or <TestSequenceItem> which is more generic
seq = self.test_module.Sequence
for i in range(1, seq.Count + 1):
# Cast from <ITestSequenceItem> to <ITestCase> to access {.Verdict}
# and the {.Enabled} property
tc = win32com.client.CastTo(seq.Item(i), "ITestCase")
if tc.Verdict != 1: # Verdict 1 is pass
tc.Enabled = True
print(f"Enabling Test Case {tc.Ident} with verdict {tc.Verdict}")
else:
tc.Enabled = False
print(f"Disabling Test Case {tc.Ident} since it has already passed")
def close_simulation(self):
# close CANoe simulation
if (self.application != None):
self.stop_Measurement()
self.application.Quit()
# make sure the CANoe is close properly, otherwise enforce taskkill
output = subprocess.check_output('tasklist', shell=True)
if "CANoe32.exe" in str(output):
os.system("taskkill /im CANoe32.exe /f 2>nul >nul")
self.application = None
def start_Measurement(self):
retry = 0
retry_counter = 5
# try to establish measurement within 20s timeout
while not self.application.Measurement.Running and (retry < retry_counter):
self.application.Measurement.Start()
time.sleep(1)
retry += 1
if (retry == retry_counter):
raise RuntimeWarning("CANoe start measuremet failed, Please Check Connection!")
and running that:
import Python_CANoe
import time
import random
def try2():
X = random.randrange(50)
Y = random.randrange(100)
print("start")
CANoe = Python_CANoe.CANoe()
CANoe.open_simulation(canoe_config_file)
CANoe.set_testmodule() # Doesn't work
print("CANoe script already Open")
CANoe.start_Measurement()
time.sleep(5)
CANoe.test_module_run() # Doesn't work
if __name__ == "__main__":
try2()
and this is the error code I am getting:
set_testmodule start
Traceback (most recent call last):
File "C:/Users/m.m/Documents/Python/CANoe_Python_Script/7_5_2021_3_for_stackoverflow/CANoe_Boaz_Test.py", line 133, in <module>
try2()
File "C:/Users/m.m/Documents/Python/CANoe_Python_Script/7_5_2021_3_for_stackoverflow/CANoe_Boaz_Test.py", line 77, in try2
CANoe.set_testmodule() # Doesn't work
File "C:\Users\m.m\Documents\Python\CANoe_Python_Script\7_5_2021_3_for_stackoverflow\Python_CANoe.py", line 52, in set_testmodule
test_env = self.application.Configuration.TestSetup.TestEnvironments.Item("Test Environment")
File "C:\Users\BOAZ~1.BIL\LOCALS~1\Temp\gen_py\3.7\7F31DEB0-5BCC-11D3-8562-00105A3E017Bx0x1x58.py", line 15873, in Item
ret = self._oleobj_.InvokeTypes(0, LCID, 2, (9, 0), ((12, 1),),index
pywintypes.com_error: (-2147352567, 'Exception occurred.', (0, 'TestEnvironments::Item', 'Invalid index: Item not found!', 'C:\\Program Files\\Vector CANoe 11.0\\Help01\\CANoeCANalyzer.chm', 4281, -2147467259), None)
I am trying to crawl abstracts from PubMed and filtering them using regex via python. To speed things up, I wanted to use pythons multiprocessing pool.
My code looks like the following:
import multiprocessing as mp
from functools import partial
from typing import List, Tuple
def beautify_abstract(abstract: str, regex: str):
import re
result: str = ""
last_start = 0
matches = re.finditer(regex, abstract, re.MULTILINE)
for matchNum, match in enumerate(matches, start=1):
result += abstract[last_start:match.start()]
result += "<b>"
result += abstract[match.start():match.end()]
result += "</b>"
last_start = match.end()
result += abstract[last_start:]
return result
def get_doi(pim: str, regex: str):
from Bio import Entrez
from Bio.Entrez import efetch
import re
from metapub.convert import pmid2doi
Entrez.email = "Your.Name.Here#example.org"
print(f"Processing {pim}")
abstract_handle = efetch(db="pubmed", id=pim, retmode='text', rettype='all')
abstract = abstract_handle.read()
abstract_handle.close()
if re.search(regex, abstract, re.MULTILINE) is not None:
docsum_handle = efetch(db="pubmed", id=pim, retmode='text', rettype='docsum').read()
docsum = docsum_handle.read()
try:
doi = pmid2doi(pim)
except:
doi = "UNKNOWN"
return f"{doi}"
return ""
def get_pim_with_regex_list(keywords: List[str]) -> List[str]:
from Bio import Entrez
Entrez.email = "Your.Name.Here#example.org"
searchterm = " ".join(keywords)
pims = []
handle = Entrez.esearch(db="pubmed", retstart=0, retmax=0, term=searchterm, idtype="acc")
record = Entrez.read(handle)
handle.close()
count = int(record['Count'])
if count > 100000:
retmax = 100000
else:
retmax = count
retstart = 0
while retstart < count:
handle = Entrez.esearch(db="pubmed", retstart=retstart, retmax=retmax, term=searchterm, idtype="acc")
record = Entrez.read(handle)
handle.close()
for pim in record['IdList']:
pims.append(pim)
retstart += retmax
return pims
if __name__ == '__main__':
keywords = ["keyword1", "keyword2"]
pim_list = get_pim_with_regex_list(keywords)
regex = "keyword1 keyword2"
worker_fn = partial(get_doi, regex=regex)
pool = mp.Pool(mp.cpu_count())
entries = pool.map(worker_fn, pim_list)
pool.close()
pool.join()
When I run the given code, I get the following error:
Traceback (most recent call last):
File "/usr/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
self.run()
File "/usr/lib/python3.9/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/usr/lib/python3.9/multiprocessing/pool.py", line 114, in worker
task = get()
File "/usr/lib/python3.9/multiprocessing/queues.py", line 368, in get
return _ForkingPickler.loads(res)
TypeError: __new__() missing 2 required positional arguments: 'tag' and 'attributes'
Process ForkPoolWorker-4:
Traceback (most recent call last):
File "/usr/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
self.run()
File "/usr/lib/python3.9/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/usr/lib/python3.9/multiprocessing/pool.py", line 114, in worker
task = get()
File "/usr/lib/python3.9/multiprocessing/queues.py", line 368, in get
return _ForkingPickler.loads(res)
TypeError: __new__() missing 2 required positional arguments: 'tag' and 'attributes'
I did some digging into multiprocessing with python and found out that only python native types are supported as parameters (enforced by the ForkingPickler).
Assuming that str is a native type, the code should work... Currently, I am completely lost and have no idea what may be the problem.
As suggested, I uploaded a minimal (sequential) working example here
Is there any way to fix this problem or at least diagnose the real issue here?
I seem to be having issues with a python script that uses multiprocessing. What it essentially does is take a list of ID codes, and start processes that use Selenium and PhantomJS as the driver to navigate to a URL containing that ID code, extract data to individual csv files, then compile another csv file once all processes finish. Everything runs great, except sometimes one of the processes will return an exception that says:
Traceback (most recent call last):
File "/usr/lib/python2.7/multiprocessing/process.py", line 258, in _bootstrap
self.run()
File "/usr/lib/python2.7/multiprocessing/process.py", line 114, in run
self._target(*self._args, **self._kwargs)
File "modtest.py", line 11, in worker
do_work(item)
File "/home/mdrouin/Dropbox/Work/Dev/Python/WynInvScrape/items.py", line 14, in do_work
driver = webdriver.PhantomJS()
File "/usr/lib/python2.7/site-packages/selenium/webdriver/phantomjs/webdriver.py", line 50, in __init__
self.service.start()
File "/usr/lib/python2.7/site-packages/selenium/webdriver/phantomjs/service.py", line 72, in start
raise WebDriverException("Can not connect to GhostDriver")
I've tried working in ways to restart the process if an exception is raised, but what appears to be happening, regardless, is once the processes finish, the program hangs and doesn't move on, or do anything for that matter. I would essentially like to restart the ID number that is being searched for if the process crashes, and continue on when all processes are finished. Here is an extremely slimmed down version of the code:
from selenium import webdriver
from time import sleep
from bs4 import BeautifulSoup as bs
import multiprocessing
import datetime, time, csv, glob
num_procs = 8
def do_work(rsrt):
driver = webdriver.PhantomJS()
try:
driver.get('http://www.example.com/get.php?resort=' + rsrt)
rows = []
for row in soup.find_all('tr'):
if row.find('input', {'name': 'booksubmit'}):
wyncheckin = row.find('td', {'class': 'searchAvailDate'}).string
wynnights = row.find('td', {'class': 'searchAvailNights'}).string
wynroom = row.find('td', {'class': 'searchAvailUnitType'}).string
rows.append([wynresort, wyncheckin, wynroom])
driver.quit()
with open('/home/mdrouin/Dropbox/Work/Dev/Python/WynInvScrape/availability/'+rsrt+'.csv', 'wb') as f:
writer = csv.writer(f)
writer.writerows(row for row in rows if row)
print 'Process ' + rsrt + ' End: ' + str(time.strftime('%c'))
except:
driver.quit()
def worker():
for item in iter( q.get, None ):
do_work(item)
q.task_done()
q.task_done()
q = multiprocessing.JoinableQueue()
procs = []
for i in range(num_procs):
procs.append( multiprocessing.Process(target=worker) )
procs[-1].daemon = True
procs[-1].start()
source = ['0017', '0113', '0020', '0013', '0038', '1028', '0115', '0105', '0041', '0037', '0043', '2026', '0165', '0164',
'0033', '0126', '0116', '0103', '9135', '0185', '0206', '0053', '0062', '1020', '0019', '0042', '2028', '0213',
'0211', '0163', '0073', '2020', '0214', '2140', '0084', '0193', '0095', '0064', '0196', '0028', '0068', '0074']
for item in source:
q.put(item)
q.join()
for p in procs:
q.put( None )
q.join()
for p in procs:
p.join()
print "Finished"
print 'Writting core output: ' + str(time.strftime('%c'))
with open('availability.csv', 'wb') as outfile:
for csvfile in glob.glob('/home/mdrouin/Dropbox/Work/Dev/Python/WynInvScrape/availability/*.csv'):
for line in open(csvfile, 'r'):
outfile.write(line)
print 'Process End: ' + str(time.strftime('%c'))
One of the ways to tackle this sort of problem is with recurrent call to itself, something in lines of:
def do_work(rsrt):
if failed:
return do_work(rsrt)
Of course this will run until it resolves, so you might want to pass a counter and if it's above certain value just return false.