I am trying to see if Python can run two objects at once or simultaneously. I have researched multi threading and seems like it can handle this. All the posts I am seeing is to add functions to the target but not an actual instantiation of an object.
file1.py:
from threading import Thread
class Move_Images:
def __init__(self, rps_img_archive_fp, destination_fp, images_count, set_capturebatchno):
self.rps_img_archive_fp = rps_img_archive_fp
self.destination_fp = destination_fp
self.images_count = images_count
self.set_capturebatchno = set_capturebatchno
def get_fp_elem(self):
count = 0
capturebatchno_l = []
img_fn_l = []
final_dest_fp_l = []
input_img_fp_l = []
for path, directories, files in os.walk(self.rps_img_archive_fp):
for file in files:
count += 1
# if count <= self.images_count:
img_fp = os.path.join(path, file)
capturebatchno = img_fp.split("\\")[7]
get_cbn = [cbn for cbn in self.set_capturebatchno if cbn in capturebatchno]
if get_cbn:
capturebatchno_l.append(get_cbn[0])
filename = img_fp.split("\\")[8]
img_fn_l.append(filename)
input_img_fp_l.append(img_fp)
res = []
for i in capturebatchno_l:
if i not in res:
res.append(i)
for cbn, fn in zip(capturebatchno_l, img_fn_l):
dest_fp = os.path.join(str(self.destination_fp) + "\\" + str(cbn) + "\\" + str(fn))
final_dest_fp_l.append(dest_fp)
return res, self.destination_fp, img_fn_l, self.rps_img_archive_fp, input_img_fp_l, final_dest_fp_l
move_images = Move_Images(rps_img_archive_fp=r'c:\\test\test\input',
destination_fp=r'c:\\test\test\output', images_count=100,
set_capturebatchno=['00004002', '00004005'])
res, destination_fp, img_fn_l,rps_img_archive_fp, input_img_fp_l,final_dest_fp_l = \
move_images.get_fp_elem()
move_images_2 = Move_Images(rps_img_archive_fp=rr'c:\\test\test\input',
destination_fp=r'c:\\test\test\output', images_count=100,
set_capturebatchno=['000040010', '000040012'])
if __name__ == '__main__':
Thread(target=move_images).start()
Thread(target=move_images_2).start()
you can create for example a class (Move_Images) that inherhit from Thread itself and create a run function. Look into the accepted answer on this post maybe this is what u want.
Related
I am having trouble getting my class pickled.[This is an example of my function that creates a spike class with certain attributes]
def load_pt(ptname, data_directory):
"""
input: ptname, a string containing the name of the patient you want to load. example: 'ABC123'
output: object: spike ---- contains: List of 1000 random spikes: spike.select and their subsequent:
values: spike.values, chlabels: spike.chlabels, fs: spike.fs, soz channels: spike.soz
"""
val = mat73.loadmat(data_directory + '/values/values_{}.mat'.format(ptname))
val2 = val['values_all']
select_spikes = loadmat(data_directory + '/randi/randi_{}.mat'.format(ptname))
select_spikes = select_spikes['select_spikes']
ch_labels = loadmat(data_directory + '/chlabels/chlabels_{}.mat'.format(ptname))
ch_labels = ch_labels['ch_labels_all']
fs_all = loadmat(data_directory + '/fs/fs_{}.mat'.format(ptname))
fs_all = fs_all['fs_all']
SOZ_chlabels = pd.read_csv(data_directory + '/pt_data/SOZ_channels.csv')
pt_all = pd.read_csv(data_directory + '/pt_data/ptname_all.csv')
pt_name = ("'{}'".format(ptname))
whichpt = pt_all.index[pt_all['ptname'] == pt_name].tolist()
clean_SOZ_chlabels = prep_clean_soz(SOZ_chlabels)
global spike
class spike:
values = val2
select=select_spikes
chlabels = ch_labels
fs = fs_all
soz = clean_SOZ_chlabels[whichpt[0]]
return spike
I understand that the code requires some global effect. I'm fairly new to coding and would love some pointers.
The error I get is:
AttributeError: Can't pickle local object 'load_pt..spike'
Things I've tried:
add a global spike before initializing my class object in my load_pt function (did not work)
get error: Can't get attribute 'spike' on <module 'ied_functions' from 'directory_name'
You need to instantiate the class Spike first before you can return it.
To do this the class does not need to be global. After defining the class instantiate it by writing spike = Spike(). After that, you can return the instance of spike that you just created with return spike.
Into the pickle function you now need to pass load_pt and not load_pt.spike. Now you are passing the function load_pt which returns an instance of the class Spike. load_pt.spike references a local variable inside of load_pt which was never created.
See the entire code here:
import pickle
def load_pt(ptname, data_directory):
# input: ptname, a string containing the name of the patient you want to load. example: 'ABC123'
# output: object: spike ---- contains: List of 1000 random spikes: spike.select and their subsequent:
# values: spike.values, chlabels: spike.chlabels, fs: spike.fs, soz channels: spike.soz
val = mat73.loadmat(data_directory + '/values/values_{}.mat'.format(ptname))
val2 = val['values_all']
select_spikes = loadmat(data_directory + '/randi/randi_{}.mat'.format(ptname))
select_spikes = select_spikes['select_spikes']
ch_labels = loadmat(data_directory + '/chlabels/chlabels_{}.mat'.format(ptname))
ch_labels = ch_labels['ch_labels_all']
fs_all = loadmat(data_directory + '/fs/fs_{}.mat'.format(ptname))
fs_all = fs_all['fs_all']
SOZ_chlabels = pd.read_csv(data_directory + '/pt_data/SOZ_channels.csv')
pt_all = pd.read_csv(data_directory + '/pt_data/ptname_all.csv')
pt_name = ("'{}'".format(ptname))
whichpt = pt_all.index[pt_all['ptname'] == pt_name].tolist()
clean_SOZ_chlabels = prep_clean_soz(SOZ_chlabels)
class Spike:
values = val2
select = select_spikes
chlabels = ch_labels
fs = fs_all
soz = clean_SOZ_chlabels[whichpt[0]]
spike = Spike()
return spike
pickle.dump(load_pt, open( "spike.pkl", "wb" ))
Hope this helpes! :)
Edit:
Also I would follow #MichaelButschers advice:
It is unusual to define a specialized class in a function. Probably you only need an object of the class holding the data as instance attributes you are currently storing as class attributes. In this case you can define a generic class at global level and create an object in the function.
That might look something like this:
import pickle
def load_pt(ptname, data_directory):
# input: ptname, a string containing the name of the patient you want to load. example: 'ABC123'
# output: object: spike ---- contains: List of 1000 random spikes: spike.select and their subsequent:
# values: spike.values, chlabels: spike.chlabels, fs: spike.fs, soz channels: spike.soz
val2 = val['values_all']
select_spikes = select_spikes['select_spikes']
ch_labels = ch_labels['ch_labels_all']
fs_all = loadmat(data_directory + '/fs/fs_{}.mat'.format(ptname))
fs_all = fs_all['fs_all']
SOZ_chlabels = pd.read_csv(data_directory + '/pt_data/SOZ_channels.csv')
pt_all = pd.read_csv(data_directory + '/pt_data/ptname_all.csv')
pt_name = ("'{}'".format(ptname))
whichpt = pt_all.index[pt_all['ptname'] == pt_name].tolist()
clean_SOZ_chlabels = prep_clean_soz(SOZ_chlabels)
spike = Spike(val2, select_spikes, ch_labels, fs_all, clean_SOZ_chlabels[whichpt[0]])
return spike
class Spike():
def __init__(self, values, select, chlabels, fs, soz):
self.values = values
self.select = select
self.chlabels = chlabels
self.fs = fs
self.soz = soz
spike = load_pt('pt_name', 'data_directory')
pickle.dump(spike, open( "spike.pkl", "wb" ))
print(pickle.load(open( "spike.pkl", "rb" )))
I want to make an inverted index using multiprocessing to speed up its work. My idea is to split the files into groups, and each process will build its own inverted index, and then I want to merge all these indexes into one inverted index. But I don't know how to return them to the main process that will merge them.
import multiprocessing as mp
from pathlib import Path
import re
import time
class InvertedIndex:
def __init__(self):
self.index = dict()
def createIndex(self, path='data', threads_num=4):
pathList = list(Path(path).glob('**/*.txt'))
fileNum = len(pathList)
oneProcessNum = fileNum / threads_num
processes = []
for i in range(threads_num):
startIndex = int(i * oneProcessNum)
endIndex = int((i + 1) * oneProcessNum)
currLi = pathList[startIndex:endIndex]
p = mp.Process(target=self.oneProcessTask, args=(currLi,))
processes.append(p)
[x.start() for x in processes]
[x.join() for x in processes]
#staticmethod
def oneProcessTask(listOfDoc):
#print(f'Start: {list[0]}, end: {list[-1]}') # temp
tempDict = dict()
for name in listOfDoc:
with open(name) as f:
text = f.read()
li = re.findall(r'\b\w+\b', text)
for w in li:
if tempDict.get(w) is None:
tempDict[w] = set()
tempDict[w].add(str(name))
def getListOfDoc(self, keyWord):
return self.index[keyWord]
if __name__ == '__main__':
ii = InvertedIndex()
start_time = time.time()
ii.createIndex()
print("--- %s seconds ---" % (time.time() - start_time))
I used multiprocessing.manager to write everything in one dictionary, but that solution was too slow. So I went back to the idea of creating own inverted index for each process and then merging them. But I don't know how to return all indexes to one process.
Take a look at concurrent.futures (native library) with either ThreadPoolExecutor or ProcessPoolExecutor. FYI: I wrote on that in here and did not test but, this is more or less the jist of what I use all the time.
from concurrent.futures import ThreadPoolExecutor, as_completed
def foo(stuff: int) -> dict:
return {}
things_to_analyze = [1,2,3]
threads = []
results = []
with ThreadPoolExecutor() as executor:
for things in things_to_analyze:
threads.append(executor.submit(foo, thing))
for job in as_completed(threads):
results.append(job.results())
I found a solution. I used pool.starmap to return a list of indexes.
My code:
class InvertedIndex:
def __init__(self):
self.smallIndexes = None
self.index = dict()
def createIndex(self, path='data', threads_num=4):
pathList = list(Path(path).glob('**/*.txt')) # Рекурсивно проходимо по всіх текстових файлах і робимо з них список
fileNum = len(pathList)
oneProcessNum = fileNum / threads_num # Розраховуємо скільки файлів має обробити один процес
processes_args = []
for i in range(threads_num):
startIndex = int(i * oneProcessNum)
endIndex = int((i + 1) * oneProcessNum)
processes_args.append((path, startIndex, endIndex))
pool = mp.Pool(threads_num)
self.smallIndexes = pool.starmap(self.oneProcessTask, processes_args)
self.mergeIndex()
#staticmethod
def oneProcessTask(path, startIndex, endIndex):
pathList = list(Path(path).glob('**/*.txt'))
listOfDoc = pathList[startIndex:endIndex]
tempDict = dict()
for name in listOfDoc:
with open(name) as f:
text = f.read()
li = re.findall(r'\b\w+\b', text)
for w in li:
if tempDict.get(w) is None:
tempDict[w] = set()
tempDict[w].add(str(name))
return tempDict
Execution time decreased from 200 seconds (when I used shared memory and menger.dict ) to 0.8 seconds (when I used pool.starmap).
I've been working for around a week to learn SimPy for a discrete simulation I have to run. I've done my best, but I'm just not experienced enough to figure it out quickly. I am dying. Please help.
The system in question goes like this:
order arrives -> resource_1 (there are 2) performs take_order -> order broken into items -> resource_2 (there are 10) performs process_item
My code runs and performs the simulation, but I'm having a lot of trouble getting the queues on the resources to function. As in, queues do not build up on either resource when I run it, and I cannot find the reason why. I try resource.get_queue and get empty lists. There should absolutely be queues, as the orders arrive faster than they can be processed.
I think it has something to do with the logic for requesting resources, but I can't figure it out. Here's how I've structured the code:
import simpy
import random
import numpy as np
total_items = []
total_a = []
total_b = []
total_c = []
order_Q = []
item_Q = []
skipped_visits = []
order_time_dict = {}
order_time_dict2 = {}
total_order_time_dict = {}
var = []
class System:
def __init__(self,env,num_resource_1,num_resource_2):
self.env = env
self.resource_1 = simpy.Resource(env,num_resource_1)
self.resource_2 = simpy.Resource(env,num_resource_2)
def take_order(self, order):
self.time_to_order = random.triangular(30/60,60/60,120/60)
arrive = self.env.now
yield self.env.timeout(self.time_to_order)
def process_item(self,item):
total_process_time = 0
current = env.now
order_num = item[1][0]
for i in range(1,item[1][1]):
if 'a' in item[0]:
total_process_time += random.triangular(.05,7/60,1/6) #bagging time only
#here edit order time w x
if 'b' in item[0]:
total_process_time += random.triangular(.05,.3333,.75)
if 'c' in item[0]:
total_process_time += random.triangular(.05,7/60,1/6)
#the following is handling time: getting to station, waiting on car to arrive at window after finished, handing to cust
total_process_time += random.triangular(.05, 10/60, 15/60)
item_finish_time = current + total_process_time
if order_num in order_time_dict2.keys():
start = order_time_dict2[order_num][0]
if order_time_dict2[order_num][1] < item_finish_time:
order_time_dict2[order_num] = (start, item_finish_time)
else:
order_time_dict2[order_num] = (current, item_finish_time)
yield self.env.timeout(total_process_time)
class Order:
def __init__(self, order_dict,order_num):
self.order_dict = order_dict
self.order_num = order_num
self.order_stripped = {}
for x,y in list(self.order_dict.items()):
if x != 'total':
if y != 0:
self.order_stripped[x] = (order_num,y) #this gives dictionary format {item: (order number, number items) } but only including items in order
self.order_list = list(self.order_stripped.items())
def generate_order(num_orders):
print('running generate_order')
a_demand = .1914 ** 3
a_stdev = 43.684104
b_demand = .1153
b_stdev = 28.507782
c_demand = .0664
c_stdev = 15.5562624349
num_a = abs(round(np.random.normal(a_demand)))
num_b = abs(round(np.random.normal(b_demand)))
num_c = abs(round(np.random.normal(c_demand)))
total = num_orders
total_a.append(num_a)
total_b.append(num_b)
total_c.append(num_c)
total_num_items = num_a + num_b + num_c
total_items.append(total_num_items)
order_dict = {'num_a':num_a, 'num_b':num_b,'num_c':num_c, 'total': total}
return order_dict
def order_process(order_instance,system):
enter_system_at = system.env.now
print("order " + str(order_instance.order_num) + " arrives at " + str(enter_system_at))
if len(system.resource_1.get_queue) > 1:
print("WORKING HERE ******************")
if len(system.resource_1.get_queue) <= 25:
with system.resource_1.request() as req:
order_Q.append(order_instance)
yield req
yield env.process(system.take_order(order_instance))
order_Q.pop()
enter_workstation_at = system.env.now
print("order num " + str(order_instance.order_num) + " enters workstation at " + str(enter_workstation_at))
for item in order_instance.order_list:
item_Q.append(item)
with system.resource_2.request() as req:
yield req
yield env.process(system.process_item(item))
if len(system.resource_2.get_queue) >1:
var.append(1)
item_Q.pop()
leave_workstation_at = system.env.now
print("Order num " + str(order_instance.order_num) + " leaves at " + str(leave_workstation_at))
order_time_dict[order_instance.order_num] = leave_workstation_at-enter_workstation_at
total_order_time_dict[order_instance.order_num]=leave_workstation_at-enter_system_at
else:
skipped_visits.append(1)
def setup(env):
system = System(env,2,15)
order_num = 0
while True:
next_order = random.expovariate(3.5) #where 20 is order arrival mean (lambda)
yield env.timeout(next_order)
order_num+=1
env.process(order_process(Order(generate_order(order_num),order_num),system))
env = simpy.Environment()
env.process(setup(env))
env.run(until=15*60)
print("1: \n", order_time_dict)
I think you are looking at the wrong queue.
the api for getting queued requests for resources is just attribute queue so try using
len(system.resource_1.queue)
get_queue and put_queue is from the base class and used to derive new resource classes.
but wait they are not what any reasonable person would assume, and I find this confusing too, but the doc says
Requesting a resources is modeled as “putting a process’ token into the resources” which means when you call request() the process is put into the put_queue, not the get_queue. And with resource, release always succeeds immediately so its queue (which is the get_queue) is always empty
I think queue is just a alias for the put_queue, but queue is much less confussing
I have a instance with 16-core processor and I have a while loop like below,
count = 200000
num = 0
pbar = tqdm(total=count)
lst = []
while num <= count:
random_folder = os.path.join(path, np.random.choice(os.listdir(path)))
file_path = os.path.join(path, np.random.choice(os.listdir(random_folder)))
if not os.path.isdir(file_path):
lst.append(file_path)
pbar.update(1)
num += 1
When I tried to run this code on a server, the estimated time is really long
0%| | 138/200000 [02:14<51:25:11, 1.08it/s]
I have tried to use numpy to get random choice but it's still slow. Is there any way I can take advantage of my multi-core cpu and speed up this while loop? It's just collecting random files from sub folders. Really appreciate any help. Thanks
Update:
path = "/home/user12/pdf_files"
def get_random_file(num_of_files):
count = 0
random_files = []
while count < num_of_files:
random_folder = os.path.join(path, random.choice(os.listdir(path)))
file_path = os.path.join(path, random.choice(os.listdir(random_folder)))
if not os.path.isdir(file_path):
resumes_list.append(file_path)
count += 1
return random_files
with Pool(16) as p:
random_files = p.map(get_random_file, (1000/16,))
You can use multi processing and use all cores at the same time.
See https://docs.python.org/3.8/library/multiprocessing.html
Something like this:
from multiprocessing import Pool
def get_random_file(num_of_files):
# your logic goes here
count = 0
random_files = []
while count < num_of_files:
count += 1
pass
#get random file and append to 'random_files'
return random_files
if __name__ == '__main__':
with Pool(16) as p:
num_of_files = [200000/16 for i in range(1,16)]
random_files = p.map(get_random_file,num_of_files)
# random_files is a list of lists - you need to merge them into one list
I am using multiprocessing module in python to make the function to run in parallel.
the functions name is:
Parallel_Solution_Combination_Method(subset, i):
the subset parameter is a list element which is made up of a tuple of chromosomes.
chromosome is a class defined by me within the same script.
I am running on Lubuntu Linux based OS. the code that I'm using to try to run the function in parallel is:
pool = mp.Pool(processes=2)
results = [pool.apply_async(Parallel_Solution_Combination_Method,
args=(subsets[i],i,)
)
for i in range(len(subsets))
]
however the problem that I'm encoutring is that, whenever I specify the number of processes more than 1, the results are not as expected, lets say if I'm passing a list of subsets of size 10 and im using:
processes=2
then the first two outputs are resulting exactly the same values, outputs 3 and 4 the same and so on, whereas if I specify the number of processes:
processes = 1
which essentially is a sequential run, then the outcome is correct as expected (same as a normal for loop without multiprocessing.
I don't know why my results are getting mixed up even though I'm explicitly sending a different tuple from the set which is specified by the index i of the fool loop.
args=(subsets[i],i,)
I am running on a hardware with two cores so I was hoping that I can run two instances of the function in parallel, but the outcome is that it's producing duplicate results. I cant figure out my wrong doing.
Please help!! Thank you.
def Parallel_Solution_Combination_Method(subset, counter):
print 'entered parallel sol comb'
child_chromosome = chromosome()
combination_model_offset = 300
attempts = 0
while True:
template1 = subset[0].record_template
template2 = subset[1].record_template
template_child = template1
template_gap1 = find_allIndices(template1, '-')
template_gap2 = find_allIndices(template2, '-')
if(len(template_gap1) !=0 and len(template_gap2) != 0):
template_gap_difference = find_different_indicies(template_gap1, template_gap2)
if(len(template_gap_difference) != 0):
template_slice_point = random.choice(template_gap_difference)
if(template_gap2[template_slice_point -1] < template_gap1[template_slice_point]):
#swap template1 template2 values as well as their respective gap indices
#so that in crossover the gaps would not collide with each other.
temp_template = template1
temp_gap = template_gap1
template1 = template2
template2 = temp_template
template_gap1 = template_gap2
template_gap2 = temp_gap
#the crossing over takes the first part of the child sequence to be up until
#the crossing point without including it. this way it ensures that the resulting
#child sequence is different from both of the parents by at least one point.
child_template_gap = template_gap1[:template_slice_point]+template_gap2[template_slice_point:]
child_gap_part1 = child_template_gap[:template_slice_point]
child_gap_part2 = child_template_gap[template_slice_point:]
if template_slice_point == 0:
template_child = template2
else:
template_child = template1[:template_gap1[template_slice_point]]
template_residues_part1 = str(template_child).translate(None, '-')
template_residues_part2 = str(template2).translate(None, '-')
template_residues_part2 = template_residues_part2[len(template_residues_part1):]
for i in range(template_gap1[template_slice_point-1], len(template1)):
if i in child_gap_part2:
template_child = template_child + '-'
else:
template_child = template_child + template_residues_part2[0:1]
template_residues_part2 = template_residues_part2[1:]
target1 = subset[0].record_target
target2 = subset[1].record_target
target_child = target1
target_gap1 = find_allIndices(target1, '-')
target_gap2 = find_allIndices(target2, '-')
if(len(target_gap1) !=0 and len(target_gap2) != 0):
target_gap_difference = find_different_indicies(target_gap1, target_gap2)
if(len(target_gap_difference) !=0):
target_slice_point = random.choice(target_gap_difference)
if(target_gap2[target_slice_point -1] < target_gap1[target_slice_point]):
#swap template1 template2 values as well as their respective gap indices
#so that in crossover the gaps would not collide with each other.
temp_target = target1
temp_gap = target_gap1
target1 = target2
target2 = temp_target
target_gap1 = target_gap2
target_gap2 = temp_gap
#the crossing over takes the first part of the child sequence to be up until
#the crossing point without including it. this way it ensures that the resulting
#child sequence is different from both of the parents by at least one point.
child_target_gap = target_gap1[:target_slice_point]+target_gap2[target_slice_point:]
child_gap_part1 = child_target_gap[:target_slice_point]
child_gap_part2 = child_target_gap[target_slice_point:]
if target_slice_point == 0:
target_child = target2
else:
target_child = target1[:target_gap1[target_slice_point]]
target_residues_part1 = str(target_child).translate(None, '-')
target_residues_part2 = str(target2).translate(None, '-')
target_residues_part2 = target_residues_part2[len(target_residues_part1):]
for i in range(target_gap1[target_slice_point-1], len(target1)):
if i in child_gap_part2:
target_child = target_child + '-'
else:
target_child = target_child + target_residues_part2[0:1]
target_residues_part2 = target_residues_part2[1:]
if not [False for y in Reference_Set if y.record_template == template_child and y.record_target == target_child] or attempts <= 100:
break
attempts +=1
child_chromosome.record_template = template_child
#print template_child
child_chromosome.record_target = target_child
#print target_child
generate_PIR(template_header, template_description, child_chromosome.record_template, target_header,target_description, child_chromosome.record_target)
output_values = start_model(template_id, target_id,'PIR_input.ali', combination_model_offset + counter)
child_chromosome.molpdf_score = output_values['molpdf']
#print output_values['molpdf']
mdl = complete_pdb(env, '1BBH.B99990'+ str(combination_model_offset + counter)+'.pdb')
child_chromosome.normalized_dope_score = mdl.assess_normalized_dope()
#print mdl.assess_normalized_dope()
return child_chromosome
this is the code for the Parallel_Soultion_Combination_Method, also if it becomes handy, i'm including the chromosome class that I defined:
class chromosome():
"""basic solution represenation that holds alignments and it's evaluations"""
def __init__(self):
self.record_template = ''
self.record_target = ''
self.molpdf_score = 0.0
self.ga341_score = 0.0
self.dope_score = 0.0
self.normalized_dope_score = 0.0
self.flag_value = 0
self.distance_value = 0
def add_molpdf(self, molpdf):
self.molpdf_score = molpdf
def add_ga341(self, ga341):
self.ga341_score = ga341
def add_dope(self, dope):
self.dope_score = dope
def add_normalized_dope(self, normalized_dope):
self.normalized_dope_score = normalized_dope
def add_records(self, records):
self.seq_records = records
for rec in self.seq_records:
if rec.id == template_id:
self.record_template = rec.seq
elif rec.id == target_id:
self.record_target = rec.seq
def set_flag(self, flag):
self.flag_value = flag
def add_distance(self, distance):
self.distance_value = distance
please note that all of this is within the same python script.