I have built a job shop scheduling algorithm using the ortools optimization library for python.
the problem is when i made a flexible jobshop model with setup times it doesn't work and i think it is due to the arcs that i made, if there is anybody here who can explain more the circuit constraint, that would help me. By the way when i use a single machine it works.
Code:
from __future__ import print_function
import collections
from google.protobuf import text_format
from ortools.sat.python import cp_model
# Import Python wrapper for or-tools CP-SAT solver.
from ortools.sat.python import cp_model
# Intermediate solution printer
class SolutionPrinter(cp_model.CpSolverSolutionCallback):
"""Print intermediate solutions."""
def __init__(self):
cp_model.CpSolverSolutionCallback.__init__(self)
self.__solution_count = 0
def on_solution_callback(self):
"""Called after each new solution found."""
print('Solution %i, time = %f s, objective = %i' %
(self.__solution_count, self.WallTime(), self.ObjectiveValue()))
self.__solution_count += 1
def MinimalJobshopSat():
"""Minimal jobshop problem."""
# Create the model.
model = cp_model.CpModel()
jobs_data = [[(0, 2546), (1, 2000), (2, 1400)],
[(0, 1289), (1, 2546), (2, 2546)],
[(0, 2839), (1, 1576), (2, 1200)]
]
setup_times = [
[
[3559, 1638, 2000],
[1442, 3010, 1641],
[1728, 3583, 3243]],
[
[3559, 1638, 2000],
[1442, 3010, 1641],
[1728, 3583, 3243]],
[
[3559, 1638, 2000],
[1442, 3010, 1641],
[1728, 3583, 3243]]
]
all_jobs = range(len(jobs_data))
machines_count = 1 + max(task[0] for job in jobs_data for task in job)
all_machines = range(machines_count)
for machine in all_machines:
for job_id in all_jobs:
min_incoming_setup = min(
setup_times[machine][j][job_id] for j in all_jobs)
if min_incoming_setup == 0:
continue
print('job %i at machine %i has a min incoming setup of %i' %
(job_id, machine, min_incoming_setup))
# We can transfer some setup times to the duration of the job.
jobs_data[job_id][machine] = (machine, jobs_data[job_id][machine][1] + min_incoming_setup)
# Decrease corresponding incoming setup times.
for j in all_jobs:
setup_times[machine][j][job_id] -= min_incoming_setup
# Computes horizon dynamically as the sum of all durations.
horizon = sum(task[1] for job in jobs_data for task in job)
for times in setup_times:
for time in times:
horizon += max(time)
# Named tuple to store information about created variables.
task_type = collections.namedtuple('task_type', 'start end interval')
# Named tuple to manipulate solution information.
assigned_task_type = collections.namedtuple('assigned_task_type',
'start job index duration')
# Creates job intervals and add to the corresponding machine lists.
all_tasks = {}
machine_to_intervals = collections.defaultdict(list)
starts = collections.defaultdict(list)
ends = collections.defaultdict(list)
for job_id, job in enumerate(jobs_data):
for task_id, task in enumerate(job):
machine = task[0]
duration = task[1]
suffix = '_%i_%i' % (job_id, task_id)
start_var = model.NewIntVar(0, horizon, 'start' + suffix)
end_var = model.NewIntVar(0, horizon, 'end' + suffix)
interval_var = model.NewIntervalVar(start_var, duration, end_var,
'interval' + suffix)
all_tasks[job_id, task_id] = task_type(
start=start_var, end=end_var, interval=interval_var)
machine_to_intervals[machine].append(interval_var)
starts[machine].append(start_var)
ends[machine].append(end_var)
# Create and add disjunctive constraints.
for machine in all_machines:
model.AddNoOverlap(machine_to_intervals[machine])
#----------------------------------------------------------------------------
# Transition times using a circuit constraint.
list_arcs = []
for machine in all_machines:
arcs = []
for i in all_jobs:
# Initial arc from the dummy node (0) to a task.
start_lit = model.NewBoolVar('')
arcs.append([0, i + 1, start_lit])
# If this task is the first, set to minimum starting time.
min_start_time = min(0,setup_times[machine][0][i])
model.Add(starts[machine][i] == min_start_time).OnlyEnforceIf(start_lit)
# Final arc from an arc to the dummy node.
arcs.append([i + 1, 0, model.NewBoolVar('')])
for j in all_jobs:
if i == j:
continue
lit = model.NewBoolVar('%i_%i follows %i_%i' % (j, machine, i, machine))
arcs.append([i + 1, j + 1, lit])
# We add the reified precedence to link the literal with the times of the
# two tasks.
# If release_dates[j] == 0, we can strenghten this precedence into an
# equality as we are minimizing the makespan.
model.Add(starts[machine][j] >=
ends[machine][i] + setup_times[machine][i][j]).OnlyEnforceIf(lit)
list_arcs.append(arcs)
model.AddCircuit(arcs)
#----------------------------------------------------------------------------
# Precedences inside a job.
for job_id, job in enumerate(jobs_data):
for task_id in range(len(job) - 1):
model.Add(all_tasks[job_id, task_id +
1].start >= all_tasks[job_id, task_id].end)
# Makespan objective.
obj_var = model.NewIntVar(0, horizon, 'makespan')
model.AddMaxEquality(obj_var, [
all_tasks[job_id, len(job) - 1].end
for job_id, job in enumerate(jobs_data)
])
model.Minimize(obj_var)
# Solve model.
solver = cp_model.CpSolver()
solver.parameters.max_time_in_seconds = 60
status=solver.Solve(model)
solver.parameters
solution_printer = SolutionPrinter()
solver.SolveWithSolutionCallback(model, solution_printer)
print(solver.ResponseStats())
if status == cp_model.FEASIBLE:
# Create one list of assigned tasks per machine.
assigned_jobs = collections.defaultdict(list)
for job_id, job in enumerate(jobs_data):
for task_id, task in enumerate(job):
machine = task[0]
assigned_jobs[machine].append(
assigned_task_type(
start=solver.Value(all_tasks[job_id, task_id].start),
job=job_id,
index=task_id,
duration=task[1]))
# Create per machine output lines.
output = ''
for machine in all_machines:
# Sort by starting time.
assigned_jobs[machine].sort()
sol_line_tasks = 'Machine ' + str(machine) + ': '
sol_line = ' '
for assigned_task in assigned_jobs[machine]:
name = 'job_%i_%i' % (assigned_task.job, assigned_task.index)
# Add spaces to output to align columns.
sol_line_tasks += '%-10s' % name
start = assigned_task.start
duration = assigned_task.duration
sol_tmp = '[%i,%i]' % (start, start + duration)
# Add spaces to output to align columns.
sol_line += '%-10s' % sol_tmp
sol_line += '\n'
sol_line_tasks += '\n'
output += sol_line_tasks
output += sol_line
# Finally print the solution found.
print('Optimal Schedule Length: %i' % solver.ObjectiveValue())
print(output)
if __name__ == '__main__':
MinimalJobshopSat()()
If a task is optional, you need to add a self looping arc on the node that corresponds to this arc.
So let's assume task_i with Boolean presence literal lit_i, you need to add
arcs.append([i + 1, i + 1, lit_i.Not()])
Related
I've been working for around a week to learn SimPy for a discrete simulation I have to run. I've done my best, but I'm just not experienced enough to figure it out quickly. I am dying. Please help.
The system in question goes like this:
order arrives -> resource_1 (there are 2) performs take_order -> order broken into items -> resource_2 (there are 10) performs process_item
My code runs and performs the simulation, but I'm having a lot of trouble getting the queues on the resources to function. As in, queues do not build up on either resource when I run it, and I cannot find the reason why. I try resource.get_queue and get empty lists. There should absolutely be queues, as the orders arrive faster than they can be processed.
I think it has something to do with the logic for requesting resources, but I can't figure it out. Here's how I've structured the code:
import simpy
import random
import numpy as np
total_items = []
total_a = []
total_b = []
total_c = []
order_Q = []
item_Q = []
skipped_visits = []
order_time_dict = {}
order_time_dict2 = {}
total_order_time_dict = {}
var = []
class System:
def __init__(self,env,num_resource_1,num_resource_2):
self.env = env
self.resource_1 = simpy.Resource(env,num_resource_1)
self.resource_2 = simpy.Resource(env,num_resource_2)
def take_order(self, order):
self.time_to_order = random.triangular(30/60,60/60,120/60)
arrive = self.env.now
yield self.env.timeout(self.time_to_order)
def process_item(self,item):
total_process_time = 0
current = env.now
order_num = item[1][0]
for i in range(1,item[1][1]):
if 'a' in item[0]:
total_process_time += random.triangular(.05,7/60,1/6) #bagging time only
#here edit order time w x
if 'b' in item[0]:
total_process_time += random.triangular(.05,.3333,.75)
if 'c' in item[0]:
total_process_time += random.triangular(.05,7/60,1/6)
#the following is handling time: getting to station, waiting on car to arrive at window after finished, handing to cust
total_process_time += random.triangular(.05, 10/60, 15/60)
item_finish_time = current + total_process_time
if order_num in order_time_dict2.keys():
start = order_time_dict2[order_num][0]
if order_time_dict2[order_num][1] < item_finish_time:
order_time_dict2[order_num] = (start, item_finish_time)
else:
order_time_dict2[order_num] = (current, item_finish_time)
yield self.env.timeout(total_process_time)
class Order:
def __init__(self, order_dict,order_num):
self.order_dict = order_dict
self.order_num = order_num
self.order_stripped = {}
for x,y in list(self.order_dict.items()):
if x != 'total':
if y != 0:
self.order_stripped[x] = (order_num,y) #this gives dictionary format {item: (order number, number items) } but only including items in order
self.order_list = list(self.order_stripped.items())
def generate_order(num_orders):
print('running generate_order')
a_demand = .1914 ** 3
a_stdev = 43.684104
b_demand = .1153
b_stdev = 28.507782
c_demand = .0664
c_stdev = 15.5562624349
num_a = abs(round(np.random.normal(a_demand)))
num_b = abs(round(np.random.normal(b_demand)))
num_c = abs(round(np.random.normal(c_demand)))
total = num_orders
total_a.append(num_a)
total_b.append(num_b)
total_c.append(num_c)
total_num_items = num_a + num_b + num_c
total_items.append(total_num_items)
order_dict = {'num_a':num_a, 'num_b':num_b,'num_c':num_c, 'total': total}
return order_dict
def order_process(order_instance,system):
enter_system_at = system.env.now
print("order " + str(order_instance.order_num) + " arrives at " + str(enter_system_at))
if len(system.resource_1.get_queue) > 1:
print("WORKING HERE ******************")
if len(system.resource_1.get_queue) <= 25:
with system.resource_1.request() as req:
order_Q.append(order_instance)
yield req
yield env.process(system.take_order(order_instance))
order_Q.pop()
enter_workstation_at = system.env.now
print("order num " + str(order_instance.order_num) + " enters workstation at " + str(enter_workstation_at))
for item in order_instance.order_list:
item_Q.append(item)
with system.resource_2.request() as req:
yield req
yield env.process(system.process_item(item))
if len(system.resource_2.get_queue) >1:
var.append(1)
item_Q.pop()
leave_workstation_at = system.env.now
print("Order num " + str(order_instance.order_num) + " leaves at " + str(leave_workstation_at))
order_time_dict[order_instance.order_num] = leave_workstation_at-enter_workstation_at
total_order_time_dict[order_instance.order_num]=leave_workstation_at-enter_system_at
else:
skipped_visits.append(1)
def setup(env):
system = System(env,2,15)
order_num = 0
while True:
next_order = random.expovariate(3.5) #where 20 is order arrival mean (lambda)
yield env.timeout(next_order)
order_num+=1
env.process(order_process(Order(generate_order(order_num),order_num),system))
env = simpy.Environment()
env.process(setup(env))
env.run(until=15*60)
print("1: \n", order_time_dict)
I think you are looking at the wrong queue.
the api for getting queued requests for resources is just attribute queue so try using
len(system.resource_1.queue)
get_queue and put_queue is from the base class and used to derive new resource classes.
but wait they are not what any reasonable person would assume, and I find this confusing too, but the doc says
Requesting a resources is modeled as “putting a process’ token into the resources” which means when you call request() the process is put into the put_queue, not the get_queue. And with resource, release always succeeds immediately so its queue (which is the get_queue) is always empty
I think queue is just a alias for the put_queue, but queue is much less confussing
Upon exploring actor-critic, I have been trying to speed up my program using multiprocessing. However, the code runs fine until the point where I start using processes. The code starts, but then never stops at one episode (keeps running, showing looping behavior). I have been searching for possible errors, and as far as I am concerned, the fault is due to each process calling its sub-process (?). That being said, I really want to find out how to fix the multiprocessing part so that my program will run, so I would really appreciate any help.
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import gym
import time
import random
import multiprocessing as mp
from keras.layers import Reshape, BatchNormalization
from keras.layers.embeddings import Embedding
from collections import deque
# Configuration parameters for the whole setup
gamma = 1 # Discount factor for past rewards
max_steps_per_episode = 2000
env = gym.make("Taxi-v3").env # Create the environment
eps = 1e-6
num_inputs = 1
num_actions = 6
num_hidden = 64
simulated_epsilon = 0
# Actor Policy Network
inputs_1 = layers.Input(shape=(num_inputs,))
embed = layers.Embedding(500, 10, input_length=num_inputs)(inputs_1)
reshape = layers.Reshape((10 * num_inputs, ))(embed)
common = layers.Dense(num_hidden * 2, activation="relu")(reshape)
common = layers.Dense(num_hidden, activation="relu")(common)
action = layers.Dense(num_actions, activation="softmax")(common)
model_1 = keras.Model(inputs=inputs_1, outputs=action)
# Critic Reward Network
inputs_2 = layers.Input(shape=(num_inputs,))
embed_2 = layers.Embedding(500, 10, input_length=num_inputs)(inputs_2)
reshape_2 = layers.Reshape((10, ))(embed_2)
common_2 = layers.Dense(num_hidden * 2, activation="relu")(reshape_2)
common_2 = layers.Dense(num_hidden, activation="relu")(common_2)
critic = layers.Dense(1)(common_2)
model_2 = keras.Model(inputs=inputs_2, outputs=critic)
# Optimizer and Loss Function
optimizer = keras.optimizers.Adam(learning_rate=5e-4)
huber_loss = keras.losses.Huber()
action_probs_history = []
critic_value_history = []
rewards_history = []
running_reward = 0
episode_count = 0
under_20 = 0
model_2.compile(optimizer = keras.optimizers.Adam(learning_rate=5e-4), loss=huber_loss)
def worker(number, env, actor, critic):
optimizer = keras.optimizers.Adam(learning_rate=5e-4)
huber_loss = keras.losses.Huber()
action_probs_history = []
critic_value_history = []
rewards_history = []
running_reward = 0
episode_count = 0
under_20 = 0
while True: # Run until solved
state = env.reset()
episode_reward = 0
penalties = 0
drop = 0
print("Episode {} begins ({})".format(episode_count, number))
env.render()
start = time.time()
time_solve = 0
with tf.GradientTape() as tape_1, tf.GradientTape() as tape_2:
#with tf.GradientTape() as tape:
#while True:
for _ in range(1, max_steps_per_episode + 1):
#env.render() # Adding this line would show the attempts
# of the agent in a pop up window.
state = tf.convert_to_tensor(state)
state = tf.expand_dims(state, 0)
# Predict action probabilities and estimated future rewards
# from environment state
action_probs = actor(state)
critic_value = critic(state)
critic_value_history.append((state, critic_value[0, 0]))
# Choose action
action = np.random.choice(num_actions, p=np.squeeze(action_probs))
action_probs_history.append(tf.math.log(action_probs[0, action])) # action_probs stores log of probs of action
#if timestep == 1:
# print("{}: {}".format(state, action_probs))
# print("{}: {}".format(state, action))
# Apply the sampled action in our environment
state, reward, done, _ = env.step(action)
rewards_history.append(reward)
episode_reward += reward
time_solve += 1
if reward == -10:
penalties += 1
elif reward == 20:
drop += 1
if done:
break
# Update running reward to check condition for solving
running_reward = (running_reward * (episode_count) + episode_reward) / (episode_count + 1)
# Calculate expected value from rewards
# - At each timestep what was the total reward received after that timestep
# - Rewards in the past are discounted by multiplying them with gamma
# - These are the labels for our critic
returns = deque(maxlen=3500)
discounted_sum = 0
for r in rewards_history[::-1]:
discounted_sum = r + gamma * discounted_sum
returns.appendleft(discounted_sum)
# Normalize
#returns = np.array(returns)
returns = (returns - np.mean(returns)) / (np.std(returns) + eps)
#returns = returns.tolist()
# Calculating loss values to update our network
history = zip(action_probs_history, critic_value_history, returns)
loss_value_actor = 0
loss_value_critic = 0
for log_prob, value, ret in history:
diff = ret - value[1]
loss_value_actor += -log_prob * diff
loss_value_critic += huber_loss(tf.expand_dims(value[1], 0), tf.expand_dims(ret, 0))
# Backpropagation
#loss_value_actor /= time_solve
#loss_value_critic /= time_solve
if episode_count % 2 == 1:
grads_1 = tape_1.gradient(loss_value_actor, model_1.trainable_variables)
optimizer.apply_gradients(zip(grads_1, model_1.trainable_variables))
grads_2 = tape_2.gradient(loss_value_critic, model_2.trainable_variables)
optimizer.apply_gradients(zip(grads_2, model_2.trainable_variables))
# Clear the loss and reward history
action_probs_history.clear()
critic_value_history.clear()
rewards_history.clear()
# Copy params
actor.set_weights(model_1.get_weights())
critic.set_weights(model_2.get_weights())
# Log details
end = time.time()
episode_count += 1
if episode_count % 1 == 0:
env.render()
template = "average reward: {:.2f}"
print(template.format(running_reward, episode_count))
print("episode reward: {}".format(episode_reward))
print("Steps taken: {}".format(time_solve))
print("Penalties incurred: {}".format(penalties))
print("Passengers dropped off: {}".format(drop))
print("Time taken: {}".format(end - start))
print()
if running_reward > -50: # Condition to consider the task solved
under_20 += 1
if under_20 > 5:
print("Solved at episode {} !".format(episode_count))
break
num_processes = 5
if __name__ == "__main__":
mp.freeze_support()
envs = []
processes = []
actors = []
critics = []
for i in range(num_processes):
envs.append(gym.make("Taxi-v3").env)
for i in range(num_processes):
t = mp.Process(target=worker, args=(i, envs[i], model_1, model_2))
t.start()
time.sleep(0.5)
processes.append(t)
for process in processes:
process.join()
for process in processes:
process.terminate()
I am conducting a project in data science to analyse large volumes of cancer genome data, my computer is relatively inefficient and has a low cpu and low ram. As a result to run through all the samples it take sufficiently too long.
I have tried reducing any excess code, I have tried getting rid of for loops for list comprehensions, I have used multiprocessing to split up my tasks to run faster.
import re
import xlrd
import os
import time
from multiprocessing import Pool
import collections
import pandas as pd
if os.path.exists("C:\\Users\\js769\\genomemutations\\Input\\ChromosomesVersion") == True:
print("chromosomes in folder")
else:
os.makedirs("C:\\Users\\js769\\genomemutations\\Input\\ChromosomesVersion")
print(
"Chromosome Folder Created, Please transfer current version of chromosome number base data to new file."
)
if os.path.exists("C:\\Users\\js769\\genomemutations\\Input\\MutationSamples") == True:
print("Add sample data to run.")
else:
os.makedirs("C:\\Users\\js769\\genomemutations\\Input\\MutationSamples")
print("Mutation Sample Folder Created, please add mutation sample data to folder.")
if os.path.exists("C:\\Users\\js769\\genomemutations\\output") == True:
print("3")
else:
os.makedirs("C:\\Users\\js769\\genomemutations\\output")
# Require editing of this so it works both on a mac or windows system. Currently this version suited to mac because of higher processing power.
# Require ability to check to see if error occurs
def Main(Yeram):
import os
import glob
import errno
import shutil
import xlrd
import pandas as pd
import time
import re
import numpy as np
FragmentSize = 10000000 # This is fragment size which is adjustable.
# Code not needed
Position1 = Yeram.vectx
Position2 = Yeram.vecty
samplelist = Yeram.samplelist
dictA = Yeram.dictA
FragmentSize = Yeram.FragmentSize
chromosomesizes = Yeram.chromosomesizes
def chromosomex_mutation_data(
chromosomenumber, mutationlist
): # It selects the correct chromosome mutation point data, then it selects the data before the -. Mutation data in form(12-20)
chromosomexlist = ["0-1"]
for mutationposition in mutationlist:
if mutationposition[0:2] == str(chromosomenumber):
chromosomexlist.append(mutationposition[3:])
elif mutationposition[0:2] == (str(chromosomenumber) + ":"):
chromosomexlist.append(mutationposition[2:])
else:
continue
Puremutationdatapoints = [int(mutationposition.split("-")[0]) for mutationposition in chromosomexlist]
return Puremutationdatapoints
def Dictionary_Of_Fragment_mutation(FragmentSize, MutationData, ChromosomeNumber): #
chromosomes = {} # Dictionary
chromosomesize = chromosomesizes[ChromosomeNumber - 1]
# Opening up specific chromosome data and calculating amount of bases present in chromosome
Number_of_fragments = int(chromosomesize / FragmentSize)
for mutation in MutationData:
for i in range(0, (Number_of_fragments), 1):
a = (
"Chromosome"
+ str(ChromosomeNumber)
+ "Fragment"
+ str(i)
+ ",Basepairs "
+ str(i * FragmentSize + 1)
+ "-"
+ str(i * FragmentSize + FragmentSize)
)
if mutation in range(i * FragmentSize + 1, i * FragmentSize + FragmentSize + 1):
if chromosomes.get(a) == None:
chromosomes.update({a: 1})
else:
b = (chromosomes.get(a)) + 1
chromosomes.update({a: b})
else:
if chromosomes.get(a) == None:
chromosomes.update({a: 0})
else:
continue
return chromosomes # adds
# This adds mutations or no mutation to each fragment for chromosome,makes dicitonaries
def DictionaryRead(FragmentSize, Dict, ChromosomeNumber):
chromosomesize = chromosomesizes[ChromosomeNumber - 1]
Number_of_fragments = int(chromosomesize / FragmentSize)
chromosomefragmentlist = []
for i in range(0, (Number_of_fragments), 1):
a = (
"Chromosome"
+ str(ChromosomeNumber)
+ "Fragment"
+ str(i)
+ ",Basepairs "
+ str(i * FragmentSize + 1)
+ "-"
+ str(i * FragmentSize + FragmentSize)
)
chromosomefragmentlist.append(str(Dict.get((a))))
return chromosomefragmentlist
# This uses dictionary to create list
def forwardpackage2(FragmentSize, PureMutationData):
C = [] # list of data in numerical order 0 = no mutation
for i in range(1, 23, 1):
A = chromosomex_mutation_data(i, PureMutationData) # Purifies Data
B = Dictionary_Of_Fragment_mutation(FragmentSize, A, i) # Constructs Dictionary
C += DictionaryRead(
FragmentSize, B, i
) # Uses constructed Dictionary amd generates list of numbers, each number being a fragment in numerical order.
return C
def Mutationpointdata(Position1, Position2, dictA, FragmentSize): # Require dictA
vectx = Position1
vecty = Position2
Samplesandmutationpoints = []
for i in range(vectx, vecty):
print(samplelist[i])
new = [k for k, v in dictA.items() if int(v) == samplelist[i]]
mutationlist = [excelsheet.cell_value(i, 23) for i in new]
mutationlist.sort()
Samplesandmutationpoints.append(forwardpackage2(FragmentSize, mutationlist))
return Samplesandmutationpoints
# Opening sample data from excel table
return Mutationpointdata(Position1, Position2, dictA, FragmentSize) # yeram to james samples
def ChromosomeSequenceData(ChromosomeNumber): # Formats the chromosome file into readable information
with open(
r"C:\Users\js769\genomemutations\Input\ChromosomesVersion\chr" + str(ChromosomeNumber) + ".fa"
) as text_file:
text_data = text_file.read()
listA = re.sub("\n", "", text_data)
# list2=[z for z in text_data if z!= "\n"]
if ChromosomeNumber < 10:
ChromosomeSequenceData = listA[5:]
else:
ChromosomeSequenceData = listA[6:]
return ChromosomeSequenceData
def basepercentage_single(
i, FragmentSize, ChromosomeSequenceData
): # Creates a list of base percentage known for certain type of chromosome.
sentence = ChromosomeSequenceData[(i * FragmentSize + 1) : (i * FragmentSize + FragmentSize)]
a = sentence.count("N") + sentence.count("n")
c = str(((FragmentSize - a) / FragmentSize) * 100) + "%"
return c
def basepercentage_multiple(
FragmentSize, ChromosomeSequenceData
): # Creates a a list of base percentages known which correspond with the dna fragments for every chromosome.
fragmentamount = int(len(ChromosomeSequenceData) / FragmentSize)
list = [
basepercentage_single(i, FragmentSize, ChromosomeSequenceData) for i in range(0, (fragmentamount), 1)
]
return list
def FragmentEncodedPercentage(
FragmentSize
): # Packages a list of base percentages known which correspond with the dna fragments for every chromosome.
Initial_list = [basepercentage_multiple(FragmentSize, ChromosomeSequenceData(i)) for i in range(1, 23, 1)]
List_of_fragment_encoded_percentages = [item for sublist in Initial_list for item in sublist]
return List_of_fragment_encoded_percentages
def chromosomefragmentlist(
FragmentSize, ChromosomeNumber
): # Creares a list of fragment sizes for a specific chromosome.
chromosomesize = chromosomesizes[ChromosomeNumber - 1]
Number_of_fragments = int(chromosomesize / FragmentSize)
chromosomefragmentlist = []
for i in range(0, (Number_of_fragments), 1):
a = (
"Chromosome"
+ str(ChromosomeNumber)
+ "Fragment"
+ str(i)
+ ",Basepairs "
+ str(i * FragmentSize + 1)
+ "-"
+ str(i * FragmentSize + FragmentSize)
)
chromosomefragmentlist.append(str(((a))))
return chromosomefragmentlist
def GenomeFragmentGenerator(
FragmentSize
): # Creates the genome fragments for all chromosomes and adds them all to a list.
list = [chromosomefragmentlist(FragmentSize, i) for i in range(1, 23, 1)]
A = [item for sublist in list for item in sublist]
return A
def excelcreation(
mutationdata, samplelist, alpha, bravo, FragmentSize, A, B
): # Program runs sample alpha to bravo and then constructs excel table
data = {"GenomeFragments": A, "Encoded Base Percentage": B}
for i in range(alpha, bravo):
data.update({str(samplelist[i]): mutationdata[i]})
df = pd.DataFrame(data, index=A)
export_csv = df.to_csv(
r"C:/Users/js769/genomemutations/output/chromosomeAll.csv", index=None, header=True
)
start_time = time.time()
# Code determine base fragment size
FragmentSize = 1000000
chromosomesizes = [] # This calculates the base pair sizes for each chromosome.
for i in range(1, 23):
with open(r"C:\Users\js769\genomemutations\Input\ChromosomesVersion\chr" + str(i) + ".fa") as text_file:
text_data = text_file.read()
list = re.sub("\n", "", text_data)
if i < 10:
chromosomesizes.append(len(list[5:]))
else:
chromosomesizes.append(len(list[6:]))
wb = xlrd.open_workbook("C:/Users/js769/genomemutations/input/MutationSamples/Complete Sample For lungs.xlsx")
excelsheet = wb.sheet_by_index(0)
excelsheet.cell_value(0, 0)
sampleswithduplicates = [excelsheet.cell_value(i, 5) for i in range(1, excelsheet.nrows)]
samplelist = []
for sample in sampleswithduplicates:
if sample not in samplelist:
samplelist.append(int(sample)) # Constructs list of sample , each sample only comes up once
dictA = {}
counter = 1 # Creates a dictionary where it counts the
for sample in sampleswithduplicates:
dictA.update({counter: int(sample)})
counter = counter + 1
A = GenomeFragmentGenerator(FragmentSize)
B = FragmentEncodedPercentage(FragmentSize)
value = collections.namedtuple(
"value", ["vectx", "vecty", "samplelist", "dictA", "FragmentSize", "chromosomesizes"]
)
SampleValues = (
value(
vectx=0,
vecty=2,
samplelist=samplelist,
dictA=dictA,
FragmentSize=FragmentSize,
chromosomesizes=chromosomesizes,
),
value(
vectx=2,
vecty=4,
samplelist=samplelist,
dictA=dictA,
FragmentSize=FragmentSize,
chromosomesizes=chromosomesizes,
),
value(
vectx=4,
vecty=6,
samplelist=samplelist,
dictA=dictA,
FragmentSize=FragmentSize,
chromosomesizes=chromosomesizes,
),
value(
vectx=6,
vecty=8,
samplelist=samplelist,
dictA=dictA,
FragmentSize=FragmentSize,
chromosomesizes=chromosomesizes,
),
value(
vectx=8,
vecty=10,
samplelist=samplelist,
dictA=dictA,
FragmentSize=FragmentSize,
chromosomesizes=chromosomesizes,
),
value(
vectx=10,
vecty=12,
samplelist=samplelist,
dictA=dictA,
FragmentSize=FragmentSize,
chromosomesizes=chromosomesizes,
),
value(
vectx=12,
vecty=14,
samplelist=samplelist,
dictA=dictA,
FragmentSize=FragmentSize,
chromosomesizes=chromosomesizes,
),
value(
vectx=14,
vecty=16,
samplelist=samplelist,
dictA=dictA,
FragmentSize=FragmentSize,
chromosomesizes=chromosomesizes,
),
)
print("starting multiprocessing")
if __name__ == "__main__":
with Pool(4) as p:
result = p.map(Main, SampleValues)
Allmutationdata = []
for i in result:
for b in i:
Allmutationdata.append(b)
excelcreation(Allmutationdata, samplelist, 0, 16, FragmentSize, A, B)
print("My program took " + str(time.time() - start_time) + " to run")
So the program runs that isn't the issue, the issue is the time it runs,can anyone spot anywhere where my code maybe at fault.
This article How to make your pandas loop run 72,000x faster has really resonated with me and I think will help you.
It provides clear instructions on how to vectorize your for loops to drastically speed them up
Methods to speed up a For Loop:
Utilize pandas iterrows()
~321 times faster
Example
for index, row in dataframe.iterrows():
print(index, row)
Pandas Vectorization
~9280 times faster
Example
df.loc[((col1 == val1) & (col2 == val2)), column_name] = conditional_result
Numpy Vectorization
~72,000 times faster
Example
df.loc[((col1.values == val1) & (col2.values == val2)), column_name] = conditional_result
By adding .values we receive a numpy array.
Credit for the timing results goes to this article
I have some code (this is not the full file):
chunk_list = []
def makeFakeTransactions(store_num, num_transactions):
global chunk_list
startTime = datetime.now()
data_load_datetime = startTime.isoformat()
data_load_name = "Faked Data v2.2"
data_load_path = "data was faked"
index_list = []
number_of_stores = store_num + 10
number_of_terminals = 13
for month in range(1, 13):
number_of_days = 30
extra_day_months = [1, 3, 5, 7, 8, 10, 12]
if month == 2:
number_of_days = 28
elif month in extra_day_months:
number_of_days = 31
for day in range(1, number_of_days + 1):
for store in range(store_num, number_of_stores):
operator_id = "0001"
operator_counter = 1
if store < 11:
store_number = "0000" + str(store)
else:
store_number = "000" + str(store)
for terminal in range(1, number_of_terminals + 1):
if terminal < 10:
terminal_id = str(terminal) + "000"
else:
terminal_id = str(terminal) + "00"
transaction_type = "RetailTransaction"
transaction_type_code = "Transaction"
transaction_date = date(2015, month, day)
transaction_date_str = transaction_date.isoformat()
transaction_time = time(random.randint(0, 23), random.randint(0, 59))
transaction_datetime = datetime.combine(transaction_date, transaction_time)
transaction_datetime_str = transaction_datetime.isoformat()
max_transactions = num_transactions
for transaction_number in range (0, max_transactions):
inactive_time = random.randint(80, 200)
item_count = random.randint(1, 15)
sequence_number = terminal_id + str(transaction_number)
transaction_datetime = transaction_datetime + timedelta(0, ring_time + special_time + inactive_time)
transaction_summary = {}
transaction_summary["transaction_type"] = transaction_type
transaction_summary["transaction_type_code"] = transaction_type_code
transaction_summary["store_number"] = store_number
transaction_summary["sequence_number"] = sequence_number
transaction_summary["data_load_path"] = data_load_path
index_list.append(transaction_summary.copy())
operator_counter += 10
operator_id = '{0:04d}'.format(operator_counter)
chunk_list.append(index_list)
if __name__ == '__main__':
store_num = 1
process_number = 6
num_transactions = 10
p = multiprocessing.Pool(process_number)
results = [p.apply(makeFakeTransactions, args = (store_num, num_transactions,)) for store_num in xrange(1, 30, 10)]
results = [p.apply(elasticIndexing, args = (index_list,)) for index_list in chunk_list]
I have a global variable chunk_list that gets appended to at the end of my makeFakeTransactions function and basically it's a list of lists. However, when I do a test print of chunk_list after the 3 processes for makeFakeTransactions, the chunk_list shows up empty, even though it should've been appended to 3 times. Am I doing something wrong regarding global list variables in multiprocessing? Is there a better way to do this?
Edit: makeFakeTransactions appends a dictionary copy to index_list and once all the dictionaries are appended to index_list, it appends index_list to the global variable chunk_list.
First, your code isn't actually running in parallel. According to the docs, p.apply will block until complete, so you are running your tasks sequentially on the process pool. You need to use p.map_async to kick off a task and not wait for it to complete.
Second, as was said in a comment, global state isn't shared between processes. You can use shared memory, but in this case it is much simpler to just transfer the result back from the worker process. Since you don't use chunk_list for anything other than collecting the result, you can just send the result back after computation and collect them on the calling process. This is easy using multiprocessing.Pool, you just return the result from your worker function:
return index_list
This will make p.apply() return index_list. p.apply_async() will return an AsyncResult that will return index_list with AsyncResult.get(). Since you're already using list comprehension, the modifications are small:
p = multiprocessing.Pool(process_number)
async_results = [p.apply_async(makeFakeTransactions, args = (store_num, num_transactions,)) for store_num in xrange(1, 30, 10)]
results = [ar.get() for ar in async_results]
You can do simplify it down to one step by using p.map, which effectively does what those previous two lines do. Note p.map blocks until all results are available.
p = multiprocessing.Pool(process_number)
results = p.map(lambda store_num: makeFakeTransactions(store_num, num_transactions), xrange(1, 30, 10))
Since p.map expects a single argument function, you need to wrap it in a lambda.
There is documentation on how to produce a graph after running a canvas job in Celery.
However I'd like to generate a graph before I run the job.
Say I created a simple chain:
c = chain(add.s(1, 2), mul(4))
How can I generate a graph of the chain?
Thanks,
Miki
I had the exact same desire. Generate the graph before running the job. So I worked a bit on it :)
It appears that celery does not allow it. The reason for that (at least what I understood when trying to do it) is that in the graph each node has to have a unique name. Once the canvas is executed this unique name is the celery task_id but before execution there is nothing that allow such a distinction.
So the solution is to generate this graph by yourself, and of course identify uniquely each node (for this a counter can do the work).
This is the job of this function:
# -*- coding: utf-8 -*-
from celery.canvas import chain, group, Signature
def analyze_canvas(canvas):
return _analyze_canvas(canvas)['dependencies']
def _analyze_canvas(canvas, previous=[], i=0):
dependencies = []
if isinstance(canvas, chain):
for t in canvas.tasks:
if not (isinstance(t, group) or isinstance(t, chain)):
n = str(t) + " - (" + str(i) + ")"
i += 1
dependencies.append((n, previous))
previous = [n]
else:
analysis = _analyze_canvas(t, previous, i)
dependencies.extend(analysis['dependencies'])
previous = analysis['previous']
elif isinstance(canvas, group):
new_previous = []
for t in canvas.tasks:
if not (isinstance(t, group) or isinstance(t, chain)):
n = str(t) + " - (" + str(i) + ")"
i += 1
dependencies.append((n, previous))
new_previous.append(n)
else:
analysis = _analyze_canvas(t, previous, i)
dependencies.extend(analysis['dependencies'])
new_previous = analysis['previous']
previous = new_previous
elif isinstance(canvas, Signature):
n = str(t) + " - (" + str(i) + ")"
i += 1
dependencies.append((n, previous))
previous = [n]
return {"dependencies": dependencies,
"previous": previous}
It generates the dependency graph of your canvas. The idea is just to iterate other the tasks of the canvas and identify group/chain/Signatures to generate the right dependencies.
From this point you can use some more celery utils to generate the dot file. Here is a small usage example:
from celery_util import analyze_canvas
from celery.datastructures import DependencyGraph
from celery import Celery, group
app = Celery()
#app.task
def t1():
pass
#app.task
def t2():
pass
canvas = t1.si() | t2.si() | group(t1.si(), t1.si(), t2.si()) | t2.si()
d = analyze_canvas(canvas)
dg = DependencyGraph(it=d)
pipo = open("pipo.dot", "w+")
dg.to_dot(pipo)
In this example I just declare dummy tasks and chain/group them in a nice pretty canvas. I use the celery util DependencyGraph to have the object representation and the ability to dump the graph in dot which I do with to_dot method.
And the beautiful result is:
I have updated the code from https://stackoverflow.com/a/29105701/928489 to work with celery4. It works with chain, group and chord.
from app.instant_design import get_instant_design_tasks
from celery.canvas import _chain, group, chord
def analyze_canvas(canvas):
return _analyze_canvas(canvas)[0]
def _analyze_canvas(canvas, previous=[], i=0):
dependencies = []
if isinstance(canvas, _chain):
for i, t in enumerate(canvas.tasks, i):
dep, previous = _analyze_canvas(t, previous, i)
dependencies.extend(dep)
elif isinstance(canvas, group) or isinstance(canvas, chord):
new_previous = []
for i, t in enumerate(canvas.tasks, i):
dep, p = _analyze_canvas(t, previous, i)
dependencies.extend(dep)
new_previous.extend(p)
if isinstance(canvas, chord):
dep, p = _analyze_canvas(canvas.body, new_previous, i)
return dependencies + dep, p
else:
t = canvas.name + " - (" + str(i) + ")"
dependencies = [(t, previous)]
previous = [t]
return dependencies, previous