Any way to speedup itertool.product - python

I am using itertools.product to find the possible weights an asset can take given that the sum of all weights adds up to 100.
min_wt = 10
max_wt = 50
step = 10
nb_Assets = 5
weight_mat = []
for i in itertools.product(range(min_wt, (max_wt+1), step), repeat = nb_Assets):
if sum(i) == 100:
weight = [i]
if np.shape(weight_mat)[0] == 0:
weight_mat = weight
else:
weight_mat = np.concatenate((weight_mat, weight), axis = 0)
The above code works, but it is too slow as it goes through the combinations that are not acceptable, example [50,50,50,50,50] eventually testing 3125 combinations instead of 121 possible combinations. Is there any way we can add the 'sum' condition within the loop to speed things up?

Many improvements are possible.
For starters, the search space can be reduced using itertools.combinations_with_replacement() because summation is commutative.
Also, the last addend should be computed rather than tested. For example if t[:4] was (10, 20, 30, 35), you could compute t[4] as 1 - sum(t), giving a value of 5. This will give a 100-fold speed-up over trying one-hundred values of x in (10, 20, 30, 35, x).

You can write up a recursive algorithm for that which prunes all the impossible options early on:
def make_weight_combs(min_wt, max_wt, step, nb_assets, req_wt):
weights = range(min_wt, max_wt + 1, step)
current = []
yield from _make_weight_combs_rec(weights, nb_assets, req_wt, current)
def _make_weight_combs_rec(weights, nb_assets, req_wt, current):
if nb_assets <= 0:
yield tuple(current)
else:
# Discard weights that cannot possibly be used
while weights and weights[0] + weights[-1] * (nb_assets - 1) < req_wt:
weights = weights[1:]
while weights and weights[-1] + weights[0] * (nb_assets - 1) > req_wt:
weights = weights[:-1]
# Add all possible weights
for w in weights:
current.append(w)
yield from _make_weight_combs_rec(weights, nb_assets - 1, req_wt - w, current)
current.pop()
min_wt = 10
max_wt = 50
step = 10
nb_assets = 5
req_wt = 100
for comb in make_weight_combs(min_wt, max_wt, step, nb_assets, req_wt):
print(comb, sum(comb))
Output:
(10, 10, 10, 20, 50) 100
(10, 10, 10, 30, 40) 100
(10, 10, 10, 40, 30) 100
(10, 10, 10, 50, 20) 100
(10, 10, 20, 10, 50) 100
(10, 10, 20, 20, 40) 100
(10, 10, 20, 30, 30) 100
(10, 10, 20, 40, 20) 100
...
If order of the weights does not matter (so, for example, (10, 10, 10, 20, 50) and (50, 20, 10, 10, 10) are the same), then you can modify the for loop as follows:
for i, w in enumerate(weights):
current.append(w)
yield from _make_weight_combs_rec(weights[i:], nb_assets - 1, req_wt - w, current)
current.pop()
Which gives the output:
(10, 10, 10, 20, 50) 100
(10, 10, 10, 30, 40) 100
(10, 10, 20, 20, 40) 100
(10, 10, 20, 30, 30) 100
(10, 20, 20, 20, 30) 100
(20, 20, 20, 20, 20) 100

Comparing performance of the offered solutions:
import itertools
import timeit
import numpy as np
# original code from question
def f1():
min_wt = 10
max_wt = 50
step = 10
nb_assets = 5
weight_mat = []
for i in itertools.product(range(min_wt, (max_wt+1), step), repeat=nb_assets):
if sum(i) == 100:
weight = [i, ]
if np.shape(weight_mat)[0] == 0:
weight_mat = weight
else:
weight_mat = np.concatenate((weight_mat, weight), axis=0)
return weight_mat
# code from question using list instead of numpy array
def f1b():
min_wt = 10
max_wt = 50
step = 10
nb_assets = 5
weight_list = []
for i in itertools.product(range(min_wt, (max_wt+1), step), repeat=nb_assets):
if sum(i) == 100:
weight_list.append(i)
return weight_list
# calculating the last element of each tuple
def f2():
min_wt = 10
max_wt = 50
step = 10
nb_assets = 5
weight_list = []
for i in itertools.product(range(min_wt, (max_wt+1), step), repeat=nb_assets-1):
the_sum = sum(i)
if the_sum < 100:
last_elem = 100 - the_sum
if min_wt <= last_elem <= max_wt:
weight_list.append(i + (last_elem, ))
return weight_list
# recursive solution from user kaya3 (https://stackoverflow.com/a/58823843/9225671)
def constrained_partitions(n, k, min_w, max_w, w_step=1):
if k < 0:
raise ValueError('Number of parts must be at least 0')
elif k == 0:
if n == 0:
yield ()
else:
for w in range(min_w, max_w+1, w_step):
for p in constrained_partitions(n-w, k-1, min_w, max_w, w_step):
yield (w,) + p
def f3():
return list(constrained_partitions(100, 5, 10, 50, 10))
# recursive solution from user jdehesa (https://stackoverflow.com/a/58823990/9225671)
def make_weight_combs(min_wt, max_wt, step, nb_assets, req_wt):
weights = range(min_wt, max_wt + 1, step)
current = []
yield from _make_weight_combs_rec(weights, nb_assets, req_wt, current)
def _make_weight_combs_rec(weights, nb_assets, req_wt, current):
if nb_assets <= 0:
yield tuple(current)
else:
# Discard weights that cannot possibly be used
while weights and weights[0] + weights[-1] * (nb_assets - 1) < req_wt:
weights = weights[1:]
while weights and weights[-1] + weights[0] * (nb_assets - 1) > req_wt:
weights = weights[:-1]
# Add all possible weights
for w in weights:
current.append(w)
yield from _make_weight_combs_rec(weights, nb_assets - 1, req_wt - w, current)
current.pop()
def f4():
return list(make_weight_combs(10, 50, 10, 5, 100))
I tested these functions using timeit like this:
print(timeit.timeit('f()', 'from __main__ import f1 as f', number=100))
The results using the parameters from the question:
# min_wt = 10
# max_wt = 50
# step = 10
# nb_assets = 5
0.07021828400320373 # f1 - original code from question
0.041302188008558005 # f1b - code from question using list instead of numpy array
0.009902548001264222 # f2 - calculating the last element of each tuple
0.10601829699589871 # f3 - recursive solution from user kaya3
0.03329997700348031 # f4 - recursive solution from user jdehesa
If I expand the search space (reduced step and increased assets):
# min_wt = 10
# max_wt = 50
# step = 5
# nb_assets = 6
7.6620834979985375 # f1 - original code from question
7.31425816299452 # f1b - code from question using list instead of numpy array
0.809070186005556 # f2 - calculating the last element of each tuple
14.88188026699936 # f3 - recursive solution from user kaya3
0.39385621099791024 # f4 - recursive solution from user jdehesa
Seems like f2 and f4 are the fastest (for the tested size of the data).

Let's generalise this problem; you want to iterate over k-tuples whose sum is n, and whose elements are within range(min_w, max_w+1, w_step). This is a kind of integer partitioning problem, with some extra constraints on the size of the partition and the sizes of its components.
To do this, we can write a recursive generator function; for each w in the range, the remainder of the tuple is a (k - 1)-tuple whose sum is (n - w). The base case is a 0-tuple, which is possible only if the required sum is 0.
As Raymond Hettinger notes, you can also improve the efficiency when k = 1 by just testing whether the required sum is one of the allowed weights.
def constrained_partitions(n, k, min_w, max_w, w_step=1):
if k < 0:
raise ValueError('Number of parts must be at least 0')
elif k == 0:
if n == 0:
yield ()
elif k == 1:
if n in range(min_w, max_w+1, w_step):
yield (n,)
elif min_w*k <= n <= max_w*k:
for w in range(min_w, max_w+1, w_step):
for p in constrained_partitions(n-w, k-1, min_w, max_w, w_step):
yield (w,) + p
Usage:
>>> for p in constrained_partitions(5, 3, 1, 5, 1):
... print(p)
...
(1, 1, 3)
(1, 2, 2)
(1, 3, 1)
(2, 1, 2)
(2, 2, 1)
(3, 1, 1)
>>> len(list(constrained_partitions(100, 5, 10, 50, 10)))
121
Whenever you're iterating over all solutions to some sort of combinatorial problem, it's generally best to generate actual solutions directly, rather than generate more than you need (e.g. with product or combinations_with_replacement) and reject the ones you don't want. For larger inputs, the vast majority of time would be spent generating solutions which will get rejected, due to combinatorial explosion.
Note that if you don't want repeats in different orders (e.g. 1, 1, 3 and 1, 3, 1), you can change the recursive call to constrained_partitions(n-w, k-1, min_w, w, w_step) to only generate partitions where the weights are in non-increasing order.

Note that when you have N weights that sum up to 100, and you chose N - 1 weights, the remaining weight is already defined as 100 - sum of already chosen weights, which should be positive. The same limitation applies to any number of already chosen weights.
Next, you don't want combinations that are just permutations of the same weights. This is why you can order weights by value, and choose the next weight in the combination to be below or equal of the previous one.
This immediately makes the search space much smaller, and you can break a particular branch of search earlier.
Probably writing it with explicit loops first, or as a recursive algorithm, should be much easier for understanding and implementing.

Related

LP Programming with python using Pulp library

I have a linear programming problem where I need to minimise the cost of manufacturing a number of items in the span of n months. Xi is the variable for each amount of items manufactured corresponding to month i. Now, I want to include a constraint where if Xi > 0, then a number A is going to be added to the objective function.
Obviously this can't be done with a boolean expression inside a for loop for example since Xi is a class object from the pulp library. Does anybody know how to help me?
Docplex is not working
Thank you so much.
x = [LpVariable(name=f"x{i}", lowBound=0) for i in range(0, 12)]
# standards
manufacturing_time_per_unit = 1/3
cost_of_hour = 12
storage_cost_per_unit = 3
# these are monthly
cost_of_raw_materials_per_unit = [11, 10, 13, 9, 8, 7,
10, 12, 12, 10, 9]
demand = [150, 200, 100, 300, 200,
400, 300, 250, 150, 200, 300, 350]
avalaible_hours = [250, 250, 200, 150, 200, 200,
150, 200, 250, 150, 150, 200]
cost_sum = 0
stored = [100]
for i in range(1, 13):
cost_constraint = manufacturing_time_per_unit*x[i-1] <= avalaible_hours[i-1]
model += cost_constraint
demand_constraint = x[i-1] + stored >= demand[i-1]
model += demand_constraint
stored.append(x[i-1] + stored - demand[i-1])
cost_sum += manufacturing_time_per_unit*x[i-1]+stored[i-1]*storage_cost_per_unit
storage_constraint = x[i-1] != 0
#if x[i-1]>0:
# cost_sum += 1000
model += cost_sum
model.solve()
Add a binary variable y[i] with y[i]=0 => x[i]=0. (This implies x[i]>0 => y[i]=1.) I.e.
min sum(i, 1000*y[i])
x[i] <= U*y[i]
x[i] >= 0
y[i] ∈ {0,1}
Here U is an upper bound on x[i].

How to properly index scenarios in a two stage stochastic programming in Pyomo?

Main question:
How to define the wind scenario parameter so that it can provide different wind scenarios for different probabilities in scenario set 'S'.
When running the stochastic code with scenario set S = [1,2,3,...10] and the corresponding probabilities Prob = [0.1, 0.1, ..., 0.1]. With t = [1, 2, ..., 48] with corresponding windspeeds m.wind.
I tried the following options to create a (s*t) matrix for the objective function below.
Objective function
Option 1:
m.wind = dict{:48}, this is the dict with the wind speeds at 48 timesteps used in the deterministic model
Option 2:
m. wind = {ndarray:(10,48)} = array([[...],..,[...]]), In this I constructed an array with ten scenarios consisting of wind speeds at 48 timesteps for 10 scenarios.
Option 3:
m.windd = {list:10} = array[(...),...,(...)], In this option I put the different scenarios between parentheses after having read that Pyomo sometimes doesn't recognize square brackets.
Option 4:
The last method for constructing a scenario set for wind was by creating a (x*t) dictionary.
All options resulted in the error:
ERROR: index '0' not valid for indeed component 'wind'
Do you know how this error can be resolved and how the wind should be properly indexed?
`
def build_model(price_data, horizon_length, scenario_length, load_calc, park_calc):
m = pyo.ConcreteModel()
### BEGIN SOLUTION
# test vector
vector = np.array([0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1])
vector = vector.reshape(10,1)
## Sets
# Save the number of timesteps
m.N = horizon_length
m.S = len(scenario_length)
# Define the horizon set starting at hour 1 until horizon length +1
m.HORIZON = pyo.Set(initialize=range(1, m.N + 1))
# Define scenario set
m.SCENARIO = pyo.Set(initialize=range(1, m.S + 1))
## Parameters
# Round trip efficiency
m.teta = pyo.Param(initialize=0.95)
# Energy [MWh] in battery at t=0
m.E0 = pyo.Param(initialize=2.0, mutable=True)
# Guarantee of origin for local wind [€/MWh]
m.goNL = pyo.Param(initialize=5)
# Guarantee of origin for grid power [€/MWh]
m.goBE = pyo.Param(initialize=150)
# Maximum discharge power
m.d_max = pyo.Param(initialize=5)
# Maximum charge power
m.c_max = pyo.Param(initialize=5)
# Maximum export power
m.im_max = pyo.Param(initialize=10)
# Maximum import power
m.ex_max = pyo.Param(initialize=100)
## CREATE DICTS FOR DATA: Price, Load & Calc
# Create empty dictionary
price_data_dict = {}
# Loop over Price data elements of numpy array
for i in range(0, len(price_data)):
# Add element to data_dict
price_data_dict[i + 1] = price_data[i]
# Create empty dictionary
load_data_dict = {}
# Loop over Load data elements of numpy array
for i in range(0, len(load_calc)):
# Add element to data_dict
load_data_dict[i + 1] = load_calc[i]
# Create empty dictionary
park_data_dict = {}
# Loop over Wind park data elements of numpy array
for i in range(0, len(park_calc)):
# Add element to data_dict
park_data_dict[i + 1] = park_calc[i]
# Create empty dictionary
prob_dict = {}
# Loop over probability data elements of numpy array
for i in range(0, len(vector)):
# Add element to prob_dict
prob_dict[i + 1] = vector[i]
# Repeat the wind data to a matrix for 10 similar scenario's
wind_matrix = np.tile(park_calc, (10, 1))
# wind_matrix = np.tile(park_calc, (10, 1)) * vector
park_data_dict_2 = {1: park_data_dict, 2: park_data_dict, 3: park_data_dict, 4: park_data_dict, 5: park_data_dict,
6: park_data_dict, 7: park_data_dict, 8: park_data_dict, 9: park_data_dict, 10: park_data_dict}
# Price data
m.price = pyo.Param(m.HORIZON, initialize=price_data_dict, domain=pyo.Reals, mutable=True)
# Load data
m.Load = pyo.Param(m.HORIZON, initialize=load_data_dict, domain=pyo.Reals, mutable=True)
# Wind park data
m.wind = pyo.Param(m.SCENARIO, m.HORIZON, initialize=park_data_dict_2, mutable=True) #park_data_dict
# Scenario probability
m.prob = pyo.Param(m.SCENARIO, initialize=vector) # Was Scen_prob
# # New description of wind in 10 different scenarios
# m.wind = pyo.Param(m.SCENARIO, m.HORIZON, initialize=wind_matrix_2) # initialize=wind_matrix_2
## Variables
## Battery related variables
# Charging rate [MW]
m.c = pyo.Var(m.HORIZON, initialize=0.0, bounds=(0, 10), domain=pyo.NonNegativeReals)
# Discharging rate [MW]
m.d = pyo.Var(m.HORIZON, initialize=0.0, bounds=(0, 10), domain=pyo.NonNegativeReals)
# Battery power
m.Bat = pyo.Var(m.HORIZON, initialize=0.0, domain=pyo.NonNegativeReals)
# Binary variables charging and grid
m.u = pyo.Var(m.HORIZON, initialize=0.0, domain=pyo.Binary)
m.v = pyo.Var(m.HORIZON, initialize=0.0, domain=pyo.Binary)
# Energy (state-of-charge) [MWh]
m.E = pyo.Var(m.HORIZON, initialize=2.0, bounds=(0, 5), domain=pyo.NonNegativeReals)
m.G_im = pyo.Var(m.HORIZON, initialize=0, bounds=(0, 10), domain=pyo.NonNegativeReals)
m.G_ex = pyo.Var(m.HORIZON, initialize=0, bounds=(0, 100), domain=pyo.NonNegativeReals)
m.grid = pyo.Var(m.HORIZON, initialize=m.Load, bounds=(0, 10), domain=pyo.NonNegativeReals)
# Objective function
# def objfun(model):
# return sum((m.price[t] + m.goNL) * m.wind[t] + (m.price[t] + m.goBE) * m.G_im[t] for t in m.HORIZON)
def objfun(model):
return sum((m.price[t] + m.goBE) * m.G_im[t] + (m.price[t] + m.goNL) * sum(m.prob[s] * m.wind[s, t] for s in m.SCENARIO) for t in m.HORIZON)
m.OBJ = pyo.Objective(rule=objfun, sense=pyo.minimize)
def PowerBalance(m, t):
return m.Load[t] + m.c[t] == m.grid[t] + m.d[t]
# Define Energy Balance constraints. [MWh] = [MW]*[1 hr]
# Note: assume 1-hour timestep in price data and control actions.
def EnergyBalance(m, t):
# First timestep
if t == 1:
return m.E[t] == m.E0 + m.c[t] * m.teta - m.d[t] / m.teta
# Subsequent timesteps
else:
return m.E[t] == m.E[t - 1] + m.c[t] * m.teta - m.d[t] / m.teta
# def ColdIroning(m, t):
# return m.c[t] + m.d[t] + m.Load[t] <= m.CI
def GridBalance(m, t, s):
return m.grid[t] == m.wind[t, s] + m.G_im[t] - m.G_ex[t]
def ImMax(m, t):
return m.G_ex[t] - m.v[t] * m.ex_max <= 0
def ExMax(m, t):
return m.G_im[t] + m.v[t] * m.im_max <= m.im_max
# def BatteryBalance(m, t):
# return m.Bat[t] - m.d[t] + m.c[t] == 0
#
def ChargeMax(m, t):
return m.d[t] - m.u[t] * m.d_max <= 0
def DischargeMax(m, t):
return m.c[t] + m.u[t] * m.c_max <= m.c_max
m.EnergyBalance_Con = pyo.Constraint(m.HORIZON, rule=EnergyBalance)
m.PowerBalance_Con = pyo.Constraint(m.HORIZON, rule=PowerBalance)
# m.ColdIroning_Con = pyo.Constraint(m.HORIZON, rule=ColdIroning)
m.GridBalance_Con = pyo.Constraint(m.HORIZON, m.SCENARIO, rule=GridBalance)
# m.BatteryBalance_Con = pyo.Constraint(m.HORIZON, rule=BatteryBalance)
m.ChargeMax_Con = pyo.Constraint(m.HORIZON, rule=ChargeMax)
m.DischargeMax_Con = pyo.Constraint(m.HORIZON, rule=DischargeMax)
m.ImMax_Con = pyo.Constraint(m.HORIZON, rule=ImMax)
m.ExMax_Con = pyo.Constraint(m.HORIZON, rule=ExMax)
## END SOLUTION
return m`
Hey Thomas welcome to the site.
2 quick things on your model before we talk wind...
You don't need to (and probably shouldn't) initialize variables, just let the solver do its work.
You have an egregious typo wind[t, s] (instead of wind[s, t]) in your GridBalance constraint, which would be a devil to find if |T| == |S|.
You didn't say what format the wind data came to you in. Perhaps you are just hand-jamming in some table data. So let's start with what pyomo wants... It wants a tuple-indexed dictionary to initialize the parameter. Meaning that the key values of the dictionary are tuples of (s, t) values. This is also known as a "flat" data structure where all of the keys are enumerated and the data value of interest is in 1 column (vice a matrix format or something). So you want to initialize your parameter from something like this:
import pyomo.environ as pyo
# what we want: a "flat" dictionary
# s, t : w
wind = { (1, 1): 12,
(1, 2): 11,
(1, 3): 10,
(2, 1): 9,
(2, 2): 13,
(2, 3): 14}
m = pyo.ConcreteModel()
# SETS
m.S = pyo.Set(initialize=[1, 2])
m.T = pyo.Set(initialize=[1, 2, 3])
# PARAMS
m.wind = pyo.Param(m.S, m.T, initialize=wind)
m.pprint()
Output:
3 Set Declarations
S : Size=1, Index=None, Ordered=Insertion
Key : Dimen : Domain : Size : Members
None : 1 : Any : 2 : {1, 2}
T : Size=1, Index=None, Ordered=Insertion
Key : Dimen : Domain : Size : Members
None : 1 : Any : 3 : {1, 2, 3}
wind_index : Size=1, Index=None, Ordered=True
Key : Dimen : Domain : Size : Members
None : 2 : S*T : 6 : {(1, 1), (1, 2), (1, 3), (2, 1), (2, 2), (2, 3)}
1 Param Declarations
wind : Size=6, Index=wind_index, Domain=Any, Default=None, Mutable=False
Key : Value
(1, 1) : 12
(1, 2) : 11
(1, 3) : 10
(2, 1) : 9
(2, 2) : 13
(2, 3) : 14
4 Declarations: S T wind_index wind
There are several techniques to make a tuple-indexed dictionary from data, dependent on the structure of the source data, obviously. Before that though, I see you converting lists to dicts with loops. Certainly doable, or you could use a shortcut with enumerate, which generates index:value pairs and just pass that to a dictionary constructor. Note that enumerate takes an optional start argument, if you like your data 1-indexed vice 0.
prices = [3.5, 4.2, 9.8]
price_dict = dict(enumerate(prices, start=1))
print(price_dict)
# {1: 3.5, 2: 4.2, 3: 9.8}
So if you are purely hand-jamming in the values, you could type out a flat dictionary as shown above, or if you have a list-of-lists (aka matrix) of wind data you can convert it several ways, depending on your comfort level with dictionary comprehensions and such. All 3 of these below generate the same flat dictionary usable in your model:
raw_wind_data = [[4, 5, 9],
[3, 0, 12]]
wind_1 = {}
for i in range(len(raw_wind_data)):
for j in range(len(raw_wind_data[0])):
wind_1[(i+1, j+1)] = raw_wind_data[i][j]
wind_2 = { (r+1, c+1) : raw_wind_data[r][c]
for r in range(len(raw_wind_data))
for c in range(len(raw_wind_data[0]))}
wind_3 = {(r_idx, c_idx): w
for r_idx, row in enumerate(raw_wind_data, 1)
for c_idx, w in enumerate(row, 1)}
print(wind_1)
print(wind_2)
print(wind_3)
# {(1, 1): 4, (1, 2): 5, (1, 3): 9, (2, 1): 3, (2, 2): 0, (2, 3): 12}
# {(1, 1): 4, (1, 2): 5, (1, 3): 9, (2, 1): 3, (2, 2): 0, (2, 3): 12}
# {(1, 1): 4, (1, 2): 5, (1, 3): 9, (2, 1): 3, (2, 2): 0, (2, 3): 12}

Conditional minimum of 4D matrix: find minimizing vectors

I'm guessing about the way to conditionally minimize the 4D matrix.
Let's start creating some toy data (which is close to my real-world problem):
import numpy as np
t = np.arange(1960,1981,1)
N = np.arange(0,3,1)
k = np.arange(0,5,0.1)
k_matrix = ( np.tile(k,(len(N),1)).T * (N+1)/(N+2) ).T
p = np.arange(0.1,2.01,0.1)
theory = np.random.normal(10,1,[len(N),len(t),len(p)])
res2 = np.zeros([len(N),len(t),len(k),len(p)])
def calc_res2(N,t,k_matrix,p,theory):
for N_ind, N_val in enumerate(N):
for t_ind, t_val in enumerate(t):
for k_ind, k_val in enumerate(k_matrix[N_ind]):
for p_ind, p_val in enumerate(p):
res2[N_ind,t_ind,k_ind,p_ind] = (N_val*t_val-k_val*theory[N_ind,t_ind,p_ind])**2
return res2
test = calc_res2(N,t,k_matrix,p,theory)
I want to find such indices/values of k_matrix (as a function of N) and p (as a function of t) that
test sum over t and N will be minimal.
Now I see that this problem can be solved using for cycles:
def k_multi_N (test,k_matrix,p):
SUM_best = 1e99
k0i_b,k1i_b,k2i_b = 0,0,0
for k0_ind,k0 in enumerate(k_matrix[0]):
temp = test[0,:,k0_ind,:]
for k1_ind,k1 in enumerate(k_matrix[1]):
temp += test[1,:,k1_ind,:]
for k2_ind,k2 in enumerate(k_matrix[2]):
temp += test[2,:,k2_ind,:]
SUM = sum(temp.min(axis=1))
if SUM < SUM_best:
SUM_best = SUM
p_min_ind = np.argmin(temp,axis=1)
k0i_b,k1i_b,k2i_b = k0_ind,k1_ind,k2_ind
temp -= test[2,:,k2_ind,:]
temp -= test[1,:,k1_ind,:]
temp -= test[0,:,k0_ind,:]
return p_min_ind, (k0i_b,k1i_b,k2i_b)
k_multi_N (test,k_matrix,p)
So the expected output is:
(array([12, 16, 14, 8, 14, 18, 1, 18, 9, 9, 15, 18, 9, 13, 9, 3, 3,
18, 13, 6, 19]),
(0, 49, 49))
but the computational efficiency will be very small considering big-size vectors of N and k (my real-world case is 16*200 for N*k+800*200 for 't*k`, so it will be 16^200 iterations with 800*200 matrices :(
Of course, I considered numba solution, but it does not allow me to significantly speed up the calculation (i.e. it still takes a lot of time!).
I'm wondering about alternative, more computationally efficient ways to solve the problem.
Thanks!
EDIT: The question was significantly changed to clarify the problem. I appreciate the people who helped me to do it!

How to generate random numbers to satisfy a specific mean and median in python?

I would like to generate n random numbers e.g., n=200, where the range of possible values is between 2 and 40 with a mean of 12 and median is 6.5.
I searched everywhere and i could not find a solution for this. I tried the following script by it works for small numbers such as 20, for big numbers it takes ages and result is returned.
n=200
x = np.random.randint(0,1,size=n) # initalisation only
while True:
if x.mean() == 12 and np.median(x) == 6.5:
break
else:
x=np.random.randint(2,40,size=n)
Could anyone help me by improving this to get a quick result even when n=5000 or so?
One way to get a result really close to what you want is to generate two separate random ranges with length 100 that satisfies your median constraints and includes all the desire range of numbers. Then by concatenating the arrays the mean will be around 12 but not quite equal to 12. But since it's just mean that you're dealing with you can simply generate your expected result by tweaking one of these arrays.
In [162]: arr1 = np.random.randint(2, 7, 100)
In [163]: arr2 = np.random.randint(7, 40, 100)
In [164]: np.mean(np.concatenate((arr1, arr2)))
Out[164]: 12.22
In [166]: np.median(np.concatenate((arr1, arr2)))
Out[166]: 6.5
Following is a vectorized and very much optimized solution against any other solution that uses for loops or python-level code by constraining the random sequence creation:
import numpy as np
import math
def gen_random():
arr1 = np.random.randint(2, 7, 99)
arr2 = np.random.randint(7, 40, 99)
mid = [6, 7]
i = ((np.sum(arr1 + arr2) + 13) - (12 * 200)) / 40
decm, intg = math.modf(i)
args = np.argsort(arr2)
arr2[args[-41:-1]] -= int(intg)
arr2[args[-1]] -= int(np.round(decm * 40))
return np.concatenate((arr1, mid, arr2))
Demo:
arr = gen_random()
print(np.median(arr))
print(arr.mean())
6.5
12.0
The logic behind the function:
In order for us to have a random array with that criteria we can concatenate 3 arrays together arr1, mid and arr2. arr1 and arr2 each hold 99 items and the mid holds 2 items 6 and 7 so that make the final result to give as 6.5 as the median. Now we an create two random arrays each with length 99. All we need to do to make the result to have a 12 mean is to find the difference between the current sum and 12 * 200 and subtract the result from our N largest numbers which in this case we can choose them from arr2 and use N=50.
Edit:
If it's not a problem to have float numbers in your result you can actually shorten the function as following:
import numpy as np
import math
def gen_random():
arr1 = np.random.randint(2, 7, 99).astype(np.float)
arr2 = np.random.randint(7, 40, 99).astype(np.float)
mid = [6, 7]
i = ((np.sum(arr1 + arr2) + 13) - (12 * 200)) / 40
args = np.argsort(arr2)
arr2[args[-40:]] -= i
return np.concatenate((arr1, mid, arr2))
Here, you want a median value lesser than the mean value. That means that a uniform distribution is not appropriate: you want many little values and fewer great ones.
Specifically, you want as many value lesser or equal to 6 as the number of values greater or equal to 7.
A simple way to ensure that the median will be 6.5 is to have the same number of values in the range [ 2 - 6 ] as in [ 7 - 40 ]. If you choosed uniform distributions in both ranges, you would have a theorical mean of 13.75, which is not that far from the required 12.
A slight variation on the weights can make the theorical mean even closer: if we use [ 5, 4, 3, 2, 1, 1, ..., 1 ] for the relative weights of the random.choices of the [ 7, 8, ..., 40 ] range, we find a theorical mean of 19.98 for that range, which is close enough to the expected 20.
Example code:
>>> pop1 = list(range(2, 7))
>>> pop2 = list(range(7, 41))
>>> w2 = [ 5, 4, 3, 2 ] + ( [1] * 30)
>>> r1 = random.choices(pop1, k=2500)
>>> r2 = random.choices(pop2, w2, k=2500)
>>> r = r1 + r2
>>> random.shuffle(r)
>>> statistics.mean(r)
12.0358
>>> statistics.median(r)
6.5
>>>
So we now have a 5000 values distribution that has a median of exactly 6.5 and a mean value of 12.0358 (this one is random, and another test will give a slightly different value). If we want an exact mean of 12, we just have to tweak some values. Here sum(r) is 60179 when it should be 60000, so we have to decrease 175 values which were neither 2 (would go out of range) not 7 (would change the median).
In the end, a possible generator function could be:
def gendistrib(n):
if n % 2 != 0 :
raise ValueError("gendistrib needs an even parameter")
n2 = n//2 # n / 2 in Python 2
pop1 = list(range(2, 7)) # lower range
pop2 = list(range(7, 41)) # upper range
w2 = [ 5, 4, 3, 2 ] + ( [1] * 30) # weights for upper range
r1 = random.choices(pop1, k=n2) # lower part of the distrib.
r2 = random.choices(pop2, w2, k=n2) # upper part
r = r1 + r2
random.shuffle(r) # randomize order
# time to force an exact mean
tot = sum(r)
expected = 12 * n
if tot > expected: # too high: decrease some values
for i, val in enumerate(r):
if val != 2 and val != 7:
r[i] = val - 1
tot -= 1
if tot == expected:
random.shuffle(r) # shuffle again the decreased values
break
elif tot < expected: # too low: increase some values
for i, val in enumerate(r):
if val != 6 and val != 40:
r[i] = val + 1
tot += 1
if tot == expected:
random.shuffle(r) # shuffle again the increased values
break
return r
It is really fast: I could timeit gendistrib(10000) at less than 0.02 seconds. But it should not be used for small distributions (less than 1000)
Ok, you're looking at the distribution which has no less than 4 parameters - two of those defining range and two responsible for required mean and median.
I could think about two possibilities from the top of my head:
Truncated normal distribution, look here for details. You have already range defined, and have to recover μ and σ from mean and median. It will require solving couple of nonlinear equation, but quite doable in python. Sampling could be done using https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.truncnorm.html
4-parameters Beta distribution, see here for details. Again, recovering α and β in Beta distribution from mean and median will require solving couple of non-linear equations. Knowing them sampling would be easy via https://docs.scipy.org/doc/numpy/reference/generated/numpy.random.beta.html
UPDATE
Here how you could do it for truncated normal going from mean to mu: Truncated normal with a given mean
If you have a bunch of smaller arrays with the right median and mean, you can combine them to produce a larger array.
So... you can pre-generate smaller arrays as you are currently doing, and then combine them randomly for larger n. Of course, this will result in a biased random sample, but it sounds like you just want something that's approximately random.
Here's working (py3) code that generates a sample of size 5000 with your desired properties, which it build from smaller samples of size 4, 6, 8, 10, ..., 18.
Note, that I changed how the smaller random samples are built: half of the numbers must be <= 6 and half >= 7 if the median is to be 6.5, so we generate those halves independently. This speeds things up massively.
import collections
import numpy as np
import random
rs = collections.defaultdict(list)
for i in range(50):
n = random.randrange(4, 20, 2)
while True:
x=np.append(np.random.randint(2, 7, size=n//2), np.random.randint(7, 41, size=n//2))
if x.mean() == 12 and np.median(x) == 6.5:
break
rs[len(x)].append(x)
def random_range(n):
if n % 2:
raise AssertionError("%d must be even" % n)
r = []
while n:
i = random.randrange(4, min(20, n+1), 2)
# Don't be left with only 2 slots left.
if n - i == 2: continue
xs = random.choice(rs[i])
r.extend(xs)
n -= i
random.shuffle(r)
return r
xs = np.array(random_range(5000))
print([(i, list(xs).count(i)) for i in range(2, 41)])
print(len(xs))
print(xs.mean())
print(np.median(xs))
Output:
[(2, 620), (3, 525), (4, 440), (5, 512), (6, 403), (7, 345), (8, 126), (9, 111), (10, 78), (11, 25), (12, 48), (13, 61), (14, 117), (15, 61), (16, 62), (17, 116), (18, 49), (19, 73), (20, 88), (21, 48), (22, 68), (23, 46), (24, 75), (25, 77), (26, 49), (27, 83), (28, 61), (29, 28), (30, 59), (31, 73), (32, 51), (33, 113), (34, 72), (35, 33), (36, 51), (37, 44), (38, 25), (39, 38), (40, 46)]
5000
12.0
6.5
The first line of the output shows that there's 620 2's, 52 3's, 440 4's etc. in the final array.
While this post already has an accepted answer, I'd like to contribute a general non integer approach. It does not need loops or testing. The idea is to take a PDF with compact support. Taking the idea of the accepted answer of Kasrâmvd, make two distributions in the left and right interval. Chose shape parameters such that the mean falls to the given value. The interesting opportunity here is that one can create a continuous PDF, i.e. without jumps where the intervals join.
As an example I have chosen the beta distribution. To have finite non-zero values at the border I've chosen beta =1 for the left and alpha = 1 for the right.
Looking at the definition of the PDF and the requirement of the mean the continuity gives two equations:
4.5 / alpha = 33.5 / beta
2 + 6.5 * alpha / ( alpha + 1 ) + 6.5 + 33.5 * 1 / ( 1 + beta ) = 24
This is a quadratic equation rather easy to solve. The just using scipy.stat.beta like
from scipy.stats import beta
import matplotlib.pyplot as plt
import numpy as np
x1 = np.linspace(2, 6.5, 200 )
x2 = np.linspace(6.5, 40, 200 )
# i use s and t not alpha and beta
s = 1./737 *(np.sqrt(294118) - 418 )
t = 1./99 *(np.sqrt(294118) - 418 )
data1 = beta.rvs(s, 1, loc=2, scale=4.5, size=20000)
data2 = beta.rvs(1, t, loc=6.5, scale=33.5, size=20000)
data = np.concatenate( ( data1, data2 ) )
print np.mean( data1 ), 2 + 4.5 * s/(1.+s)
print np.mean( data2 ), 6.5 + 33.5/(1.+t)
print np.mean( data )
print np.median( data )
fig = plt.figure()
ax = fig.add_subplot( 1, 1, 1 )
ax.hist(data1, bins=13, density=True )
ax.hist(data2, bins=67, density=True )
ax.plot( x1, beta.pdf( x1, s, 1, loc=2, scale=4.5 ) )
ax.plot( x2, beta.pdf( x2, 1, t, loc=6.5, scale=33.5 ) )
ax.set_yscale( 'log' )
plt.show()
provides
>> 2.661366939244768 2.6495436216856976
>> 21.297348804473618 21.3504563783143
>> 11.979357871859191
>> 6.5006779033245135
so results are as required and it looks like:

"Sensibly" remove points in a Python list

Suppose I have two arrays indicating the x and y coordinates of a calibration curve.
X = [1,2,3,4,5,6,7,8,9,10,12,14,16,18,20,30,40,50]
Y = [2,4,6,8,10,12,14,16,18,20,24,28,32,36,40,60,80,100]
My example arrays above contain 18 points. You'll notice that the x values are not linearly spaced; there are more points at lower values of x.
Let's suppose I need to reduce the number of points in my calibration curve to 13 points. Obviously, I could just remove the first five or the last five points, but that would shorten my overall range of x values. To maintain range and minimise the space between x values I would preferentially remove values x= 2,4,6,8,10. Removing these x points and their respective y values would leave 13 points in the curve as required.
How could I do this point selection and removal automatically in Python? I.e. Is there an algorithm to pick the best x points from a list, where "best" is defined as keeping the points as close as possible while keeping the overall range and adhering to the new number of points.
Please note that the points remaining must be in the original lists, so I can't interpolate the 18 points on to a 13 point grid.
This would maximize the square root distances between the chosen points. It in some sense spreads the points as far as possible.
import itertools
list(max(itertools.combinations(sorted(X), 13), i
key=lambda l: sum((a - b) ** 2 for a, b in zip(l, l[1:]))))
Note that this is only feasible for small problems. The time complexity for selecting k points is O(k * (len(X) choose k)), so basically O(exp(len(X)). So don't even think about using this for, e.g., len(X) == 100 and k == 10.
X = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 30, 40, 50]
Y = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 24, 28, 32, 36, 40, 60, 80, 100]
assert len(X) == len(set(X)), "Duplicate X values found"
points = list(zip(X, Y))
points.sort() # sorts by X
while len(points) > 13:
# Find index whose neighbouring X values are closest together
i = min(range(1, len(points) - 1), key=lambda p: points[p + 1][0] - points[p - 1][0])
points.pop(i)
print(points)
Output:
[(1, 2), (3, 6), (5, 10), (7, 14), (10, 20), (12, 24), (14, 28), (16, 32), (18, 36), (20, 40), (30, 60), (40, 80), (50, 100)]
If you want the original series again:
X, Y = zip(*points)
An algorithm that would achieve that:
Convert each number into the sum of the absolute difference to the number to the left and to the right. If a number is missing, first or last cases, then use MAX_INT. For example, 1 would become MAX_INT; 2 would become 2, 10 would become 3.
Remove the first case with the lowest sum.
If you need to remove more numbers, go to 1.
This would remove 2,4,6,8,10,3,...
Here is a recursive approach that repeatedly removes the point which will be the least missed:
def mostRedundantPoint(x):
#returns the index, i, in the range 0 < i < len(x) - 1
#that minimizes x[i+1] - x[i-1]
#assumes len(x) > 2 and that x
#is sorted in ascending order
gaps = [x[i+1] - x[i-1] for i in range(1,len(x)-1)]
i = gaps.index(min(gaps))
return i+1
def reduceList(x,k):
if len(x) <= k:
return x
else:
i = mostRedundantPoint(x)
return reduceList(x[:i]+x[i+1:],k)
X = [1,2,3,4,5,6,7,8,9,10,12,14,16,18,20,30,40,50]
print(reduceList(X,13))
#prints [1, 3, 5, 7, 10, 12, 14, 16, 18, 20, 30, 40, 50]
This list essentially agrees with your intended output since 7 vs. 8 have the same net effect. It is reasonably quick in the sense that it is almost instantaneous in reducing sorted([random.randint(1,10**6) for i in range(1000)]) from 1000 elements to 100 elements. The fact that it is recursive implies that it will blow the stack if you try to remove many more points than that, but with what seems to be your intended problem size that shouldn't be an issue. If need be, you could of course replace the recursion by a loop.

Categories

Resources