I would like to optimize over all 30 by 30 matrices with entries that are 0 or 1. My objective function is the determinant. One way to do this would be some sort of stochastic gradient descent or simulated annealing.
I looked at scipy.optimize but it doesn't seem to support this sort of optimization as far as I can tell. scipy.optimize.basinhopping looked very tempting but it seems to require continuous variables.
Are there any tools in Python for this sort of general discrete optimization?
I think a genetic algorithm might work quite well in this case. Here's a quick example thrown together using deap, based loosely on their example here:
import numpy as np
import deap
from deap import algorithms, base, tools
import imp
class GeneticDetMinimizer(object):
def __init__(self, N=30, popsize=500):
# we want the creator module to be local to this instance, since
# creator.create() directly adds new classes to the module's globals()
# (yuck!)
cr = imp.load_module('cr', *imp.find_module('creator', deap.__path__))
self._cr = cr
self._cr.create("FitnessMin", base.Fitness, weights=(-1.0,))
self._cr.create("Individual", np.ndarray, fitness=self._cr.FitnessMin)
self._tb = base.Toolbox()
# an 'individual' consists of an (N^2,) flat numpy array of 0s and 1s
self.N = N
self.indiv_size = N * N
self._tb.register("attr_bool", np.random.random_integers, 0, 1)
self._tb.register("individual", tools.initRepeat, self._cr.Individual,
self._tb.attr_bool, n=self.indiv_size)
# the 'population' consists of a list of such individuals
self._tb.register("population", tools.initRepeat, list,
self._tb.individual)
self._tb.register("evaluate", self.fitness)
self._tb.register("mate", self.crossover)
self._tb.register("mutate", tools.mutFlipBit, indpb=0.025)
self._tb.register("select", tools.selTournament, tournsize=3)
# create an initial population, and initialize a hall-of-fame to store
# the best individual
self.pop = self._tb.population(n=popsize)
self.hof = tools.HallOfFame(1, similar=np.array_equal)
# print summary statistics for the population on each iteration
self.stats = tools.Statistics(lambda ind: ind.fitness.values)
self.stats.register("avg", np.mean)
self.stats.register("std", np.std)
self.stats.register("min", np.min)
self.stats.register("max", np.max)
def fitness(self, individual):
"""
assigns a fitness value to each individual, based on the determinant
"""
return np.linalg.det(individual.reshape(self.N, self.N)),
def crossover(self, ind1, ind2):
"""
randomly swaps a subset of array values between two individuals
"""
size = self.indiv_size
cx1 = np.random.random_integers(0, size - 2)
cx2 = np.random.random_integers(cx1, size - 1)
ind1[cx1:cx2], ind2[cx1:cx2] = (
ind2[cx1:cx2].copy(), ind1[cx1:cx2].copy())
return ind1, ind2
def run(self, ngen=int(1E6), mutation_rate=0.3, crossover_rate=0.7):
np.random.seed(seed)
pop, log = algorithms.eaSimple(self.pop, self._tb,
cxpb=crossover_rate,
mutpb=mutation_rate,
ngen=ngen,
stats=self.stats,
halloffame=self.hof)
self.log = log
return self.hof[0].reshape(self.N, self.N), log
if __name__ == "__main__":
np.random.seed(0)
gd = GeneticDetMinimizer()
best, log = gd.run()
It takes about 40 seconds to run 1000 generations on my laptop, which gets me from a minimum determinant value of about -5.7845x108 to -6.41504x1011. I haven't really played around much with the meta-parameters (population size, mutation rate, crossover rate etc.), so I'm sure it's possible to do a lot better.
Here's a greatly improved version that implements a much smarter crossover function that swaps blocks of rows or columns across individuals, and uses a cachetools.LRUCache to guarantee that each mutation step produces a novel configuration, and to skip evaluation of the determinant for configurations that have already been tried:
import numpy as np
import deap
from deap import algorithms, base, tools
import imp
from cachetools import LRUCache
# used to control the size of the cache so that it doesn't exceed system memory
MAX_MEM_BYTES = 11E9
class GeneticDetMinimizer(object):
def __init__(self, N=30, popsize=500, cachesize=None, seed=0):
# an 'individual' consists of an (N^2,) flat numpy array of 0s and 1s
self.N = N
self.indiv_size = N * N
if cachesize is None:
cachesize = int(np.ceil(8 * MAX_MEM_BYTES / self.indiv_size))
self._gen = np.random.RandomState(seed)
# we want the creator module to be local to this instance, since
# creator.create() directly adds new classes to the module's globals()
# (yuck!)
cr = imp.load_module('cr', *imp.find_module('creator', deap.__path__))
self._cr = cr
self._cr.create("FitnessMin", base.Fitness, weights=(-1.0,))
self._cr.create("Individual", np.ndarray, fitness=self._cr.FitnessMin)
self._tb = base.Toolbox()
self._tb.register("attr_bool", self.random_bool)
self._tb.register("individual", tools.initRepeat, self._cr.Individual,
self._tb.attr_bool, n=self.indiv_size)
# the 'population' consists of a list of such individuals
self._tb.register("population", tools.initRepeat, list,
self._tb.individual)
self._tb.register("evaluate", self.fitness)
self._tb.register("mate", self.crossover)
self._tb.register("mutate", self.mutate, rate=0.002)
self._tb.register("select", tools.selTournament, tournsize=3)
# create an initial population, and initialize a hall-of-fame to store
# the best individual
self.pop = self._tb.population(n=popsize)
self.hof = tools.HallOfFame(1, similar=np.array_equal)
# print summary statistics for the population on each iteration
self.stats = tools.Statistics(lambda ind: ind.fitness.values)
self.stats.register("avg", np.mean)
self.stats.register("std", np.std)
self.stats.register("min", np.min)
self.stats.register("max", np.max)
# keep track of configurations that have already been visited
self.tabu = LRUCache(cachesize)
def random_bool(self, *args):
return self._gen.rand(*args) < 0.5
def mutate(self, ind, rate=1E-3):
"""
mutate an individual by bit-flipping one or more randomly chosen
elements
"""
# ensure that each mutation always introduces a novel configuration
while np.packbits(ind.astype(np.uint8)).tostring() in self.tabu:
n_flip = self._gen.binomial(self.indiv_size, rate)
if not n_flip:
continue
idx = self._gen.random_integers(0, self.indiv_size - 1, n_flip)
ind[idx] = ~ind[idx]
return ind,
def fitness(self, individual):
"""
assigns a fitness value to each individual, based on the determinant
"""
h = np.packbits(individual.astype(np.uint8)).tostring()
# look up the fitness for this configuration if it has already been
# encountered
if h not in self.tabu:
fitness = np.linalg.det(individual.reshape(self.N, self.N))
self.tabu.update({h: fitness})
else:
fitness = self.tabu[h]
return fitness,
def crossover(self, ind1, ind2):
"""
randomly swaps a block of rows or columns between two individuals
"""
cx1 = self._gen.random_integers(0, self.N - 2)
cx2 = self._gen.random_integers(cx1, self.N - 1)
ind1.shape = ind2.shape = self.N, self.N
if self._gen.rand() < 0.5:
# row swap
ind1[cx1:cx2, :], ind2[cx1:cx2, :] = (
ind2[cx1:cx2, :].copy(), ind1[cx1:cx2, :].copy())
else:
# column swap
ind1[:, cx1:cx2], ind2[:, cx1:cx2] = (
ind2[:, cx1:cx2].copy(), ind1[:, cx1:cx2].copy())
ind1.shape = ind2.shape = self.indiv_size,
return ind1, ind2
def run(self, ngen=int(1E6), mutation_rate=0.3, crossover_rate=0.7):
pop, log = algorithms.eaSimple(self.pop, self._tb,
cxpb=crossover_rate,
mutpb=mutation_rate,
ngen=ngen,
stats=self.stats,
halloffame=self.hof)
self.log = log
return self.hof[0].reshape(self.N, self.N), log
if __name__ == "__main__":
np.random.seed(0)
gd = GeneticDetMinimizer(0)
best, log = gd.run()
My best score thus far is about -3.23718x1013 -3.92366x1013 after 10000 1000 generations, which takes about 45 seconds on my machine.
Based on the solution cthonicdaemon linked to in the comments, the maximum determinant for a 31x31 Hadamard matrix must be at least 75960984159088×230 ~= 8.1562x1022 (it's not yet proven whether that solution is optimal). The maximum determinant for an (n-1 x n-1) binary matrix is 21-n times the value for an (n x n) Hadamard matrix, i.e. 8.1562x1022 x 2-30 ~= 7.5961x1013, so the genetic algorithm gets within an order of magnitude of the current best known solution.
However, the fitness function seems to plateau around here, and I'm having a hard time breaking -4x1013. Since it's a heuristic search there is no guarantee that it will eventually find the global optimum.
I don't know of any straight-forward method for discrete optimization in scipy. One alternative is using the simanneal package from pip or github, which allows you to introduce your own move function, such that you can restrict it to moves within your domain:
import random
import numpy as np
import simanneal
class BinaryAnnealer(simanneal.Annealer):
def move(self):
# choose a random entry in the matrix
i = random.randrange(self.state.size)
# flip the entry 0 <=> 1
self.state.flat[i] = 1 - self.state.flat[i]
def energy(self):
# evaluate the function to minimize
return -np.linalg.det(self.state)
matrix = np.zeros((5, 5))
opt = BinaryAnnealer(matrix)
print(opt.anneal())
I have looked into this a bit.
A couple of things first off: 1) 56 million is the max value when the size of the matrix is 21x21, not 30x30:https://en.wikipedia.org/wiki/Hadamard%27s_maximal_determinant_problem#Connection_of_the_maximal_determinant_problems_for_.7B1.2C.C2.A0.E2.88.921.7D_and_.7B0.2C.C2.A01.7D_matrices.
But that is also an upper bound on -1, 1 matrices, not 1,0.
EDIT: Reading more carefully from that link:
The maximal determinants of {1, −1} matrices up to size n = 21 are given in the following table. Size 22 is the smallest open case. In the table, D(n) represents the maximal determinant divided by 2n−1. Equivalently, D(n) represents the maximal determinant of a {0, 1} matrix of size n−1.
So that table can be used for upper bounds, but remember they're divided by 2n−1. Also note that 22 is the smallest open case, so trying to find the maximum of a 30x30 matrix has not been done, and is not even close to being done just yet.
2) The reason David Zwicker's code gives an answer of 30 million is probably due to the fact that he's minimising. Not maximising.
return -np.linalg.det(self.state)
See how he's got the minus sign there?
3) Also, the solution space for this problem is very big. I calculate the number of different matrices to be 2^(30*30) i.e. in the order of 10^270. So looking at each matrix is simply impossible, and even look at most of them is too.
I have a bit of code here (adapted from David Zwicker's code) that runs, but I have no idea how close it is to the actual maximum. It takes around 45 mins to do 10 million iterations on my PC, or only about 2 mins for 1 mill iterations. I get a max value of around 3.4 billion. But again, I have no idea how close this is to the theoretical maximum.
import numpy as np
import random
import time
MATRIX_SIZE = 30
def Main():
startTime = time.time()
mat = np.zeros((MATRIX_SIZE, MATRIX_SIZE), dtype = int)
for i in range(MATRIX_SIZE):
for j in range(MATRIX_SIZE):
mat[i,j] = random.randrange(2)
print("Starting matrix:\n", mat)
maxDeterminant = 0
for i in range(1000000):
# choose a random entry in the matrix
x = random.randrange(MATRIX_SIZE)
y = random.randrange(MATRIX_SIZE)
mat[x,y] = 1 - mat[x,y]
#print(mat)
detValue = np.linalg.det(mat)
if detValue > maxDeterminant:
maxDeterminant = detValue
timeTakenStr = "\nTotal time to complete: " + str(round(time.time() - startTime, 4)) + " seconds"
print(timeTakenStr )
print(maxDeterminant)
Main()
Does this help?
Related
import numpy as np
from time import time
import matplotlib.pyplot as plt
np.random.seed(27)
mysetup = "from math import sqrt"
begin=time()
i=int(input("Number of rows in first matrix"))
k=int(input("Number of column in first and rows in second matrix"))
j=int(input("Number of columns in second matrix"))
A = np.random.randint(1,10,size = (i,k))
B = np.random.randint(1,10,size = (k,j))
def multiply_matrix(A,B):
global C
if A.shape[1]==B.shape[0]:
C=np.zeros((A.shape[0],B.shape[1]),dtype=int)
for row in range(i):
for col in range(j):
for elt in range(0,len(B)):
C[row,col] += A[row,elt]*B[elt,col]
return C
else:
return "Cannot multiply A and B"
print(f"Matrix A:\n {A}\n")
print(f"Matrix B:\n {B}\n")
D=print(multiply_matrix(A, B))
end=time()
t=print(end-begin)
x=[0,100,10]
y=[100,100,1000]
plt.plot(x,y)
plt.xlabel('Time taken for the program to run')
plt.ylabel('Order of the matrix multiplication')
plt.show()
In the program, I have generated random elements for the matrices to be multiplied.Basically I am trying to compute the time it takes to multiply two matrices.The i,j and k will be considered as the order used for the matrix.As we cannot multiply matrices where number of columns of the first is not equal to the number of the rows in the second, I have already given them the variable 'k'.
Initially I considered to increment the order of the matrix using for loop but wasn't able to do so. I want the graph to display the time it took to multiply the matrices on the x axis and the order of the resultant matrix on the y axis.
There is a problem in the logic I applied but I am not able to find out how to do this problem as I am a beginner in programming
I was expecting to get the result as Y axis having a scale ranging from 0 to 100 with a difference of 10 and x axis with a scale of 100 to 1000 with a difference of 100.
The thousandth entity on the x axis will correspond to the time it took to compute the multiplication of two matrices with numbers of rows and columns as 1000.
Suppose the time it took to compute this was 200seconds. So the graph should be showing the point(1000,200).
Some problematic points I'd like to address -
You're starting the timer before the user chooses an input - which can differ, we want to be as precise as possible, thus we need to only calculate how much time it takes for the multiply_matrix function to run.
Because you're taking an input - it means that each run you will get one result, and one result is only a single point - not a full graph, so we need to get rid of the user input and generate our own.
Moreover to point #2 - we are not interested in giving "one shot" for each matrix order - that means that when we want to test how much time it takes to multiply two matrices of order 300 (for example) - we need to do it N times and take the average in order to be more precise, not to mention we are generating random numbers, and it is possible that some random generated matrices will be easier to compute than other... although taking the average over N tests is not 100% accurate - it does help.
You don't need to set C as a global variable as it can be a local variable of the function multiply_matrix that we anyways return. Also this is not the usage of globals as even with the global C - it will be undefined in the module level.
This is not a must, but it can improve a little bit your program - use time.perf_counter() as it uses the clock with the highest (available) resolution to measure a short duration, and it avoids precision loss by the float type.
You need to change the axes because we want to see how the time is affected by the order of the matrices, not the opposite! (so our X axis is now the order and the Y is the average time it took to multiply them)
Those fixes translate to this code:
Calculating how much it takes for multiply_matrix only.
begin = time.perf_counter()
C = multiply_matrix(A, B)
end = time.perf_counter()
2+3. Generating our own data, looping from order 1 to order maximum_order, taking 50 tests for each order:
maximum_order = 50
tests_number_for_each_order = 50
def generate_matrices_to_graph():
matrix_orders = [] # our X
multiply_average_time = [] # our Y
for order in range(1, maximum_order):
print(order)
times_for_each_order = []
for _ in range(tests_amount_for_each_order):
# generating random square matrices of size order.
A = np.random.randint(1, 10, size=(order, order))
B = np.random.randint(1, 10, size=(order, order))
# getting the time it took to compute
begin = time.perf_counter()
multiply_matrix(A, B)
end = time.perf_counter()
# adding it to the times list
times_for_each_order.append(end - begin)
# adding the data about the order and the average time it took to compute
matrix_orders.append(order)
multiply_average_time.append(sum(times_for_each_order) / tests_amount_for_each_order) # average
return matrix_orders, multiply_average_time
Minor changes to multiply_matrix as we don't need i, j, k from the user:
def multiply_matrix(A, B):
matrix_order = A.shape[1]
C = np.zeros((matrix_order, matrix_order), dtype=int)
for row in range(matrix_order):
for col in range(matrix_order):
for elt in range(0, len(B)):
C[row, col] += A[row, elt] * B[elt, col]
return C
and finally call generate_matrices_to_graph
# calling the generate_data_and_compute function
plt.plot(*generate_matrices_to_graph())
plt.xlabel('Matrix order')
plt.ylabel('Time [in seconds]')
plt.show()
Some outputs:
We can see that when our tests_number_for_each_order is small, the graph loses precision and crisp.
Going from order 1-40 with 1 test for each order:
Going from order 1-40 with 30 tests for each order:
Going from order 1-40 with 80 tests for each order:
I love this kind of questions:
import numpy as np
from time import time
import matplotlib.pyplot as plt
np.random.seed(27)
dim = []
times = []
for i in range(1,10001,10):
A = np.random.randint(1,10,size=(1,i))
B = np.random.randint(1,10,size=(i,1))
begin = time()
C = A*B
times.append(time()-begin)
dim.append(i)
plt.plot(times,dim)
This is a simplified test in which I tested 1 dimension matrices, (1,1)(1,1), (1,10)(10,1), (1,20)(20,1) and so on...
But you can make a double iteration to change also the "outer" dimension of the matrices and see how this affect the computational time
I have a dataset carrying N arrays, each array may have a fixed or variable length. I am going to extract a window from the dataset and counts the occurrence of a given number. To make it simple, I assume each array contains 90 numbers. The model will pick a random position on each array and extract M consecutive numbers started from that position as a window strip, and thus total N strip will be obtained. I assume each strip has the same size so all N strips composing a view (window). I have a C++ code to perform the job and work very well, for a dataset of 5x90, repeating the window sampling 1000000 times, it takes roughly 0.002 seconds to finish every 10,000 iterations. I am trying to port the C++ to Python such that I could use the powerful data frame (Pandas) and other tools for related analysis. After some research, it seems that the Numpy array is a good representation of the data array but since the size of each array varied, I need to warp all arrays into a list. The code I have shown below (majority of irrelated code are removed)
import numpy as np
import random
import time
class dataSource:
_data = None
_NCOLS = 0
_NROWS = 0
# NCOLS, NROWS is the dimension of a window (view) of the data
# _data is a list of NCOLS Numpy array, each array carry 90 numbers
def __init__(self, NCOLS, NROWS):
# assuming each array has 90 numbers for this exmaple, it could be varied in actual case
src = [[1]*90]*NCOLS # just assume all numbers in the array to be 1 for this example
self._data = [np.array(cc) for cc in src]
self._NCOLS = NCOLS
self._NROWS = NROWS
def extractView(self, view):
pos = [random.randint(0, len(cc)-1-self._NROWS) for cc in self._data]
for col in range(len(self._data)):
view[:, col] = self._data[col][pos[col]:pos[col]+self._NROWS]
class dataView:
_NCOLS = 0
_NROWS = 0
_view = None
def __init__(self, NCOLS, NROWS):
self._NCOLS = NCOLS
self._NROWS = NROWS
self._view = np.zeros([self._NROWS, self._NCOLS], order='F')
def __setitem__(self, key, value):
self._view[key] = value
def count(self, elem):
return np.count_nonzero(self._view==elem)
if __name__ == '__main__':
ds = dataSource(5, 3)
dv = dataView(5, 3)
batchCount, totalTime, startTime = 1, 0, time.process_time()
for n in range(10000000):
if ((n+1)%10000==0):
dt = time.process_time() - startTime
totalTime += dt
print(totalTime / batchCount, ' ', dt)
batchCount += 1
startTime = time.process_time()
ds.extractView(dv)
cnt = dv.count(1) # this could then be used for other analysis, dv may be used by other functions
Running this code in Python 3.8.8 showing that the code takes 0.23 seconds to finish every 10,000 iterations. The Python code is about 100 times slower than C++. In an actual case, the data source could have up to 50 arrays and each may have 50 to 500 numbers, so the run times in Python could be even longer. To apply the C++ code in the real data, it takes about 5 hours to finish all iterations, based on the above estimation, it takes 500 hours to do the same job in Python. I understand that there may be some limit in Python that makes it hard to boost the speed up, but if anything that I could do to optimize my code so to reduce the time (even 10%) will help a lot.
I was wondering how to speed up the following code in where I compute a probability function which involves numerical integrals and then I compute some confidence margins.
Some possibilities that I have thought about are Numba or vectorization of the code
EDIT:
I have made minor modifications because there was a mistake. I am looking for some modifications that provide major time improvements (I know that there are some minor changes that would provide some minor time improvements, such as repeated functions, but I am not concerned about them)
The code is:
# -*- coding: utf-8 -*-
"""
Created on Tue Jan 26 17:05:46 2021
#author: Ignacio
"""
import numpy as np
from scipy.integrate import simps
def pdf(V,alfa_points):
alfa=np.linspace(0,2*np.pi,alfa_points)
return simps(1/np.sqrt(2*np.pi)/np.sqrt(sigma_R2)*np.exp(-(V*np.cos(alfa)-eR)**2/2/sigma_R2)*1/np.sqrt(2*np.pi)/np.sqrt(sigma_I2)*np.exp(-(V*np.sin(alfa)-eI)**2/2/sigma_I2),alfa)
def find_nearest(array,value):
array=np.asarray(array)
idx = (np.abs(array-value)).argmin()
return array[idx]
N = 20
n=np.linspace(0,N-1,N)
d=1
sigma_An=0.1
sigma_Pn=0.2
An=np.ones(N)
Pn=np.zeros(N)
Vs=np.linspace(0,30,1000)
inc=np.max(Vs)/len(Vs)
th=np.linspace(0,np.pi/2,250)
R=np.sum(An*np.cos(Pn+2*np.pi*np.sin(th[:,np.newaxis])*n*d),axis=1)
I=np.sum(An*np.sin(Pn+2*np.pi*np.sin(th[:,np.newaxis])*n*d),axis=1)
fmin=np.zeros(len(th))
fmax=np.zeros(len(th))
for tt in range(len(th)):
eR=np.exp(-sigma_Pn**2/2)*np.sum(An*np.cos(Pn+2*np.pi*np.sin(th[tt])*n*d))
eI=np.exp(-sigma_Pn**2/2)*np.sum(An*np.sin(Pn+2*np.pi*np.sin(th[tt])*n*d))
sigma_R2=1/2*np.sum(An*sigma_An**2)+1/2*(1-np.exp(-sigma_Pn**2))*np.sum(An**2)+1/2*np.sum(np.cos(2*(Pn+2*np.pi*np.sin(th[tt])*n*d))*((An**2+sigma_An**2)*np.exp(-2*sigma_Pn**2)-An**2*np.exp(-sigma_Pn**2)))
sigma_I2=1/2*np.sum(An*sigma_An**2)+1/2*(1-np.exp(-sigma_Pn**2))*np.sum(An**2)-1/2*np.sum(np.cos(2*(Pn+2*np.pi*np.sin(th[tt])*n*d))*((An**2+sigma_An**2)*np.exp(-2*sigma_Pn**2)-An**2*np.exp(-sigma_Pn**2)))
PDF=np.zeros(len(Vs))
for vv in range(len(Vs)):
PDF[vv]=pdf(Vs[vv],100)
total=simps(PDF,Vs)
values=np.cumsum(PDF)*inc/total
xval_05=find_nearest(values,0.05)
fmin[tt]=Vs[values==xval_05]
xval_95=find_nearest(values,0.95)
fmax[tt]=Vs[values==xval_95]
This version's speedup: 31x
A simple profiling (%%prun) reveals that most of the time is spent in simps.
You are in control of the integration done in pdf(): for example, you can use the trapeze method instead of Simpson with negligible numerical difference if you increase a bit the resolution of alpha. In fact, the higher resolution obtained by a higher sampling of alpha more than makes up for the difference between simps and trapeze (see picture at the bottom as for why). This is by far the highest speedup. We go one bit further by implementing the trapeze method ourselves instead of using scipy, since it is so simple. This alone yields marginal gain, but opens the door for a more drastic optimization (below, about pdf2D.
Also, the remaining simps(PDF, ...) goes faster when it knows that the dx step is constant, so we can just say so instead of passing the whole alpha array.
You can avoid doing the loop to compute PDF and use np.vectorize(pdf) directly on Vs, or better (as in the code below), do a 2-D version of that calculation.
There are some other minor things (such as using an index directly fmin[tt] = Vs[closest(values, 0.05)] instead of finding the index, returning the value, and then using a boolean mask for where values == xval_05), or taking all the constants (including alpha) outside functions and avoid recalculating every time.
This above gives us a 5.2x improvement. There is a number of things I don't understand in your code, e.g. why having An (ones) and Pn (zeros)?
But, importantly, another ~6x speedup comes from the observation that, since we are implementing our own trapeze method by using numpy primitives, we can actually do it in 2D in one go for the whole PDF.
The final speed up of the code below is 31x. I believe that a better understanding of "the big picture" of what you want to do would yield additional, perhaps substantial, speed gains.
Modified code:
import numpy as np
from scipy.integrate import simps
alpha_points = 200 # more points as we'll use trapeze not simps
alpha = np.linspace(0, 2*np.pi, alpha_points)
cosalpha = np.cos(alpha)
sinalpha = np.sin(alpha)
d_alpha = np.mean(np.diff(alpha)) # constant dx
coeff = 1 / np.sqrt(2*np.pi)
Vs=np.linspace(0,30,1000)
d_Vs = np.mean(np.diff(Vs)) # constant dx
inc=np.max(Vs)/len(Vs)
def f2D(Vs, eR, sigma_R2, eI, sigma_I2):
a = coeff / np.sqrt(sigma_R2)
b = coeff / np.sqrt(sigma_I2)
y = a * np.exp(-(np.outer(cosalpha, Vs) - eR)**2 / 2 / sigma_R2) * b * np.exp(-(np.outer(sinalpha, Vs) - eI)**2 / 2 / sigma_I2)
return y
def pdf2D(Vs, eR, sigma_R2, eI, sigma_I2):
y = f2D(Vs, eR, sigma_R2, eI, sigma_I2)
s = y.sum(axis=0) - (y[0] + y[-1]) / 2 # our own impl of trapeze, on 2D y
return s * d_alpha
def closest(a, val):
return np.abs(a - val).argmin()
N = 20
n = np.linspace(0,N-1,N)
d = 1
sigma_An = 0.1
sigma_Pn = 0.2
An=np.ones(N)
Pn=np.zeros(N)
th = np.linspace(0,np.pi/2,250)
R = np.sum(An*np.cos(Pn+2*np.pi*np.sin(th[:,np.newaxis])*n*d),axis=1)
I = np.sum(An*np.sin(Pn+2*np.pi*np.sin(th[:,np.newaxis])*n*d),axis=1)
fmin=np.zeros(len(th))
fmax=np.zeros(len(th))
for tt in range(len(th)):
eR=np.exp(-sigma_Pn**2/2)*np.sum(An*np.cos(Pn+2*np.pi*np.sin(th[tt])*n*d))
eI=np.exp(-sigma_Pn**2/2)*np.sum(An*np.sin(Pn+2*np.pi*np.sin(th[tt])*n*d))
sigma_R2=1/2*np.sum(An*sigma_An**2)+1/2*(1-np.exp(-sigma_Pn**2))*np.sum(An**2)+1/2*np.sum(np.cos(2*(Pn+2*np.pi*np.sin(th[tt])*n*d))*((An**2+sigma_An**2)*np.exp(-2*sigma_Pn**2)-An**2*np.exp(-sigma_Pn**2)))
sigma_I2=1/2*np.sum(An*sigma_An**2)+1/2*(1-np.exp(-sigma_Pn**2))*np.sum(An**2)-1/2*np.sum(np.cos(2*(Pn+2*np.pi*np.sin(th[tt])*n*d))*((An**2+sigma_An**2)*np.exp(-2*sigma_Pn**2)-An**2*np.exp(-sigma_Pn**2)))
PDF=pdf2D(Vs, eR, sigma_R2, eI, sigma_I2)
total = simps(PDF, dx=d_Vs)
values = np.cumsum(PDF) * inc / total
fmin[tt] = Vs[closest(values, 0.05)]
fmax[tt] = Vs[closest(values, 0.95)]
Note: most of the fmin and fmax are np.allclose() compared with the original function, but some of them have a small error: after some digging, it turns out that the implementation here is more precise as that function f() can be pretty abrupt, and more alpha points actually help (and more than compensate the minuscule lack of precision due to using trapeze instead of Simpson).
For example, at index tt=244, vv=400:
Considering several methods, the one that provides the largest time improvement is the Numba method. The method proposed by Pierre is very interesting and it does not require to install other packages, which is an asset.
However, in the examples that I have computed, the time improvement is not as large as with the numba example, specially when the points in th grows to a few tenths of thousands (which is my actual case). I post here the Numba code just in case someone is interested:
import numpy as np
from numba import njit
#njit
def margins(val_min,val_max):
fmin=np.zeros(len(th))
fmax=np.zeros(len(th))
for tt in range(len(th)):
eR=np.exp(-sigma_Pn**2/2)*np.sum(An*np.cos(Pn+2*np.pi*np.sin(th[tt])*n*d))
eI=np.exp(-sigma_Pn**2/2)*np.sum(An*np.sin(Pn+2*np.pi*np.sin(th[tt])*n*d))
sigma_R2=1/2*np.sum(An*sigma_An**2)+1/2*(1-np.exp(-sigma_Pn**2))*np.sum(An**2)+1/2*np.sum(np.cos(2*(Pn+2*np.pi*np.sin(th[tt])*n*d))*((An**2+sigma_An**2)*np.exp(-2*sigma_Pn**2)-An**2*np.exp(-sigma_Pn**2)))
sigma_I2=1/2*np.sum(An*sigma_An**2)+1/2*(1-np.exp(-sigma_Pn**2))*np.sum(An**2)-1/2*np.sum(np.cos(2*(Pn+2*np.pi*np.sin(th[tt])*n*d))*((An**2+sigma_An**2)*np.exp(-2*sigma_Pn**2)-An**2*np.exp(-sigma_Pn**2)))
Vs=np.linspace(0,30,1000)
inc=np.max(Vs)/len(Vs)
integration_points=200
PDF=np.zeros(len(Vs))
for vv in range(len(Vs)):
PDF[vv]=np.trapz(1/np.sqrt(2*np.pi)/np.sqrt(sigma_R2)*np.exp(-(Vs[vv]*np.cos(np.linspace(0,2*np.pi,integration_points))-eR)**2/2/sigma_R2)*1/np.sqrt(2*np.pi)/np.sqrt(sigma_I2)*np.exp(-(Vs[vv]*np.sin(np.linspace(0,2*np.pi,integration_points))-eI)**2/2/sigma_I2),np.linspace(0,2*np.pi,integration_points))
total=np.trapz(PDF,Vs)
values=np.cumsum(PDF)*inc/total
idx = (np.abs(values-val_min)).argmin()
xval_05=values[idx]
fmin[tt]=Vs[np.where(values==xval_05)[0][0]]
idx = (np.abs(values-val_max)).argmin()
xval_95=values[idx]
fmax[tt]=Vs[np.where(values==xval_95)[0][0]]
return fmin,fmax
N = 20
n=np.linspace(0,N-1,N)
d=1
sigma_An=1/2**6
sigma_Pn=2*np.pi/2**6
An=np.ones(N)
Pn=np.zeros(N)
th=np.linspace(0,np.pi/2,250)
R=np.sum(An*np.cos(Pn+2*np.pi*np.sin(th[:,np.newaxis])*n*d),axis=1)
I=np.sum(An*np.sin(Pn+2*np.pi*np.sin(th[:,np.newaxis])*n*d),axis=1)
F=R+1j*I
Fa=np.abs(F)/np.max(np.abs(F))
fmin, fmax = margins(0.05,0.95)
I'm trying to perform solve an optimization problem in python. At this point, I'm already familiar with Scipy, but I didn't manage to use it properly with a constraint of unique integer values.
The example below might better fit mlrose tag.
In a high level, I'm trying to create a Swiss pairing, I have seen a few articles and one of those suggested a "Penalty Matrix" and to minimize the penalty. Here is what I have done:
import mlrose
import numpy as np
# Create Penalty Matrix
x = np.round(np.random.rand(8,8)*50)
z = np.eye(8, dtype=int)*100 + x
print(z)
# fitness problem given a penalty matrix and an order
def pairing_fittness(order, panalty):
print(order)
order = np.array(order)
a = np.bincount(order)
order = order.reshape(-1, 2)
PF = 0
for pair in order:
print("{}, {}: {}".format(pair[0],pair[1], panalty[pair[0],pair[1]]))
PF = PF + panalty[pair[0],pair[1]]
print("Real PF: ",PF)
print("order penalty: {}".format((np.max(a) - 1) * 1000))
return (PF + (np.max(a) - 1) * 1000)
One of the problems this tries to solve is to create an array with unique values (The same player can not play twice in the same round) this is why the penalty for duplicated values is high (1000)
kwargs = {'panalty': z}
fitness_cust_problem_fun = mlrose.CustomFitness(pairing_fittness, **kwargs)
problem = mlrose.DiscreteOpt(length = 8,
fitness_fn = fitness_cust_problem_fun,
maximize = False,
max_val = 8,
)
best_state, best_fitness = mlrose.simulated_annealing(
problem,
max_attempts = 300,
max_iters = 100000,
random_state = 1)
print(best_state)
print(best_fitness)
No matter what I do, with over 6 variables it can not find a unique values array to optimize it. While I can do it in Excel (Solver > Evolutionary).
I'm looking for a better tool (I used scipy.optimize but I'm not sure it works well for integer problems, as I got better results with mlrose) or suggestions on how to improve my optimization problem so it is solvable.
I am writing some code that chooses n random hyperplanes in 5 dimensions that go through the origin. It then samples no_points points uniformly at random on the unit sphere and counts how many of the regions created by the hyperplanes have at least one point in them. This is relatively simple to do using the following Python code.
import numpy as np
def points_on_sphere(dim, N, norm=np.random.normal):
"""
http://en.wikipedia.org/wiki/N-sphere#Generating_random_points
"""
normal_deviates = norm(size=(N, dim))
radius = np.sqrt((normal_deviates ** 2).sum(axis=0))
points = normal_deviates / radius
return points
n = 100
d = 5
hpoints = points_on_sphere(n, d).T
for no_points in xrange(0, 10000000,100000):
test_points = points_on_sphere(no_points,d).T
#The next two lines count how many of the test_points are in different regions created by the hyperplanes
signs = np.sign(np.inner(test_points, hpoints))
print no_points, len(set(map(tuple,signs)))
Unfortunately, my naive method of counting how many of the points are in different regions is slow. Overall the method takes O(no_points * n * d) time and in practice it is too slow and too RAM hungry once no_points reaches about 1000000. In particular it reaches 4GB of RAM at no_points = 900,000 .
Can this be done more efficiently so that no_points can get all the way to 10,000,000 (actually it would be great if it could go to 10 times that) fairly quickly and using less than 4GB of RAM?
Storing how each test point classifies with respect to each hyperplane is a lot of data. I would suggest an implicit radix sort on the point labels, e.g.,
import numpy as np
d = 5
n = 100
N = 100000
is_boundary = np.zeros(N, dtype=bool)
tpoints = np.random.normal(size=(N, d))
tperm = np.arange(N)
for i in range(n):
hpoint = np.random.normal(size=d)
region = np.cumsum(is_boundary) * 2 + (np.inner(hpoint, tpoints) < 0.0)[tperm]
region_order = np.argsort(region)
is_boundary[1:] = np.diff(region[region_order])
tperm = tperm[region_order]
print(np.sum(is_boundary))
This code keeps a permutation of test points (tperm) such that all points in the same region are consecutive. boundary indicates whether each point is in a different region from the previous in permutation order. For each successive hyperplane, we partition each of the existing regions and effectively discard the empty regions to avoid storage for 2^100 of them.
Actually, since you have so many points and so few hyperplanes, it makes more sense not to store the points. The following code packs the region signature into two doubles using binary encoding.
import numpy as np
d = 5
hpoints = np.random.normal(size=(100, d))
bits = np.zeros((2, 100))
bits[0, :50] = 2.0 ** np.arange(50)
bits[1, 50:] = 2.0 ** np.arange(50)
N = 100000
uniques = set()
for i in xrange(0, N, 1000):
tpoints = np.random.normal(size=(1000, d))
signatures = np.inner(np.inner(tpoints, hpoints) < 0.0, bits)
uniques.update(map(tuple, signatures))
print(len(uniques))