Sinkhorn algorithm for optimal transport

Sinkhorn algorithm for optimal transport - python

I'm trying to code Sinkhorn algorithm, especially I'm trying to see if I can compute the optimal transportation between two measures when the strengh of the entropic regularization converges to 0.
For exemple let's transport the uniform measure $U$ over $[0;1]$ into the uniform measure $V$ over $[1;2]$.
The optimal measure for the quadratic coast is $(x,x-1)_{#} U$.
Let's discretize $[0;1]$, the measure $U$, $[1;2]$ and the measure $V$. Using Sinkhorn I'm supposed to get a measure such that the support is in the graphe of the line $y = x-1$. But it didn't so I'm working on it to find what's the problem. I'm going to show you my code and my result maybe someone can help me.
import numpy as np
import math
from mpl_toolkits import mplot3d
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import cm
import matplotlib.colors as colors
#Parameters
N = 10 #Step of the discritization of [0,1]
stop = 10**-3
Niter = 10**3
def Sinkhorn(C, mu, nu, lamb):
# lam : strength of the entropic regularization
#Initialization
a1 = np.zeros(N)
b1 = np.zeros(N)
a2 = np.ones(N)
b2 = np.ones(N)
Iter = 0
GammaB = np.exp(-lamb*C)
#Sinkhorn
while (np.linalg.norm(a2) > stop and np.linalg.norm(b2) > stop and np.linalg.norm(a2) < 1/stop and np.linalg.norm(b2) < 1/stop and Iter < Niter and np.linalg.norm(a1-a2) + np.linalg.norm(b1-b2) > stop ):
a1 = a2
b1 = b2
a2 = mu/(np.dot(GammaB,b1))
b2 = nu/(np.dot(GammaB.T,a2))
Iter +=1
# Compute gamma_star
Gamma = np.zeros((N,N))
for i in range(N):
for j in range(N):
Gamma[i][j] = a2[i]*b2[j]*GammaB[i][j]
Gamma /= Gamma.sum()
return Gamma
## Test between uniform([0;1]) over uniform([1;2])
S = np.linspace(0,1,N, False) #discritization of [0,1]
T = np.linspace(1,2,N,False) #discritization of [1,2]
# Discretization of uniform([0;1])
U01 = np.ones(N)
Mass = np.sum(U01)
U01 = U01/Mass
# Discretization uniform([1;2])
U12 = np.ones(N)
Mass = np.sum(U12)
U12 = U12/Mass
# Cost function
X,Y = np.meshgrid(S,T)
C = (X-Y)**2 #Matrix of c[i,j]=(xi-yj)²
def plot_Sinkhorn_U01_U12():
#plot optimal measure and convergence
fig = plt.figure()
for i in range(4):
ax = fig.add_subplot(2, 2, i+1, projection='3d')
Gamma_star = Sinkhorn(C, U01, U12, 1/10**i)
ax.scatter(X, Y, Gamma_star, cmap='viridis', linewidth=0.5)
plt.title("Gamma_bar({}) between uniform([0,1]) and uniform([1,2])".format(1/10**i))
plt.show()
plt.figure()
for i in range(4):
plt.subplot(2,2,i+1)
Gamma_star = Sinkhorn(C, U01, U12, 1/10**i)
plt.imshow(Gamma_star,interpolation='none')
plt.title("Gamma_bar({}) between uniform([0,1]) and uniform([1,2])".format(1/10**i))
plt.show()
return
# The transport between U01 ans U12 is x -> x-1 so the support of gamma^* is contained in the graph of the function x -> (x,x+1) which is the line y = x+1
plot_Sinkhorn_U01_U12()
And what I get.
As adviced, this is the output of my code when I'm considerating 1/lamb.
It's way more better but still not correct. Here's Gamma_star(125)
Gamma_star(125) :
[[0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.09 0.01]
[0. 0. 0. 0. 0. 0. 0. 0. 0.01 0.01]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.01]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.01]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.01]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.01]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.01]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.01]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.01]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.01]]
We can see that the support of the measure gamma_star is not contained in the line $y = x-1$
Thanks and regards.

It's not the final answer but we're getting closer.
As adviced, I lighten my while condition. For exemple with the only condition
while (Iter < Niter):
This is what I get :
Here is the matrix I got for gamma_star(125) :
[[0.08 0.02 0. 0. 0. 0. 0. 0. 0. 0. ]
[0.02 0.06 0.02 0. 0. 0. 0. 0. 0. 0. ]
[0. 0.02 0.06 0.02 0. 0. 0. 0. 0. 0. ]
[0. 0. 0.02 0.06 0.02 0. 0. 0. 0. 0. ]
[0. 0. 0. 0.02 0.06 0.02 0. 0. 0. 0. ]
[0. 0. 0. 0. 0.02 0.06 0.02 0. 0. 0. ]
[0. 0. 0. 0. 0. 0.02 0.06 0.02 0. 0. ]
[0. 0. 0. 0. 0. 0. 0.02 0.06 0.02 0. ]
[0. 0. 0. 0. 0. 0. 0. 0.02 0.06 0.02]
[0. 0. 0. 0. 0. 0. 0. 0. 0.02 0.08]]
It's closer from my expection which is : $\text{Gamma_star}(i,j) = 0$ for $j \ne i-1$
The new code is :
import numpy as np
import math
from mpl_toolkits import mplot3d
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import cm
import matplotlib.colors as colors
#Parameters
N = 10 #Step of the discritization of [0,1]
Niter = 10**5
def Sinkhorn(C, mu, nu, lamb):
# lam : strength of the entropic regularization
#Initialization
a1 = np.zeros(N)
b1 = np.zeros(N)
a2 = np.ones(N)
b2 = np.ones(N)
Iter = 0
GammaB = np.exp(-lamb*C)
#Sinkhorn
while (Iter < Niter):
a1 = a2
b1 = b2
a2 = mu/(np.dot(GammaB,b1))
b2 = nu/(np.dot(GammaB.T,a2))
Iter +=1
# Compute gamma_star
Gamma = np.zeros((N,N))
for i in range(N):
for j in range(N):
Gamma[i][j] = a2[i]*b2[j]*GammaB[i][j]
Gamma /= Gamma.sum()
return Gamma
## Test between uniform([0;1]) over uniform([1;2])
S = np.linspace(0,1,N, False) #discritization of [0,1]
T = np.linspace(1,2,N,False) #discritization of [1,2]
# Discretization of uniform([0;1])
U01 = np.ones(N)
Mass = np.sum(U01)
U01 = U01/Mass
# Discretization uniform([1;2])
U12 = np.ones(N)
Mass = np.sum(U12)
U12 = U12/Mass
# Cost function
X,Y = np.meshgrid(S,T)
C = (X-Y)**2 #Matrix of c[i,j]=(xi-yj)²
def plot_Sinkhorn_U01_U12():
#plot optimal measure and convergence
fig = plt.figure()
for i in range(4):
ax = fig.add_subplot(2, 2, i+1, projection='3d')
Gamma_star = Sinkhorn(C, U01, U12, 5**i)
ax.scatter(X, Y, Gamma_star, cmap='viridis', linewidth=0.5)
plt.title("Gamma_bar({}) between uniform([0,1]) and uniform([1,2])".format(5**i))
plt.show()
plt.figure()
for i in range(4):
plt.subplot(2,2,i+1)
Gamma_star = Sinkhorn(C, U01, U12, 5**i)
plt.imshow(Gamma_star,interpolation='none')
plt.title("Gamma_bar({}) between uniform([0,1]) and uniform([1,2])".format(5**i))
plt.show()
return
# The transport between U01 ans U12 is x -> x-1 so the support of gamma^* is contained in the graph of the function x -> (x,x-1) which is the line y = x-1
plot_Sinkhorn_U01_U12()

Related

sklearn for LASSO (BIC tuning)

We encounter a problem when using the LASSO-related function in sklearn. Since the LASSO with BIC tuning just change the alpha, the results of LASSO with BIC (1) should be equivalent to the LASSO with fixed optimal alpha (2).
linear_model.LassoLarsIC
linear_model.Lasso
First, we could consider the simple DGP setting:
################## DGP ##################
np.random.seed(10)
T = 200 # sample size
p = 100 # number of regressors
X = np.random.normal(size = (T, p))
u = np.random.normal(size = T)
beta = np.hstack((np.array([5, 0, 3, 0, 1, 0, 0, 0, 0, 0]), np.zeros(p-10)))
y = np.dot(X, beta) + u
Then we use the LASSO with BIC. linear_model.LassoLarsIC
# LASSO with BIC
lasso = linear_model.LassoLarsIC(criterion='bic')
lasso.fit(X,y)
print("lasso coef = \n {}".format(lasso.coef_))
print("lasso optimal alpha = {}".format(lasso.alpha_))
lasso coef =
[ 4.81934044 0. 2.87574831 0. 0.90031582 0.
0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0.01705965 0.
0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0.
-0.07789506 0. 0.05817856 0. 0. 0.
0. 0. 0. 0. 0. 0.
0. 0. 0. 0. ]
lasso optimal alpha = 0.010764484244859006
Then we use the optimal alpha here with LASSO. linear_model.Lasso
# LASSO with fixed alpha
clf = linear_model.Lasso(alpha=lasso.alpha_)
clf.fit(X,y)
print("lasso coef = \n {}".format(clf.coef_))
lasso coef =
[ 4.93513468e+00 5.42491624e-02 3.00412571e+00 -3.83394653e-02
9.87262697e-01 5.21693412e-03 -2.89977454e-02 -1.40952930e-01
5.18653123e-02 -7.66271662e-02 -1.99074552e-02 2.72228580e-02
-1.01217167e-01 -4.69445223e-02 1.74378470e-01 2.52655725e-02
1.84902632e-02 -7.11030674e-02 -4.15940817e-03 1.98229236e-02
-8.81779536e-02 -3.59094431e-02 5.53212537e-03 9.23031418e-02
1.21577471e-01 -4.73932893e-03 5.15459727e-02 4.17136419e-02
4.49561794e-02 -4.74874460e-03 0.00000000e+00 -3.56968194e-02
-4.43094631e-02 0.00000000e+00 1.00390051e-03 7.17980301e-02
-7.39058574e-02 1.73139031e-02 7.88996602e-02 1.04325618e-01
-4.10356303e-02 5.94564069e-02 0.00000000e+00 9.28354383e-02
0.00000000e+00 4.57453873e-02 0.00000000e+00 0.00000000e+00
-1.94113178e-02 1.97056365e-02 -1.17381604e-01 5.13943798e-02
2.11245596e-01 4.24124220e-02 1.16573094e-01 1.19551223e-02
-0.00000000e+00 -0.00000000e+00 -8.35210244e-02 -8.29230887e-02
-3.16409003e-02 8.43274240e-02 -2.90949577e-02 -0.00000000e+00
1.24697858e-01 -3.07120380e-02 -4.34558350e-02 -0.00000000e+00
1.30491858e-01 -2.04573808e-02 6.72141775e-02 -6.85563204e-02
5.64781612e-02 -7.43380132e-02 1.88610065e-01 -5.53155313e-04
0.00000000e+00 2.43191722e-02 9.10973250e-02 -4.49945551e-02
3.36006276e-02 -0.00000000e+00 -3.85862475e-02 -9.63711465e-02
-2.07015665e-01 8.67164869e-02 1.30776709e-01 -0.00000000e+00
5.42630086e-02 -1.44763258e-01 -0.00000000e+00 -3.29485283e-02
-2.35245212e-02 -6.19975427e-02 -8.83892134e-03 -1.60523703e-01
9.63008989e-02 -1.06953313e-01 4.60206741e-02 6.02880434e-02]
-0.06321829752708413
Two coefficients are different.
Why does this happen?

So the main difference I could find off the bat is the max_iter parameter, which is at 1000 with the Lasso model and at 500 with the LassoLarsIC model.
Other hyperparameters such as tol and selection are not adjustable in the LassoLarsIC implementation.
There might be more nuanced differences in the exact implementation of the two models though.

matplotlib get bitmap from a scatter plot

I have coordinates of some points that I need to plot and then convert plot to black & white bitmap:
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from PIL import Image
plt.scatter(x,y)
plt.tight_layout()
fig1 = plt.gcf()
plt.show()
type(fig1)
matplotlib.figure.Figure
How to get from this figure black & white bitmap as numpy array similar to this one:
side = 5
image = np.random.choice([0, 1], size=side*side, p=[.1, .9])
image = image.reshape(side,side)
image = np.expand_dims(image, axis=-1)
print("image.shape: ",image.shape)
plt.imshow(image, cmap=plt.get_cmap('gray'))
image.shape: (5, 5, 1)
print(image.reshape(side,side))
[[1 1 1 0 1]
[1 1 1 1 1]
[1 0 1 1 0]
[1 1 1 1 0]
[1 1 1 1 1]]
Update 1
I need also to get resulting bitmap as a numpy array. How to get it?
In case I use solution given by Zephyr:
fig, ax = plt.subplots(figsize = (5,5))
ax.hist2d(x, y, cmap = 'Greys', cmin = 0, cmax = 1)
plt.show()
I get image different from the scatter plot. And they should be similar:

You can create a grid and use it to define a map were the closest points will be white. My try with random data, range 0 to 1:
import matplotlib.pyplot as plt
import numpy as np
n_points = 10
# create random coordinates
x, y = np.random.rand(n_points,2).T
fig, ax = plt.subplots()
ax.scatter(x,y)
ax.set_xlim([0,1])
ax.set_ylim([0,1])
ax.set_aspect(1.0)
# create a grid
grid_points = 10
grid_x = np.linspace(0,1,grid_points)
grid_y = grid_x.copy()
# initiate array of ones (white)
image = np.ones([grid_points, grid_points])
for xp, yp in zip(x,y):
# selecing the closest point in grid
index_x = np.argmin(np.abs(xp - grid_x))
index_y = np.argmin(np.abs(yp - grid_y))
# setting to black
image[index_x,index_y] = 0
# you need to transpose it so x is represented
# by the columns and y by the rows
fig, ax = plt.subplots()
ax.imshow(
image.T,
origin='lower',
cmap=plt.get_cmap('gray'))
Note that the closest may not be always good. It gets better with a more refined grid.

First of all, I generate random N points within range (x_min, x_max) and (y_min, y_max):
np.random.seed(42)
N = 10
x_min = 0
x_max = 40
y_min = -20
y_max = 20
x = np.random.uniform(x_min, x_max, N)
y = np.random.uniform(y_min, y_max, N)
Then I prepare:
a grid (the bitmap) of (size, size) dimension
two vectors x_grid and y_grid which resample (x_min, x_max) and (y_min, y_max) in size + 1 points, so size inverval: one interval for each grid cell
size = 10
grid = np.zeros((size, size))
x_grid = np.linspace(x_min, x_max, size + 1)
y_grid = np.linspace(y_min, y_max, size + 1)
Then I loop over each grid cells; in each iteration I check if there is at least 1 point of (x, y) which stay within limits of that cell. If so, I set the correspondent value of grid to 1:
for i in range(size):
for j in range(size):
for x_i, y_i in zip(x, y):
if (x_grid[i] < x_i <= x_grid[i + 1]) and (y_grid[j] < y_i <= y_grid[j + 1]):
grid[i, j] = 1
break
Resulting numpy matrix:
[[0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
[0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
[0. 0. 1. 0. 0. 0. 0. 0. 1. 0.]
[0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]]
Complete Code
import matplotlib.pyplot as plt
import numpy as np
np.random.seed(42)
N = 10
x_min = 0
x_max = 40
y_min = -20
y_max = 20
x = np.random.uniform(x_min, x_max, N)
y = np.random.uniform(y_min, y_max, N)
size = 10
grid = np.zeros((size, size))
x_grid = np.linspace(x_min, x_max, size + 1)
y_grid = np.linspace(y_min, y_max, size + 1)
for i in range(size):
for j in range(size):
for x_i, y_i in zip(x, y):
if (x_grid[i] < x_i <= x_grid[i + 1]) and (y_grid[j] < y_i <= y_grid[j + 1]):
grid[i, j] = 1
break
fig, ax = plt.subplots(1, 2, figsize = (10, 5))
ax[0].scatter(x, y)
ax[0].set_xlim(x_min, x_max)
ax[0].set_ylim(y_min, y_max)
ax[0].grid()
ax[0].set_xticks(x_grid)
ax[0].set_yticks(y_grid)
ax[1].imshow(grid.T, cmap = 'Greys', extent = (x_min, x_max, y_min, y_max))
ax[1].invert_yaxis()
plt.show()
NOTE
Pay attention to the fact that in ax.imshow you need to transpose the matrix (grid.T) and then invert y axis in order to be able to compare the ax.imshow with ax.scatter.
If you want grid matrix to match ax.imshow, then you need to rotate it counterclockwise by 90°:
grid = np.rot90(grid, k=1, axes=(0, 1))
Rotated grid, which correspond to the above plot:
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
[0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
[0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
[1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 1. 0. 1. 0. 0.]
[0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]]

choosing 5 random numbers / coordinates on a grid in python

I need help with making this so I can convert a string into a number in a list, I have done this but if I wanted to do it this way I would have to wright a dictionary with 100 definitions which I do not want to do. The code is just to show what I found all ready. As you can see this would take 100 definitions if I were to do it this way.
x1 = [0,0,0,0,0,0,0,0,0,0]
x2 = [0,0,0,0,0,0,0,0,0,0]
x3 = [0,0,0,0,0,0,0,0,0,0]
x4 = [0,0,0,0,0,0,0,0,0,0]
x5 = [0,0,0,0,0,0,0,0,0,0]
x6 = [0,0,0,0,0,0,0,0,0,0]
x7 = [0,0,0,0,0,0,0,0,0,0]
x8 = [0,0,0,0,0,0,0,0,0,0]
x9 = [0,0,0,0,0,0,0,0,0,0]
x10 = [0,0,0,0,0,0,0,0,0,0]
my_dict_grid = {
'x2[3]' : x2[3]
}
x = 'x2[3]'
print(my_dict_grid[x])

If you have multiple arrays you are managing all at once, create a multi-dimensional array:
x = [
[0,0,0,0,0,0,0,0,0,0],
[0,0,0,0,0,0,0,0,0,0],
[0,0,0,0,0,0,0,0,0,0],
[0,0,0,0,0,0,0,0,0,0],
[0,0,0,0,0,0,0,0,0,0],
[0,0,0,0,0,0,0,0,0,0],
[0,0,0,0,0,0,0,0,0,0],
[0,0,0,0,0,0,0,0,0,0],
[0,0,0,0,0,0,0,0,0,0],
[0,0,0,0,0,0,0,0,0,0]
]
In that case, you can just index by row then column:
x[2][3]
Based on your comment, you want to randomly change values in the array. In that case, the approach above is not at all what you want. You want to pick two random numbers, and index to them in x to change them:
import random
for _ in range(5):
updated = False
while not updated:
i = random.randrange(10)
j = random.randrange(10)
if x[i][j] == 0:
x[i][j] = 1
updated = True
Original answer to the initial question:
(this is here more as an interesting thing, not as a viable approach)
Okay. Assuming that you have to do it the way you have described, you can generate a dictionary with all of the string keys:
my_dict_grid = {
f"x{i + 1}[{j}]": arr[j]
for i, arr in enumerate([x1, x2, x3, x4, x5, x6, x7, x8, x9, x10])
for j in range(10)
}
However, I have to stress that this is not a good idea.

3 different ways to solve this with oneliners, depending of the output you want:
my_list = [[ 0 for _ in range(10)] for _ in range(10)]
my_dict = {"x"+str(i+1):[ 0 for _ in range(10)] for i in range(10)}
my_dict2 = {"x"+str((i+1)%10)+"["+str(int((i+1)/10))+"]": 0 for i in range(100)}
print(my_list) #[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0,...
print(my_dict) #{'x10': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'x9': [0,...
print(my_dict2)#{'x4[3]': 0, 'x1[9]': 0, 'x6[6]': 0, 'x2[8]': 0,...

A more mathematical, but very practical way to do this with numpy:
import numpy as np
grid_shape = [10, 10] # define 10 x 10 grid
num_ones = 5
cells = np.zeros(grid_shape[0]*grid_shape[1]) # define 10*10 = 100 cells as flat array
cells[0:num_ones] = 1 # Set the first 5 entries to 1
np.random.shuffle(cells) # Shuffle the entries, such that the 1's are at random position
grid = cells.reshape(grid_shape) # shape the grid into the desired shape
Running the code above and will e.g. result in grid=
[[0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
[0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]]
Note that, by changing grid_shape you can resize your grid, and by changing num_ones, you will adapt the number of ones in your grid. Also, it is guaranteed that there will always be num_ones ones in your grid (given that num_ones is smaller or equal the number of elements in the grid).

Sparse vectors for training data

I have a training data like this:
x_train = np.random.randint(100, size=(1000, 25))
where each row is a sample and thus we have 1000 samples.
Now I need to have the training data such that for each of the sample/row there can be at max 3 non-zero elements out of 25.
Can you all please suggest how I can implement that? Thanks!

I am assuming that you want to turn a majority of your data into zeros, except that 0 to 3 non-zero elements are retained (randomly) for each row. If this is the case, a possible way to do this is as follows.
Code
import numpy as np
max_ = 3
nrows = 1000
ncols = 25
np.random.seed(7)
X = np.zeros((nrows,ncols))
data = np.random.randint(100, size=(nrows, ncols))
# number of max non-zeros to be generated for each column
vmax = np.random.randint(low=0, high=4, size=(nrows,))
for i in range(nrows):
if vmax[i]>0:
#index for setting non-zeros
col = np.random.randint(low=0, high=ncols, size=(1,vmax[i]))
#set non-zeros elements
X[i][col] = data[i][col]
print(X)
Output
[[ 0. 68. 25. ... 0. 0. 0.]
[ 0. 0. 0. ... 0. 0. 0.]
[ 0. 0. 0. ... 0. 0. 0.]
...
[ 0. 0. 0. ... 0. 0. 0.]
[88. 0. 0. ... 0. 0. 0.]
[ 0. 0. 0. ... 0. 0. 0.]]

create 3D binary image

I have a 2D array, a, comprising a set of 100 x,y,z coordinates:
[[ 0.81 0.23 0.52]
[ 0.63 0.45 0.13]
...
[ 0.51 0.41 0.65]]
I would like to create a 3D binary image, b, with 101 pixels in each of the x,y,z dimensions, of coordinates ranging between 0.00 and 1.00.
Pixels at locations defined by a should take on a value of 1, all other pixels should have a value of 0.
I can create an array of zeros of the right shape with b = np.zeros((101,101,101)), but how do I assign coordinate and slice into it to create the ones using a?

First, start off by safely rounding your floats to ints. In context, see this question.
a_indices = np.rint(a * 100).astype(int)
Next, assign those indices in b to 1. But be careful to use an ordinary list instead of the array, or else you'll trigger the usage of index arrays. It seems as though performance of this method is comparable to that of alternatives (Thanks #Divakar! :-)
b[list(a_indices.T)] = 1
I created a small example with size 10 instead of 100, and 2 dimensions instead of 3, to illustrate:
>>> a = np.array([[0.8, 0.2], [0.6, 0.4], [0.5, 0.6]])
>>> a_indices = np.rint(a * 10).astype(int)
>>> b = np.zeros((10, 10))
>>> b[list(a_indices.T)] = 1
>>> print(b)
[[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[ 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
[ 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[ 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]

You could do something like this -
# Get the XYZ indices
idx = np.round(100 * a).astype(int)
# Initialize o/p array
b = np.zeros((101,101,101))
# Assign into o/p array based on linear index equivalents from indices array
np.put(b,np.ravel_multi_index(idx.T,b.shape),1)
Runtime on the assignment part -
Let's use a bigger grid for timing purposes.
In [82]: # Setup input and get indices array
...: a = np.random.randint(0,401,(100000,3))/400.0
...: idx = np.round(400 * a).astype(int)
...:
In [83]: b = np.zeros((401,401,401))
In [84]: %timeit b[list(idx.T)] = 1 ##Praveen soln
The slowest run took 42.16 times longer than the fastest. This could mean that an intermediate result is being cached.
1 loop, best of 3: 6.28 ms per loop
In [85]: b = np.zeros((401,401,401))
In [86]: %timeit np.put(b,np.ravel_multi_index(idx.T,b.shape),1) # From this post
The slowest run took 45.34 times longer than the fastest. This could mean that an intermediate result is being cached.
1 loop, best of 3: 5.71 ms per loop
In [87]: b = np.zeros((401,401,401))
In [88]: %timeit b[idx[:,0],idx[:,1],idx[:,2]] = 1 #Subscripted indexing
The slowest run took 40.48 times longer than the fastest. This could mean that an intermediate result is being cached.
1 loop, best of 3: 6.38 ms per loop

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Sinkhorn algorithm for optimal transport - python

Related

sklearn for LASSO (BIC tuning)

matplotlib get bitmap from a scatter plot

choosing 5 random numbers / coordinates on a grid in python

Sparse vectors for training data

create 3D binary image

Categories

Resources