Speeding up a pytorch tensor operation

Speeding up a pytorch tensor operation - python

I am trying to speed up the below operation by doing some sort of matrix/vector-multiplication, can anyone see a nice quick solution?
It should also work for a special case where a tensor has shape 0 (torch.Size([])) but i am not able to initialize such a tensor.
See the image below for the type of tensor i am referring to:
tensor to add to test
def adstock_geometric(x: torch.Tensor, theta: float):
x_decayed = torch.zeros_like(x)
x_decayed[0] = x[0]
for xi in range(1, len(x_decayed)):
x_decayed[xi] = x[xi] + theta * x_decayed[xi - 1]
return x_decayed
def adstock_multiple_samples(x: torch.Tensor, theta: torch.Tensor):
listtheta = theta.tolist()
if isinstance(listtheta, float):
return adstock_geometric(x=x,
theta=theta)
x_decayed = torch.zeros((100, 112, 1))
for idx, theta_ in enumerate(listtheta):
x_decayed_one_entry = adstock_geometric(x=x,
theta=theta_)
x_decayed[idx] = x_decayed_one_entry
return x_decayed
if __name__ == '__main__':
ones = torch.tensor([1])
hundreds = torch.tensor([idx for idx in range(100)])
x = torch.tensor([[idx] for idx in range(112)])
ones = adstock_multiple_samples(x=x,
theta=ones)
hundreds = adstock_multiple_samples(x=x,
theta=hundreds)
print(ones)
print(hundreds)

I came up with the following, which is 40 times faster on your example:
import torch
def adstock_multiple_samples(x: torch.Tensor, theta: torch.Tensor):
arange = torch.arange(len(x))
powers = (arange[:, None] - arange).clip(0)
return ((theta[:, None, None] ** powers[None, :, :]).tril() * x).sum(-1)
It behaves as expected:
>>> x = torch.arange(112)
>>> theta = torch.arange(100)
>>> adstock_multiple_samples(x, theta)
... # the same output
Note that I considered that x was a 1D-tensor, as for your example the second dimension was not needed.
It also works with theta = torch.empty((0,)), and it returns an empty tensor.

Related

Must input same numpy shape in numba's #vectorize?

Window 10, Python=3.9, Numba=0.53.1, Numpy=1.22.2
I'm using numba with python for using my gpu.
This is my code sample.
import numpy as np
from numba import guvectorize
#vectorize(["boolean(float64, int64, int64)"], target="cuda")
def vector_add_gpu(a, b, c):
"""
Do something
"""
return True
def main():
a_source = np.ones(10, dtype=np.float64)
b_source = np.ones(100000, dtype=np.int64)
d_source = 10
# Time the GPU function
start = timer()
vector_add_gpu(a_source, b_source, d_source)
vector_add_gpu_time = timer() - start
print("GPU function took %f seconds." % vector_add_gpu_time)
return 0
if __name__ == "__main__":
main()
But I got this error.
failed to broadcast argument #1
If I put same shape of arguments, it works.
Like
def main():
a_source = np.ones(100000, dtype=np.float64)
b_source = np.ones(100000, dtype=np.int64)
d_source = 10
Sadly, I must use different shape of numpy arrays on my code.
So, can "vectorize" be used only if the shape of the numpy input is the same?

I apologize for my lack of explanation.
I just want to run the function using numba with cuda. Because my code is slow...
Here is my code
##vectorize(["boolean(int64, uint8, int64, int64, int64)"], target="cuda")
##guvectorize(["void(int64, uint8, int64, int64, int64)"], '(), (), (), ()->()', target="cuda")
def _deleting_from_endpoints(coords, input_image, ar_x, ar_y, max_value):
for (x, y) in coords:
ar_x[0], ar_y[0] = x, y
count = 0
for i in range(1, max_value):
count += 1
x_, y_ = ar_x[i - 1], ar_y[i - 1]
input_image[x_, y_] = 0
if count > max_value:
# input_image[ar_x[:count], ar_y[:count]] = 1
for v in range(count):
input_image[ar_x[count], ar_y[count]] = 1
break
x__, y__ = np.where(input_image[x_ - 1:x_ + 2, y_ - 1:y_ + 2])
if len(x__) != 0:
ar_x[i] = x_ + x__[0] - 1
ar_y[i] = y_ + y__[0] - 1
else:
break
return True
Arguments type
coords: numpy array dtype=int64
input_image: numpy array dtype=uint8
ar_x: numpy array dtype=int64
ar_y: numpy array dtype=int64
max_value: int

Scipy optimize unable to find the correct results

I am trying to use scipy.optimize.minimize to fit parameters for a multivariate function, however, regardless of how many noise free data points I am providing to the optimizer, the optimizer could not converge to a correct (or close) answer.
I wonder if there is a mistake in the way I am using the optimizer but I have been scratching my head to find the mistake. I would appreciate any advice or guesses, thanks!
import numpy as np
from scipy.optimize import minimize
import math
def get_transform(ai,aj,ak,x,y,z):
i,j,k = 0, 1, 2
si, sj, sk = math.sin(ai), math.sin(aj), math.sin(ak)
ci, cj, ck = math.cos(ai), math.cos(aj), math.cos(ak)
cc, cs = ci*ck, ci*sk
sc, ss = si*ck, si*sk
M = np.identity(4)
M[i, i] = cj*ck
M[i, j] = sj*sc-cs
M[i, k] = sj*cc+ss
M[j, i] = cj*sk
M[j, j] = sj*ss+cc
M[j, k] = sj*cs-sc
M[k, i] = -sj
M[k, j] = cj*si
M[k, k] = cj*ci
M[0, 3] = x
M[1, 3] = y
M[2, 3] = z
return M
def camera_intrinsic(fx, ppx, fy, ppy):
K = np.zeros((3, 3), dtype='float64')
K[0, 0], K[0, 2] = fx, ppx
K[1, 1], K[1, 2] = fy, ppy
K[2, 2] = 1
return K
def apply_transform(p, matrix):
rotation = matrix[0:3,0:3]
T = np.array([matrix[0][3],matrix[1][3],matrix[2][3]])
transformed = (np.dot(rotation, p.T).T)+T
return transformed
def project(points_3D,internal_calibration):
points_3D = points_3D.T
projections_2d = np.zeros((2, points_3D.shape[1]), dtype='float32')
camera_projection = (internal_calibration).dot(points_3D)
projections_2d[0, :] = camera_projection[0, :]/camera_projection[2, :]
projections_2d[1, :] = camera_projection[1, :]/camera_projection[2, :]
return projections_2d.T
def error(x):
global points,pixels
transform = get_transform(x[0],x[1],x[2],x[3],x[4],x[5])
points_transfered = apply_transform(points, transform)
internal_calibration = camera_intrinsic(x[6],x[7],x[8],x[9])
projected = project(points_transfered,internal_calibration)
# print(((projected-pixels)**2).mean())
return ((projected-pixels)**2).mean()
def generate(points, x):
transform = get_transform(x[0],x[1],x[2],x[3],x[4],x[5])
points_transfered = apply_transform(points, transform)
internal_calibration = camera_intrinsic(x[6],x[7],x[8],x[9])
projected = project(points_transfered,internal_calibration)
return projected
points = np.random.rand(100,3)
x_initial = np.random.rand(10)
pixels = generate(points,x_initial)
x_guess = np.random.rand(10)
results = minimize(error,x_guess, method='nelder-mead', tol = 1e-15)
x = results.x
print(x_initial)
print(x)

You are solving least squares problem, but trying to optimize it using a solver that minimizes a scalar function. While it can possibly solve the problem, it does so very inefficiently. It can require much more iterations or can fail to converge at all.
The better way is to use least_squares instead of minimize.
For it to work properly you should modify error function by returning 1D numpy array instead of a scalar:
def error(x):
...
return (projected-pixels).flatten()
Then call least_squares:
results = least_squares(error, x_guess)
x = results.x
print(x_initial)
print(x)
print('error:', np.linalg.norm(error(x)))
Also, error(x) currently returns array of float32, because an array of float32 is created in project. It should be replaced by float64, otherwise minimization fails to converge, because most of gradients become zeros when 32 bit precision is used.
def project(points_3D,internal_calibration):
...
projections_2d = np.zeros((2, points_3D.shape[1]), dtype='float64')
With these modifications the solver converges to the solution most of the times, but can sometimes fail to do so. It happens because you generate the problem randomly, so in some cases the problem may be degenerate or make no physical sense. Such cases should be investigated on their own.
It can also help to use a robust loss, such as 'arctan', instead of linear loss:
results = least_squares(error, x_guess, loss='arctan')
Result:
[0.68589904 0.68782115 0.83299068 0.02360941 0.19367124 0.54715374
0.37609235 0.62190714 0.98824796 0.88385802]
[0.68589904 0.68782115 0.83299068 0.02360941 0.19367124 0.54715374
0.37609235 0.62190714 0.98824796 0.88385802]
error: 1.2269443642313758e-12

How to get shape of previous layer and pass it to next layer?

I want to take the shape of Input data which is passed to Input layer with (None,) shape, and use it in a for loop for some purpose.
Here's part of my code implementation:
lst_rfrm = []
Inpt_lyr = keras.Input(shape = (None,))
for k in range(tm_stp):
F = keras.layers.Lambda(lambda x, i, j: x[:, None, j : j + i])
F.arguments = {'i' : sub_len, 'j' : k}
tmp_rfrm = F(Inpt_lyr)
lst_rfrm.append(tmp_rfrm)
cnctnt_lyr = keras.layers.merge.Concatenate(axis = 1)(lst_rfrm)
#defining other layers ...
because the Input shape is (None,), I don't know what to give to for loop as range( at the code i describe it with 'tm_stp'). how can i get the shape of the input layer (the data that is passed to input layer) in this situation?
any help is deeply appreciated

You can try a different type of loop. It seems you are trying sliding windows, right?
You don't know the "length" to run, but you know the window size and how much of the borders to remove... so....
This function gets the slices following that principle:
windowSize = sub_len
def getWindows(x):
borderCut = windowSize - 1 #lost length in the length dimension
leftCut = range(windowSize) #start of sequence
rightCut = [i - borderCut for i in leftCut] #end of sequence - negative
rightCut[-1] = None #because it can't be zero for slicing
croppedSequences = K.stack([x[:, l: r] for l,r in zip(leftCut, rightCut)], axis=-1)
return croppedSequences
Running test:
from keras.layers import *
from keras.models import Model
import keras.backend as K
import numpy as np
windowSize = 3
batchSize = 5
randomLength = np.random.randint(5,10)
inputData = np.arange(randomLength * batchSize).reshape((batchSize, randomLength))
def getWindows(x):
borderCut = windowSize - 1
leftCut = range(windowSize)
rightCut = [i - borderCut for i in leftCut]
rightCut[-1] = None
croppedSequences = K.stack([x[:, l: r] for l,r in zip(leftCut, rightCut)], axis=-1)
return croppedSequences
inputs = Input((None,))
outputs = Lambda(getWindows)(inputs)
model = Model(inputs, outputs)
preds = model.predict(inputData)
for i, (inData, pred) in enumerate(zip(inputData, preds)):
print('sample: ', i)
print('input sequence: ', inData)
print('output sequence: \n', pred, '\n\n')

Is there in Python an approach faster than a double "for" cycle to evaluate a function characterized by an internal vector?

I have the following function:
k=np.linspace(0,5,100)
def f(x,y):
m=k
return sum(np.sin(m-x)*np.exp(-y**2))
I would like to obtain a 2D grid of values of f(x,y) evaluated on these two arrays:
x=np.linspace(0,4,30)
y=np.linspace(0,2,70)
Is there a way of calculation faster than a double "for" cycle like this one?
matrix=np.zeros((len(x),len(y)))
for i in range(len(x)):
for j in range(len(y)):
matrix[i,j]=f(x[i],y[j])
z=matrix.T
I tried to use the "numpy meshgrid" function in this way:
xx,yy=np.meshgrid(x, y)
z=f(xx,yy)
however I got the following error message:
ValueError: operands could not be broadcast together with shapes (100,) (70,30).

Here's a numpy approach. If we start with your original array setup,
k = np.linspace(0,5,100)
x = np.linspace(0,4,30)
y = np.linspace(0,2,70)
then
matrix = np.sin(k[:,np.newaxis] - x).sum(axis = 0)[:,np.newaxis]*np.exp(-y**2)
returns the same (30,70) "matrix" calculated with the double "for" cycle.
For reference, https://docs.scipy.org/doc/numpy/user/basics.broadcasting.html outlines the broadcasting rules in numpy and https://www.numpy.org/devdocs/user/theory.broadcasting.html
gives a nice illustration of using these rules.

k=np.linspace(0,5,100)
x=np.linspace(0,4,30)
y=np.linspace(0,2,70)
def f(x,y):
## m=k
return sum(np.sin(k-x)*np.exp(-y**2))
# original
def g():
m = np.zeros((len(x),len(y)))
for i in range(len(x)):
for j in range(len(y)):
m[i,j]=f(x[i],y[j])
return m.T
# k.shape, x.shape, y.shape -> (100,), (30,), (70,)
# sine of each k minus each x
q = np.sin(k[:,None]-x) # q.shape -> (100,30)
# [sine of each k minus each x] times [e to the negative (y squared)]
r = np.exp(-y**2) # r.shape --> (70,)
s = q[...,None] * r # s.shape --> (100,30,70)
t = s.sum(0)
v = t.T # v.shape -> (70,30)
assert np.all(np.isclose(v,g()))
assert np.all(v == g())
Broadcasting

How to use if statement PyTorch using torch.FloatTensor

I am trying to use the if statement in my PyTorch code using torch.FloatTensor as data type, to speed it up into the GPU.
This is my code:
import torch
import time
def fitness(x):
return torch.pow(x, 2)
def velocity(v, gxbest, pxbest, pybest, x, pop):
return torch.rand(pop).type(dtype)*v + \
torch.rand(pop).type(dtype)*(pxbest - x) + \
torch.rand(pop).type(dtype)*(gxbest.expand(x.size(0)) - x)
dtype = torch.cuda.FloatTensor
def main():
pop, xmax, xmin, niter = 300000, 50, -50, 100
v = torch.rand(pop).type(dtype)
x = (xmax-xmin)*torch.rand(pop).type(dtype)+xmin
y = fitness(x)
[miny, indexminy] = y.min(0)
gxbest = x[indexminy]
pxbest = x
pybest = y
for K in range(niter):
vnext = velocity(v, gxbest, pxbest, pybest, x, pop)
xnext = x + vnext
ynext = fitness(x)
[minynext, indexminynext] = ynext.min(0)
if (minynext < miny):
miny = minynext
gxbest = xnext[indexminynext]
indexpbest = (ynext < pybest)
pxbest[indexpbest] = xnext[indexpbest]
pybest[indexpbest] = ynext[indexpbest]
x = xnext
v = vnext
main()
Unfortanally it is not working. It is giving me a error message and I can not figure it out what is the problem.
RuntimeError: bool value of non-empty torch.cuda.ByteTensor objects is ambiguous
How can I use the if in PyTorch? I tried to convert the cuda.Tensor into a numpy array but it did not work also.
minynext = minynext.cpu().numpy()
miny = miny.cpu().numpy()
PS: Am I doing the code the efficient/faster way possible ? Or should I change something to achieve faster results?

If you look into the following simple example:
import torch
a = torch.LongTensor([1])
b = torch.LongTensor([5])
print(a > b)
Output:
0
[torch.ByteTensor of size 1]
Comparing tensors a and b results in a torch.ByteTensor which is obviously not equivalent to boolean. So, you can do the following.
print(a[0] > b[0]) # False
So, you should change your if condition as follows.
if (minynext[0] < miny[0])

When you compare pyTorch tensors, the output is usually a ByteTensor. This data type is not suitable for if statements.
Change the condition inside the if:
if (minynext[0] < miny[0])

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Speeding up a pytorch tensor operation - python

Related

Must input same numpy shape in numba's #vectorize?

Scipy optimize unable to find the correct results

How to get shape of previous layer and pass it to next layer?

Is there in Python an approach faster than a double "for" cycle to evaluate a function characterized by an internal vector?

How to use if statement PyTorch using torch.FloatTensor

Categories

Resources