Custom torch.nn.Module not learning, even though grad_fn=MmBackward

Custom torch.nn.Module not learning, even though grad_fn=MmBackward - python

I am training a model to predict pose using a custom Pytorch model. However, V1 below never learns (params don't change). The output is connected to the backdrop graph and grad_fn=MmBackward.
I can't understand why V1 isn't learning but V2 is?
V1
class cam_pose_transform_V1(torch.nn.Module):
def __init__(self):
super(cam_pose_transform, self).__init__()
self.elevation_x_rotation_radians = torch.nn.Parameter(torch.normal(0., 1e-6, size=()))
self.azimuth_y_rotation_radians = torch.nn.Parameter(torch.normal(0., 1e-6, size=()))
self.z_rotation_radians = torch.nn.Parameter(torch.normal(0., 1e-6, size=()))
def forward(self, x):
exp_i = torch.zeros((4,4))
c1 = torch.cos(self.elevation_x_rotation_radians)
s1 = torch.sin(self.elevation_x_rotation_radians)
c2 = torch.cos(self.azimuth_y_rotation_radians)
s2 = torch.sin(self.azimuth_y_rotation_radians)
c3 = torch.cos(self.z_rotation_radians)
s3 = torch.sin(self.z_rotation_radians)
rotation_in_matrix = torch.tensor([
[c2, s2 * s3, c3 * s2],
[s1 * s2, c1 * c3 - c2 * s1 * s3, -c1 * s3 - c2 * c3 * s1],
[-c1 * s2, c3 * s1 + c1 * c2 * s3, c1 * c2 * c3 - s1 * s3]
], requires_grad=True)
exp_i[:3, :3] = rotation_in_matrix
exp_i[3, 3] = 1.
return torch.matmul(exp_i, x)
However, this version learns as expected (params and loss change) and also has grad_fn=MmBackward on the output:
V2
def vec2ss_matrix(vector): # vector to skewsym. matrix
ss_matrix = torch.zeros((3,3))
ss_matrix[0, 1] = -vector[2]
ss_matrix[0, 2] = vector[1]
ss_matrix[1, 0] = vector[2]
ss_matrix[1, 2] = -vector[0]
ss_matrix[2, 0] = -vector[1]
ss_matrix[2, 1] = vector[0]
return ss_matrix
class cam_pose_transform_V2(torch.nn.Module):
def __init__(self):
super(camera_transf, self).__init__()
self.w = torch.nn.Parameter(torch.normal(0., 1e-6, size=(3,)))
self.v = torch.nn.Parameter(torch.normal(0., 1e-6, size=(3,)))
self.theta = torch.nn.Parameter(torch.normal(0., 1e-6, size=()))
def forward(self, x):
exp_i = torch.zeros((4,4))
w_skewsym = vec2ss_matrix(self.w)
v_skewsym = vec2ss_matrix(self.v)
exp_i[:3, :3] = torch.eye(3) + torch.sin(self.theta) * w_skewsym + (1 - torch.cos(self.theta)) * torch.matmul(w_skewsym, w_skewsym)
exp_i[:3, 3] = torch.matmul(torch.eye(3) * self.theta + (1 - torch.cos(self.theta)) * w_skewsym + (self.theta - torch.sin(self.theta)) * torch.matmul(w_skewsym, w_skewsym), self.v)
exp_i[3, 3] = 1.
return torch.matmul(exp_i, x)
Update #1
In the training loop I printed the .grad attributes using:
print([i.grad for i in list(cam_pose.parameters())])
loss.backward()
print([i.grad for i in list(cam_pose.parameters())])
Results:
# V1
[None, None, None]
[None, None, None]
# V2
[None, None, None]
[tensor([-0.0032, 0.0025, -0.0053]), tensor([ 0.0016, -0.0013, 0.0054]), tensor(-0.0559)]
Nothing else in the code was changed, just swapped V1 model for V2.

this is your problem right here:
rotation_in_matrix = torch.tensor([
[c2, s2 * s3, c3 * s2],
[s1 * s2, c1 * c3 - c2 * s1 * s3, -c1 * s3 - c2 * c3 * s1],
[-c1 * s2, c3 * s1 + c1 * c2 * s3, c1 * c2 * c3 - s1 * s3]], requires_grad=True)
you are creating a tensor out of a list of tensors, which is not a differentiable operation -- i.e. there's no gradient flow from rotation_in_matrix to its elements c1..c3
the solution would be to create the rotation_in_matrix using tensor operations like stack and cat instead

Related

Python library for dot product classification

I have the following python pyseudo-code:
A1 = "101000001111"
A2 = "110000010101"
B2 = "000111010000"
B2 = "000110100000"
# TODO get X = [x1, x2, ..., x12]
assert(A1 * X > .5)
assert(A2 * X > .5)
assert(B1 * X < .5)
assert(B2 * X < .5)
So this will basically be a regression based classification.
0.5 is my threshold but how to get X?

You need to find 12 coefficients. You can try to use LogisticRegression or LinearRegression
When you have linear coefficients you can use np.dot or # operator to get a dot product.
Example:
import numpy as np
from sklearn.linear_model import LogisticRegression
A1 = "101000001111"
A2 = "110000010101"
B1 = "000111010000"
B2 = "000110100000"
A1 = np.array(list(A1), np.float32)
A2 = np.array(list(A2), np.float32)
B1 = np.array(list(B1), np.float32)
B2 = np.array(list(B2), np.float32)
X = np.array((A1, A2, B1, B2))
y = np.array([1, 1, 0, 0])
w = model = LogisticRegression(fit_intercept=False).fit(X, y).coef_.flatten()
print(A1.dot(w))
print(A2.dot(w))
print(B1.dot(w))
print(B2.dot(w))
assert A1 # w > 0.5
assert A2 # w > 0.5
assert B1 # w < 0.5
assert B2 # w < 0.5
Results:
1.7993630995882384
1.5032155788245702
-1.0190643734998346
-1.0385501901808816

Solve_ivp not updating a parameter every iteration

I have this function for solve_ivp, which solves a series of differential equations.
For every time step, the variable position [Pos] should be updated depending on a probability of it jumping. However, solve_ivp is not remembering the previous variable.
def jump_fun(t, Y, P, flux = 670):
global Pos_list
k1, k2 , k3, k4, k5, k6 , k , kay, q , K1, c5_tot, c6_tot, phi, Nc, V, qs, Cx, Ks, F, Cs0, flux, Pos, Pos_list = P
c1, c2, c3, c4, c5 ,c6, c4ex1, c4ex2, Cs1, Cs2= Y.reshape([10,-1])
#print(c1, c2, c3, c4, c5 ,c6, c4ex, Cs, Pos)
print(f'Pos: {Pos}')
# function heterogeneous_environment
prob = heterogeneous_environment(Nc, flux, V, 10, 10)
print(f'prob: {prob}')
#These arrays keep track of where the cells are
R1 = np.where(Pos == 1, 1, 0 )
R2 = np.where(Pos == 2, 1, 0 )
# We need to know the external concentrations of each reactor
rs1 = (Cx*qs*Cs1/ (Ks+Cs1))
dCs1 = F/V + flux*Cs2/V - flux*Cs1/V - rs1 # Position of cell is reactor 1, C1 = Cs
rs2 = (Cx*qs*Cs2/ (Ks+Cs2))
dCs2 = flux*Cs1/V - flux*Cs2/V - rs2 # Position of cell is reactor 2, C2 = Cs
j0 = rs1*R1 + rs2*R2
jm1 = kay * (c4 - c4ex1)*R1
jm2 = kay * (c4 - c4ex2)*R2
dc1= j0 - (k1 * c1 * c6 * (1/( 1 + ((c6/K1 )**q))))
dc2 = 2* k1 * c1 * c6 * (1/( 1 + ((c6/K1 )**q))) - k2 * c2 * (c5_tot - c5) - k6 *c2 *c5
dc3 = k2 * c2 * (c5_tot - c5) - k3 * c3* (c6_tot - c6)
dc4 = k3 * c3* (c6_tot - c6) - k4 * c4 *c5 - (jm1*R1 + jm2*R2)
dc5= k2 * c2 * (c5_tot - c5) - k4 * c4 *c5 - k6 *c2 *c5
dc6 = -2 * k1 * c1 * c6 * (1/( 1 + ((c6/K1 )**q))) + 2 * k3 * c3* (c6_tot - c6) - k5 * c6
cell1 = np.count_nonzero(R1 == 1)
cell2 = np.count_nonzero(R2 == 1)
dc4ex1= - k * c4ex1 + (phi / cell1) * np.sum(jm1) - (flux*c4ex1)/V + (flux*c4ex2)/V
dc4ex2= - k * c4ex2 + (phi / cell2) * np.sum(jm2) + (flux*c4ex1)/V - (flux*c4ex2)/V
#print('1')
#print(Pos)
#print('2')
Pos = Pos + prob*R1
Pos = Pos - prob*R2
#print(Pos)
Pos_list.append(Pos)
#iterate through the Pos vector, if Pos =1 do the balances for reactor 1 and then if the probability says to change
#reactor, change the position of the cell. Same for POS=2
#UPDATE THE MASS BALANCE FOR EACH REACTOR
return dc1, dc2, dc3, dc4, dc5 , dc6, dc4ex1, dc4ex2, dCs1, dCs2
I tried passing Pos as a return of the function, but it also didn't seem to work. Please help.

Issue on Runge Kutta Fehlberg algorithm

I have wrote a code for Runge-Kutta 4th order, which works perfectly fine for a system of differential equations:
import numpy as np
import matplotlib.pyplot as plt
import numba
import time
start_time = time.clock()
#numba.jit()
def V(u,t):
x1,dx1, x2, dx2=u
ddx1=-w**2 * x1 -b * dx1
ddx2=-(w+0.5)**2 * x2 -(b+0.1) * dx2
return np.array([dx1,ddx1,dx2,ddx2])
#numba.jit()
def rk4(f, u0, t0, tf , n):
t = np.linspace(t0, tf, n+1)
u = np.array((n+1)*[u0])
h = t[1]-t[0]
for i in range(n):
k1 = h * f(u[i], t[i])
k2 = h * f(u[i] + 0.5 * k1, t[i] + 0.5*h)
k3 = h * f(u[i] + 0.5 * k2, t[i] + 0.5*h)
k4 = h * f(u[i] + k3, t[i] + h)
u[i+1] = u[i] + (k1 + 2*(k2 + k3) + k4) / 6
return u, t
u, t = rk4(V,np.array([0,0.2,0,0.3]) ,0,100, 20000)
print("Execution time:",time.clock() - start_time, "seconds")
x1,dx1,x2,dx2 = u.T
plt.plot(x1,x2)
plt.xlabel('X1')
plt.ylabel('X2')
plt.show()
The above code, returns the desired result:
And thanks to Numba JIT, this code works really fast. However, this method doesn't use adaptive step size and hence, it is not very suitable for a system of stiff differential equations. Runge Kutta Fehlberg method, solves this problem by using a straight forward algorithm. Based on the algorithm (https://en.wikipedia.org/wiki/Runge%E2%80%93Kutta%E2%80%93Fehlberg_method) I wrote this code which works for only one differential equation :
import numpy as np
def rkf( f, a, b, x0, tol, hmax, hmin ):
a2 = 2.500000000000000e-01 # 1/4
a3 = 3.750000000000000e-01 # 3/8
a4 = 9.230769230769231e-01 # 12/13
a5 = 1.000000000000000e+00 # 1
a6 = 5.000000000000000e-01 # 1/2
b21 = 2.500000000000000e-01 # 1/4
b31 = 9.375000000000000e-02 # 3/32
b32 = 2.812500000000000e-01 # 9/32
b41 = 8.793809740555303e-01 # 1932/2197
b42 = -3.277196176604461e+00 # -7200/2197
b43 = 3.320892125625853e+00 # 7296/2197
b51 = 2.032407407407407e+00 # 439/216
b52 = -8.000000000000000e+00 # -8
b53 = 7.173489278752436e+00 # 3680/513
b54 = -2.058966861598441e-01 # -845/4104
b61 = -2.962962962962963e-01 # -8/27
b62 = 2.000000000000000e+00 # 2
b63 = -1.381676413255361e+00 # -3544/2565
b64 = 4.529727095516569e-01 # 1859/4104
b65 = -2.750000000000000e-01 # -11/40
r1 = 2.777777777777778e-03 # 1/360
r3 = -2.994152046783626e-02 # -128/4275
r4 = -2.919989367357789e-02 # -2197/75240
r5 = 2.000000000000000e-02 # 1/50
r6 = 3.636363636363636e-02 # 2/55
c1 = 1.157407407407407e-01 # 25/216
c3 = 5.489278752436647e-01 # 1408/2565
c4 = 5.353313840155945e-01 # 2197/4104
c5 = -2.000000000000000e-01 # -1/5
t = a
x = np.array(x0)
h = hmax
T = np.array( [t] )
X = np.array( [x] )
while t < b:
if t + h > b:
h = b - t
k1 = h * f( x, t )
k2 = h * f( x + b21 * k1, t + a2 * h )
k3 = h * f( x + b31 * k1 + b32 * k2, t + a3 * h )
k4 = h * f( x + b41 * k1 + b42 * k2 + b43 * k3, t + a4 * h )
k5 = h * f( x + b51 * k1 + b52 * k2 + b53 * k3 + b54 * k4, t + a5 * h )
k6 = h * f( x + b61 * k1 + b62 * k2 + b63 * k3 + b64 * k4 + b65 * k5, \
t + a6 * h )
r = abs( r1 * k1 + r3 * k3 + r4 * k4 + r5 * k5 + r6 * k6 ) / h
if len( np.shape( r ) ) > 0:
r = max( r )
if r <= tol:
t = t + h
x = x + c1 * k1 + c3 * k3 + c4 * k4 + c5 * k5
T = np.append( T, t )
X = np.append( X, [x], 0 )
h = h * min( max( 0.84 * ( tol / r )**0.25, 0.1 ), 4.0 )
if h > hmax:
h = hmax
elif h < hmin:
raise RuntimeError("Error: Could not converge to the required tolerance %e with minimum stepsize %e." % (tol,hmin))
break
return ( T, X )
but I'm struggling to convert it to a function like the first code, where I can input a system of differential equations. The most confusing part for me, is how can I vectorize everything in the second code without messing things up. In other words, I cannot reproduce the first result using the RKF algorithm. Can anyone point me in the right direction?

I'm not really sure where your problem lies. Setting the not given parameters to w=1; b=0.1 and calling, without changing anything
T, X = rkf( f=V, a=0, b=100, x0=[0,0.2,0,0.3], tol=1e-6, hmax=1e1, hmin=1e-16 )
gives the phase plot
The step sizes grow as the system slows down as
which is the expected behavior for an unfiltered step size controller.

Bi-cubic interpolation for image scaling

I'm trying to implement a very basic Bi-Cubic algo to use it for image scaling. I used other questions on stack overflow and tried to just translate the code into the Python.
It executes correctly w/o any errors, I've checked the math a couple of times but I cant find the reason of that kind of problem seems like everything should be running smooth but as a result I get this:
Any help appreciated.
Here's the source:
def getBicPixelChannel(img,x,y,channel):
if x < img.shape[1] & y < img.shape[0]:
return img[y,x,channel]
return 0
def Bicubic(img, rate):
new_w = int(math.ceil(float(img.shape[1]) * rate))
new_h = int(math.ceil(float(img.shape[0]) * rate))
new_img = np.zeros((new_w, new_h, 3))
x_rate = float(img.shape[1]) / new_img.shape[1]
y_rate = float(img.shape[0]) / new_img.shape[0]
C = np.zeros(5)
for hi in range(new_img.shape[0]):
for wi in range(new_img.shape[1]):
x_int = int(wi * x_rate)
y_int = int(hi * y_rate)
dx = x_rate * wi - x_int
dy = y_rate * hi - y_int
for channel in range(new_img.shape[2]):
for jj in range(0,4):
o_y = y_int - 1 + jj
a0 = getBicPixelChannel(img,x_int,o_y, channel)
d0 = getBicPixelChannel(img,x_int - 1,o_y, channel) - a0
d2 = getBicPixelChannel(img,x_int + 1,o_y, channel) - a0
d3 = getBicPixelChannel(img,x_int + 2,o_y, channel) - a0
a1 = -1./3 * d0 + d2 - 1./6 * d3
a2 = 1./2 * d0 + 1./2 * d2
a3 = -1./6 * d0 - 1./2 * d2 + 1./6 * d3
C[jj] = a0 + a1 * dx + a2 * dx * dx + a3 * dx * dx * dx
d0 = C[0] - C[1]
d2 = C[2] - C[1]
d3 = C[3] - C[1]
a0 = C[1]
a1 = -1. / 3 * d0 + d2 - 1. / 6 * d3
a2 = 1. / 2 * d0 + 1. / 2 * d2
a3 = -1. / 6 * d0 - 1. / 2 * d2 + 1. / 6 * d3
new_img[hi, wi, channel] = a0 + a1 * dy + a2 * dy * dy + a3 * dy * dy * dy
return new_img

I think your problem is in this line of code:
if x < img.shape[1] & y < img.shape[0]:
From what I gather, & is the bit-wise AND operator in Python, and it has a higher precedence than <. Thus what you are computing is:
if x < ( img.shape[1] & y ) < img.shape[0]:
(whatever that means...)
Instead, try this:
if (x < img.shape[1]) and (y < img.shape[0]):
and is Python's boolean AND operator.

Neural Network XOR with numpy not converging

I have trained a Neural Net to solve the XOR problem. The problem with my network is that it is not converging. I am using Andrew Ng's methods and notations as taught in the DeepLearning.ai course.
Here's the code :
import numpy as np
from __future__ import print_function
X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
Y = np.array([[0, 1, 1, 0]])
np.random.seed(1)
W1 = np.random.randn(3, 2) * 0.0001
b1 = np.ones((3, 1))
W2 = np.random.randn(1, 3) * 0.0001
b2 = np.ones((1, 1))
The next part for the Backpropagation:
learning_rate = 0.01
m = 4
for iteration in range(100000):
# forward propagation
# layer1
Z1 = np.dot(W1, X.T) + b1
A1 = sigmoid(Z1)
# layer2
Z2 = np.dot(W2, A1) + b2
A2 = sigmoid(Z2)
# backpropagation
dZ2 = Y - A2
dW2 = (1 / m) * np.dot(dZ2, A1.T)
db2 = (1 / m) * np.sum(dZ2, axis=1, keepdims=True)
dZ1 = np.dot(dW2.T, dZ2) * sigmoid_gradient(Z1)
dW1 = (1 / m) * np.dot(dZ1, X)
db1 = (1 / m) * np.sum(dZ1, axis=1, keepdims=True)
# checking if shapes are correctly preserved
assert (dZ2.shape == Z2.shape)
assert (dW2.shape == W2.shape)
assert (db2.shape == b2.shape)
assert (dZ1.shape == Z1.shape)
assert (dW1.shape == W1.shape)
assert (db1.shape == b1.shape)
# update parameters
W1 = W1 + learning_rate * dW1
W2 = W2 + learning_rate * dW2
b1 = b1 + learning_rate * db1
b2 = b2 + learning_rate * db2
# print every 10k
if (iteration % 10000 == 0):
print(A2)

You have made a couple of mistakes in your code. For example, in computing the W2.
...
dZ2 = Y - A2
dW2 = (1 / m) * np.dot(dZ2, A1.T)
...
W2 = W2 + learning_rate * dW2
We want to calculate the derivative of Cost with respect to W2 using the chain rule.
We can write the derivatives as follows:
You haven't implemented the middle part which computes the derivative of the Z2.
You can check out this video, it explains the math part of backpropagation. Moreover, you can check out this simple implementation of the neural network.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Custom torch.nn.Module not learning, even though grad_fn=MmBackward - python

Related

Python library for dot product classification

Solve_ivp not updating a parameter every iteration

Issue on Runge Kutta Fehlberg algorithm

Bi-cubic interpolation for image scaling

Neural Network XOR with numpy not converging

Categories

Resources