I've got zero experience with Python. I have looked around some tutorial materials, but it seems difficult to understand a advanced code. So I came here for a more specific answer.
For me the mission is to redo the code in my computer.
Here is the scenario:
I'm a graduate student studying tensor factorization in relation learning. A paper[1] providing a code to run this algorithm, as follows:
import logging, time
from numpy import dot, zeros, kron, array, eye, argmax
from numpy.linalg import qr, pinv, norm, inv
from scipy.linalg import eigh
from numpy.random import rand
__version__ = "0.1"
__all__ = ['rescal', 'rescal_with_random_restarts']
__DEF_MAXITER = 500
__DEF_INIT = 'nvecs'
__DEF_PROJ = True
__DEF_CONV = 1e-5
__DEF_LMBDA = 0
_log = logging.getLogger('RESCAL')
def rescal_with_random_restarts(X, rank, restarts=10, **kwargs):
"""
Restarts RESCAL multiple time from random starting point and
returns factorization with best fit.
"""
models = []
fits = []
for i in range(restarts):
res = rescal(X, rank, init='random', **kwargs)
models.append(res)
fits.append(res[2])
return models[argmax(fits)]
def rescal(X, rank, **kwargs):
"""
RESCAL
Factors a three-way tensor X such that each frontal slice
X_k = A * R_k * A.T. The frontal slices of a tensor are
N x N matrices that correspond to the adjecency matrices
of the relational graph for a particular relation.
For a full description of the algorithm see:
Maximilian Nickel, Volker Tresp, Hans-Peter-Kriegel,
"A Three-Way Model for Collective Learning on Multi-Relational Data",
ICML 2011, Bellevue, WA, USA
Parameters
----------
X : list
List of frontal slices X_k of the tensor X. The shape of each X_k is ('N', 'N')
rank : int
Rank of the factorization
lmbda : float, optional
Regularization parameter for A and R_k factor matrices. 0 by default
init : string, optional
Initialization method of the factor matrices. 'nvecs' (default)
initializes A based on the eigenvectors of X. 'random' initializes
the factor matrices randomly.
proj : boolean, optional
Whether or not to use the QR decomposition when computing R_k.
True by default
maxIter : int, optional
Maximium number of iterations of the ALS algorithm. 500 by default.
conv : float, optional
Stop when residual of factorization is less than conv. 1e-5 by default
Returns
-------
A : ndarray
array of shape ('N', 'rank') corresponding to the factor matrix A
R : list
list of 'M' arrays of shape ('rank', 'rank') corresponding to the factor matrices R_k
f : float
function value of the factorization
iter : int
number of iterations until convergence
exectimes : ndarray
execution times to compute the updates in each iteration
"""
# init options
ainit = kwargs.pop('init', __DEF_INIT)
proj = kwargs.pop('proj', __DEF_PROJ)
maxIter = kwargs.pop('maxIter', __DEF_MAXITER)
conv = kwargs.pop('conv', __DEF_CONV)
lmbda = kwargs.pop('lmbda', __DEF_LMBDA)
if not len(kwargs) == 0:
raise ValueError( 'Unknown keywords (%s)' % (kwargs.keys()) )
sz = X[0].shape
dtype = X[0].dtype
n = sz[0]
k = len(X)
_log.debug('[Config] rank: %d | maxIter: %d | conv: %7.1e | lmbda: %7.1e' % (rank,
maxIter, conv, lmbda))
_log.debug('[Config] dtype: %s' % dtype)
# precompute norms of X
normX = [norm(M)**2 for M in X]
Xflat = [M.flatten() for M in X]
sumNormX = sum(normX)
# initialize A
if ainit == 'random':
A = array(rand(n, rank), dtype=dtype)
elif ainit == 'nvecs':
S = zeros((n, n), dtype=dtype)
T = zeros((n, n), dtype=dtype)
for i in range(k):
T = X[i]
S = S + T + T.T
evals, A = eigh(S,eigvals=(n-rank,n-1))
else :
raise 'Unknown init option ("%s")' % ainit
# initialize R
if proj:
Q, A2 = qr(A)
X2 = __projectSlices(X, Q)
R = __updateR(X2, A2, lmbda)
else :
R = __updateR(X, A, lmbda)
# compute factorization
fit = fitchange = fitold = f = 0
exectimes = []
ARAt = zeros((n,n), dtype=dtype)
for iter in xrange(maxIter):
tic = time.clock()
fitold = fit
A = __updateA(X, A, R, lmbda)
if proj:
Q, A2 = qr(A)
X2 = __projectSlices(X, Q)
R = __updateR(X2, A2, lmbda)
else :
R = __updateR(X, A, lmbda)
# compute fit value
f = lmbda*(norm(A)**2)
for i in range(k):
ARAt = dot(A, dot(R[i], A.T))
f += normX[i] + norm(ARAt)**2 - 2*dot(Xflat[i], ARAt.flatten()) + lmbda*(R[i].flatten()**2).sum()
f *= 0.5
fit = 1 - f / sumNormX
fitchange = abs(fitold - fit)
toc = time.clock()
exectimes.append( toc - tic )
_log.debug('[%3d] fit: %.5f | delta: %7.1e | secs: %.5f' % (iter,
fit, fitchange, exectimes[-1]))
if iter > 1 and fitchange < conv:
break
return A, R, f, iter+1, array(exectimes)
def __updateA(X, A, R, lmbda):
n, rank = A.shape
F = zeros((n, rank), dtype=X[0].dtype)
E = zeros((rank, rank), dtype=X[0].dtype)
AtA = dot(A.T,A)
for i in range(len(X)):
F += dot(X[i], dot(A, R[i].T)) + dot(X[i].T, dot(A, R[i]))
E += dot(R[i], dot(AtA, R[i].T)) + dot(R[i].T, dot(AtA, R[i]))
A = dot(F, inv(lmbda * eye(rank) + E))
return A
def __updateR(X, A, lmbda):
r = A.shape[1]
R = []
At = A.T
if lmbda == 0:
ainv = dot(pinv(dot(At, A)), At)
for i in range(len(X)):
R.append( dot(ainv, dot(X[i], ainv.T)) )
else :
AtA = dot(At, A)
tmp = inv(kron(AtA, AtA) + lmbda * eye(r**2))
for i in range(len(X)):
AtXA = dot(At, dot(X[i], A))
R.append( dot(AtXA.flatten(), tmp).reshape(r, r) )
return R
def __projectSlices(X, Q):
q = Q.shape[1]
X2 = []
for i in range(len(X)):
X2.append( dot(Q.T, dot(X[i], Q)) )
return X2
It's boring to paste such a long code but there is no other way to figure out my problems. I'm sorry about this.
I import this module and pass them arguments according to the author's website:
import pickle, sys
from rescal import rescal
rank = sys.argv[1]
X = pickle.load('us-presidents.pickle')
A, R, f, iter, exectimes = rescal(X, rank, lmbda=1.0)
The dataset us-presidents.rdf can be found here.
My questions are:
According to the code note, the tensor X is a list. I don't quite understand this, how do I relate a list to a tensor in Python? Can I understand tensor = list in Python?
Should I convert RDF format to a triple(subject, predicate, object) format first? I'm not sure of the data structure of X. How do I assignment values to X by hand?
Then, how to run it?
I paste the author's code without his authorization, is it an act of infringement? if so, I am so sorry and I will delete it soon.
The problems may be a little bored, but these are important to me. Any help would be greatly appreciated.
[1] Maximilian Nickel, Volker Tresp, Hans-Peter Kriegel,
A Three-Way Model for Collective Learning on Multi-Relational Data,
in Proceedings of the 28th International Conference on Machine Learning, 2011 , Bellevue, WA, USA
To answer Q2: you need to transform the RDF and save it before you can load it from the file 'us-presidents.pickle'. The author of that code probably did that once because the Python native pickle format loads faster. As the pickle format includes the datatype of the data, it is possible that X is some numpy class instance and you would need either an example pickle file as used by this code, or some code doing the pickle.dump to figure out how to convert from RDF to this particular pickle file as rescal expects it.
So this might answer Q1: the tensor consists of a list of elements. From the code you can see that the X parameter to rescal has a length (k = len(X) ) and can be indexed (T = X[i]). So it elements are used as a list (even if it might be some other datatype, that just behaves as such.
As an aside: If you are not familiar with Python and are just interested in the result of the computation, you might get more help contacting the author of the software.
According to the code note, the tensor X is a list. I don't quite understand this, how do I relate a list to a tensor in Python? Can I
understand tensor = list in Python?
Not necessarily but the author of the code has decided to represent the tensor data as a list data structure. As the comments indicate, the list X contains:
List of frontal slices X_k of the tensor X. The shape of each X_k is ('N', 'N')
That means the tensor is repesented as a list of tuples: [(N, N), ..., (N, N)].
I'm not sure of the data structure of X. How do I assignment values to X by hand?
Now that we now the data structure of X, we can assign values to it using assignment. The following will assign the tuple (1, 3) to the first position in the list X (as the first position is at index 0, the second at position 1, et cetera):
X[0] = (1, 3)
Similarly, the following will assign the tuple (2, 4) to the second position:
X[1] = (2, 4)
Related
I was reading about attention and came across this equation:
import einops
from fancy_einsum import einsum
import torch
x = torch.rand((200, 10, 768))
y = torch.rand((20, 768, 64))
res = einsum("batch query_pos d_model, n_heads d_model d_head -> batch query_pos n_heads d_head", x, y)
And I am not able to understand the underlying operations that give the result res
I thought it might be matmul and tried this:
import torch
x_ = x.unsqueeze(dim = 2).unsqueeze(dim = 2)
y_ = torch.broadcast_to(y, (1, 1, 20, 768, 64))
res2 = x_ # y_
res2 = res2.squeeze(dim = -2)
(res == res2).all() # Prints False
But that does not seem to be right.
Any help regarding this is greatly appreciated
So whenever using einsum you best think about the meaning of the dimensions. Basically we perform a multiplication between the two inputs in this case. The signature passed to einsum shows what dimensions will be preserved and which ones will be "summed away". I simplified the signature with single letters here:
res = einsum("b q m, n m h -> b q n h", x, y)
We can read from this that both x and y have three dimensions. Furthermore both have a dimension called m, and this doesn't appear in the output. So we can conclude that it gets "summed away". So for each entry of the output we have following formula. For simplicity I reused the dimension names as indices, so for every b,q,n,h we get
___
\
res[b,q,n,h] = / x[b,q,m] * y[n,m,h]
/__
m
To do this with any other function than einsum is usually more cumbersome. So first we need to reorder and unsqueeze the dimensions in a way that they are compatible to be multiplied, so we can do the following (the shapes annotated above):
#(b,q,m,n,h) (b, q, m, 1, 1) (m, n, h)
product = x[:, :, :, None, None] * y.permute([1,0,2])
Due to the broadcasting rules, the second (y-) term will implicitly get the required leading dummy dimensions.
Then we can "sum away" the dimension m:
res = product.sum(dim=2) # (b,q,n,h)
So you can interpret that as a matrix multiplication if you want, or also just a scalar product, but of course with many "batch"-dimensions.
I am using the qndiag library to try to find a diagonalisation for 2 given matrices.
The github is here : qndiag libray
I am using this Python script to compute these 2 diagonalisation as closed as possible :
import os, sys
import numpy as np
from qndiag import qndiag
# dimension
m=7
# number of matrices
n=2
# Load spectro and WL+GCph+XC
FISH_GCsp = np.loadtxt('Fisher_GCsp_flat.txt')
FISH_XC = np.loadtxt('Fisher_XC_GCph_WL_flat.txt')
# Marginalizing over uncommon parameters between the two matrices
COV_GCsp_first = np.linalg.inv(FISH_GCsp)
COV_XC_first = np.linalg.inv(FISH_XC)
COV_GCsp = COV_GCsp_first[0:m,0:m]
COV_XC = COV_XC_first[0:m,0:m]
# Invert to get Fisher matrix
FISH_sp = np.linalg.inv(COV_GCsp);
FISH_xc = np.linalg.inv(COV_XC);
# Drawing a random set of commuting matrices
C=np.zeros((n,m,m));
C[0,:,:] = FISH_sp
C[1,:,:] = FISH_xc
[D, B] = qndiag(C, 'max_iter', 1000, 'tol', 1e-3);
# Print expected diagonal matrices
B*C[0,:,:]*B.T
B*C[1,:,:]*B.T
Given my 2 matrices 7x7 FISH_sp and FISH_xc, I get an error of kind :
approximate_joint_diagonalization_qndiag/qndiag/qndiag/qndiag.py", line 90, in qndiag
D = transform_set(B, C)
and following
approximate_joint_diagonalization_qndiag/qndiag/qndiag/qndiag.py", line 151, in transform_set
op[i] = M.dot(d.dot(M.T))
AttributeError: 'str' object has no attribute 'dot'
Here the concerned function transform_set :
def transform_set(M, D, diag_only=False):
n, p, _ = D.shape
if not diag_only:
op = np.zeros((n, p, p))
for i, d in enumerate(D):
op[i] = M.dot(d.dot(M.T))
and the main function that is called in my initialization :
def qndiag(C, B0=None, weights=None, max_iter=1000, tol=1e-6,
lambda_min=1e-4, max_ls_tries=10, diag_only=False,
return_B_list=False, verbose=False):
"""Joint diagonalization of matrices using the quasi-Newton method
Parameters
----------
C : array-like, shape (n_samples, n_features, n_features)
Set of matrices to be jointly diagonalized. C[0] is the first matrix,
etc...
B0 : None | array-like, shape (n_features, n_features)
Initial point for the algorithm. If None, a whitener is used.
weights : None | array-like, shape (n_samples,)
Weights for each matrix in the loss:
L = sum(weights * KL(C, C')) / sum(weights).
No weighting (weights = 1) by default.
max_iter : int, optional
Maximum number of iterations to perform.
tol : float, optional
A positive scalar giving the tolerance at which the
algorithm is considered to have converged. The algorithm stops when
|gradient| < tol.
lambda_min : float, optional
A positive regularization scalar. Each eigenvalue of the Hessian
approximation below lambda_min is set to lambda_min.
max_ls_tries : int, optional
Maximum number of line-search tries to perform.
diag_only : bool, optional
If true, the line search is done by computing only the diagonals of the
dataset. The dataset is then computed after the line search.
Taking diag_only = True might be faster than diag_only=False
when the matrices are large (n_features > 200)
return_B_list : bool, optional
Chooses whether or not to return the list of iterates.
verbose : bool, optional
Prints informations about the state of the algorithm if True.
Returns
-------
D : array-like, shape (n_samples, n_features, n_features)
Set of matrices jointly diagonalized
B : array, shape (n_features, n_features)
Estimated joint diagonalizer matrix.
infos : dict
Dictionnary of monitoring informations, containing the times,
gradient norms and objective values.
References
----------
P. Ablin, J.F. Cardoso and A. Gramfort. Beyond Pham's algorithm
for joint diagonalization. Proc. ESANN 2019.
https://www.elen.ucl.ac.be/Proceedings/esann/esannpdf/es2019-119.pdf
https://hal.archives-ouvertes.fr/hal-01936887v1
https://arxiv.org/abs/1811.11433
"""
t0 = time()
n_samples, n_features, _ = C.shape
if B0 is None:
C_mean = np.mean(C, axis=0)
d, p = np.linalg.eigh(C_mean)
B = p.T / np.sqrt(d[:, None])
else:
B = B0
if weights is not None: # normalize
weights_ = weights / np.mean(weights)
else:
weights_ = None
D = transform_set(B, C)
Why this error on dot operator ? it seems to be matricial product (like np.dot) but the way it is used lets think that's not the case.
The issue lies in [D, B] = qndiag(C, 'max_iter', 1000, 'tol', 1e-3), B0 (which is the second param) gets assigned as a string not as an array! Then eventually B would be a string and hence the error message str object has no attribute 'dot' !, if you are only passing C matrix as parameter, just do [D, B] = qndiag(C).
I have been playing around with numba and trying to implement a simple element-wise matrix multiplication. When using 'vectorize' I get the same result as the numpy multiplication but when I'm using 'cuda.jit' they are not same. Many of them are zeros. I'm providing a minimum working example for this purpose. Any help with the problem will be appreciated. I'm using numba o.35.0 and python 2.7
from __future__ import division
from __future__ import print_function
import numpy as np
from numba import vectorize, cuda, jit
M = 80
N = 40
P = 40
# Set the number of threads in a block
threadsperblock = 32
# Calculate the number of thread blocks in the grid
blockspergrid = (M*N*P + (threadsperblock - 1)) // threadsperblock
#vectorize(['float32(float32,float32)'], target='cuda')
def VectorMult3d(a, b):
return a*b
#cuda.jit('void(float32[:, :, :], float32[:, :, :], float32[:, :, :])')
def mult_gpu_3d(a, b, c):
[x, y, z] = cuda.grid(3)
if x < c.shape[0] and y < c.shape[1] and z < c.shape[2]:
c[x, y, z] = a[x, y, z] * b[x, y, z]
if __name__ == '__main__':
A = np.random.normal(size=(M, N, P)).astype(np.float32)
B = np.random.normal(size=(M, N, P)).astype(np.float32)
numpy_C = A*B
A_gpu = cuda.to_device(A)
B_gpu = cuda.to_device(B)
C_gpu = cuda.device_array((M,N,P), dtype=np.float32) # cuda.device_array_like(A_gpu)
mult_gpu_3d[blockspergrid,threadsperblock](A_gpu,B_gpu,C_gpu)
cudajit_C = C_gpu.copy_to_host()
print('------- using cuda.jit -------')
print('Is close?: {}'.format(np.allclose(numpy_C,cudajit_C)))
print('{} of {} elements are close'.format(np.sum(np.isclose(numpy_C,cudajit_C)), M*N*P))
print('------- using cuda.jit -------\n')
vectorize_C_gpu = VectorMult3d(A_gpu, B_gpu)
vectorize_C = vectorize_C_gpu.copy_to_host()
print('------- using vectorize -------')
print('Is close?: {}'.format(np.allclose(numpy_C,vectorize_C)))
print('{} of {} elements are close'.format(np.sum(np.isclose(numpy_C,vectorize_C)), M*N*P))
print('------- using vectorize -------\n')
import numba; print("numba version: "+numba.__version__)
Here is how you could debug this.
Consider a smaller and simplified example with:
reduced array sizes, e.g. (2, 3, 1) (so you could actually print the values and be able to read them)
simple and deterministic contents, e.g. "all ones" (to compare across runs)
additional kernel arguments for debugging
from __future__ import (division, print_function)
import numpy as np
from numba import cuda
M = 2
N = 3
P = 1
threadsperblock = 1
blockspergrid = (M * N * P + (threadsperblock - 1)) // threadsperblock
#cuda.jit
def mult_gpu_3d(a, b, c, grid_ran, grid_multed):
grid = cuda.grid(3)
x, y, z = grid
grid_ran[x] = 1
if (x < c.shape[0]) and (y < c.shape[1]) and (z < c.shape[2]):
grid_multed[x] = 1
c[grid] = a[grid] * b[grid]
if __name__ == '__main__':
A = np.ones((M, N, P), np.int32)
B = np.ones((M, N, P), np.int32)
A_gpu = cuda.to_device(A)
B_gpu = cuda.to_device(B)
C_gpu = cuda.to_device(np.zeros_like(A))
# Tells whether thread at index i have ran
grid_ran = cuda.to_device(np.zeros([blockspergrid], np.int32))
# Tells whether thread at index i have performed multiplication
grid_multed = cuda.to_device(np.zeros(blockspergrid, np.int32))
mult_gpu_3d[blockspergrid, threadsperblock](
A_gpu, B_gpu, C_gpu, grid_ran, grid_multed)
print("grid_ran.shape : ", grid_ran.shape)
print("grid_multed.shape : ", grid_multed.shape)
print("C_gpu.shape : ", C_gpu.shape)
print("grid_ran : ", grid_ran.copy_to_host())
print("grid_multed : ", grid_multed.copy_to_host())
C = C_gpu.copy_to_host()
print("C transpose flat : ", C.T.flatten())
print("C : \n", C)
Output:
grid_ran.shape : (6,)
grid_multed.shape : (6,)
C_gpu.shape : (2, 3, 1)
grid_ran : [1 1 1 1 1 1]
grid_multed : [1 1 0 0 0 0]
C transpose flat : [1 1 0 0 0 0]
C :
[[[1]
[0]
[0]]
[[1]
[0]
[0]]]
You can see that the device grid shape does not correspond to the shape of the arrays: the grid is flat (M*N*P), while arrays are all 3-dimensional (M, N, P). That is, first dimension of the grid has indices in range 0..M*N*P-1 (0..5, totaling 6 values in my example), while first dimension of the array is only in 0..M-1 (0..1, totaling 2 values in my example). This mistake typically leads do out-of-bounds access, but you have protected your kernel with a conditional which cuts down the offending threads:
if (x <= c.shape[0])
This line does not allow threads with indices above M-1 (1 in my example) to run (well, sort of [1]), that is why no values are written and you get many zeros in the resulting array.
Possible solutions:
In general, you could use multidimensional kernel grid configuration, i.e. a 3D vector for blockspergrid instead of a scalar [2].
In particular, as elementwise multiplication is a map operation and does not depend on array shapes, you could flatten all 3 arrays to 1D arrays, run your kernel as is on 1D grid, then reshape the result back [3], [4].
References:
[1] How to understand “All threads in a warp execute the same instruction at the same time.” in GPU?
[2] Understanding CUDA grid dimensions, block dimensions and threads organization (simple explanation)
[3] numpy.ndarray.flatten
[4] numpy.ravel
Is there any python implementation of MINRES pseudoinversion algorithm that can deal with Hermitian matrices?
I have found a few sources, but all of them are only capable of working with real matrices and do not seem to be easily generalizable onto the complex case:
https://searchcode.com/codesearch/view/89958680/
https://github.com/pascanur/theano_optimize
(there are a couple of other links, but my reputation does not allow me to post them)
A Hermitian system of size $n$
$$\mathbf y = \mathbf H^{-1}\mathbf v$$
can be embedded in a real, symmetric system of size $2n$:
\begin{equation}
\begin{bmatrix}
\Re(\mathbf y)\\Im(\mathbf y)
\end{bmatrix}=
\begin{bmatrix}
\Re(\mathbf H)&-\Im(\mathbf H)\\Im(\mathbf H)&\Re(\mathbf H)
\end{bmatrix}^{-1}
\begin{bmatrix}
\Re(\mathbf v)\\Im(\mathbf v)
\end{bmatrix}.
\end{equation}
Minimum-residual methods are often used for large problems, where constructing $H$ is impractical. In which case we may have an operation which computes a matrix-vector product, $f: \mathbb C^n \to \mathbb C^n; ,, f(\mathbf v) = \mathbf H\mathbf v.$ This function can be wrapped to operate on $\mathbf x \in \mathbb R^{2n}$ by converting $\mathbf x$ back to a complex vector, applying $f$, and then embedding the result back in $\mathbb R^{2n}$.
Here is an example in python / numpy / scipy:
from scipy.sparse.linalg import minres, LinearOperator
from pylab import *
# Problem size
N = 100
# error helper
er = lambda t,a,b:print('%s error:'%t,mean(abs(a-b)))
# random Hermitian matrix
Q = randn(N,N) + 1j*randn(N,N)
H = Q#conj(Q.T)
# random complex vector
v = randn(N) + 1j*randn(N)
# ground-truth solution
x0 = inv(H)#v
# Pack/unpack complex vector as stacked real vector
c2r = lambda v:block([real(v),imag(v)])
r2c = lambda v:kron([1,1j],eye(N))#v
# Verify that we can embed C^n in R^(2N)
Hr = real(H)
Hi = imag(H)
Hs = block([[Hr,-Hi],[Hi,Hr]])
vs = c2r(v)
xs = inv(Hs)#vs
x1 = r2c(xs)
er('Embed',x0,x1)
# Verify that minres works as expected in R-embed
x2 = r2c(minres(Hs,vs,tol=1e-12)[0])
er('Minres 1',x0,x2)
# Demonstrate using operators
Av = lambda u:c2r( H # r2c(u) )
A = LinearOperator((N*2,)*2,Av,Av)
# Minres, converting input/output to/from complex/real
x3 = r2c(minres(Hs,vs,tol=1e-12)[0])
er('Minres 2',x0,x3)
>>> Embed error: 5.317184726020268e-12
>>> Minres 1 error: 6.641342200989796e-11
>>> Minres 2 error: 6.641342200989796e-11
I am wondering how the GEMM Transpose works. I have a matrix which I want to multiply and I want to multiple the sample matrix transposed. Such as A.T * A
I have something like this,
def bptrs(a):
return gpuarray.arange(a.ptr,a.ptr+a.shape[0]*a.strides[0],a.strides[0],dtype=ctypes.c_void_p)
handle=cublasCreate()
A=np.ones((s,3)).astype(np.float64)
B=A.T # transposed
m,k=A.shape
k,n=B.shape
a_gpu = gpuarray.to_gpu(A)
b_gpu = gpuarray.to_gpu(B) # I am guessing I need to do a copy since A.T is a view
c_gpu = gpuarray.empty((m,n), np.float64) #Not 100% sure if this is right. I want to get a view returned, so I can save on memory
alpha = np.float64(1.0)
beta = np.float64(0.0)
cublasDgemmBatched(handle, 't','n',
n, m, k, alpha,
b_arr.gpudata, m,
a_arr.gpudata, k,
beta, c_arr.gpudata, m, 1)
I am using Cublas 7
To do A^T A use the cublas syrk-function instead of gemm.
If you want to understand gemm then you have to carefully read the documentation. Don't bother performing the transposition in python as gemm have arguments to do it on the fly. Something like this should get you what you want:
s = ...
k = 3
handle=cublasCreate()
A = np.ones((s,k)).astype(np.float64)
a_gpu = gpuarray.to_gpu(A)
c_gpu = gpuarray.empty((k,k), np.float64)
alpha = np.float64(1.0)
beta = np.float64(0.0)
cublasDgemm(handle,
't', 'n', # A^T A
m=k, # number of rows of matrix op(A) and C.
n=k, # number of columns of matrix op(B) and C.
k=s, # number of columns of op(A) and rows of op(B).
alpha,
a_gpu.gpudata, s, # lda x m with lda>=max(1,k)
a_gpu.gpudata, k, # ldb x k with ldb>=max(1,n)
beta,
c_arr.gpudata, k, # ldc x n with ldc>=max(1,m)
)