Thank you for the sample. It turns out that `cublasSetAtomicsMode (handle, cublasAtomicsMode.CUBLAS_ATOMICS_ALLOWED)`

breaks `cublasGemmEx`

. I blindly included it because the cuBLAS docs said it could boost performance.

I expanded the sample to be a bit more thorough and to use CUDA events:

```
/*
* JCuda - Java bindings for NVIDIA CUDA
*
* Copyright 2008-2016 Marco Hutter - http://www.jcuda.org
*/
package jcuda.jcublas.samples;
import static jcuda.cudaDataType.CUDA_R_32F;
import static jcuda.jcublas.JCublas2.*;
import static jcuda.jcublas.cublasGemmAlgo.*;
import static jcuda.jcublas.cublasOperation.CUBLAS_OP_N;
import static jcuda.jcublas.cublasOperation.CUBLAS_OP_T;
import static jcuda.runtime.JCuda.*;
import java.util.Arrays;
import java.util.List;
import jcuda.Pointer;
import jcuda.Sizeof;
import jcuda.jcublas.JCublas;
import jcuda.jcublas.JCublas2;
import jcuda.jcublas.cublasHandle;
import jcuda.jcublas.cublasAtomicsMode;
import jcuda.jcublas.cublasPointerMode;
import jcuda.runtime.cudaStream_t;
import jcuda.runtime.cudaEvent_t;
import jcuda.runtime.JCuda;
import jcuda.driver.CUcontext;
import jcuda.driver.CUdevice;
import jcuda.driver.JCudaDriver;
import jcuda.samples.utils.JCudaSamplesUtils;
/**
* This is a sample class demonstrating the application of JCublas2 for
* performing a BLAS 'sgemm' operation, i.e. for computing the matrix <br>
* <code>C = alpha * A * B + beta * C</code> <br>
* for single-precision floating point values alpha and beta, and matrices
* A, B and C, using the extended CUBLAS GEMM function
*/
public class JCublas2SgemmExSample
{
public static void main(String args[])
{
JCuda.setExceptionsEnabled(true);
CUdevice device = new CUdevice();
CUcontext context = new CUcontext();
JCudaDriver.cuInit(0);
JCudaDriver.cuDeviceGet(device, 0);
JCudaDriver.cuCtxCreate(context, 0, device);
testSgemm();
JCudaDriver.cuCtxDestroy (context);
}
// Remember, matrices are in column-major order
// and pitch is also column-major
final static boolean TRANSPOSE_A = true;
final static boolean TRANSPOSE_B = true;
final static int M = 2048; // number of rows of matrix op(A) and rows of matrix C
final static int N = 2048; // number of columns of matrix op(B) and number of columns of C
final static int K = 2048; // number of columns of matrix op(A) and number of rows of op(B)
final static int LDA = 2048; // pitch of matrix A (before the transpose)
final static int LDB = 2048; // pitch of matrix B (before the transpose)
final static int LDC = 2048; // pitch of matrix C
final static float ALPHA = 1f;
final static float BETA = 0f;
// The list of CUBLAS GEMM algorithms to use. Note that the set of
// supported algorithms will likely depend on the platform, the
// size of the matrix, and other factors.
private static final List<Integer> GEMM_ALGORITHMS = Arrays.asList(
CUBLAS_GEMM_DFALT,
CUBLAS_GEMM_ALGO0, // doesn't seem to ever work
CUBLAS_GEMM_ALGO1, // doesn't seem to ever work
CUBLAS_GEMM_ALGO2,
CUBLAS_GEMM_ALGO3,
CUBLAS_GEMM_ALGO4,
CUBLAS_GEMM_ALGO5,
CUBLAS_GEMM_ALGO6,
CUBLAS_GEMM_ALGO7
);
/**
* Test the JCublas sgemm operation for matrices of size n x x
*
* @param n The matrix size
*/
public static void testSgemm ()
{
System.out.println("Creating input data...");
float h_A[] = JCudaSamplesUtils.createRandomFloatData(K * LDA);
float h_B[] = JCudaSamplesUtils.createRandomFloatData(N * LDB);
float h_C[] = JCudaSamplesUtils.createRandomFloatData(N * LDC);
System.out.println("Performing Sgemm with cuBLAS v1...");
{
try
{
sgemmJCublas (h_A, h_B, h_C, false, 0);
}
catch (Exception e)
{
e.printStackTrace();
}
}
System.out.println("Performing Sgemm with cuBLAS v2...");
for (int algo : GEMM_ALGORITHMS)
{
try
{
sgemmJCublas (h_A, h_B, h_C, true, algo);
}
catch (Exception e)
{
e.printStackTrace();
}
}
System.out.println();
System.out.println("Timing is inaccurate because the GPU doesn't have time to enter high-performance state.");
System.out.println("Abnormally quick execution time indicates failure and incompatibility of the given algorithm.");
}
/**
* Implementation of sgemm using JCublas
*/
private static void sgemmJCublas (float A[], float B[], float C[], boolean useJCublas2, int algo)
{
// Create a CUBLAS handle
cublasHandle handle = null;
if (useJCublas2)
{
handle = new cublasHandle();
cublasCreate(handle);
// Causes CUBLAS v2 to fail (Linux, CUDA 8)
// cublasSetAtomicsMode (handle, cublasAtomicsMode.CUBLAS_ATOMICS_ALLOWED);
cublasSetPointerMode (handle, cublasPointerMode.CUBLAS_POINTER_MODE_HOST);
}
else
JCublas.cublasInit();
// Allocate memory on the device
Pointer d_A = new Pointer();
Pointer d_B = new Pointer();
Pointer d_C = new Pointer();
cudaMalloc(d_A, K * LDA * Sizeof.FLOAT);
cudaMalloc(d_B, N * LDB * Sizeof.FLOAT);
cudaMalloc(d_C, N * LDC * Sizeof.FLOAT);
// Copy the memory from the host to the device
cublasSetVector (K * LDA, Sizeof.FLOAT, Pointer.to(A), 1, d_A, 1);
cublasSetVector (N * LDB, Sizeof.FLOAT, Pointer.to(B), 1, d_B, 1);
cublasSetVector (N * LDC, Sizeof.FLOAT, Pointer.to(C), 1, d_C, 1);
// Execute sgemm
Pointer pAlpha = Pointer.to(new float[] { ALPHA });
Pointer pBeta = Pointer.to(new float[] { BETA });
cudaEvent_t startEvent = new cudaEvent_t();
cudaEvent_t stopEvent = new cudaEvent_t();
cudaEventCreate (startEvent);
cudaEventCreate (stopEvent);
// run twice for a bit more stable numbers
for (int i = 0; i < 2; i++)
{
cudaEventRecord (startEvent, cudaStreamLegacy);
if (useJCublas2)
cublasGemmEx(handle,
TRANSPOSE_A ? CUBLAS_OP_T : CUBLAS_OP_N,
TRANSPOSE_B ? CUBLAS_OP_T : CUBLAS_OP_N,
M, N, K,
pAlpha, d_A, CUDA_R_32F, LDA, d_B, CUDA_R_32F, LDB,
pBeta, d_C, CUDA_R_32F, LDC, CUDA_R_32F, algo);
else
JCublas.cublasSgemm
(TRANSPOSE_A ? 't' : 'n',
TRANSPOSE_B ? 't' : 'n',
M, N, K,
ALPHA, d_A, LDA, d_B, LDB,
BETA, d_C, LDC);
cudaEventRecord (stopEvent, cudaStreamLegacy);
}
cudaEventSynchronize (stopEvent);
float[] time = new float [1];
cudaEventElapsedTime (time, startEvent, stopEvent);
if (useJCublas2)
System.out.println(
"cuBLAS v2 algorithm " + algo + " took " + time[0] * 1000 + " us");
else
System.out.println(
"cuBLAS v1 took " + time[0] * 1000 + " us");
// Copy the result from the device to the host
cublasGetVector (N * LDC, Sizeof.FLOAT, d_C, 1, Pointer.to(C), 1);
// Clean up
cudaEventDestroy (startEvent);
cudaEventDestroy (stopEvent);
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
if (useJCublas2)
cublasDestroy(handle);
else
JCublas.cublasShutdown();
}
}
```

There does seem to be a bit of a performance difference between the algos, especially when matrices are transposed:

```
Performing Sgemm with cuBLAS v1...
cuBLAS v1 took 1600.096 us
Performing Sgemm with cuBLAS v2...
cuBLAS v2 algorithm -1 took 1571.264 us
cuBLAS v2 algorithm 0 took 7.328 us
cuBLAS v2 algorithm 1 took 6.784 us
cuBLAS v2 algorithm 2 took 1523.84 us
cuBLAS v2 algorithm 3 took 1593.0559 us
cuBLAS v2 algorithm 4 took 1719.872 us
cuBLAS v2 algorithm 5 took 1660.3201 us
cuBLAS v2 algorithm 6 took 2125.248 us
cuBLAS v2 algorithm 7 took 2459.808 us
Timing is inaccurate because the GPU doesn't have time to enter high-performance state.
Abnormally quick execution time indicates failure and incompatibility of the given algorithm.
```