Commit 50da3839 authored by Henning Fehrmann's avatar Henning Fehrmann Committed by Henning Fehrmann
Browse files

test different precissions for tensor core

parent 327dcff7
......@@ -3,9 +3,10 @@ GPU=AMD
GPU=NVIDIA
OBJ_blas = blas.o
OBJ_blas_hp = blas_hp.o
OBJ_tensor_core = tensor_core.o
OBJ_fftw = fftw.o
OBJ_tensor_core = tensor_core.o
OBJ_tensor2 = tensor2.o
ifeq ($(GPU), AMD)
......@@ -20,22 +21,22 @@ else ifeq ($(GPU), NVIDIA)
LDFLAGS_blas = -lcublas
LDFLAGS_fftw = -lcufft
INCLUDE= -I/usr/lib/x86_64-linux-gnu/openmpi/include/
CFLAGS = ${INCLUDE} -arch sm_70 --compile -O3 -pg -Xcompiler -fopenmp -DCUDA
CUDAFLAGS = -arch sm_70 --Werror cross-execution-space-call --Wno-deprecated-gpu-targets
CFLAGS = ${INCLUDE} -arch sm_75 --compile -O3 -pg -Xcompiler -fopenmp -DCUDA
CUDAFLAGS = -arch sm_75 --Werror cross-execution-space-call --Wno-deprecated-gpu-targets
else
unknown_HW:
endif
all: blas fftw blas_hp
all: blas fftw tensor_core
blas: ${OBJ_blas}
${CC} -o blas ${OBJ_blas} ${LDFLAGS} ${LDFLAGS_blas} ${CUDAFLAGS}
blas_hp: ${OBJ_blas_hp}
${CC} -o blas_hp ${OBJ_blas_hp} ${LDFLAGS} ${LDFLAGS_blas} ${CUDAFLAGS}
tensor_core: ${OBJ_tensor_core}
${CC} -o tensor_core ${OBJ_tensor_core} ${LDFLAGS} ${LDFLAGS_blas} ${CUDAFLAGS}
${CC} -o tensor_core ${OBJ_tensor_core} ${LDFLAGS} ${LDFLAGS_fftw} ${LDFLAGS_blas} ${CUDAFLAGS}
tensor2: ${OBJ_tensor2}
${CC} -o tensor2 ${OBJ_tensor2} ${LDFLAGS} ${LDFLAGS_fftw} ${LDFLAGS_blas} ${CUDAFLAGS}
fftw: ${OBJ_fftw}
${CC} -o fftw ${OBJ_fftw} ${LDFLAGS} ${LDFLAGS_fftw} ${CUDAFLAGS}
......
#include <iostream>
#include <time.h>
#include <cublas_v2.h>
#include <thrust/device_vector.h>
const char* cublasGetErrorString(cublasStatus_t status) {
switch(status) {
case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS";
case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED";
case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED";
case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE";
case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH";
case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR";
case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED";
case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR";
case CUBLAS_STATUS_NOT_SUPPORTED: return "CUBLAS_STATUS_NOT_SUPPORTED";
}
return "unknown error";
}
int main(void) {
// matrix A
size_t m = 1 << 10;
size_t n = 1 << 17;
size_t k = 1 << 9;
m = 1024;
n = 1024;
k = 512;
int rowA = m;
int colA = k;
// matrix B
int rowB = colA;
int colB = n;
// matrix C
int rowC = rowA;
int colC = colB;
thrust::device_vector<float> A(rowA * colA);
thrust::device_vector<float> B(rowB * colB);
thrust::device_vector<float> C(rowC * colC);
/*
for (size_t i = 0; i < rowA; i++){
for (size_t j = 0; j < colA; j++){
A[i * rowA + j] = i + j;
}
}
for (size_t i = 0; i < rowB; i++){
for (size_t j = 0; j < colB; j++){
B[i * rowA + j] = i + j;
}
}
*/
cublasHandle_t handle;
cublasStatus_t status = cublasCreate(&handle);
if (status != CUBLAS_STATUS_SUCCESS) {
std::cerr << "cublasCreate failed. error is: " << cublasGetErrorString(status) << std::endl;;
}
struct timespec start;
struct timespec stop;
int alpha = 1;
int beta = 0;
float alphaf = 1.f;
float betaf = 0.f;
// A * B + C
/*
*/
//cublasSetMathMode(handle, CUBLAS_PEDANTIC_MATH);
//cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH);
cublasSetMathMode(handle, CUBLAS_TF32_TENSOR_OP_MATH);
for (int r = 0; r < 10; r++)
{
clock_gettime(CLOCK_REALTIME , &start);
/*
status = cublasGemmEx
(
handle, CUBLAS_OP_N, CUBLAS_OP_N,
rowA, colB, colA,
&alpha, thrust::raw_pointer_cast(&A[0]),
CUDA_R_8I,
rowA,
thrust::raw_pointer_cast(&B[0]),
CUDA_R_8I,
colB,
&beta, thrust::raw_pointer_cast(&C[0]), CUDA_R_32I,
colB,
CUDA_R_32I, CUBLAS_GEMM_ALGO0
);
*/
status = cublasSgemmEx
(
handle,
CUBLAS_OP_N,
CUBLAS_OP_N,
rowA,
colB,
colA,
&alphaf,
thrust::raw_pointer_cast(&A[0]),
CUDA_R_16F,
rowA,
thrust::raw_pointer_cast(&B[0]),
CUDA_R_16F,
colB,
&betaf,
thrust::raw_pointer_cast(&C[0]),
CUDA_R_32F,
colB
);
if (status != CUBLAS_STATUS_SUCCESS) {
std::cerr << "cublasGemmEx execution error is: " << cublasGetErrorString(status) << std::endl;
exit(0);
}
cudaDeviceSynchronize();
clock_gettime(CLOCK_REALTIME , &stop);
double res= (double)
(
stop.tv_sec - start.tv_sec
)*1000.
+
(double)
(
stop.tv_nsec - start.tv_nsec
)/1000000.
;
printf("hp %d %d %d %d %g [ms]\n",r, m, n, k, res);
}
status = cublasDestroy(handle);
if (status != CUBLAS_STATUS_SUCCESS) {
std::cerr << "shutdown error code is: " << cublasGetErrorString(status) << std::endl;
}
return 0;
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment