diff --git a/Makefile b/Makefile index ace3c81b6d270652589bead06c3a1e7bd15b4347..77c82306d54372b2405ec98a8d89e8e000bcd537 100644 --- a/Makefile +++ b/Makefile @@ -1,9 +1,10 @@ -GPU=NVIDIA GPU=AMD +GPU=NVIDIA OBJ_blas = blas.o OBJ_fftw = fftw.o +OBJ_tensor_core = tensor_core.o ifeq ($(GPU), AMD) @@ -29,6 +30,9 @@ all: blas fftw blas: ${OBJ_blas} ${CC} -o blas ${OBJ_blas} ${LDFLAGS} ${LDFLAGS_blas} ${CUDAFLAGS} +tensor_core: ${OBJ_tensor_core} + ${CC} -o tensor_core ${OBJ_tensor_core} ${LDFLAGS} ${LDFLAGS_blas} ${CUDAFLAGS} + fftw: ${OBJ_fftw} ${CC} -o fftw ${OBJ_fftw} ${LDFLAGS} ${LDFLAGS_fftw} ${CUDAFLAGS} diff --git a/tensor_core.c b/tensor_core.c index 6feebfc9d5fb4ae37c1be3397c67898cb5d859e5..b815bedc10526297fdb59ec5da07701f3efae56a 100644 --- a/tensor_core.c +++ b/tensor_core.c @@ -77,9 +77,9 @@ void multiplication ( __HANDLE__ handle, - const __COMPLEX8__ *A, - const __COMPLEX8__ *B, - __COMPLEX8__ *C, + const float *A, + const float *B, + float *C, size_t m, size_t n, size_t k @@ -87,8 +87,26 @@ multiplication { __BLAS_OPERATION__ transA = __NO_TRANSFORM__; __BLAS_OPERATION__ transB = __CT_TRANSFORM__; - const __COMPLEX8__ alpha = {.x = 1.f, .y = 0.f}; - const __COMPLEX8__ beta = {.x = 0.f, .y = 0.f}; + const float alpha = 1.f; + const float beta = 0.f; + + + check_status( cublasSgemm( + handle, + transA, + transB, + m, + n, + k, + &alpha, + A, + m, + B, + n, + &beta, + C, + m + )); cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP; check_status( cublasGemmEx( @@ -112,6 +130,7 @@ multiplication CUDA_C_32F, algo )); + exit(0); // cublasIcamax(handle,m * n, C, 1, &result); cudaDeviceSynchronize(); @@ -120,8 +139,8 @@ multiplication void prepare_matrices ( - __COMPLEX8__ * hA, - __COMPLEX8__ * hB, + float * hA, + float * hB, size_t m, size_t n, size_t k @@ -134,9 +153,10 @@ prepare_matrices for (size_t j = 0; j < k; j++) { size_t ind = j + k * i; - hA[ind].x = (float)xorshf96()*fact; - hA[ind].y = (float)xorshf96()*fact; + //hA[ind] = (float)xorshf96()*fact; + hA[ind] = 0.f; } + hA[k * (i+1)] = 1.f; } #pragma omp parallel for for (size_t i = 0; i < n; i++) @@ -144,9 +164,10 @@ prepare_matrices for (size_t j = 0; j < k; j++) { size_t ind = j + k * i; - hB[ind].x = (float)xorshf96()*fact; - hB[ind].y = (float)xorshf96()*fact; + //hB[ind] = (float)xorshf96()*fact; + hB[ind] = 0.f; } + hB[k * (i+1)] = 1.f; } } @@ -154,7 +175,7 @@ prepare_matrices void print_result ( - __COMPLEX8__ * hC, + float * hC, size_t m, size_t n, size_t k @@ -166,7 +187,7 @@ print_result for (size_t j = 0; j < k; j++) { size_t ind = j + k * i; - printf("%1.2f %1.2f\t", hC[ind].x, hC[ind].y); + printf("%1.2f\t", hC[ind]); } printf("\n"); } @@ -187,9 +208,9 @@ run_test { struct runtime * timer; __MALLOC(timer, sizeof(*timer)); - __COMPLEX8__ *A; - __COMPLEX8__ *B; - __COMPLEX8__ *C; + float *A; + float *B; + float *C; __ASSERT(__PREFIX(Malloc)((void **)&A, sizeof(*A) * (size_t)(m * k))); __ASSERT(__PREFIX(Malloc)((void **)&B, sizeof(*B) * (size_t)(n * k))); __ASSERT(__PREFIX(Malloc)((void **)&C, sizeof(*C) * (size_t)(m * n))); @@ -199,11 +220,11 @@ run_test exit(1); } - __COMPLEX8__ *hA; + float *hA; __MALLOC( hA, sizeof(*hA) * (size_t)(m * k)); - __COMPLEX8__ *hB; + float *hB; __MALLOC( hB, sizeof(*hB) * (size_t)(k * n)); - __COMPLEX8__ *hC; + float *hC; __MALLOC( hC, sizeof(*hC) * (size_t)(n * m)); // timer_start(timer, "Prepare matrices"); @@ -287,6 +308,7 @@ main __CREATE_HANDLE(&handle); + cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH); __MALLOC(res, sizeof(*res) * (size_t)( (m_max - m_min + 1) * (n_max - n_min + 1) *