Skip to content
Snippets Groups Projects
Commit 72043435 authored by Henning Fehrmann's avatar Henning Fehrmann Committed by Henning Fehrmann
Browse files

tensor core code

parent c3a18e8d
No related branches found
No related tags found
No related merge requests found
GPU=NVIDIA
GPU=AMD
GPU=NVIDIA
OBJ_blas = blas.o
OBJ_fftw = fftw.o
OBJ_tensor_core = tensor_core.o
ifeq ($(GPU), AMD)
......@@ -29,6 +30,9 @@ all: blas fftw
blas: ${OBJ_blas}
${CC} -o blas ${OBJ_blas} ${LDFLAGS} ${LDFLAGS_blas} ${CUDAFLAGS}
tensor_core: ${OBJ_tensor_core}
${CC} -o tensor_core ${OBJ_tensor_core} ${LDFLAGS} ${LDFLAGS_blas} ${CUDAFLAGS}
fftw: ${OBJ_fftw}
${CC} -o fftw ${OBJ_fftw} ${LDFLAGS} ${LDFLAGS_fftw} ${CUDAFLAGS}
......
......@@ -77,9 +77,9 @@ void
multiplication
(
__HANDLE__ handle,
const __COMPLEX8__ *A,
const __COMPLEX8__ *B,
__COMPLEX8__ *C,
const float *A,
const float *B,
float *C,
size_t m,
size_t n,
size_t k
......@@ -87,8 +87,26 @@ multiplication
{
__BLAS_OPERATION__ transA = __NO_TRANSFORM__;
__BLAS_OPERATION__ transB = __CT_TRANSFORM__;
const __COMPLEX8__ alpha = {.x = 1.f, .y = 0.f};
const __COMPLEX8__ beta = {.x = 0.f, .y = 0.f};
const float alpha = 1.f;
const float beta = 0.f;
check_status( cublasSgemm(
handle,
transA,
transB,
m,
n,
k,
&alpha,
A,
m,
B,
n,
&beta,
C,
m
));
cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP;
check_status( cublasGemmEx(
......@@ -112,6 +130,7 @@ multiplication
CUDA_C_32F,
algo
));
exit(0);
// cublasIcamax(handle,m * n, C, 1, &result);
cudaDeviceSynchronize();
......@@ -120,8 +139,8 @@ multiplication
void
prepare_matrices
(
__COMPLEX8__ * hA,
__COMPLEX8__ * hB,
float * hA,
float * hB,
size_t m,
size_t n,
size_t k
......@@ -134,9 +153,10 @@ prepare_matrices
for (size_t j = 0; j < k; j++)
{
size_t ind = j + k * i;
hA[ind].x = (float)xorshf96()*fact;
hA[ind].y = (float)xorshf96()*fact;
//hA[ind] = (float)xorshf96()*fact;
hA[ind] = 0.f;
}
hA[k * (i+1)] = 1.f;
}
#pragma omp parallel for
for (size_t i = 0; i < n; i++)
......@@ -144,9 +164,10 @@ prepare_matrices
for (size_t j = 0; j < k; j++)
{
size_t ind = j + k * i;
hB[ind].x = (float)xorshf96()*fact;
hB[ind].y = (float)xorshf96()*fact;
//hB[ind] = (float)xorshf96()*fact;
hB[ind] = 0.f;
}
hB[k * (i+1)] = 1.f;
}
}
......@@ -154,7 +175,7 @@ prepare_matrices
void
print_result
(
__COMPLEX8__ * hC,
float * hC,
size_t m,
size_t n,
size_t k
......@@ -166,7 +187,7 @@ print_result
for (size_t j = 0; j < k; j++)
{
size_t ind = j + k * i;
printf("%1.2f %1.2f\t", hC[ind].x, hC[ind].y);
printf("%1.2f\t", hC[ind]);
}
printf("\n");
}
......@@ -187,9 +208,9 @@ run_test
{
struct runtime * timer;
__MALLOC(timer, sizeof(*timer));
__COMPLEX8__ *A;
__COMPLEX8__ *B;
__COMPLEX8__ *C;
float *A;
float *B;
float *C;
__ASSERT(__PREFIX(Malloc)((void **)&A, sizeof(*A) * (size_t)(m * k)));
__ASSERT(__PREFIX(Malloc)((void **)&B, sizeof(*B) * (size_t)(n * k)));
__ASSERT(__PREFIX(Malloc)((void **)&C, sizeof(*C) * (size_t)(m * n)));
......@@ -199,11 +220,11 @@ run_test
exit(1);
}
__COMPLEX8__ *hA;
float *hA;
__MALLOC( hA, sizeof(*hA) * (size_t)(m * k));
__COMPLEX8__ *hB;
float *hB;
__MALLOC( hB, sizeof(*hB) * (size_t)(k * n));
__COMPLEX8__ *hC;
float *hC;
__MALLOC( hC, sizeof(*hC) * (size_t)(n * m));
// timer_start(timer, "Prepare matrices");
......@@ -287,6 +308,7 @@ main
__CREATE_HANDLE(&handle);
cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH);
__MALLOC(res, sizeof(*res) * (size_t)(
(m_max - m_min + 1) *
(n_max - n_min + 1) *
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment