Commit 72043435 authored by Henning Fehrmann's avatar Henning Fehrmann Committed by Henning Fehrmann
Browse files

tensor core code

parent c3a18e8d
GPU=NVIDIA
GPU=AMD
GPU=NVIDIA
OBJ_blas = blas.o
OBJ_fftw = fftw.o
OBJ_tensor_core = tensor_core.o
ifeq ($(GPU), AMD)
......@@ -29,6 +30,9 @@ all: blas fftw
blas: ${OBJ_blas}
${CC} -o blas ${OBJ_blas} ${LDFLAGS} ${LDFLAGS_blas} ${CUDAFLAGS}
tensor_core: ${OBJ_tensor_core}
${CC} -o tensor_core ${OBJ_tensor_core} ${LDFLAGS} ${LDFLAGS_blas} ${CUDAFLAGS}
fftw: ${OBJ_fftw}
${CC} -o fftw ${OBJ_fftw} ${LDFLAGS} ${LDFLAGS_fftw} ${CUDAFLAGS}
......
......@@ -77,9 +77,9 @@ void
multiplication
(
__HANDLE__ handle,
const __COMPLEX8__ *A,
const __COMPLEX8__ *B,
__COMPLEX8__ *C,
const float *A,
const float *B,
float *C,
size_t m,
size_t n,
size_t k
......@@ -87,8 +87,26 @@ multiplication
{
__BLAS_OPERATION__ transA = __NO_TRANSFORM__;
__BLAS_OPERATION__ transB = __CT_TRANSFORM__;
const __COMPLEX8__ alpha = {.x = 1.f, .y = 0.f};
const __COMPLEX8__ beta = {.x = 0.f, .y = 0.f};
const float alpha = 1.f;
const float beta = 0.f;
check_status( cublasSgemm(
handle,
transA,
transB,
m,
n,
k,
&alpha,
A,
m,
B,
n,
&beta,
C,
m
));
cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP;
check_status( cublasGemmEx(
......@@ -112,6 +130,7 @@ multiplication
CUDA_C_32F,
algo
));
exit(0);
// cublasIcamax(handle,m * n, C, 1, &result);
cudaDeviceSynchronize();
......@@ -120,8 +139,8 @@ multiplication
void
prepare_matrices
(
__COMPLEX8__ * hA,
__COMPLEX8__ * hB,
float * hA,
float * hB,
size_t m,
size_t n,
size_t k
......@@ -134,9 +153,10 @@ prepare_matrices
for (size_t j = 0; j < k; j++)
{
size_t ind = j + k * i;
hA[ind].x = (float)xorshf96()*fact;
hA[ind].y = (float)xorshf96()*fact;
//hA[ind] = (float)xorshf96()*fact;
hA[ind] = 0.f;
}
hA[k * (i+1)] = 1.f;
}
#pragma omp parallel for
for (size_t i = 0; i < n; i++)
......@@ -144,9 +164,10 @@ prepare_matrices
for (size_t j = 0; j < k; j++)
{
size_t ind = j + k * i;
hB[ind].x = (float)xorshf96()*fact;
hB[ind].y = (float)xorshf96()*fact;
//hB[ind] = (float)xorshf96()*fact;
hB[ind] = 0.f;
}
hB[k * (i+1)] = 1.f;
}
}
......@@ -154,7 +175,7 @@ prepare_matrices
void
print_result
(
__COMPLEX8__ * hC,
float * hC,
size_t m,
size_t n,
size_t k
......@@ -166,7 +187,7 @@ print_result
for (size_t j = 0; j < k; j++)
{
size_t ind = j + k * i;
printf("%1.2f %1.2f\t", hC[ind].x, hC[ind].y);
printf("%1.2f\t", hC[ind]);
}
printf("\n");
}
......@@ -187,9 +208,9 @@ run_test
{
struct runtime * timer;
__MALLOC(timer, sizeof(*timer));
__COMPLEX8__ *A;
__COMPLEX8__ *B;
__COMPLEX8__ *C;
float *A;
float *B;
float *C;
__ASSERT(__PREFIX(Malloc)((void **)&A, sizeof(*A) * (size_t)(m * k)));
__ASSERT(__PREFIX(Malloc)((void **)&B, sizeof(*B) * (size_t)(n * k)));
__ASSERT(__PREFIX(Malloc)((void **)&C, sizeof(*C) * (size_t)(m * n)));
......@@ -199,11 +220,11 @@ run_test
exit(1);
}
__COMPLEX8__ *hA;
float *hA;
__MALLOC( hA, sizeof(*hA) * (size_t)(m * k));
__COMPLEX8__ *hB;
float *hB;
__MALLOC( hB, sizeof(*hB) * (size_t)(k * n));
__COMPLEX8__ *hC;
float *hC;
__MALLOC( hC, sizeof(*hC) * (size_t)(n * m));
// timer_start(timer, "Prepare matrices");
......@@ -287,6 +308,7 @@ main
__CREATE_HANDLE(&handle);
cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH);
__MALLOC(res, sizeof(*res) * (size_t)(
(m_max - m_min + 1) *
(n_max - n_min + 1) *
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment