Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Henning Fehrmann
NVidia_AMD_Bench
Commits
50da3839
Commit
50da3839
authored
Feb 12, 2021
by
Henning Fehrmann
Committed by
Henning Fehrmann
Feb 12, 2021
Browse files
test different precissions for tensor core
parent
327dcff7
Changes
2
Hide whitespace changes
Inline
Side-by-side
Makefile
View file @
50da3839
...
...
@@ -3,9 +3,10 @@ GPU=AMD
GPU
=
NVIDIA
OBJ_blas
=
blas.o
OBJ_
blas_hp
=
blas_hp
.o
OBJ_
tensor_core
=
tensor_core
.o
OBJ_fftw
=
fftw.o
OBJ_tensor_core
=
tensor_core.o
OBJ_tensor2
=
tensor2.o
ifeq
($(GPU), AMD)
...
...
@@ -20,22 +21,22 @@ else ifeq ($(GPU), NVIDIA)
LDFLAGS_blas
=
-lcublas
LDFLAGS_fftw
=
-lcufft
INCLUDE
=
-I
/usr/lib/x86_64-linux-gnu/openmpi/include/
CFLAGS
=
${INCLUDE}
-arch
sm_7
0
--compile
-O3
-pg
-Xcompiler
-fopenmp
-DCUDA
CUDAFLAGS
=
-arch
sm_7
0
--Werror
cross-execution-space-call
--Wno-deprecated-gpu-targets
CFLAGS
=
${INCLUDE}
-arch
sm_7
5
--compile
-O3
-pg
-Xcompiler
-fopenmp
-DCUDA
CUDAFLAGS
=
-arch
sm_7
5
--Werror
cross-execution-space-call
--Wno-deprecated-gpu-targets
else
unknown_HW
:
endif
all
:
blas fftw
blas_hp
all
:
blas fftw
tensor_core
blas
:
${OBJ_blas}
${CC}
-o
blas
${OBJ_blas}
${LDFLAGS}
${LDFLAGS_blas}
${CUDAFLAGS}
blas_hp
:
${OBJ_blas_hp}
${CC}
-o
blas_hp
${OBJ_blas_hp}
${LDFLAGS}
${LDFLAGS_blas}
${CUDAFLAGS}
tensor_core
:
${OBJ_tensor_core}
${CC}
-o
tensor_core
${OBJ_tensor_core}
${LDFLAGS}
${LDFLAGS_blas}
${CUDAFLAGS}
${CC}
-o
tensor_core
${OBJ_tensor_core}
${LDFLAGS}
${LDFLAGS_fftw}
${LDFLAGS_blas}
${CUDAFLAGS}
tensor2
:
${OBJ_tensor2}
${CC}
-o
tensor2
${OBJ_tensor2}
${LDFLAGS}
${LDFLAGS_fftw}
${LDFLAGS_blas}
${CUDAFLAGS}
fftw
:
${OBJ_fftw}
${CC}
-o
fftw
${OBJ_fftw}
${LDFLAGS}
${LDFLAGS_fftw}
${CUDAFLAGS}
...
...
tensor2.cu
0 → 100644
View file @
50da3839
#include <iostream>
#include <time.h>
#include <cublas_v2.h>
#include <thrust/device_vector.h>
const
char
*
cublasGetErrorString
(
cublasStatus_t
status
)
{
switch
(
status
)
{
case
CUBLAS_STATUS_SUCCESS
:
return
"CUBLAS_STATUS_SUCCESS"
;
case
CUBLAS_STATUS_NOT_INITIALIZED
:
return
"CUBLAS_STATUS_NOT_INITIALIZED"
;
case
CUBLAS_STATUS_ALLOC_FAILED
:
return
"CUBLAS_STATUS_ALLOC_FAILED"
;
case
CUBLAS_STATUS_INVALID_VALUE
:
return
"CUBLAS_STATUS_INVALID_VALUE"
;
case
CUBLAS_STATUS_ARCH_MISMATCH
:
return
"CUBLAS_STATUS_ARCH_MISMATCH"
;
case
CUBLAS_STATUS_MAPPING_ERROR
:
return
"CUBLAS_STATUS_MAPPING_ERROR"
;
case
CUBLAS_STATUS_EXECUTION_FAILED
:
return
"CUBLAS_STATUS_EXECUTION_FAILED"
;
case
CUBLAS_STATUS_INTERNAL_ERROR
:
return
"CUBLAS_STATUS_INTERNAL_ERROR"
;
case
CUBLAS_STATUS_NOT_SUPPORTED
:
return
"CUBLAS_STATUS_NOT_SUPPORTED"
;
}
return
"unknown error"
;
}
int
main
(
void
)
{
// matrix A
size_t
m
=
1
<<
10
;
size_t
n
=
1
<<
17
;
size_t
k
=
1
<<
9
;
m
=
1024
;
n
=
1024
;
k
=
512
;
int
rowA
=
m
;
int
colA
=
k
;
// matrix B
int
rowB
=
colA
;
int
colB
=
n
;
// matrix C
int
rowC
=
rowA
;
int
colC
=
colB
;
thrust
::
device_vector
<
float
>
A
(
rowA
*
colA
);
thrust
::
device_vector
<
float
>
B
(
rowB
*
colB
);
thrust
::
device_vector
<
float
>
C
(
rowC
*
colC
);
/*
for (size_t i = 0; i < rowA; i++){
for (size_t j = 0; j < colA; j++){
A[i * rowA + j] = i + j;
}
}
for (size_t i = 0; i < rowB; i++){
for (size_t j = 0; j < colB; j++){
B[i * rowA + j] = i + j;
}
}
*/
cublasHandle_t
handle
;
cublasStatus_t
status
=
cublasCreate
(
&
handle
);
if
(
status
!=
CUBLAS_STATUS_SUCCESS
)
{
std
::
cerr
<<
"cublasCreate failed. error is: "
<<
cublasGetErrorString
(
status
)
<<
std
::
endl
;;
}
struct
timespec
start
;
struct
timespec
stop
;
int
alpha
=
1
;
int
beta
=
0
;
float
alphaf
=
1.
f
;
float
betaf
=
0.
f
;
// A * B + C
/*
*/
//cublasSetMathMode(handle, CUBLAS_PEDANTIC_MATH);
//cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH);
cublasSetMathMode
(
handle
,
CUBLAS_TF32_TENSOR_OP_MATH
);
for
(
int
r
=
0
;
r
<
10
;
r
++
)
{
clock_gettime
(
CLOCK_REALTIME
,
&
start
);
/*
status = cublasGemmEx
(
handle, CUBLAS_OP_N, CUBLAS_OP_N,
rowA, colB, colA,
&alpha, thrust::raw_pointer_cast(&A[0]),
CUDA_R_8I,
rowA,
thrust::raw_pointer_cast(&B[0]),
CUDA_R_8I,
colB,
&beta, thrust::raw_pointer_cast(&C[0]), CUDA_R_32I,
colB,
CUDA_R_32I, CUBLAS_GEMM_ALGO0
);
*/
status
=
cublasSgemmEx
(
handle
,
CUBLAS_OP_N
,
CUBLAS_OP_N
,
rowA
,
colB
,
colA
,
&
alphaf
,
thrust
::
raw_pointer_cast
(
&
A
[
0
]),
CUDA_R_16F
,
rowA
,
thrust
::
raw_pointer_cast
(
&
B
[
0
]),
CUDA_R_16F
,
colB
,
&
betaf
,
thrust
::
raw_pointer_cast
(
&
C
[
0
]),
CUDA_R_32F
,
colB
);
if
(
status
!=
CUBLAS_STATUS_SUCCESS
)
{
std
::
cerr
<<
"cublasGemmEx execution error is: "
<<
cublasGetErrorString
(
status
)
<<
std
::
endl
;
exit
(
0
);
}
cudaDeviceSynchronize
();
clock_gettime
(
CLOCK_REALTIME
,
&
stop
);
double
res
=
(
double
)
(
stop
.
tv_sec
-
start
.
tv_sec
)
*
1000.
+
(
double
)
(
stop
.
tv_nsec
-
start
.
tv_nsec
)
/
1000000.
;
printf
(
"hp %d %d %d %d %g [ms]
\n
"
,
r
,
m
,
n
,
k
,
res
);
}
status
=
cublasDestroy
(
handle
);
if
(
status
!=
CUBLAS_STATUS_SUCCESS
)
{
std
::
cerr
<<
"shutdown error code is: "
<<
cublasGetErrorString
(
status
)
<<
std
::
endl
;
}
return
0
;
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment