Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Henning Fehrmann
NVidia_AMD_Bench
Commits
327dcff7
Commit
327dcff7
authored
Feb 09, 2021
by
Henning Fehrmann
Committed by
Henning Fehrmann
Feb 09, 2021
Browse files
tensor core benchmark
parent
d281ea67
Changes
2
Hide whitespace changes
Inline
Side-by-side
tensor_core.c
deleted
100644 → 0
View file @
d281ea67
/*
* =====================================================================================
*
* Description: BLAS Benchmark
*
* Version: 1.0
* Created: 27.01.2021 12:45:18
* Revision: none
* Compiler: hipc or nvcc
*
* Author: Henning Fehrmann (), henning.fehrmann@aei.mpg.de
* Organization: AEI Hannover
* License: GNU General Public License v2
*
* =====================================================================================
*/
#include "hardware_settings.h"
#include "profiler.h"
#include <stdlib.h>
#include <math.h>
#include <omp.h>
#include <mma.h>
#include <string.h>
#define __MALLOC(P, size) P = malloc(size); \
if (P == NULL) \
{\
fprintf(stderr, "Allocation failed at line %d in %s\n", __LINE__, __FILE__); \
exit(EXIT_FAILURE); \
}\
void
check_status
(
cublasStatus_t
status
)
{
switch
(
status
)
{
case
CUBLAS_STATUS_SUCCESS
:
break
;
case
CUBLAS_STATUS_NOT_INITIALIZED
:
printf
(
"not initialized
\n
"
);
break
;
case
CUBLAS_STATUS_ALLOC_FAILED
:
printf
(
"CUBLAS_STATUS_ALLOC_FAILED
\n
"
);
break
;
case
CUBLAS_STATUS_INVALID_VALUE
:
printf
(
"CUBLAS_STATUS_INVALID_VALUE
\n
"
);
break
;
case
CUBLAS_STATUS_ARCH_MISMATCH
:
printf
(
"CUBLAS_STATUS_ARCH_MISMATCH
\n
"
);
break
;
case
CUBLAS_STATUS_MAPPING_ERROR
:
printf
(
"CUBLAS_STATUS_MAPPING_ERROR
\n
"
);
break
;
case
CUBLAS_STATUS_EXECUTION_FAILED
:
printf
(
"CUBLAS_STATUS_EXECUTION_FAILED
\n
"
);
break
;
case
CUBLAS_STATUS_INTERNAL_ERROR
:
printf
(
"CUBLAS_STATUS_INTERNAL_ERROR
\n
"
);
break
;
case
CUBLAS_STATUS_NOT_SUPPORTED
:
printf
(
"CUBLAS_STATUS_NOT_SUPPORTED
\n
"
);
break
;
case
CUBLAS_STATUS_LICENSE_ERROR
:
printf
(
"CUBLAS_STATUS_LICENSE_ERROR
\n
"
);
break
;
}
}
void
multiplication
(
__HANDLE__
handle
,
const
float
*
A
,
const
float
*
B
,
float
*
C
,
size_t
m
,
size_t
n
,
size_t
k
)
{
__BLAS_OPERATION__
transA
=
__NO_TRANSFORM__
;
__BLAS_OPERATION__
transB
=
__CT_TRANSFORM__
;
const
float
alpha
=
1
.
f
;
const
float
beta
=
0
.
f
;
check_status
(
cublasSgemm
(
handle
,
transA
,
transB
,
m
,
n
,
k
,
&
alpha
,
A
,
m
,
B
,
n
,
&
beta
,
C
,
m
));
cublasGemmAlgo_t
algo
=
CUBLAS_GEMM_DEFAULT_TENSOR_OP
;
check_status
(
cublasGemmEx
(
handle
,
transA
,
transB
,
m
,
n
,
k
,
&
alpha
,
A
,
CUDA_C_16F
,
m
,
B
,
CUDA_C_16F
,
n
,
&
beta
,
C
,
CUDA_C_16F
,
m
,
CUDA_C_32F
,
algo
));
exit
(
0
);
// cublasIcamax(handle,m * n, C, 1, &result);
cudaDeviceSynchronize
();
}
void
prepare_matrices
(
float
*
hA
,
float
*
hB
,
size_t
m
,
size_t
n
,
size_t
k
)
{
float
fact
=
1
.
f
/
(
float
)
n
/
(
float
)
x
/
(
float
)
y
/
20
.
f
;
#pragma omp parallel for
for
(
size_t
i
=
0
;
i
<
m
;
i
++
)
{
for
(
size_t
j
=
0
;
j
<
k
;
j
++
)
{
size_t
ind
=
j
+
k
*
i
;
//hA[ind] = (float)xorshf96()*fact;
hA
[
ind
]
=
0
.
f
;
}
hA
[
k
*
(
i
+
1
)]
=
1
.
f
;
}
#pragma omp parallel for
for
(
size_t
i
=
0
;
i
<
n
;
i
++
)
{
for
(
size_t
j
=
0
;
j
<
k
;
j
++
)
{
size_t
ind
=
j
+
k
*
i
;
//hB[ind] = (float)xorshf96()*fact;
hB
[
ind
]
=
0
.
f
;
}
hB
[
k
*
(
i
+
1
)]
=
1
.
f
;
}
}
void
print_result
(
float
*
hC
,
size_t
m
,
size_t
n
,
size_t
k
)
{
printf
(
"-------- %zu %zu
\n
"
,
m
,
k
);
for
(
size_t
i
=
0
;
i
<
m
;
i
++
)
{
for
(
size_t
j
=
0
;
j
<
k
;
j
++
)
{
size_t
ind
=
j
+
k
*
i
;
printf
(
"%1.2f
\t
"
,
hC
[
ind
]);
}
printf
(
"
\n
"
);
}
printf
(
"--------
\n
"
);
}
int
run_test
(
size_t
m
,
size_t
n
,
size_t
k
,
unsigned
rep
,
float
*
res
,
__HANDLE__
handle
)
{
struct
runtime
*
timer
;
__MALLOC
(
timer
,
sizeof
(
*
timer
));
float
*
A
;
float
*
B
;
float
*
C
;
__ASSERT
(
__PREFIX
(
Malloc
)((
void
**
)
&
A
,
sizeof
(
*
A
)
*
(
size_t
)(
m
*
k
)));
__ASSERT
(
__PREFIX
(
Malloc
)((
void
**
)
&
B
,
sizeof
(
*
B
)
*
(
size_t
)(
n
*
k
)));
__ASSERT
(
__PREFIX
(
Malloc
)((
void
**
)
&
C
,
sizeof
(
*
C
)
*
(
size_t
)(
m
*
n
)));
if
(
C
==
NULL
)
{
fprintf
(
stderr
,
"C not allocated
\n
"
);
exit
(
1
);
}
float
*
hA
;
__MALLOC
(
hA
,
sizeof
(
*
hA
)
*
(
size_t
)(
m
*
k
));
float
*
hB
;
__MALLOC
(
hB
,
sizeof
(
*
hB
)
*
(
size_t
)(
k
*
n
));
float
*
hC
;
__MALLOC
(
hC
,
sizeof
(
*
hC
)
*
(
size_t
)(
n
*
m
));
// timer_start(timer, "Prepare matrices");
// timer_stop(timer);
//timer_start(timer, "Memcopy");
// timer_stop(timer);
//timer_start(timer, "Create Handle");
//if(rocblas_create_handle(&handle) != rocblas_status_success) return EXIT_FAILURE;
//timer_stop(timer);
prepare_matrices
(
hA
,
hB
,
m
,
n
,
k
);
for
(
unsigned
r
=
0
;
r
<
rep
;
r
++
)
{
__ASSERT
(
__PREFIX
(
Memcpy
)(
A
,
hA
,
sizeof
(
*
A
)
*
(
size_t
)(
m
*
k
),
__PREFIX
(
MemcpyHostToDevice
)));
__ASSERT
(
__PREFIX
(
Memcpy
)(
B
,
hB
,
sizeof
(
*
B
)
*
(
size_t
)(
k
*
n
),
__PREFIX
(
MemcpyHostToDevice
)));
float
res_r
=
0
.
f
;
char
mes
[
128
];
sprintf
(
mes
,
"m %zu n %zu k %zu run %d"
,
m
,
n
,
k
,
r
);
timer_start
(
timer
,
mes
);
multiplication
(
handle
,
A
,
B
,
C
,
m
,
n
,
k
);
res_r
+=
timer_stop
(
timer
);
res
[
r
]
=
res_r
/
1
.
f
;
}
printf
(
"dimensions: %zu %zu %zu
\t
-- "
,
n
,
m
,
k
);
printf
(
"required size: %f GB
\n
"
,
(
m
*
n
*
sizeof
(
*
A
)
+
k
*
n
*
sizeof
(
*
B
)
+
k
*
m
*
sizeof
(
*
C
)
)
/
1.e+9
);
//__ASSERT(__PREFIX(Memcpy)(hC, C, sizeof(*hC) * (size_t)(k * m), __PREFIX(MemcpyDeviceToHost)));
//print_result(hC, 1 << em, 1 << en, 1 << ek);
// timer_start(timer, "Destroy Handle");
//if(rocblas_destroy_handle(handle) != rocblas_status_success) return EXIT_FAILURE;
// timer_stop(timer);
__PREFIX
(
Free
)(
A
);
__PREFIX
(
Free
)(
B
);
__PREFIX
(
Free
)(
C
);
free
(
hA
);
free
(
hB
);
free
(
hC
);
free
(
timer
);
return
0
;
}
int
main
(
)
{
int
rep
=
10
;
size_t
m_min
=
8
;
// 13
size_t
m_max
=
11
;
// 16
size_t
n_min
=
11
;
// 11
size_t
n_max
=
19
;
// 19
size_t
k_min
=
5
;
// 7
size_t
k_max
=
11
;
// 11
float
*
res
;
// cudaSetDevice(0);
__HANDLE__
handle
;
__CREATE_HANDLE
(
&
handle
);
cublasSetMathMode
(
handle
,
CUBLAS_TENSOR_OP_MATH
);
__MALLOC
(
res
,
sizeof
(
*
res
)
*
(
size_t
)(
(
m_max
-
m_min
+
1
)
*
(
n_max
-
n_min
+
1
)
*
(
k_max
-
k_min
+
1
)
*
rep
));
for
(
int
em
=
m_min
;
em
<=
m_max
;
em
++
)
{
for
(
int
en
=
n_min
;
en
<=
n_max
;
en
++
)
{
for
(
int
ek
=
k_min
;
ek
<=
k_max
;
ek
++
)
{
run_test
(
1
<<
em
,
1
<<
en
,
1
<<
ek
,
rep
,
&
res
[
0
],
handle
);
}
}
}
if
(
__DESTROY_HANDLE
(
handle
)
!=
__PREFIX
(
Success
))
return
EXIT_FAILURE
;
exit
(
0
);
// store the results
/*
FILE * f;
char name[128];
sprintf(name, "runtimes");
f= fopen(name, "w");
if (f == NULL)
{
fprintf(stderr, "Couldn't open %s\n", name);
}
for (int i = min_dim; i < max_dim; i++)
{
size_t dim = 1 << i;
fprintf(f, "%zu\t", dim);
}
fprintf(f, "\n");
for (int r = 0; r < rep; r++)
{
for (int i = min_dim; i < max_dim; i++)
{
size_t pos = (i - min_dim) * rep + r;
fprintf(f, "%1.6f\t", res[pos]);
}
fprintf(f, "\n");
}
fclose(f);
*/
return
0
;
}
blas_hp
.cu
→
tensor_core
.cu
View file @
327dcff7
File moved
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment