Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
N
NVidia_AMD_Bench
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Henning Fehrmann
NVidia_AMD_Bench
Commits
9a07a73b
Commit
9a07a73b
authored
4 years ago
by
Henning Fehrmann
Committed by
Henning Fehrmann
4 years ago
Browse files
Options
Downloads
Patches
Plain Diff
test tensor_core
parent
246b9d52
No related branches found
No related tags found
No related merge requests found
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
tensor_core.c
+335
-0
335 additions, 0 deletions
tensor_core.c
with
335 additions
and
0 deletions
tensor_core.c
0 → 100644
+
335
−
0
View file @
9a07a73b
/*
* =====================================================================================
*
* Description: BLAS Benchmark
*
* Version: 1.0
* Created: 27.01.2021 12:45:18
* Revision: none
* Compiler: hipc or nvcc
*
* Author: Henning Fehrmann (), henning.fehrmann@aei.mpg.de
* Organization: AEI Hannover
* License: GNU General Public License v2
*
* =====================================================================================
*/
#include
"hardware_settings.h"
#include
"profiler.h"
#include
<stdlib.h>
#include
<math.h>
#include
<omp.h>
#include
<mma.h>
#include
<string.h>
#define __MALLOC(P, size) P = malloc(size); \
if (P == NULL) \
{\
fprintf(stderr, "Allocation failed at line %d in %s\n", __LINE__, __FILE__); \
exit(EXIT_FAILURE); \
}\
void
check_status
(
cublasStatus_t
status
)
{
switch
(
status
)
{
case
CUBLAS_STATUS_SUCCESS
:
break
;
case
CUBLAS_STATUS_NOT_INITIALIZED
:
printf
(
"not initialized
\n
"
);
break
;
case
CUBLAS_STATUS_ALLOC_FAILED
:
printf
(
"CUBLAS_STATUS_ALLOC_FAILED
\n
"
);
break
;
case
CUBLAS_STATUS_INVALID_VALUE
:
printf
(
"CUBLAS_STATUS_INVALID_VALUE
\n
"
);
break
;
case
CUBLAS_STATUS_ARCH_MISMATCH
:
printf
(
"CUBLAS_STATUS_ARCH_MISMATCH
\n
"
);
break
;
case
CUBLAS_STATUS_MAPPING_ERROR
:
printf
(
"CUBLAS_STATUS_MAPPING_ERROR
\n
"
);
break
;
case
CUBLAS_STATUS_EXECUTION_FAILED
:
printf
(
"CUBLAS_STATUS_EXECUTION_FAILED
\n
"
);
break
;
case
CUBLAS_STATUS_INTERNAL_ERROR
:
printf
(
"CUBLAS_STATUS_INTERNAL_ERROR
\n
"
);
break
;
case
CUBLAS_STATUS_NOT_SUPPORTED
:
printf
(
"CUBLAS_STATUS_NOT_SUPPORTED
\n
"
);
break
;
case
CUBLAS_STATUS_LICENSE_ERROR
:
printf
(
"CUBLAS_STATUS_LICENSE_ERROR
\n
"
);
break
;
}
}
void
multiplication
(
__HANDLE__
handle
,
const
__COMPLEX8__
*
A
,
const
__COMPLEX8__
*
B
,
__COMPLEX8__
*
C
,
size_t
m
,
size_t
n
,
size_t
k
)
{
__BLAS_OPERATION__
transA
=
__NO_TRANSFORM__
;
__BLAS_OPERATION__
transB
=
__CT_TRANSFORM__
;
const
__COMPLEX8__
alpha
=
{.
x
=
1
.
f
,
.
y
=
0
.
f
};
const
__COMPLEX8__
beta
=
{.
x
=
0
.
f
,
.
y
=
0
.
f
};
cublasGemmAlgo_t
algo
=
CUBLAS_GEMM_DEFAULT_TENSOR_OP
;
check_status
(
cublasGemmEx
(
handle
,
transA
,
transB
,
m
,
n
,
k
,
&
alpha
,
A
,
CUDA_C_16F
,
m
,
B
,
CUDA_C_16F
,
n
,
&
beta
,
C
,
CUDA_C_16F
,
m
,
CUDA_C_32F
,
algo
));
exit
(
0
);
// cublasIcamax(handle,m * n, C, 1, &result);
cudaDeviceSynchronize
();
}
void
prepare_matrices
(
__COMPLEX8__
*
hA
,
__COMPLEX8__
*
hB
,
size_t
m
,
size_t
n
,
size_t
k
)
{
float
fact
=
1
.
f
/
(
float
)
n
/
(
float
)
x
/
(
float
)
y
/
20
.
f
;
#pragma omp parallel for
for
(
size_t
i
=
0
;
i
<
m
;
i
++
)
{
for
(
size_t
j
=
0
;
j
<
k
;
j
++
)
{
size_t
ind
=
j
+
k
*
i
;
hA
[
ind
].
x
=
(
float
)
xorshf96
()
*
fact
;
hA
[
ind
].
y
=
(
float
)
xorshf96
()
*
fact
;
}
}
#pragma omp parallel for
for
(
size_t
i
=
0
;
i
<
n
;
i
++
)
{
for
(
size_t
j
=
0
;
j
<
k
;
j
++
)
{
size_t
ind
=
j
+
k
*
i
;
hB
[
ind
].
x
=
(
float
)
xorshf96
()
*
fact
;
hB
[
ind
].
y
=
(
float
)
xorshf96
()
*
fact
;
}
}
}
void
print_result
(
__COMPLEX8__
*
hC
,
size_t
m
,
size_t
n
,
size_t
k
)
{
printf
(
"-------- %zu %zu
\n
"
,
m
,
k
);
for
(
size_t
i
=
0
;
i
<
m
;
i
++
)
{
for
(
size_t
j
=
0
;
j
<
k
;
j
++
)
{
size_t
ind
=
j
+
k
*
i
;
printf
(
"%1.2f %1.2f
\t
"
,
hC
[
ind
].
x
,
hC
[
ind
].
y
);
}
printf
(
"
\n
"
);
}
printf
(
"--------
\n
"
);
}
int
run_test
(
size_t
m
,
size_t
n
,
size_t
k
,
unsigned
rep
,
float
*
res
,
__HANDLE__
handle
)
{
struct
runtime
*
timer
;
__MALLOC
(
timer
,
sizeof
(
*
timer
));
__COMPLEX8__
*
A
;
__COMPLEX8__
*
B
;
__COMPLEX8__
*
C
;
__ASSERT
(
__PREFIX
(
Malloc
)((
void
**
)
&
A
,
sizeof
(
*
A
)
*
(
size_t
)(
m
*
k
)));
__ASSERT
(
__PREFIX
(
Malloc
)((
void
**
)
&
B
,
sizeof
(
*
B
)
*
(
size_t
)(
n
*
k
)));
__ASSERT
(
__PREFIX
(
Malloc
)((
void
**
)
&
C
,
sizeof
(
*
C
)
*
(
size_t
)(
m
*
n
)));
if
(
C
==
NULL
)
{
fprintf
(
stderr
,
"C not allocated
\n
"
);
exit
(
1
);
}
__COMPLEX8__
*
hA
;
__MALLOC
(
hA
,
sizeof
(
*
hA
)
*
(
size_t
)(
m
*
k
));
__COMPLEX8__
*
hB
;
__MALLOC
(
hB
,
sizeof
(
*
hB
)
*
(
size_t
)(
k
*
n
));
__COMPLEX8__
*
hC
;
__MALLOC
(
hC
,
sizeof
(
*
hC
)
*
(
size_t
)(
n
*
m
));
// timer_start(timer, "Prepare matrices");
// timer_stop(timer);
//timer_start(timer, "Memcopy");
// timer_stop(timer);
//timer_start(timer, "Create Handle");
//if(rocblas_create_handle(&handle) != rocblas_status_success) return EXIT_FAILURE;
//timer_stop(timer);
prepare_matrices
(
hA
,
hB
,
m
,
n
,
k
);
for
(
unsigned
r
=
0
;
r
<
rep
;
r
++
)
{
__ASSERT
(
__PREFIX
(
Memcpy
)(
A
,
hA
,
sizeof
(
*
A
)
*
(
size_t
)(
m
*
k
),
__PREFIX
(
MemcpyHostToDevice
)));
__ASSERT
(
__PREFIX
(
Memcpy
)(
B
,
hB
,
sizeof
(
*
B
)
*
(
size_t
)(
k
*
n
),
__PREFIX
(
MemcpyHostToDevice
)));
float
res_r
=
0
.
f
;
char
mes
[
128
];
sprintf
(
mes
,
"m %zu n %zu k %zu run %d"
,
m
,
n
,
k
,
r
);
timer_start
(
timer
,
mes
);
multiplication
(
handle
,
A
,
B
,
C
,
m
,
n
,
k
);
res_r
+=
timer_stop
(
timer
);
res
[
r
]
=
res_r
/
1
.
f
;
}
printf
(
"dimensions: %zu %zu %zu
\t
-- "
,
n
,
m
,
k
);
printf
(
"required size: %f GB
\n
"
,
(
m
*
n
*
sizeof
(
*
A
)
+
k
*
n
*
sizeof
(
*
B
)
+
k
*
m
*
sizeof
(
*
C
)
)
/
1.e+9
);
//__ASSERT(__PREFIX(Memcpy)(hC, C, sizeof(*hC) * (size_t)(k * m), __PREFIX(MemcpyDeviceToHost)));
//print_result(hC, 1 << em, 1 << en, 1 << ek);
// timer_start(timer, "Destroy Handle");
//if(rocblas_destroy_handle(handle) != rocblas_status_success) return EXIT_FAILURE;
// timer_stop(timer);
__PREFIX
(
Free
)(
A
);
__PREFIX
(
Free
)(
B
);
__PREFIX
(
Free
)(
C
);
free
(
hA
);
free
(
hB
);
free
(
hC
);
free
(
timer
);
return
0
;
}
int
main
(
)
{
int
rep
=
10
;
size_t
m_min
=
8
;
// 13
size_t
m_max
=
11
;
// 16
size_t
n_min
=
11
;
// 11
size_t
n_max
=
19
;
// 19
size_t
k_min
=
5
;
// 7
size_t
k_max
=
11
;
// 11
float
*
res
;
// cudaSetDevice(0);
__HANDLE__
handle
;
__CREATE_HANDLE
(
&
handle
);
__MALLOC
(
res
,
sizeof
(
*
res
)
*
(
size_t
)(
(
m_max
-
m_min
+
1
)
*
(
n_max
-
n_min
+
1
)
*
(
k_max
-
k_min
+
1
)
*
rep
));
for
(
int
em
=
m_min
;
em
<=
m_max
;
em
++
)
{
for
(
int
en
=
n_min
;
en
<=
n_max
;
en
++
)
{
for
(
int
ek
=
k_min
;
ek
<=
k_max
;
ek
++
)
{
run_test
(
1
<<
em
,
1
<<
en
,
1
<<
ek
,
rep
,
&
res
[
0
],
handle
);
}
}
}
if
(
__DESTROY_HANDLE
(
handle
)
!=
__PREFIX
(
Success
))
return
EXIT_FAILURE
;
exit
(
0
);
// store the results
/*
FILE * f;
char name[128];
sprintf(name, "runtimes");
f= fopen(name, "w");
if (f == NULL)
{
fprintf(stderr, "Couldn't open %s\n", name);
}
for (int i = min_dim; i < max_dim; i++)
{
size_t dim = 1 << i;
fprintf(f, "%zu\t", dim);
}
fprintf(f, "\n");
for (int r = 0; r < rep; r++)
{
for (int i = min_dim; i < max_dim; i++)
{
size_t pos = (i - min_dim) * rep + r;
fprintf(f, "%1.6f\t", res[pos]);
}
fprintf(f, "\n");
}
fclose(f);
*/
return
0
;
}
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment