Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Henning Fehrmann
NVidia_AMD_Bench
Commits
0626ce84
Commit
0626ce84
authored
Jan 28, 2021
by
Henning Fehrmann
Committed by
Henning Fehrmann
Jan 28, 2021
Browse files
platform agnostic code
parent
21c8a6d9
Changes
4
Hide whitespace changes
Inline
Side-by-side
Makefile
0 → 100644
View file @
0626ce84
GPU
=
NVIDIA
GPU
=
AMD
OBJ
=
blas.o
ifeq
($(GPU), AMD)
LDFLAGS
=
-L
/opt/rocm/lib
-lhipblas
-lrocblas
-fopenmp
CFLAGS
=
-g
-Wall
-O3
-fopenmp
-I
/opt/rocm/include
-I
/opt/rocm/hip/include
-DROC
CC
=
hipcc
else
ifeq
($(GPU), NVIDIA)
CC
=
nvcc
LDFLAGS
=
-lcublas
-lm
-lgomp
INCLUDE
=
-I
/usr/lib/x86_64-linux-gnu/openmpi/include/
CFLAGS
=
${INCLUDE}
--compile
-O3
-pg
-Xcompiler
-fopenmp
-DCUDA
CUDAFLAGS
=
--Werror
cross-execution-space-call
--Wno-deprecated-gpu-targets
else
unknown_HW
:
endif
all
:
${OBJ}
${CC}
-o
blas
${OBJ}
${LDFLAGS}
${CUDAFLAGS}
%.o
:
%.c ${HEADER}
${CC}
${CFLAGS}
-c
$<
clean
:
rm
*
.o
unknown_HW
:
@
echo
"hardware not detected"
cuda_NVidia
.c
→
blas
.c
View file @
0626ce84
...
...
@@ -6,7 +6,7 @@
* Version: 1.0
* Created: 27.01.2021 12:45:18
* Revision: none
* Compiler:
nvc
c
* Compiler:
hip
c
*
* Author: Henning Fehrmann (), henning.fehrmann@aei.mpg.de
* Organization: AEI Hannover
...
...
@@ -15,18 +15,14 @@
* =====================================================================================
*/
#include "hardware_settings.h"
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <assert.h>
#include <cuda_runtime.h>
#include <cuda.h>
#include <cublas_v2.h>
#include <time.h>
#include <omp.h>
#define __ASSERT(x) (assert((x)==cudaSuccess))
size_t
m
=
10000
;
...
...
@@ -35,12 +31,20 @@ size_t k = 10000;
static
unsigned
long
x
=
123456789
,
y
=
362436069
,
z
=
521288629
;
#define __MALLOC(P, size) P = malloc(size); \
if (P == NULL) \
{\
fprintf(stderr, "Allocation if failed at line %d in %s\n", __LINE__, __FILE__); \
exit(EXIT_FAILURE); \
}\
unsigned
long
xorshf96
(
void
)
{
// NOT thread save
unsigned
long
t
;
x
^=
x
<<
16
;
x
^=
x
>>
5
;
...
...
@@ -102,22 +106,22 @@ timer_stop
void
multiplication
(
cublasHandle_t
handle
,
const
cuComplex
*
A
,
const
cuComplex
*
B
,
cuComplex
*
C
__HANDLE__
handle
,
const
__COMPLEX8__
*
A
,
const
__COMPLEX8__
*
B
,
__COMPLEX8__
*
C
)
{
cublasOperation_t
transA
=
CUBLAS_OP_N
;
cublasOperation_t
transB
=
CUBLAS_OP_C
;
const
cuComplex
alpha
=
{.
x
=
1
.
f
,
.
y
=
0
.
f
};
const
cuComplex
beta
=
{.
x
=
0
.
f
,
.
y
=
0
.
f
};
__BLAS_OPERATION__
transA
=
__NO_TRANSFORM__
;
__BLAS_OPERATION__
transB
=
__CT_TRANSFORM__
;
const
__COMPLEX8__
alpha
=
{.
x
=
1
.
f
,
.
y
=
0
.
f
};
const
__COMPLEX8__
beta
=
{.
x
=
0
.
f
,
.
y
=
0
.
f
};
int
lda
=
n
;
int
ldb
=
n
;
int
ldc
=
k
;
cublasCgemm
__CGMEM__
(
handle
,
transA
,
...
...
@@ -139,8 +143,8 @@ multiplication
void
prepare_matrices
(
cuComplex
*
hA
,
cuComplex
*
hB
__COMPLEX8__
*
hA
,
__COMPLEX8__
*
hB
)
{
float
fact
=
1
.
f
/
(
float
)
n
/
(
float
)
x
/
(
float
)
y
/
20
.
f
;
...
...
@@ -170,7 +174,7 @@ prepare_matrices
void
print_result
(
cuComplex
*
hC
__COMPLEX8__
*
hC
)
{
printf
(
"-------- %zu %zu
\n
"
,
m
,
k
);
...
...
@@ -198,45 +202,27 @@ run_test
m
=
dim
;
n
=
dim
;
k
=
dim
;
struct
runtime
*
timer
=
malloc
(
sizeof
(
*
timer
));
cuComplex
*
A
;
cuComplex
*
B
;
cuComplex
*
C
;
__ASSERT
(
cudaMalloc
((
void
**
)
&
A
,
sizeof
(
*
A
)
*
(
size_t
)(
m
*
n
)));
if
(
A
==
NULL
)
{
fprintf
(
stderr
,
"A not allocated
\n
"
);
exit
(
1
);
}
__ASSERT
(
cudaMalloc
((
void
**
)
&
B
,
sizeof
(
*
B
)
*
(
size_t
)(
n
*
k
)));
if
(
B
==
NULL
)
{
fprintf
(
stderr
,
"B not allocated
\n
"
);
exit
(
1
);
}
__ASSERT
(
cudaMalloc
((
void
**
)
&
C
,
sizeof
(
*
C
)
*
(
size_t
)(
m
*
k
)));
struct
runtime
*
timer
;
__MALLOC
(
timer
,
sizeof
(
*
timer
));
__COMPLEX8__
*
A
;
__COMPLEX8__
*
B
;
__COMPLEX8__
*
C
;
__ASSERT
(
__PREFIX
(
Malloc
)((
void
**
)
&
A
,
sizeof
(
*
A
)
*
(
size_t
)(
m
*
n
)));
__ASSERT
(
__PREFIX
(
Malloc
)((
void
**
)
&
B
,
sizeof
(
*
B
)
*
(
size_t
)(
n
*
k
)));
__ASSERT
(
__PREFIX
(
Malloc
)((
void
**
)
&
C
,
sizeof
(
*
C
)
*
(
size_t
)(
m
*
k
)));
if
(
C
==
NULL
)
{
fprintf
(
stderr
,
"C not allocated
\n
"
);
exit
(
1
);
}
cuComplex
*
hA
=
malloc
(
sizeof
(
*
hA
)
*
(
size_t
)(
m
*
n
));
if
(
hA
==
NULL
)
{
fprintf
(
stderr
,
"hA not allocated
\n
"
);
exit
(
1
);
}
cuComplex
*
hB
=
malloc
(
sizeof
(
*
hB
)
*
(
size_t
)(
k
*
n
));
if
(
hB
==
NULL
)
{
fprintf
(
stderr
,
"hB not allocated
\n
"
);
exit
(
1
);
}
cuComplex
*
hC
=
malloc
(
sizeof
(
*
hC
)
*
(
size_t
)(
m
*
k
));
__COMPLEX8__
*
hA
;
__MALLOC
(
hA
,
sizeof
(
*
hA
)
*
(
size_t
)(
m
*
n
));
__COMPLEX8__
*
hB
;
__MALLOC
(
hB
,
sizeof
(
*
hB
)
*
(
size_t
)(
k
*
n
));
__COMPLEX8__
*
hC
;
__MALLOC
(
hC
,
sizeof
(
*
hC
)
*
(
size_t
)(
m
*
k
));
if
(
hC
==
NULL
)
{
fprintf
(
stderr
,
"hC not allocated
\n
"
);
...
...
@@ -248,14 +234,16 @@ run_test
// timer_stop(timer);
//timer_start(timer, "Memcopy");
__ASSERT
(
cuda
Memcpy
(
A
,
hA
,
sizeof
(
*
A
)
*
(
size_t
)(
m
*
n
),
cuda
MemcpyHostToDevice
));
__ASSERT
(
cuda
Memcpy
(
B
,
hB
,
sizeof
(
*
B
)
*
(
size_t
)(
k
*
n
),
cuda
MemcpyHostToDevice
));
__ASSERT
(
__PREFIX
(
Memcpy
)
(
A
,
hA
,
sizeof
(
*
A
)
*
(
size_t
)(
m
*
n
),
__PREFIX
(
MemcpyHostToDevice
))
)
;
__ASSERT
(
__PREFIX
(
Memcpy
)
(
B
,
hB
,
sizeof
(
*
B
)
*
(
size_t
)(
k
*
n
),
__PREFIX
(
MemcpyHostToDevice
))
)
;
// timer_stop(timer);
cudaSetDevice
(
0
);
cublasHandle_t
handle
;
//
cudaSetDevice(0);
__HANDLE__
handle
;
//timer_start(timer, "Create Handle");
cublasCreate
(
&
handle
);
//if(rocblas_create_handle(&handle) != rocblas_status_success) return EXIT_FAILURE;
__CREATE_HANDLE
(
&
handle
);
//timer_stop(timer);
for
(
unsigned
r
=
0
;
r
<
rep
;
r
++
)
...
...
@@ -294,16 +282,17 @@ run_test
+
k
*
m
*
sizeof
(
*
C
)
)
/
1.e+9
);
__ASSERT
(
cuda
Memcpy
(
hC
,
C
,
sizeof
(
*
hC
)
*
(
size_t
)(
k
*
m
),
cuda
MemcpyDeviceToHost
));
__ASSERT
(
__PREFIX
(
Memcpy
)
(
hC
,
C
,
sizeof
(
*
hC
)
*
(
size_t
)(
k
*
m
),
__PREFIX
(
MemcpyDeviceToHost
))
)
;
//print_result(hC);
// timer_start(timer, "Destroy Handle");
if
(
cublasDestroy
(
handle
)
!=
cudaSuccess
)
return
EXIT_FAILURE
;
//if(rocblas_destroy_handle(handle) != rocblas_status_success) return EXIT_FAILURE;
if
(
__DESTROY_HANDLE
(
handle
)
!=
__PREFIX
(
Success
))
return
EXIT_FAILURE
;
// timer_stop(timer);
cuda
Free
(
A
);
cuda
Free
(
B
);
cuda
Free
(
C
);
__PREFIX
(
Free
)
(
A
);
__PREFIX
(
Free
)
(
B
);
__PREFIX
(
Free
)
(
C
);
free
(
hA
);
free
(
hB
);
free
(
hC
);
...
...
@@ -320,12 +309,8 @@ main
int
min_dim
=
1
;
int
max_dim
=
14
;
float
*
res
=
malloc
(
sizeof
(
*
res
)
*
(
size_t
)((
max_dim
-
min_dim
)
*
rep
));
if
(
res
==
NULL
)
{
fprintf
(
stderr
,
"Couldn't allocate res
\n
"
);
exit
(
1
);
}
float
*
res
;
__MALLOC
(
res
,
sizeof
(
*
res
)
*
(
size_t
)((
max_dim
-
min_dim
)
*
rep
));
for
(
int
i
=
min_dim
;
i
<
max_dim
;
i
++
)
{
size_t
dim
=
1
<<
i
;
...
...
hardware_settings.h
0 → 100644
View file @
0626ce84
/*
* =====================================================================================
*
* Filename: hardware_settings.h
*
* Description:
*
* Version: 1.0
* Created: 28.01.2021 16:15:56
* Revision: none
* Compiler: gcc
*
* Author: Henning Fehrmann (), henning.fehrmann@aei.mpg.de
* Organization: AEI Hannover
* Copyright: Copyright (c) 2021, Henning Fehrmann
*
* =====================================================================================
*/
#ifdef ROC
#define __HIP_PLATFORM_HCC__
#include <rocblas.h>
#include <hip/hip_runtime.h>
#include <hip/hip_vector_types.h>
#define __ASSERT(x) (assert((x)==hipSuccess))
#define __HANDLE__ rocblas_handle
#define __COMPLEX8__ rocblas_float_complex
#define __BLAS_OPERATION__ rocblas_operation
#define __NO_TRANSFORM__ rocblas_operation_none
#define __CT_TRANSFORM__ rocblas_operation_conjugate_transpose
#define __CGMEM__ rocblas_cgemm
#define __PREFIX(c) hip##c
#define __CREATE_HANDLE(h) rocblas_create_handle(h)
#define __DESTROY_HANDLE(h) rocblas_destroy_handle(h)
#endif
#ifdef CUDA
#include <assert.h>
#include <cuda_runtime.h>
#include <cuda.h>
#include <cublas_v2.h>
#define __ASSERT(x) (assert((x)==cudaSuccess))
#define __HANDLE__ cublasHandle_t
#define __COMPLEX8__ cuComplex
#define __BLAS_OPERATION__ cublasOperation_t
#define __NO_TRANSFORM__ CUBLAS_OP_N
#define __CT_TRANSFORM__ CUBLAS_OP_C
#define __CGMEM__ cublasCgemm
#define __PREFIX(c) cuda##c
#define __CREATE_HANDLE(h) cublasCreate(h)
#define __DESTROY_HANDLE(h) cublasDestroy(h)
#endif
rocmblas_AMD.c
deleted
100644 → 0
View file @
21c8a6d9
/*
* =====================================================================================
*
* Description: BLAS Benchmark
*
* Version: 1.0
* Created: 27.01.2021 12:45:18
* Revision: none
* Compiler: hipc
*
* Author: Henning Fehrmann (), henning.fehrmann@aei.mpg.de
* Organization: AEI Hannover
* License: GNU General Public License v2
*
* =====================================================================================
*/
#define __HIP_PLATFORM_HCC__
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <rocblas.h>
#include <hip/hip_runtime.h>
#include <hip/hip_vector_types.h>
#include <time.h>
#include <omp.h>
#define __ASSERT(x) (assert((x)==hipSuccess))
size_t
m
=
10000
;
size_t
n
=
10000
;
size_t
k
=
10000
;
static
unsigned
long
x
=
123456789
,
y
=
362436069
,
z
=
521288629
;
unsigned
long
xorshf96
(
void
)
{
// NOT thread save
unsigned
long
t
;
x
^=
x
<<
16
;
x
^=
x
>>
5
;
x
^=
x
<<
1
;
t
=
x
;
x
=
y
;
y
=
z
;
z
=
t
^
x
^
y
;
return
z
;
}
struct
runtime
{
struct
timespec
start
;
struct
timespec
stop
;
char
tag
[
128
];
};
void
timer_start
(
struct
runtime
*
timer
,
char
tag
[
128
]
)
{
struct
timespec
start
;
sprintf
(
timer
->
tag
,
"%s"
,
tag
);
clock_gettime
(
CLOCK_REALTIME
,
&
start
);
timer
->
start
=
start
;
// printf("--------> start timer: %s\n", timer->tag);
}
double
timer_stop
(
struct
runtime
*
timer
)
{
struct
timespec
stop
;
clock_gettime
(
CLOCK_REALTIME
,
&
stop
);
timer
->
stop
=
stop
;
double
res
=
(
double
)
(
(
timer
->
stop
).
tv_sec
-
(
timer
->
start
).
tv_sec
)
*
1000
.
+
(
double
)
(
(
timer
->
stop
).
tv_nsec
-
(
timer
->
start
).
tv_nsec
)
/
1000000
.
;
// printf("--------> stop timer %s: %g ms\n", timer->tag, res );
return
res
;
}
void
multiplication
(
rocblas_handle
handle
,
const
rocblas_float_complex
*
A
,
const
rocblas_float_complex
*
B
,
rocblas_float_complex
*
C
)
{
rocblas_operation
transA
=
rocblas_operation_none
;
rocblas_operation
transB
=
rocblas_operation_conjugate_transpose
;
const
rocblas_float_complex
alpha
=
{.
x
=
1
.
f
,
.
y
=
0
.
f
};
const
rocblas_float_complex
beta
=
{.
x
=
0
.
f
,
.
y
=
0
.
f
};
rocblas_int
lda
=
n
;
rocblas_int
ldb
=
n
;
rocblas_int
ldc
=
k
;
rocblas_cgemm
(
handle
,
transA
,
transB
,
m
,
n
,
k
,
&
alpha
,
A
,
lda
,
B
,
ldb
,
&
beta
,
C
,
ldc
);
}
void
prepare_matrices
(
rocblas_float_complex
*
hA
,
rocblas_float_complex
*
hB
)
{
float
fact
=
1
.
f
/
(
float
)
n
/
(
float
)
x
/
(
float
)
y
/
20
.
f
;
#pragma omp parallel for
for
(
size_t
i
=
0
;
i
<
n
;
i
++
)
{
for
(
size_t
j
=
0
;
j
<
m
;
j
++
)
{
size_t
ind
=
j
+
m
*
i
;
hA
[
ind
].
x
=
(
float
)
xorshf96
()
*
fact
;
hA
[
ind
].
y
=
(
float
)
xorshf96
()
*
fact
;
}
}
#pragma omp parallel for
for
(
size_t
i
=
0
;
i
<
n
;
i
++
)
{
for
(
size_t
j
=
0
;
j
<
k
;
j
++
)
{
size_t
ind
=
j
+
k
*
i
;
hB
[
ind
].
x
=
(
float
)
xorshf96
()
*
fact
;
hB
[
ind
].
y
=
(
float
)
xorshf96
()
*
fact
;
}
}
}
void
print_result
(
rocblas_float_complex
*
hC
)
{
printf
(
"-------- %zu %zu
\n
"
,
m
,
k
);
for
(
size_t
i
=
0
;
i
<
m
;
i
++
)
{
for
(
size_t
j
=
0
;
j
<
k
;
j
++
)
{
size_t
ind
=
j
+
k
*
i
;
printf
(
"%1.2f %1.2f
\t
"
,
hC
[
ind
].
x
,
hC
[
ind
].
y
);
}
printf
(
"
\n
"
);
}
printf
(
"--------
\n
"
);
}
int
run_test
(
size_t
dim
,
unsigned
rep
,
float
*
res
)
{
m
=
dim
;
n
=
dim
;
k
=
dim
;
struct
runtime
*
timer
=
malloc
(
sizeof
(
*
timer
));
rocblas_float_complex
*
A
;
rocblas_float_complex
*
B
;
rocblas_float_complex
*
C
;
__ASSERT
(
hipMalloc
((
void
**
)
&
A
,
sizeof
(
*
A
)
*
(
size_t
)(
m
*
n
)));
if
(
A
==
NULL
)
{
fprintf
(
stderr
,
"A not allocated
\n
"
);
exit
(
1
);
}
__ASSERT
(
hipMalloc
((
void
**
)
&
B
,
sizeof
(
*
B
)
*
(
size_t
)(
n
*
k
)));
if
(
B
==
NULL
)
{
fprintf
(
stderr
,
"B not allocated
\n
"
);
exit
(
1
);
}
__ASSERT
(
hipMalloc
((
void
**
)
&
C
,
sizeof
(
*
C
)
*
(
size_t
)(
m
*
k
)));
if
(
C
==
NULL
)
{
fprintf
(
stderr
,
"C not allocated
\n
"
);
exit
(
1
);
}
rocblas_float_complex
*
hA
=
malloc
(
sizeof
(
*
hA
)
*
(
size_t
)(
m
*
n
));
if
(
hA
==
NULL
)
{
fprintf
(
stderr
,
"hA not allocated
\n
"
);
exit
(
1
);
}
rocblas_float_complex
*
hB
=
malloc
(
sizeof
(
*
hB
)
*
(
size_t
)(
k
*
n
));
if
(
hB
==
NULL
)
{
fprintf
(
stderr
,
"hB not allocated
\n
"
);
exit
(
1
);
}
rocblas_float_complex
*
hC
=
malloc
(
sizeof
(
*
hC
)
*
(
size_t
)(
m
*
k
));
if
(
hC
==
NULL
)
{
fprintf
(
stderr
,
"hC not allocated
\n
"
);
exit
(
1
);
}
// timer_start(timer, "Prepare matrices");
prepare_matrices
(
hA
,
hB
);
// timer_stop(timer);
//timer_start(timer, "Memcopy");
__ASSERT
(
hipMemcpy
(
A
,
hA
,
sizeof
(
*
A
)
*
(
size_t
)(
m
*
n
),
hipMemcpyHostToDevice
));
__ASSERT
(
hipMemcpy
(
B
,
hB
,
sizeof
(
*
B
)
*
(
size_t
)(
k
*
n
),
hipMemcpyHostToDevice
));
// timer_stop(timer);
rocblas_handle
handle
;
//timer_start(timer, "Create Handle");
if
(
rocblas_create_handle
(
&
handle
)
!=
rocblas_status_success
)
return
EXIT_FAILURE
;
//timer_stop(timer);
for
(
unsigned
r
=
0
;
r
<
rep
;
r
++
)
{
float
res_r
=
0
.
f
;
char
mes
[
128
];
sprintf
(
mes
,
"dim %zu run %d a"
,
dim
,
r
);
timer_start
(
timer
,
mes
);
multiplication
(
handle
,
A
,
B
,
C
);
res_r
+=
timer_stop
(
timer
);
sprintf
(
mes
,
"dim %zu run %d b"
,
dim
,
r
);
/*
timer_start(timer, mes);
multiplication
(
handle,
B,
A,
C
);
res_r += timer_stop(timer);
*/
res
[
r
]
=
res_r
/
1
.
f
;
}
printf
(
"dimensions: %zu %zu %zu
\t
-- "
,
n
,
m
,
k
);
printf
(
"required size: %f GB
\n
"
,
(
m
*
n
*
sizeof
(
*
A
)
+
k
*
n
*
sizeof
(
*
B
)
+
k
*
m
*
sizeof
(
*
C
)
)
/
1.e+9
);
__ASSERT
(
hipMemcpy
(
hC
,
C
,
sizeof
(
*
hC
)
*
(
size_t
)(
k
*
m
),
hipMemcpyDeviceToHost
));
//print_result(hC);
// timer_start(timer, "Destroy Handle");
if
(
rocblas_destroy_handle
(
handle
)
!=
rocblas_status_success
)
return
EXIT_FAILURE
;
// timer_stop(timer);
hipFree
(
A
);
hipFree
(
B
);
hipFree
(
C
);
free
(
hA
);
free
(
hB
);
free
(
hC
);
free
(
timer
);
return
0
;
}
int
main
(
)
{
int
rep
=
512
;
int
min_dim
=
1
;
int
max_dim
=
14
;
float
*
res
=
malloc
(
sizeof
(
*
res
)
*
(
size_t
)((
max_dim
-
min_dim
)
*
rep
));
if
(
res
==
NULL
)
{
fprintf
(
stderr
,
"Couldn't allocate res
\n
"
);
<