Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Henning Fehrmann
NVidia_AMD_Bench
Commits
98a1cddc
Commit
98a1cddc
authored
Jan 27, 2021
by
Henning Fehrmann
Committed by
Henning Fehrmann
Jan 27, 2021
Browse files
Initial commit, not platform agnostic yet
parents
Changes
4
Hide whitespace changes
Inline
Side-by-side
Makefile_AMD
0 → 100644
View file @
98a1cddc
CC = /usr/bin/gcc
CC = /opt/rocm/hip/bin/hipcc
OBJ = rocmblas_AMD.o
LDFLAGS = -L/opt/rocm/lib -lhipblas -lrocblas -fopenmp
CFLAGS = -g -Wall -O3 -fopenmp -I/opt/rocm/include -I/opt/rocm/hip/include
all: ${OBJ}
${CC} -o rocmblas ${LDFLAGS} ${OBJ}
%.o: %.c ${HEADER}
${CC} ${CFLAGS} -c $<
clean:
rm *.o
Makefile_NVidia
0 → 100644
View file @
98a1cddc
CC = nvcc
OBJ = cuda_NVidia.o
LDFLAGS = -lcublas -lm -lgomp
INCLUDE= -I/usr/lib/x86_64-linux-gnu/openmpi/include/
CFLAGS = --compile -O3 -Xcompiler -fopenmp
CFLAGS = ${INCLUDE} --compile -O3 -pg -Xcompiler -fopenmp
CUDAFLAGS = --Werror cross-execution-space-call --Wno-deprecated-gpu-targets
all: ${OBJ}
${CC} -o cudablas ${OBJ} ${LDFLAGS} ${CUDAFLAGS}
%.o: %.c ${HEADER}
${CC} ${CFLAGS} -c $<
clean:
rm *.o
cuda_NVidia.c
0 → 100644
View file @
98a1cddc
/*
* =====================================================================================
*
* Description: BLAS Benchmark
*
* Version: 1.0
* Created: 27.01.2021 12:45:18
* Revision: none
* Compiler: nvcc
*
* Author: Henning Fehrmann (), henning.fehrmann@aei.mpg.de
* Organization: AEI Hannover
* License: GNU General Public License v2
*
* =====================================================================================
*/
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <assert.h>
#include <cuda_runtime.h>
#include <cuda.h>
#include <cublas_v2.h>
#include <time.h>
#include <omp.h>
#define __ASSERT(x) (assert((x)==cudaSuccess))
size_t
m
=
10000
;
size_t
n
=
10000
;
size_t
k
=
10000
;
static
unsigned
long
x
=
123456789
,
y
=
362436069
,
z
=
521288629
;
unsigned
long
xorshf96
(
void
)
{
unsigned
long
t
;
x
^=
x
<<
16
;
x
^=
x
>>
5
;
x
^=
x
<<
1
;
t
=
x
;
x
=
y
;
y
=
z
;
z
=
t
^
x
^
y
;
return
z
;
}
struct
runtime
{
struct
timespec
start
;
struct
timespec
stop
;
char
tag
[
128
];
};
void
timer_start
(
struct
runtime
*
timer
,
char
tag
[
128
]
)
{
struct
timespec
start
;
sprintf
(
timer
->
tag
,
"%s"
,
tag
);
clock_gettime
(
CLOCK_REALTIME
,
&
start
);
timer
->
start
=
start
;
// printf("--------> start timer: %s\n", timer->tag);
}
double
timer_stop
(
struct
runtime
*
timer
)
{
struct
timespec
stop
;
clock_gettime
(
CLOCK_REALTIME
,
&
stop
);
timer
->
stop
=
stop
;
double
res
=
(
double
)
(
(
timer
->
stop
).
tv_sec
-
(
timer
->
start
).
tv_sec
)
*
1000
.
+
(
double
)
(
(
timer
->
stop
).
tv_nsec
-
(
timer
->
start
).
tv_nsec
)
/
1000000
.
;
// printf("--------> stop timer %s: %g ms\n", timer->tag, res );
return
res
;
}
void
multiplication
(
cublasHandle_t
handle
,
const
cuComplex
*
A
,
const
cuComplex
*
B
,
cuComplex
*
C
)
{
cublasOperation_t
transA
=
CUBLAS_OP_N
;
cublasOperation_t
transB
=
CUBLAS_OP_C
;
const
cuComplex
alpha
=
{.
x
=
1
.
f
,
.
y
=
0
.
f
};
const
cuComplex
beta
=
{.
x
=
0
.
f
,
.
y
=
0
.
f
};
int
lda
=
n
;
int
ldb
=
n
;
int
ldc
=
k
;
cublasCgemm
(
handle
,
transA
,
transB
,
m
,
n
,
k
,
&
alpha
,
A
,
lda
,
B
,
ldb
,
&
beta
,
C
,
ldc
);
}
void
prepare_matrices
(
cuComplex
*
hA
,
cuComplex
*
hB
)
{
float
fact
=
1
.
f
/
(
float
)
n
/
(
float
)
x
/
(
float
)
y
/
20
.
f
;
#pragma omp parallel for
for
(
size_t
i
=
0
;
i
<
n
;
i
++
)
{
for
(
size_t
j
=
0
;
j
<
m
;
j
++
)
{
size_t
ind
=
j
+
m
*
i
;
hA
[
ind
].
x
=
(
float
)
xorshf96
()
*
fact
;
hA
[
ind
].
y
=
(
float
)
xorshf96
()
*
fact
;
}
}
#pragma omp parallel for
for
(
size_t
i
=
0
;
i
<
n
;
i
++
)
{
for
(
size_t
j
=
0
;
j
<
k
;
j
++
)
{
size_t
ind
=
j
+
k
*
i
;
hB
[
ind
].
x
=
(
float
)
xorshf96
()
*
fact
;
hB
[
ind
].
y
=
(
float
)
xorshf96
()
*
fact
;
}
}
}
void
print_result
(
cuComplex
*
hC
)
{
printf
(
"-------- %zu %zu
\n
"
,
m
,
k
);
for
(
size_t
i
=
0
;
i
<
m
;
i
++
)
{
for
(
size_t
j
=
0
;
j
<
k
;
j
++
)
{
size_t
ind
=
j
+
k
*
i
;
printf
(
"%1.2f %1.2f
\t
"
,
hC
[
ind
].
x
,
hC
[
ind
].
y
);
}
printf
(
"
\n
"
);
}
printf
(
"--------
\n
"
);
}
int
run_test
(
size_t
dim
,
unsigned
rep
,
float
*
res
)
{
m
=
dim
;
n
=
dim
;
k
=
dim
;
struct
runtime
*
timer
=
malloc
(
sizeof
(
*
timer
));
cuComplex
*
A
;
cuComplex
*
B
;
cuComplex
*
C
;
__ASSERT
(
cudaMalloc
((
void
**
)
&
A
,
sizeof
(
*
A
)
*
(
size_t
)(
m
*
n
)));
if
(
A
==
NULL
)
{
fprintf
(
stderr
,
"A not allocated
\n
"
);
exit
(
1
);
}
__ASSERT
(
cudaMalloc
((
void
**
)
&
B
,
sizeof
(
*
B
)
*
(
size_t
)(
n
*
k
)));
if
(
B
==
NULL
)
{
fprintf
(
stderr
,
"B not allocated
\n
"
);
exit
(
1
);
}
__ASSERT
(
cudaMalloc
((
void
**
)
&
C
,
sizeof
(
*
C
)
*
(
size_t
)(
m
*
k
)));
if
(
C
==
NULL
)
{
fprintf
(
stderr
,
"C not allocated
\n
"
);
exit
(
1
);
}
cuComplex
*
hA
=
malloc
(
sizeof
(
*
hA
)
*
(
size_t
)(
m
*
n
));
if
(
hA
==
NULL
)
{
fprintf
(
stderr
,
"hA not allocated
\n
"
);
exit
(
1
);
}
cuComplex
*
hB
=
malloc
(
sizeof
(
*
hB
)
*
(
size_t
)(
k
*
n
));
if
(
hB
==
NULL
)
{
fprintf
(
stderr
,
"hB not allocated
\n
"
);
exit
(
1
);
}
cuComplex
*
hC
=
malloc
(
sizeof
(
*
hC
)
*
(
size_t
)(
m
*
k
));
if
(
hC
==
NULL
)
{
fprintf
(
stderr
,
"hC not allocated
\n
"
);
exit
(
1
);
}
// timer_start(timer, "Prepare matrices");
prepare_matrices
(
hA
,
hB
);
// timer_stop(timer);
//timer_start(timer, "Memcopy");
__ASSERT
(
cudaMemcpy
(
A
,
hA
,
sizeof
(
*
A
)
*
(
size_t
)(
m
*
n
),
cudaMemcpyHostToDevice
));
__ASSERT
(
cudaMemcpy
(
B
,
hB
,
sizeof
(
*
B
)
*
(
size_t
)(
k
*
n
),
cudaMemcpyHostToDevice
));
// timer_stop(timer);
cudaSetDevice
(
0
);
cublasHandle_t
handle
;
//timer_start(timer, "Create Handle");
cublasCreate
(
&
handle
);
//timer_stop(timer);
for
(
unsigned
r
=
0
;
r
<
rep
;
r
++
)
{
float
res_r
=
0
.
f
;
char
mes
[
128
];
sprintf
(
mes
,
"dim %zu run %d a"
,
dim
,
r
);
timer_start
(
timer
,
mes
);
multiplication
(
handle
,
A
,
B
,
C
);
res_r
+=
timer_stop
(
timer
);
sprintf
(
mes
,
"dim %zu run %d b"
,
dim
,
r
);
/*
timer_start(timer, mes);
multiplication
(
handle,
B,
A,
C
);
res_r += timer_stop(timer);
*/
res
[
r
]
=
res_r
/
1
.
f
;
}
printf
(
"dimensions: %zu %zu %zu
\t
-- "
,
n
,
m
,
k
);
printf
(
"required size: %f GB
\n
"
,
(
m
*
n
*
sizeof
(
*
A
)
+
k
*
n
*
sizeof
(
*
B
)
+
k
*
m
*
sizeof
(
*
C
)
)
/
1.e+9
);
__ASSERT
(
cudaMemcpy
(
hC
,
C
,
sizeof
(
*
hC
)
*
(
size_t
)(
k
*
m
),
cudaMemcpyDeviceToHost
));
//print_result(hC);
// timer_start(timer, "Destroy Handle");
if
(
cublasDestroy
(
handle
)
!=
cudaSuccess
)
return
EXIT_FAILURE
;
// timer_stop(timer);
cudaFree
(
A
);
cudaFree
(
B
);
cudaFree
(
C
);
free
(
hA
);
free
(
hB
);
free
(
hC
);
free
(
timer
);
return
0
;
}
int
main
(
)
{
int
rep
=
512
;
int
min_dim
=
1
;
int
max_dim
=
14
;
float
*
res
=
malloc
(
sizeof
(
*
res
)
*
(
size_t
)((
max_dim
-
min_dim
)
*
rep
));
if
(
res
==
NULL
)
{
fprintf
(
stderr
,
"Couldn't allocate res
\n
"
);
exit
(
1
);
}
for
(
int
i
=
min_dim
;
i
<
max_dim
;
i
++
)
{
size_t
dim
=
1
<<
i
;
int
ind
=
(
i
-
min_dim
)
*
rep
;
run_test
(
dim
,
rep
,
&
res
[
ind
]);
}
// store the results
FILE
*
f
;
char
name
[
128
];
sprintf
(
name
,
"runtimes"
);
f
=
fopen
(
name
,
"w"
);
if
(
f
==
NULL
)
{
fprintf
(
stderr
,
"Couldn't open %s
\n
"
,
name
);
}
for
(
int
i
=
min_dim
;
i
<
max_dim
;
i
++
)
{
size_t
dim
=
1
<<
i
;
fprintf
(
f
,
"%zu
\t
"
,
dim
);
}
fprintf
(
f
,
"
\n
"
);
for
(
int
r
=
0
;
r
<
rep
;
r
++
)
{
for
(
int
i
=
min_dim
;
i
<
max_dim
;
i
++
)
{
size_t
pos
=
(
i
-
min_dim
)
*
rep
+
r
;
fprintf
(
f
,
"%1.6f
\t
"
,
res
[
pos
]);
}
fprintf
(
f
,
"
\n
"
);
}
fclose
(
f
);
return
0
;
}
rocmblas_AMD.c
0 → 100644
View file @
98a1cddc
/*
* =====================================================================================
*
* Description: BLAS Benchmark
*
* Version: 1.0
* Created: 27.01.2021 12:45:18
* Revision: none
* Compiler: hipc
*
* Author: Henning Fehrmann (), henning.fehrmann@aei.mpg.de
* Organization: AEI Hannover
* License: GNU General Public License v2
*
* =====================================================================================
*/
#define __HIP_PLATFORM_HCC__
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <rocblas.h>
#include <hip/hip_runtime.h>
#include <hip/hip_vector_types.h>
#include <time.h>
#include <omp.h>
#define __ASSERT(x) (assert((x)==hipSuccess))
size_t
m
=
10000
;
size_t
n
=
10000
;
size_t
k
=
10000
;
static
unsigned
long
x
=
123456789
,
y
=
362436069
,
z
=
521288629
;
unsigned
long
xorshf96
(
void
)
{
unsigned
long
t
;
x
^=
x
<<
16
;
x
^=
x
>>
5
;
x
^=
x
<<
1
;
t
=
x
;
x
=
y
;
y
=
z
;
z
=
t
^
x
^
y
;
return
z
;
}
struct
runtime
{
struct
timespec
start
;
struct
timespec
stop
;
char
tag
[
128
];
};
void
timer_start
(
struct
runtime
*
timer
,
char
tag
[
128
]
)
{
struct
timespec
start
;
sprintf
(
timer
->
tag
,
"%s"
,
tag
);
clock_gettime
(
CLOCK_REALTIME
,
&
start
);
timer
->
start
=
start
;
// printf("--------> start timer: %s\n", timer->tag);
}
double
timer_stop
(
struct
runtime
*
timer
)
{
struct
timespec
stop
;
clock_gettime
(
CLOCK_REALTIME
,
&
stop
);
timer
->
stop
=
stop
;
double
res
=
(
double
)
(
(
timer
->
stop
).
tv_sec
-
(
timer
->
start
).
tv_sec
)
*
1000
.
+
(
double
)
(
(
timer
->
stop
).
tv_nsec
-
(
timer
->
start
).
tv_nsec
)
/
1000000
.
;
// printf("--------> stop timer %s: %g ms\n", timer->tag, res );
return
res
;
}
void
multiplication
(
rocblas_handle
handle
,
const
rocblas_float_complex
*
A
,
const
rocblas_float_complex
*
B
,
rocblas_float_complex
*
C
)
{
rocblas_operation
transA
=
rocblas_operation_none
;
rocblas_operation
transB
=
rocblas_operation_conjugate_transpose
;
const
rocblas_float_complex
alpha
=
{.
x
=
1
.
f
,
.
y
=
0
.
f
};
const
rocblas_float_complex
beta
=
{.
x
=
0
.
f
,
.
y
=
0
.
f
};
rocblas_int
lda
=
n
;
rocblas_int
ldb
=
n
;
rocblas_int
ldc
=
k
;
rocblas_cgemm
(
handle
,
transA
,
transB
,
m
,
n
,
k
,
&
alpha
,
A
,
lda
,
B
,
ldb
,
&
beta
,
C
,
ldc
);
}
void
prepare_matrices
(
rocblas_float_complex
*
hA
,
rocblas_float_complex
*
hB
)
{
float
fact
=
1
.
f
/
(
float
)
n
/
(
float
)
x
/
(
float
)
y
/
20
.
f
;
#pragma omp parallel for
for
(
size_t
i
=
0
;
i
<
n
;
i
++
)
{
for
(
size_t
j
=
0
;
j
<
m
;
j
++
)
{
size_t
ind
=
j
+
m
*
i
;
hA
[
ind
].
x
=
(
float
)
xorshf96
()
*
fact
;
hA
[
ind
].
y
=
(
float
)
xorshf96
()
*
fact
;
}
}
#pragma omp parallel for
for
(
size_t
i
=
0
;
i
<
n
;
i
++
)
{
for
(
size_t
j
=
0
;
j
<
k
;
j
++
)
{
size_t
ind
=
j
+
k
*
i
;
hB
[
ind
].
x
=
(
float
)
xorshf96
()
*
fact
;
hB
[
ind
].
y
=
(
float
)
xorshf96
()
*
fact
;
}
}
}
void
print_result
(
rocblas_float_complex
*
hC
)
{
printf
(
"-------- %zu %zu
\n
"
,
m
,
k
);
for
(
size_t
i
=
0
;
i
<
m
;
i
++
)
{
for
(
size_t
j
=
0
;
j
<
k
;
j
++
)
{
size_t
ind
=
j
+
k
*
i
;
printf
(
"%1.2f %1.2f
\t
"
,
hC
[
ind
].
x
,
hC
[
ind
].
y
);
}
printf
(
"
\n
"
);
}
printf
(
"--------
\n
"
);
}
int
run_test
(
size_t
dim
,
unsigned
rep
,
float
*
res
)
{
m
=
dim
;
n
=
dim
;
k
=
dim
;
struct
runtime
*
timer
=
malloc
(
sizeof
(
*
timer
));
rocblas_float_complex
*
A
;
rocblas_float_complex
*
B
;
rocblas_float_complex
*
C
;
__ASSERT
(
hipMalloc
((
void
**
)
&
A
,
sizeof
(
*
A
)
*
(
size_t
)(
m
*
n
)));
if
(
A
==
NULL
)
{
fprintf
(
stderr
,
"A not allocated
\n
"
);
exit
(
1
);
}
__ASSERT
(
hipMalloc
((
void
**
)
&
B
,
sizeof
(
*
B
)
*
(
size_t
)(
n
*
k
)));
if
(
B
==
NULL
)
{
fprintf
(
stderr
,
"B not allocated
\n
"
);
exit
(
1
);
}
__ASSERT
(
hipMalloc
((
void
**
)
&
C
,
sizeof
(
*
C
)
*
(
size_t
)(
m
*
k
)));
if
(
C
==
NULL
)
{
fprintf
(
stderr
,
"C not allocated
\n
"
);
exit
(
1
);
}
rocblas_float_complex
*
hA
=
malloc
(
sizeof
(
*
hA
)
*
(
size_t
)(
m
*
n
));
if
(
hA
==
NULL
)
{
fprintf
(
stderr
,
"hA not allocated
\n
"
);
exit
(
1
);
}
rocblas_float_complex
*
hB
=
malloc
(
sizeof
(
*
hB
)
*
(
size_t
)(
k
*
n
));
if
(
hB
==
NULL
)
{
fprintf
(
stderr
,
"hB not allocated
\n
"
);
<