Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Henning Fehrmann
NVidia_AMD_Bench
Commits
1cc7cf31
Commit
1cc7cf31
authored
Feb 05, 2021
by
Henning Fehrmann
Committed by
Henning Fehrmann
Feb 05, 2021
Browse files
add synchronization
parent
9a07a73b
Changes
2
Hide whitespace changes
Inline
Side-by-side
blas.c
View file @
1cc7cf31
...
...
@@ -22,24 +22,23 @@
#include <math.h>
#include <omp.h>
#define __MALLOC(P, size) P =
malloc(size); \
#define __MALLOC(P, size) P = malloc(size); \
if (P == NULL) \
{\
fprintf(stderr, "Allocation failed at line %d in %s\n", __LINE__, __FILE__); \
exit(EXIT_FAILURE); \
}\
size_t
m
=
10000
;
size_t
n
=
10000
;
size_t
k
=
10000
;
void
multiplication
(
__HANDLE__
handle
,
const
__COMPLEX8__
*
A
,
const
__COMPLEX8__
*
B
,
__COMPLEX8__
*
C
__COMPLEX8__
*
C
,
size_t
m
,
size_t
n
,
size_t
k
)
{
__BLAS_OPERATION__
transA
=
__NO_TRANSFORM__
;
...
...
@@ -47,10 +46,6 @@ multiplication
const
__COMPLEX8__
alpha
=
{.
x
=
1
.
f
,
.
y
=
0
.
f
};
const
__COMPLEX8__
beta
=
{.
x
=
0
.
f
,
.
y
=
0
.
f
};
int
lda
=
n
;
int
ldb
=
n
;
int
ldc
=
k
;
__CGMEM__
(
handle
,
...
...
@@ -61,29 +56,35 @@ multiplication
k
,
&
alpha
,
A
,
lda
,
m
,
B
,
ldb
,
n
,
&
beta
,
C
,
ldc
m
);
int
result
;
// cublasIcamax(handle,m * n, C, 1, &result);
cudaDeviceSynchronize
();
}
void
prepare_matrices
(
__COMPLEX8__
*
hA
,
__COMPLEX8__
*
hB
__COMPLEX8__
*
hB
,
size_t
m
,
size_t
n
,
size_t
k
)
{
float
fact
=
1
.
f
/
(
float
)
n
/
(
float
)
x
/
(
float
)
y
/
20
.
f
;
#pragma omp parallel for
for
(
size_t
i
=
0
;
i
<
n
;
i
++
)
for
(
size_t
i
=
0
;
i
<
m
;
i
++
)
{
for
(
size_t
j
=
0
;
j
<
m
;
j
++
)
for
(
size_t
j
=
0
;
j
<
k
;
j
++
)
{
size_t
ind
=
j
+
m
*
i
;
size_t
ind
=
j
+
k
*
i
;
hA
[
ind
].
x
=
(
float
)
xorshf96
()
*
fact
;
hA
[
ind
].
y
=
(
float
)
xorshf96
()
*
fact
;
}
...
...
@@ -104,7 +105,10 @@ prepare_matrices
void
print_result
(
__COMPLEX8__
*
hC
__COMPLEX8__
*
hC
,
size_t
m
,
size_t
n
,
size_t
k
)
{
printf
(
"-------- %zu %zu
\n
"
,
m
,
k
);
...
...
@@ -124,23 +128,22 @@ print_result
int
run_test
(
size_t
dim
,
size_t
m
,
size_t
n
,
size_t
k
,
unsigned
rep
,
float
*
res
float
*
res
,
__HANDLE__
handle
)
{
m
=
dim
;
n
=
dim
;
k
=
dim
;
struct
runtime
*
timer
;
__MALLOC
(
timer
,
sizeof
(
*
timer
));
__COMPLEX8__
*
A
;
__COMPLEX8__
*
B
;
__COMPLEX8__
*
C
;
__ASSERT
(
__PREFIX
(
Malloc
)((
void
**
)
&
A
,
sizeof
(
*
A
)
*
(
size_t
)(
m
*
n
)));
__ASSERT
(
__PREFIX
(
Malloc
)((
void
**
)
&
A
,
sizeof
(
*
A
)
*
(
size_t
)(
m
*
k
)));
__ASSERT
(
__PREFIX
(
Malloc
)((
void
**
)
&
B
,
sizeof
(
*
B
)
*
(
size_t
)(
n
*
k
)));
__ASSERT
(
__PREFIX
(
Malloc
)((
void
**
)
&
C
,
sizeof
(
*
C
)
*
(
size_t
)(
m
*
k
)));
__ASSERT
(
__PREFIX
(
Malloc
)((
void
**
)
&
C
,
sizeof
(
*
C
)
*
(
size_t
)(
m
*
n
)));
if
(
C
==
NULL
)
{
fprintf
(
stderr
,
"C not allocated
\n
"
);
...
...
@@ -148,57 +151,49 @@ run_test
}
__COMPLEX8__
*
hA
;
__MALLOC
(
hA
,
sizeof
(
*
hA
)
*
(
size_t
)(
m
*
n
));
__MALLOC
(
hA
,
sizeof
(
*
hA
)
*
(
size_t
)(
m
*
k
));
__COMPLEX8__
*
hB
;
__MALLOC
(
hB
,
sizeof
(
*
hB
)
*
(
size_t
)(
k
*
n
));
__COMPLEX8__
*
hC
;
__MALLOC
(
hC
,
sizeof
(
*
hC
)
*
(
size_t
)(
m
*
k
));
__MALLOC
(
hC
,
sizeof
(
*
hC
)
*
(
size_t
)(
n
*
m
));
// timer_start(timer, "Prepare matrices");
prepare_matrices
(
hA
,
hB
);
// timer_stop(timer);
//timer_start(timer, "Memcopy");
__ASSERT
(
__PREFIX
(
Memcpy
)(
A
,
hA
,
sizeof
(
*
A
)
*
(
size_t
)(
m
*
n
),
__PREFIX
(
MemcpyHostToDevice
)));
__ASSERT
(
__PREFIX
(
Memcpy
)(
B
,
hB
,
sizeof
(
*
B
)
*
(
size_t
)(
k
*
n
),
__PREFIX
(
MemcpyHostToDevice
)));
// timer_stop(timer);
// cudaSetDevice(0);
__HANDLE__
handle
;
//timer_start(timer, "Create Handle");
//if(rocblas_create_handle(&handle) != rocblas_status_success) return EXIT_FAILURE;
__CREATE_HANDLE
(
&
handle
);
//timer_stop(timer);
prepare_matrices
(
hA
,
hB
,
m
,
n
,
k
);
for
(
unsigned
r
=
0
;
r
<
rep
;
r
++
)
{
__ASSERT
(
__PREFIX
(
Memcpy
)(
A
,
hA
,
sizeof
(
*
A
)
*
(
size_t
)(
m
*
k
),
__PREFIX
(
MemcpyHostToDevice
)));
__ASSERT
(
__PREFIX
(
Memcpy
)(
B
,
hB
,
sizeof
(
*
B
)
*
(
size_t
)(
k
*
n
),
__PREFIX
(
MemcpyHostToDevice
)));
float
res_r
=
0
.
f
;
char
mes
[
128
];
sprintf
(
mes
,
"
di
m %zu
run %d a"
,
dim
,
r
);
sprintf
(
mes
,
"m %zu
n %zu k %zu run %d"
,
m
,
n
,
k
,
r
);
timer_start
(
timer
,
mes
);
multiplication
(
handle
,
A
,
B
,
C
);
res_r
+=
timer_stop
(
timer
);
sprintf
(
mes
,
"dim %zu run %d b"
,
dim
,
r
);
/*
timer_start(timer, mes);
multiplication
(
handle,
B,
A,
C
C
,
m
,
n
,
k
);
res_r
+=
timer_stop
(
timer
);
*/
res
[
r
]
=
res_r
/
1
.
f
;
}
printf
(
"dimensions: %zu %zu %zu
\t
-- "
,
n
,
m
,
k
);
printf
(
"required size: %f GB
\n
"
,
(
...
...
@@ -207,12 +202,11 @@ run_test
+
k
*
m
*
sizeof
(
*
C
)
)
/
1.e+9
);
__ASSERT
(
__PREFIX
(
Memcpy
)(
hC
,
C
,
sizeof
(
*
hC
)
*
(
size_t
)(
k
*
m
),
__PREFIX
(
MemcpyDeviceToHost
)));
//print_result(hC);
//
__ASSERT(__PREFIX(Memcpy)(hC, C, sizeof(*hC) * (size_t)(k * m), __PREFIX(MemcpyDeviceToHost)));
//print_result(hC
, 1 << em, 1 << en, 1 << ek
);
// timer_start(timer, "Destroy Handle");
//if(rocblas_destroy_handle(handle) != rocblas_status_success) return EXIT_FAILURE;
if
(
__DESTROY_HANDLE
(
handle
)
!=
__PREFIX
(
Success
))
return
EXIT_FAILURE
;
// timer_stop(timer);
__PREFIX
(
Free
)(
A
);
...
...
@@ -230,19 +224,39 @@ main
(
)
{
int
rep
=
512
;
int
min_dim
=
1
;
int
max_dim
=
14
;
int
rep
=
10
;
size_t
m_min
=
8
;
// 13
size_t
m_max
=
11
;
// 16
size_t
n_min
=
11
;
// 11
size_t
n_max
=
19
;
// 19
size_t
k_min
=
5
;
// 7
size_t
k_max
=
11
;
// 11
float
*
res
;
__MALLOC
(
res
,
sizeof
(
*
res
)
*
(
size_t
)((
max_dim
-
min_dim
)
*
rep
));
for
(
int
i
=
min_dim
;
i
<
max_dim
;
i
++
)
// cudaSetDevice(0);
__HANDLE__
handle
;
__CREATE_HANDLE
(
&
handle
);
__MALLOC
(
res
,
sizeof
(
*
res
)
*
(
size_t
)(
(
m_max
-
m_min
+
1
)
*
(
n_max
-
n_min
+
1
)
*
(
k_max
-
k_min
+
1
)
*
rep
));
for
(
int
em
=
m_min
;
em
<=
m_max
;
em
++
)
{
size_t
dim
=
1
<<
i
;
int
ind
=
(
i
-
min_dim
)
*
rep
;
run_test
(
dim
,
rep
,
&
res
[
ind
]);
for
(
int
en
=
n_min
;
en
<=
n_max
;
en
++
)
{
for
(
int
ek
=
k_min
;
ek
<=
k_max
;
ek
++
)
{
run_test
(
1
<<
em
,
1
<<
en
,
1
<<
ek
,
rep
,
&
res
[
0
],
handle
);
}
}
}
if
(
__DESTROY_HANDLE
(
handle
)
!=
__PREFIX
(
Success
))
return
EXIT_FAILURE
;
exit
(
0
);
// store the results
/*
FILE * f;
char name[128];
sprintf(name, "runtimes");
...
...
@@ -267,5 +281,6 @@ main
fprintf(f, "\n");
}
fclose(f);
*/
return
0
;
}
fftw.c
View file @
1cc7cf31
...
...
@@ -37,12 +37,28 @@
exit(EXIT_FAILURE); \
}\
void
prepare_data
(
__COMPLEX8__
*
hA
,
size_t
s
)
{
float
fact
=
1
.
f
/
(
float
)
x
/
(
float
)
y
/
20
.
f
;
#pragma omp parallel for
for
(
size_t
i
=
0
;
i
<
s
;
i
++
)
{
hA
[
i
].
x
=
(
float
)
xorshf96
()
*
fact
*
fact
;
hA
[
i
].
y
=
(
float
)
xorshf96
()
*
fact
*
fact
;
}
}
int
run_test
(
size_t
T
,
size_t
N
,
unsigned
rep
,
int
nofftws
unsigned
rep
)
{
...
...
@@ -51,19 +67,13 @@ run_test
// Create HIP device buffer
__COMPLEX8__
*
A
;
__COMPLEX8__
*
hB
;
__MALLOC
(
hB
,
sizeof
(
*
hB
)
*
N
);
__MALLOC
(
hB
,
sizeof
(
*
hB
)
*
N
*
T
);
__ASSERT
(
__PREFIX
(
Malloc
)((
void
**
)
&
A
,
sizeof
(
*
A
)
*
N
));
__ASSERT
(
__PREFIX
(
Malloc
)((
void
**
)
&
A
,
sizeof
(
*
A
)
*
N
*
T
));
// Initialize data
__COMPLEX8__
*
hA
;
__MALLOC
(
hA
,
sizeof
(
*
hA
)
*
N
);
float
fact
=
1
.
f
/
(
float
)
x
/
(
float
)
y
/
20
.
f
;
for
(
size_t
i
=
0
;
i
<
N
;
i
++
)
{
hA
[
i
].
x
=
(
float
)
xorshf96
()
*
fact
*
fact
;
hA
[
i
].
y
=
(
float
)
xorshf96
()
*
fact
*
fact
;
}
__MALLOC
(
hA
,
sizeof
(
*
hA
)
*
N
*
T
);
// Copy data to device
__ASSERT
(
__PREFIX
(
Memcpy
)(
A
,
hA
,
sizeof
(
*
hA
)
*
N
,
__PREFIX
(
MemcpyHostToDevice
)));
...
...
@@ -72,8 +82,8 @@ run_test
size_t
length
=
N
;
char
mes
[
128
];
sprintf
(
mes
,
"dim: %zu
\t
Plan generation."
,
N
);
timer_start
(
timer
,
mes
);
//
sprintf(mes, "dim: %zu\tPlan generation." ,N);
//
timer_start(timer, mes);
#ifdef ROC
rocfft_plan_create
...
...
@@ -88,34 +98,45 @@ run_test
NULL
);
#elif CUDA
cufftPlan1d
(
&
plan
,
N
,
CUFFT_C2C
,
1
);
int
batch
=
T
;
// --- Number of batched executions
int
rank
=
1
;
// --- 1D FFTs
int
na
[]
=
{
N
};
// --- Size of the Fourier transform
int
istride
=
1
,
ostride
=
1
;
// --- Distance between two successive input/output elements
int
idist
=
N
,
odist
=
N
;
// --- Distance between batches
int
inembed
[]
=
{
0
};
// --- Input size with pitch (ignored for 1D transforms)
int
onembed
[]
=
{
0
};
// --- Output size with pitch (ignored for 1D transforms)
cufftPlanMany
(
&
plan
,
rank
,
na
,
inembed
,
istride
,
idist
,
onembed
,
ostride
,
odist
,
CUFFT_C2C
,
batch
);
#endif
timer_stop
(
timer
);
for
(
int
r
=
0
;
r
<
2
;
r
++
)
prepare_data
(
hA
,
N
*
T
);
for
(
int
r
=
0
;
r
<
10
;
r
++
)
{
// Execute plan
sprintf
(
mes
,
"
dim: %zu
\t
Execute plan
round %d."
,
N
,
r
);
sprintf
(
mes
,
"
T = %zu n = %zu
\t
round %d."
,
T
,
N
,
r
);
timer_start
(
timer
,
mes
);
#ifdef ROC
rocfft_execute
(
plan
,
(
void
**
)
&
A
,
NULL
,
NULL
);
#elif CUDA
cufftExecC2C
(
plan
,
A
,
A
,
CUFFT_FORWARD
);
#endif
__PREFIX
(
DeviceSynchronize
)();
timer_stop
(
timer
);
// Wait for execution to finish
sprintf
(
mes
,
"dim: %zu
\t
Synchronize round %d."
,
N
,
r
);
timer_start
(
timer
,
mes
);
#ifdef ROC
hipDeviceSynchronize
();
#endif
timer_stop
(
timer
);
}
// Destroy plan
sprintf
(
mes
,
"dim: %zu
\t
Destroy plan."
,
N
);
timer_start
(
timer
,
mes
);
__DESTROY_PLAN
(
plan
);
timer_stop
(
timer
);
__ASSERT
(
__PREFIX
(
Memcpy
)(
hB
,
A
,
sizeof
(
*
A
)
*
N
,
__PREFIX
(
MemcpyDeviceToHost
)));
...
...
@@ -132,22 +153,26 @@ main
)
{
int
rep
=
1
;
int
min_dim
=
8
;
int
max_dim
=
28
;
int
nofftws
=
128
;
int
t_min
=
8
;
int
t_max
=
11
;
int
n_min
=
11
;
int
n_max
=
19
;
float
*
res
=
malloc
(
sizeof
(
*
res
)
*
(
size_t
)((
max
_dim
-
min_dim
)
*
rep
));
float
*
res
=
malloc
(
sizeof
(
*
res
)
*
(
size_t
)((
n_
max
-
n_min
+
1
)
*
rep
));
if
(
res
==
NULL
)
{
fprintf
(
stderr
,
"Couldn't allocate res
\n
"
);
exit
(
1
);
}
for
(
int
i
=
min
_dim
;
i
<
max_dim
;
i
++
)
for
(
int
et
=
t_
min
;
et
<=
t_max
;
et
++
)
{
size_t
dim
=
1
<<
i
;
//int ind = (i - min_dim) * rep;
run_test
(
dim
,
rep
,
nofftws
);
int
t
=
1
<<
et
;
for
(
int
en
=
n_min
;
en
<=
n_max
;
en
++
)
{
size_t
n
=
1
<<
en
;
run_test
(
t
,
n
,
rep
);
}
}
free
(
res
);
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment