Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Henning Fehrmann
NVidia_AMD_Bench
Commits
582cb735
Commit
582cb735
authored
Mar 08, 2021
by
Henning Fehrmann
Committed by
Henning Fehrmann
Mar 08, 2021
Browse files
index swaping
parent
2a2041d4
Changes
1
Hide whitespace changes
Inline
Side-by-side
blas.c
View file @
582cb735
...
...
@@ -22,24 +22,23 @@
#include
<math.h>
#include
<omp.h>
#define __MALLOC(P, size) P =
malloc(size); \
#define __MALLOC(P, size) P = malloc(size); \
if (P == NULL) \
{\
fprintf(stderr, "Allocation failed at line %d in %s\n", __LINE__, __FILE__); \
exit(EXIT_FAILURE); \
}\
size_t
m
=
10000
;
size_t
n
=
10000
;
size_t
k
=
10000
;
void
multiplication
(
__HANDLE__
handle
,
const
__COMPLEX8__
*
A
,
const
__COMPLEX8__
*
B
,
__COMPLEX8__
*
C
__COMPLEX8__
*
C
,
size_t
m
,
size_t
n
,
size_t
k
)
{
__BLAS_OPERATION__
transA
=
__NO_TRANSFORM__
;
...
...
@@ -47,10 +46,6 @@ multiplication
const
__COMPLEX8__
alpha
=
{.
x
=
1
.
f
,
.
y
=
0
.
f
};
const
__COMPLEX8__
beta
=
{.
x
=
0
.
f
,
.
y
=
0
.
f
};
int
lda
=
n
;
int
ldb
=
n
;
int
ldc
=
k
;
__CGMEM__
(
handle
,
...
...
@@ -61,29 +56,34 @@ multiplication
k
,
&
alpha
,
A
,
lda
,
m
,
B
,
ldb
,
n
,
&
beta
,
C
,
ldc
m
);
// cublasIcamax(handle,m * n, C, 1, &result);
__PREFIX
(
DeviceSynchronize
)();
}
void
prepare_matrices
(
__COMPLEX8__
*
hA
,
__COMPLEX8__
*
hB
__COMPLEX8__
*
hB
,
size_t
m
,
size_t
n
,
size_t
k
)
{
float
fact
=
1
.
f
/
(
float
)
n
/
(
float
)
x
/
(
float
)
y
/
20
.
f
;
#pragma omp parallel for
for
(
size_t
i
=
0
;
i
<
n
;
i
++
)
for
(
size_t
i
=
0
;
i
<
m
;
i
++
)
{
for
(
size_t
j
=
0
;
j
<
m
;
j
++
)
for
(
size_t
j
=
0
;
j
<
k
;
j
++
)
{
size_t
ind
=
j
+
m
*
i
;
size_t
ind
=
j
+
k
*
i
;
hA
[
ind
].
x
=
(
float
)
xorshf96
()
*
fact
;
hA
[
ind
].
y
=
(
float
)
xorshf96
()
*
fact
;
}
...
...
@@ -104,7 +104,10 @@ prepare_matrices
void
print_result
(
__COMPLEX8__
*
hC
__COMPLEX8__
*
hC
,
size_t
m
,
size_t
n
,
size_t
k
)
{
printf
(
"-------- %zu %zu
\n
"
,
m
,
k
);
...
...
@@ -124,23 +127,22 @@ print_result
int
run_test
(
size_t
dim
,
size_t
m
,
size_t
n
,
size_t
k
,
unsigned
rep
,
float
*
res
float
*
res
,
__HANDLE__
handle
)
{
m
=
dim
;
n
=
dim
;
k
=
dim
;
struct
runtime
*
timer
;
__MALLOC
(
timer
,
sizeof
(
*
timer
));
__COMPLEX8__
*
A
;
__COMPLEX8__
*
B
;
__COMPLEX8__
*
C
;
__ASSERT
(
__PREFIX
(
Malloc
)((
void
**
)
&
A
,
sizeof
(
*
A
)
*
(
size_t
)(
m
*
n
)));
__ASSERT
(
__PREFIX
(
Malloc
)((
void
**
)
&
A
,
sizeof
(
*
A
)
*
(
size_t
)(
m
*
k
)));
__ASSERT
(
__PREFIX
(
Malloc
)((
void
**
)
&
B
,
sizeof
(
*
B
)
*
(
size_t
)(
n
*
k
)));
__ASSERT
(
__PREFIX
(
Malloc
)((
void
**
)
&
C
,
sizeof
(
*
C
)
*
(
size_t
)(
m
*
k
)));
__ASSERT
(
__PREFIX
(
Malloc
)((
void
**
)
&
C
,
sizeof
(
*
C
)
*
(
size_t
)(
m
*
n
)));
if
(
C
==
NULL
)
{
fprintf
(
stderr
,
"C not allocated
\n
"
);
...
...
@@ -148,57 +150,49 @@ run_test
}
__COMPLEX8__
*
hA
;
__MALLOC
(
hA
,
sizeof
(
*
hA
)
*
(
size_t
)(
m
*
n
));
__MALLOC
(
hA
,
sizeof
(
*
hA
)
*
(
size_t
)(
m
*
k
));
__COMPLEX8__
*
hB
;
__MALLOC
(
hB
,
sizeof
(
*
hB
)
*
(
size_t
)(
k
*
n
));
__COMPLEX8__
*
hC
;
__MALLOC
(
hC
,
sizeof
(
*
hC
)
*
(
size_t
)(
m
*
k
));
__MALLOC
(
hC
,
sizeof
(
*
hC
)
*
(
size_t
)(
n
*
m
));
// timer_start(timer, "Prepare matrices");
prepare_matrices
(
hA
,
hB
);
// timer_stop(timer);
//timer_start(timer, "Memcopy");
__ASSERT
(
__PREFIX
(
Memcpy
)(
A
,
hA
,
sizeof
(
*
A
)
*
(
size_t
)(
m
*
n
),
__PREFIX
(
MemcpyHostToDevice
)));
__ASSERT
(
__PREFIX
(
Memcpy
)(
B
,
hB
,
sizeof
(
*
B
)
*
(
size_t
)(
k
*
n
),
__PREFIX
(
MemcpyHostToDevice
)));
// timer_stop(timer);
// cudaSetDevice(0);
__HANDLE__
handle
;
//timer_start(timer, "Create Handle");
//if(rocblas_create_handle(&handle) != rocblas_status_success) return EXIT_FAILURE;
__CREATE_HANDLE
(
&
handle
);
//timer_stop(timer);
prepare_matrices
(
hA
,
hB
,
m
,
n
,
k
);
for
(
unsigned
r
=
0
;
r
<
rep
;
r
++
)
{
__ASSERT
(
__PREFIX
(
Memcpy
)(
A
,
hA
,
sizeof
(
*
A
)
*
(
size_t
)(
m
*
k
),
__PREFIX
(
MemcpyHostToDevice
)));
__ASSERT
(
__PREFIX
(
Memcpy
)(
B
,
hB
,
sizeof
(
*
B
)
*
(
size_t
)(
k
*
n
),
__PREFIX
(
MemcpyHostToDevice
)));
float
res_r
=
0
.
f
;
char
mes
[
128
];
sprintf
(
mes
,
"
di
m %zu
run %d a"
,
dim
,
r
);
sprintf
(
mes
,
"m %zu
n %zu k %zu run %d"
,
m
,
n
,
k
,
r
);
timer_start
(
timer
,
mes
);
multiplication
(
handle
,
A
,
B
,
C
);
res_r
+=
timer_stop
(
timer
);
sprintf
(
mes
,
"dim %zu run %d b"
,
dim
,
r
);
/*
timer_start(timer, mes);
multiplication
(
handle,
B,
A,
C
C
,
m
,
n
,
k
);
res_r
+=
timer_stop
(
timer
);
*/
res
[
r
]
=
res_r
/
1
.
f
;
}
printf
(
"dimensions: %zu %zu %zu
\t
-- "
,
n
,
m
,
k
);
printf
(
"required size: %f GB
\n
"
,
(
...
...
@@ -207,12 +201,11 @@ run_test
+
k
*
m
*
sizeof
(
*
C
)
)
/
1.e+9
);
__ASSERT
(
__PREFIX
(
Memcpy
)(
hC
,
C
,
sizeof
(
*
hC
)
*
(
size_t
)(
k
*
m
),
__PREFIX
(
MemcpyDeviceToHost
)));
//print_result(hC);
//
__ASSERT(__PREFIX(Memcpy)(hC, C, sizeof(*hC) * (size_t)(k * m), __PREFIX(MemcpyDeviceToHost)));
//print_result(hC
, 1 << em, 1 << en, 1 << ek
);
// timer_start(timer, "Destroy Handle");
//if(rocblas_destroy_handle(handle) != rocblas_status_success) return EXIT_FAILURE;
if
(
__DESTROY_HANDLE
(
handle
)
!=
__PREFIX
(
Success
))
return
EXIT_FAILURE
;
// timer_stop(timer);
__PREFIX
(
Free
)(
A
);
...
...
@@ -230,19 +223,39 @@ main
(
)
{
int
rep
=
512
;
int
min_dim
=
1
;
int
max_dim
=
14
;
int
rep
=
10
;
size_t
m_min
=
8
;
// 13
size_t
m_max
=
11
;
// 16
size_t
n_min
=
11
;
// 11
size_t
n_max
=
19
;
// 19
size_t
k_min
=
5
;
// 7
size_t
k_max
=
11
;
// 11
float
*
res
;
__MALLOC
(
res
,
sizeof
(
*
res
)
*
(
size_t
)((
max_dim
-
min_dim
)
*
rep
));
for
(
int
i
=
min_dim
;
i
<
max_dim
;
i
++
)
// cudaSetDevice(0);
__HANDLE__
handle
;
__CREATE_HANDLE
(
&
handle
);
__MALLOC
(
res
,
sizeof
(
*
res
)
*
(
size_t
)(
(
m_max
-
m_min
+
1
)
*
(
n_max
-
n_min
+
1
)
*
(
k_max
-
k_min
+
1
)
*
rep
));
for
(
int
em
=
m_min
;
em
<=
m_max
;
em
++
)
{
size_t
dim
=
1
<<
i
;
int
ind
=
(
i
-
min_dim
)
*
rep
;
run_test
(
dim
,
rep
,
&
res
[
ind
]);
for
(
int
en
=
n_min
;
en
<=
n_max
;
en
++
)
{
for
(
int
ek
=
k_min
;
ek
<=
k_max
;
ek
++
)
{
run_test
(
1
<<
em
,
1
<<
en
,
1
<<
ek
,
rep
,
&
res
[
0
],
handle
);
}
}
}
if
(
__DESTROY_HANDLE
(
handle
)
!=
__PREFIX
(
Success
))
return
EXIT_FAILURE
;
exit
(
0
);
// store the results
/*
FILE * f;
char name[128];
sprintf(name, "runtimes");
...
...
@@ -267,5 +280,6 @@ main
fprintf(f, "\n");
}
fclose(f);
*/
return
0
;
}
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment