Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
N
NVidia_AMD_Bench
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Henning Fehrmann
NVidia_AMD_Bench
Commits
582cb735
Commit
582cb735
authored
Mar 8, 2021
by
Henning Fehrmann
Committed by
Henning Fehrmann
Mar 8, 2021
Browse files
Options
Downloads
Patches
Plain Diff
index swaping
parent
2a2041d4
No related branches found
No related tags found
No related merge requests found
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
blas.c
+74
-60
74 additions, 60 deletions
blas.c
with
74 additions
and
60 deletions
blas.c
+
74
−
60
View file @
582cb735
...
@@ -29,17 +29,16 @@
...
@@ -29,17 +29,16 @@
exit(EXIT_FAILURE); \
exit(EXIT_FAILURE); \
}\
}\
size_t
m
=
10000
;
size_t
n
=
10000
;
size_t
k
=
10000
;
void
void
multiplication
multiplication
(
(
__HANDLE__
handle
,
__HANDLE__
handle
,
const
__COMPLEX8__
*
A
,
const
__COMPLEX8__
*
A
,
const
__COMPLEX8__
*
B
,
const
__COMPLEX8__
*
B
,
__COMPLEX8__
*
C
__COMPLEX8__
*
C
,
size_t
m
,
size_t
n
,
size_t
k
)
)
{
{
__BLAS_OPERATION__
transA
=
__NO_TRANSFORM__
;
__BLAS_OPERATION__
transA
=
__NO_TRANSFORM__
;
...
@@ -47,10 +46,6 @@ multiplication
...
@@ -47,10 +46,6 @@ multiplication
const
__COMPLEX8__
alpha
=
{.
x
=
1
.
f
,
.
y
=
0
.
f
};
const
__COMPLEX8__
alpha
=
{.
x
=
1
.
f
,
.
y
=
0
.
f
};
const
__COMPLEX8__
beta
=
{.
x
=
0
.
f
,
.
y
=
0
.
f
};
const
__COMPLEX8__
beta
=
{.
x
=
0
.
f
,
.
y
=
0
.
f
};
int
lda
=
n
;
int
ldb
=
n
;
int
ldc
=
k
;
__CGMEM__
__CGMEM__
(
(
handle
,
handle
,
...
@@ -61,29 +56,34 @@ multiplication
...
@@ -61,29 +56,34 @@ multiplication
k
,
k
,
&
alpha
,
&
alpha
,
A
,
A
,
lda
,
m
,
B
,
B
,
ldb
,
n
,
&
beta
,
&
beta
,
C
,
C
,
ldc
m
);
);
// cublasIcamax(handle,m * n, C, 1, &result);
__PREFIX
(
DeviceSynchronize
)();
}
}
void
void
prepare_matrices
prepare_matrices
(
(
__COMPLEX8__
*
hA
,
__COMPLEX8__
*
hA
,
__COMPLEX8__
*
hB
__COMPLEX8__
*
hB
,
size_t
m
,
size_t
n
,
size_t
k
)
)
{
{
float
fact
=
1
.
f
/
(
float
)
n
/
(
float
)
x
/
(
float
)
y
/
20
.
f
;
float
fact
=
1
.
f
/
(
float
)
n
/
(
float
)
x
/
(
float
)
y
/
20
.
f
;
#pragma omp parallel for
#pragma omp parallel for
for
(
size_t
i
=
0
;
i
<
n
;
i
++
)
for
(
size_t
i
=
0
;
i
<
m
;
i
++
)
{
{
for
(
size_t
j
=
0
;
j
<
m
;
j
++
)
for
(
size_t
j
=
0
;
j
<
k
;
j
++
)
{
{
size_t
ind
=
j
+
m
*
i
;
size_t
ind
=
j
+
k
*
i
;
hA
[
ind
].
x
=
(
float
)
xorshf96
()
*
fact
;
hA
[
ind
].
x
=
(
float
)
xorshf96
()
*
fact
;
hA
[
ind
].
y
=
(
float
)
xorshf96
()
*
fact
;
hA
[
ind
].
y
=
(
float
)
xorshf96
()
*
fact
;
}
}
...
@@ -104,7 +104,10 @@ prepare_matrices
...
@@ -104,7 +104,10 @@ prepare_matrices
void
void
print_result
print_result
(
(
__COMPLEX8__
*
hC
__COMPLEX8__
*
hC
,
size_t
m
,
size_t
n
,
size_t
k
)
)
{
{
printf
(
"-------- %zu %zu
\n
"
,
m
,
k
);
printf
(
"-------- %zu %zu
\n
"
,
m
,
k
);
...
@@ -124,23 +127,22 @@ print_result
...
@@ -124,23 +127,22 @@ print_result
int
int
run_test
run_test
(
(
size_t
dim
,
size_t
m
,
size_t
n
,
size_t
k
,
unsigned
rep
,
unsigned
rep
,
float
*
res
float
*
res
,
__HANDLE__
handle
)
)
{
{
m
=
dim
;
n
=
dim
;
k
=
dim
;
struct
runtime
*
timer
;
struct
runtime
*
timer
;
__MALLOC
(
timer
,
sizeof
(
*
timer
));
__MALLOC
(
timer
,
sizeof
(
*
timer
));
__COMPLEX8__
*
A
;
__COMPLEX8__
*
A
;
__COMPLEX8__
*
B
;
__COMPLEX8__
*
B
;
__COMPLEX8__
*
C
;
__COMPLEX8__
*
C
;
__ASSERT
(
__PREFIX
(
Malloc
)((
void
**
)
&
A
,
sizeof
(
*
A
)
*
(
size_t
)(
m
*
n
)));
__ASSERT
(
__PREFIX
(
Malloc
)((
void
**
)
&
A
,
sizeof
(
*
A
)
*
(
size_t
)(
m
*
k
)));
__ASSERT
(
__PREFIX
(
Malloc
)((
void
**
)
&
B
,
sizeof
(
*
B
)
*
(
size_t
)(
n
*
k
)));
__ASSERT
(
__PREFIX
(
Malloc
)((
void
**
)
&
B
,
sizeof
(
*
B
)
*
(
size_t
)(
n
*
k
)));
__ASSERT
(
__PREFIX
(
Malloc
)((
void
**
)
&
C
,
sizeof
(
*
C
)
*
(
size_t
)(
m
*
k
)));
__ASSERT
(
__PREFIX
(
Malloc
)((
void
**
)
&
C
,
sizeof
(
*
C
)
*
(
size_t
)(
m
*
n
)));
if
(
C
==
NULL
)
if
(
C
==
NULL
)
{
{
fprintf
(
stderr
,
"C not allocated
\n
"
);
fprintf
(
stderr
,
"C not allocated
\n
"
);
...
@@ -148,57 +150,49 @@ run_test
...
@@ -148,57 +150,49 @@ run_test
}
}
__COMPLEX8__
*
hA
;
__COMPLEX8__
*
hA
;
__MALLOC
(
hA
,
sizeof
(
*
hA
)
*
(
size_t
)(
m
*
n
));
__MALLOC
(
hA
,
sizeof
(
*
hA
)
*
(
size_t
)(
m
*
k
));
__COMPLEX8__
*
hB
;
__COMPLEX8__
*
hB
;
__MALLOC
(
hB
,
sizeof
(
*
hB
)
*
(
size_t
)(
k
*
n
));
__MALLOC
(
hB
,
sizeof
(
*
hB
)
*
(
size_t
)(
k
*
n
));
__COMPLEX8__
*
hC
;
__COMPLEX8__
*
hC
;
__MALLOC
(
hC
,
sizeof
(
*
hC
)
*
(
size_t
)(
m
*
k
));
__MALLOC
(
hC
,
sizeof
(
*
hC
)
*
(
size_t
)(
n
*
m
));
// timer_start(timer, "Prepare matrices");
// timer_start(timer, "Prepare matrices");
prepare_matrices
(
hA
,
hB
);
// timer_stop(timer);
// timer_stop(timer);
//timer_start(timer, "Memcopy");
//timer_start(timer, "Memcopy");
__ASSERT
(
__PREFIX
(
Memcpy
)(
A
,
hA
,
sizeof
(
*
A
)
*
(
size_t
)(
m
*
n
),
__PREFIX
(
MemcpyHostToDevice
)));
__ASSERT
(
__PREFIX
(
Memcpy
)(
B
,
hB
,
sizeof
(
*
B
)
*
(
size_t
)(
k
*
n
),
__PREFIX
(
MemcpyHostToDevice
)));
// timer_stop(timer);
// timer_stop(timer);
// cudaSetDevice(0);
__HANDLE__
handle
;
//timer_start(timer, "Create Handle");
//timer_start(timer, "Create Handle");
//if(rocblas_create_handle(&handle) != rocblas_status_success) return EXIT_FAILURE;
//if(rocblas_create_handle(&handle) != rocblas_status_success) return EXIT_FAILURE;
__CREATE_HANDLE
(
&
handle
);
//timer_stop(timer);
//timer_stop(timer);
prepare_matrices
(
hA
,
hB
,
m
,
n
,
k
);
for
(
unsigned
r
=
0
;
r
<
rep
;
r
++
)
for
(
unsigned
r
=
0
;
r
<
rep
;
r
++
)
{
{
__ASSERT
(
__PREFIX
(
Memcpy
)(
A
,
hA
,
sizeof
(
*
A
)
*
(
size_t
)(
m
*
k
),
__PREFIX
(
MemcpyHostToDevice
)));
__ASSERT
(
__PREFIX
(
Memcpy
)(
B
,
hB
,
sizeof
(
*
B
)
*
(
size_t
)(
k
*
n
),
__PREFIX
(
MemcpyHostToDevice
)));
float
res_r
=
0
.
f
;
float
res_r
=
0
.
f
;
char
mes
[
128
];
char
mes
[
128
];
sprintf
(
mes
,
"
di
m %zu
run %d a"
,
dim
,
r
);
sprintf
(
mes
,
"m %zu
n %zu k %zu run %d"
,
m
,
n
,
k
,
r
);
timer_start
(
timer
,
mes
);
timer_start
(
timer
,
mes
);
multiplication
multiplication
(
(
handle
,
handle
,
A
,
A
,
B
,
B
,
C
C
,
);
m
,
res_r
+=
timer_stop
(
timer
);
n
,
sprintf
(
mes
,
"dim %zu run %d b"
,
dim
,
r
);
k
/*
timer_start(timer, mes);
multiplication
(
handle,
B,
A,
C
);
);
res_r
+=
timer_stop
(
timer
);
res_r
+=
timer_stop
(
timer
);
*/
res
[
r
]
=
res_r
/
1
.
f
;
res
[
r
]
=
res_r
/
1
.
f
;
}
}
printf
(
"dimensions: %zu %zu %zu
\t
-- "
,
n
,
m
,
k
);
printf
(
"dimensions: %zu %zu %zu
\t
-- "
,
n
,
m
,
k
);
printf
(
"required size: %f GB
\n
"
,
printf
(
"required size: %f GB
\n
"
,
(
(
...
@@ -207,12 +201,11 @@ run_test
...
@@ -207,12 +201,11 @@ run_test
+
k
*
m
*
sizeof
(
*
C
)
+
k
*
m
*
sizeof
(
*
C
)
)
/
1.e+9
);
)
/
1.e+9
);
__ASSERT
(
__PREFIX
(
Memcpy
)(
hC
,
C
,
sizeof
(
*
hC
)
*
(
size_t
)(
k
*
m
),
__PREFIX
(
MemcpyDeviceToHost
)));
//
__ASSERT(__PREFIX(Memcpy)(hC, C, sizeof(*hC) * (size_t)(k * m), __PREFIX(MemcpyDeviceToHost)));
//print_result(hC);
//print_result(hC
, 1 << em, 1 << en, 1 << ek
);
// timer_start(timer, "Destroy Handle");
// timer_start(timer, "Destroy Handle");
//if(rocblas_destroy_handle(handle) != rocblas_status_success) return EXIT_FAILURE;
//if(rocblas_destroy_handle(handle) != rocblas_status_success) return EXIT_FAILURE;
if
(
__DESTROY_HANDLE
(
handle
)
!=
__PREFIX
(
Success
))
return
EXIT_FAILURE
;
// timer_stop(timer);
// timer_stop(timer);
__PREFIX
(
Free
)(
A
);
__PREFIX
(
Free
)(
A
);
...
@@ -230,19 +223,39 @@ main
...
@@ -230,19 +223,39 @@ main
(
(
)
)
{
{
int
rep
=
512
;
int
rep
=
10
;
int
min_dim
=
1
;
size_t
m_min
=
8
;
// 13
int
max_dim
=
14
;
size_t
m_max
=
11
;
// 16
size_t
n_min
=
11
;
// 11
size_t
n_max
=
19
;
// 19
size_t
k_min
=
5
;
// 7
size_t
k_max
=
11
;
// 11
float
*
res
;
float
*
res
;
__MALLOC
(
res
,
sizeof
(
*
res
)
*
(
size_t
)((
max_dim
-
min_dim
)
*
rep
));
for
(
int
i
=
min_dim
;
i
<
max_dim
;
i
++
)
// cudaSetDevice(0);
__HANDLE__
handle
;
__CREATE_HANDLE
(
&
handle
);
__MALLOC
(
res
,
sizeof
(
*
res
)
*
(
size_t
)(
(
m_max
-
m_min
+
1
)
*
(
n_max
-
n_min
+
1
)
*
(
k_max
-
k_min
+
1
)
*
rep
));
for
(
int
em
=
m_min
;
em
<=
m_max
;
em
++
)
{
{
size_t
dim
=
1
<<
i
;
for
(
int
en
=
n_min
;
en
<=
n_max
;
en
++
)
int
ind
=
(
i
-
min_dim
)
*
rep
;
{
run_test
(
dim
,
rep
,
&
res
[
ind
]);
for
(
int
ek
=
k_min
;
ek
<=
k_max
;
ek
++
)
{
run_test
(
1
<<
em
,
1
<<
en
,
1
<<
ek
,
rep
,
&
res
[
0
],
handle
);
}
}
}
}
if
(
__DESTROY_HANDLE
(
handle
)
!=
__PREFIX
(
Success
))
return
EXIT_FAILURE
;
exit
(
0
);
// store the results
// store the results
/*
FILE * f;
FILE * f;
char name[128];
char name[128];
sprintf(name, "runtimes");
sprintf(name, "runtimes");
...
@@ -267,5 +280,6 @@ main
...
@@ -267,5 +280,6 @@ main
fprintf(f, "\n");
fprintf(f, "\n");
}
}
fclose(f);
fclose(f);
*/
return
0
;
return
0
;
}
}
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment