Skip to content
Snippets Groups Projects
Commit 582cb735 authored by Henning Fehrmann's avatar Henning Fehrmann Committed by Henning Fehrmann
Browse files

index swaping

parent 2a2041d4
No related branches found
No related tags found
No related merge requests found
...@@ -29,17 +29,16 @@ ...@@ -29,17 +29,16 @@
exit(EXIT_FAILURE); \ exit(EXIT_FAILURE); \
}\ }\
size_t m = 10000;
size_t n = 10000;
size_t k = 10000;
void void
multiplication multiplication
( (
__HANDLE__ handle, __HANDLE__ handle,
const __COMPLEX8__ *A, const __COMPLEX8__ *A,
const __COMPLEX8__ *B, const __COMPLEX8__ *B,
__COMPLEX8__ *C __COMPLEX8__ *C,
size_t m,
size_t n,
size_t k
) )
{ {
__BLAS_OPERATION__ transA = __NO_TRANSFORM__; __BLAS_OPERATION__ transA = __NO_TRANSFORM__;
...@@ -47,10 +46,6 @@ multiplication ...@@ -47,10 +46,6 @@ multiplication
const __COMPLEX8__ alpha = {.x = 1.f, .y = 0.f}; const __COMPLEX8__ alpha = {.x = 1.f, .y = 0.f};
const __COMPLEX8__ beta = {.x = 0.f, .y = 0.f}; const __COMPLEX8__ beta = {.x = 0.f, .y = 0.f};
int lda = n;
int ldb = n;
int ldc = k;
__CGMEM__ __CGMEM__
( (
handle, handle,
...@@ -61,29 +56,34 @@ multiplication ...@@ -61,29 +56,34 @@ multiplication
k, k,
&alpha, &alpha,
A, A,
lda, m,
B, B,
ldb, n,
&beta, &beta,
C, C,
ldc m
); );
// cublasIcamax(handle,m * n, C, 1, &result);
__PREFIX(DeviceSynchronize)();
} }
void void
prepare_matrices prepare_matrices
( (
__COMPLEX8__ * hA, __COMPLEX8__ * hA,
__COMPLEX8__ * hB __COMPLEX8__ * hB,
size_t m,
size_t n,
size_t k
) )
{ {
float fact = 1.f/(float)n/(float)x/(float)y/20.f; float fact = 1.f/(float)n/(float)x/(float)y/20.f;
#pragma omp parallel for #pragma omp parallel for
for (size_t i = 0; i < n; i++) for (size_t i = 0; i < m; i++)
{ {
for (size_t j = 0; j < m; j++) for (size_t j = 0; j < k; j++)
{ {
size_t ind = j + m * i; size_t ind = j + k * i;
hA[ind].x = (float)xorshf96()*fact; hA[ind].x = (float)xorshf96()*fact;
hA[ind].y = (float)xorshf96()*fact; hA[ind].y = (float)xorshf96()*fact;
} }
...@@ -104,7 +104,10 @@ prepare_matrices ...@@ -104,7 +104,10 @@ prepare_matrices
void void
print_result print_result
( (
__COMPLEX8__ * hC __COMPLEX8__ * hC,
size_t m,
size_t n,
size_t k
) )
{ {
printf("-------- %zu %zu\n", m, k); printf("-------- %zu %zu\n", m, k);
...@@ -124,23 +127,22 @@ print_result ...@@ -124,23 +127,22 @@ print_result
int int
run_test run_test
( (
size_t dim, size_t m,
size_t n,
size_t k,
unsigned rep, unsigned rep,
float * res float * res,
__HANDLE__ handle
) )
{ {
m = dim;
n = dim;
k = dim;
struct runtime * timer; struct runtime * timer;
__MALLOC(timer, sizeof(*timer)); __MALLOC(timer, sizeof(*timer));
__COMPLEX8__ *A; __COMPLEX8__ *A;
__COMPLEX8__ *B; __COMPLEX8__ *B;
__COMPLEX8__ *C; __COMPLEX8__ *C;
__ASSERT(__PREFIX(Malloc)((void **)&A, sizeof(*A) * (size_t)(m * n))); __ASSERT(__PREFIX(Malloc)((void **)&A, sizeof(*A) * (size_t)(m * k)));
__ASSERT(__PREFIX(Malloc)((void **)&B, sizeof(*B) * (size_t)(n * k))); __ASSERT(__PREFIX(Malloc)((void **)&B, sizeof(*B) * (size_t)(n * k)));
__ASSERT(__PREFIX(Malloc)((void **)&C, sizeof(*C) * (size_t)(m * k))); __ASSERT(__PREFIX(Malloc)((void **)&C, sizeof(*C) * (size_t)(m * n)));
if (C == NULL) if (C == NULL)
{ {
fprintf(stderr, "C not allocated\n"); fprintf(stderr, "C not allocated\n");
...@@ -148,57 +150,49 @@ run_test ...@@ -148,57 +150,49 @@ run_test
} }
__COMPLEX8__ *hA; __COMPLEX8__ *hA;
__MALLOC( hA, sizeof(*hA) * (size_t)(m * n)); __MALLOC( hA, sizeof(*hA) * (size_t)(m * k));
__COMPLEX8__ *hB; __COMPLEX8__ *hB;
__MALLOC( hB, sizeof(*hB) * (size_t)(k * n)); __MALLOC( hB, sizeof(*hB) * (size_t)(k * n));
__COMPLEX8__ *hC; __COMPLEX8__ *hC;
__MALLOC( hC, sizeof(*hC) * (size_t)(m * k)); __MALLOC( hC, sizeof(*hC) * (size_t)(n * m));
// timer_start(timer, "Prepare matrices"); // timer_start(timer, "Prepare matrices");
prepare_matrices(hA, hB);
// timer_stop(timer); // timer_stop(timer);
//timer_start(timer, "Memcopy"); //timer_start(timer, "Memcopy");
__ASSERT(__PREFIX(Memcpy)(A, hA, sizeof(*A) * (size_t)(m * n), __PREFIX(MemcpyHostToDevice)));
__ASSERT(__PREFIX(Memcpy)(B, hB, sizeof(*B) * (size_t)(k * n), __PREFIX(MemcpyHostToDevice)));
// timer_stop(timer); // timer_stop(timer);
// cudaSetDevice(0);
__HANDLE__ handle;
//timer_start(timer, "Create Handle"); //timer_start(timer, "Create Handle");
//if(rocblas_create_handle(&handle) != rocblas_status_success) return EXIT_FAILURE; //if(rocblas_create_handle(&handle) != rocblas_status_success) return EXIT_FAILURE;
__CREATE_HANDLE(&handle);
//timer_stop(timer); //timer_stop(timer);
prepare_matrices(hA, hB, m, n, k);
for (unsigned r = 0; r < rep; r++) for (unsigned r = 0; r < rep; r++)
{ {
__ASSERT(__PREFIX(Memcpy)(A, hA, sizeof(*A) * (size_t)(m * k), __PREFIX(MemcpyHostToDevice)));
__ASSERT(__PREFIX(Memcpy)(B, hB, sizeof(*B) * (size_t)(k * n), __PREFIX(MemcpyHostToDevice)));
float res_r = 0.f; float res_r = 0.f;
char mes[128]; char mes[128];
sprintf(mes, "dim %zu run %d a" ,dim, r); sprintf(mes, "m %zu n %zu k %zu run %d", m, n, k, r);
timer_start(timer, mes); timer_start(timer, mes);
multiplication multiplication
( (
handle, handle,
A, A,
B, B,
C C,
); m,
res_r += timer_stop(timer); n,
sprintf(mes, "dim %zu run %d b" ,dim, r); k
/*
timer_start(timer, mes);
multiplication
(
handle,
B,
A,
C
); );
res_r += timer_stop(timer); res_r += timer_stop(timer);
*/
res[r] = res_r/1.f; res[r] = res_r/1.f;
} }
printf("dimensions: %zu %zu %zu\t -- ", n, m , k); printf("dimensions: %zu %zu %zu\t -- ", n, m , k);
printf("required size: %f GB\n", printf("required size: %f GB\n",
( (
...@@ -207,12 +201,11 @@ run_test ...@@ -207,12 +201,11 @@ run_test
+ k * m * sizeof(*C) + k * m * sizeof(*C)
)/1.e+9); )/1.e+9);
__ASSERT(__PREFIX(Memcpy)(hC, C, sizeof(*hC) * (size_t)(k * m), __PREFIX(MemcpyDeviceToHost))); //__ASSERT(__PREFIX(Memcpy)(hC, C, sizeof(*hC) * (size_t)(k * m), __PREFIX(MemcpyDeviceToHost)));
//print_result(hC); //print_result(hC, 1 << em, 1 << en, 1 << ek);
// timer_start(timer, "Destroy Handle"); // timer_start(timer, "Destroy Handle");
//if(rocblas_destroy_handle(handle) != rocblas_status_success) return EXIT_FAILURE; //if(rocblas_destroy_handle(handle) != rocblas_status_success) return EXIT_FAILURE;
if(__DESTROY_HANDLE(handle) != __PREFIX(Success)) return EXIT_FAILURE;
// timer_stop(timer); // timer_stop(timer);
__PREFIX(Free)(A); __PREFIX(Free)(A);
...@@ -230,19 +223,39 @@ main ...@@ -230,19 +223,39 @@ main
( (
) )
{ {
int rep = 512; int rep = 10;
int min_dim = 1; size_t m_min = 8; // 13
int max_dim = 14; size_t m_max = 11; // 16
size_t n_min = 11; // 11
size_t n_max = 19; // 19
size_t k_min = 5; // 7
size_t k_max = 11; // 11
float * res; float * res;
__MALLOC(res, sizeof(*res) * (size_t)((max_dim - min_dim) * rep));
for (int i = min_dim; i < max_dim; i++) // cudaSetDevice(0);
__HANDLE__ handle;
__CREATE_HANDLE(&handle);
__MALLOC(res, sizeof(*res) * (size_t)(
(m_max - m_min + 1) *
(n_max - n_min + 1) *
(k_max - k_min + 1) *
rep));
for (int em = m_min; em <= m_max; em++)
{ {
size_t dim = 1 << i; for (int en = n_min; en <= n_max; en++)
int ind = (i - min_dim) * rep; {
run_test(dim, rep, &res[ind]); for (int ek = k_min; ek <= k_max; ek++)
{
run_test(1 << em, 1 << en , 1 << ek, rep, &res[0], handle);
}
}
} }
if(__DESTROY_HANDLE(handle) != __PREFIX(Success)) return EXIT_FAILURE;
exit(0);
// store the results // store the results
/*
FILE * f; FILE * f;
char name[128]; char name[128];
sprintf(name, "runtimes"); sprintf(name, "runtimes");
...@@ -267,5 +280,6 @@ main ...@@ -267,5 +280,6 @@ main
fprintf(f, "\n"); fprintf(f, "\n");
} }
fclose(f); fclose(f);
*/
return 0; return 0;
} }
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment