Commit 582cb735 authored by Henning Fehrmann's avatar Henning Fehrmann Committed by Henning Fehrmann
Browse files

index swaping

parent 2a2041d4
...@@ -22,24 +22,23 @@ ...@@ -22,24 +22,23 @@
#include <math.h> #include <math.h>
#include <omp.h> #include <omp.h>
#define __MALLOC(P, size) P = malloc(size); \ #define __MALLOC(P, size) P = malloc(size); \
if (P == NULL) \ if (P == NULL) \
{\ {\
fprintf(stderr, "Allocation failed at line %d in %s\n", __LINE__, __FILE__); \ fprintf(stderr, "Allocation failed at line %d in %s\n", __LINE__, __FILE__); \
exit(EXIT_FAILURE); \ exit(EXIT_FAILURE); \
}\ }\
size_t m = 10000;
size_t n = 10000;
size_t k = 10000;
void void
multiplication multiplication
( (
__HANDLE__ handle, __HANDLE__ handle,
const __COMPLEX8__ *A, const __COMPLEX8__ *A,
const __COMPLEX8__ *B, const __COMPLEX8__ *B,
__COMPLEX8__ *C __COMPLEX8__ *C,
size_t m,
size_t n,
size_t k
) )
{ {
__BLAS_OPERATION__ transA = __NO_TRANSFORM__; __BLAS_OPERATION__ transA = __NO_TRANSFORM__;
...@@ -47,10 +46,6 @@ multiplication ...@@ -47,10 +46,6 @@ multiplication
const __COMPLEX8__ alpha = {.x = 1.f, .y = 0.f}; const __COMPLEX8__ alpha = {.x = 1.f, .y = 0.f};
const __COMPLEX8__ beta = {.x = 0.f, .y = 0.f}; const __COMPLEX8__ beta = {.x = 0.f, .y = 0.f};
int lda = n;
int ldb = n;
int ldc = k;
__CGMEM__ __CGMEM__
( (
handle, handle,
...@@ -61,29 +56,34 @@ multiplication ...@@ -61,29 +56,34 @@ multiplication
k, k,
&alpha, &alpha,
A, A,
lda, m,
B, B,
ldb, n,
&beta, &beta,
C, C,
ldc m
); );
// cublasIcamax(handle,m * n, C, 1, &result);
__PREFIX(DeviceSynchronize)();
} }
void void
prepare_matrices prepare_matrices
( (
__COMPLEX8__ * hA, __COMPLEX8__ * hA,
__COMPLEX8__ * hB __COMPLEX8__ * hB,
size_t m,
size_t n,
size_t k
) )
{ {
float fact = 1.f/(float)n/(float)x/(float)y/20.f; float fact = 1.f/(float)n/(float)x/(float)y/20.f;
#pragma omp parallel for #pragma omp parallel for
for (size_t i = 0; i < n; i++) for (size_t i = 0; i < m; i++)
{ {
for (size_t j = 0; j < m; j++) for (size_t j = 0; j < k; j++)
{ {
size_t ind = j + m * i; size_t ind = j + k * i;
hA[ind].x = (float)xorshf96()*fact; hA[ind].x = (float)xorshf96()*fact;
hA[ind].y = (float)xorshf96()*fact; hA[ind].y = (float)xorshf96()*fact;
} }
...@@ -104,7 +104,10 @@ prepare_matrices ...@@ -104,7 +104,10 @@ prepare_matrices
void void
print_result print_result
( (
__COMPLEX8__ * hC __COMPLEX8__ * hC,
size_t m,
size_t n,
size_t k
) )
{ {
printf("-------- %zu %zu\n", m, k); printf("-------- %zu %zu\n", m, k);
...@@ -124,23 +127,22 @@ print_result ...@@ -124,23 +127,22 @@ print_result
int int
run_test run_test
( (
size_t dim, size_t m,
size_t n,
size_t k,
unsigned rep, unsigned rep,
float * res float * res,
__HANDLE__ handle
) )
{ {
m = dim;
n = dim;
k = dim;
struct runtime * timer; struct runtime * timer;
__MALLOC(timer, sizeof(*timer)); __MALLOC(timer, sizeof(*timer));
__COMPLEX8__ *A; __COMPLEX8__ *A;
__COMPLEX8__ *B; __COMPLEX8__ *B;
__COMPLEX8__ *C; __COMPLEX8__ *C;
__ASSERT(__PREFIX(Malloc)((void **)&A, sizeof(*A) * (size_t)(m * n))); __ASSERT(__PREFIX(Malloc)((void **)&A, sizeof(*A) * (size_t)(m * k)));
__ASSERT(__PREFIX(Malloc)((void **)&B, sizeof(*B) * (size_t)(n * k))); __ASSERT(__PREFIX(Malloc)((void **)&B, sizeof(*B) * (size_t)(n * k)));
__ASSERT(__PREFIX(Malloc)((void **)&C, sizeof(*C) * (size_t)(m * k))); __ASSERT(__PREFIX(Malloc)((void **)&C, sizeof(*C) * (size_t)(m * n)));
if (C == NULL) if (C == NULL)
{ {
fprintf(stderr, "C not allocated\n"); fprintf(stderr, "C not allocated\n");
...@@ -148,57 +150,49 @@ run_test ...@@ -148,57 +150,49 @@ run_test
} }
__COMPLEX8__ *hA; __COMPLEX8__ *hA;
__MALLOC( hA, sizeof(*hA) * (size_t)(m * n)); __MALLOC( hA, sizeof(*hA) * (size_t)(m * k));
__COMPLEX8__ *hB; __COMPLEX8__ *hB;
__MALLOC( hB, sizeof(*hB) * (size_t)(k * n)); __MALLOC( hB, sizeof(*hB) * (size_t)(k * n));
__COMPLEX8__ *hC; __COMPLEX8__ *hC;
__MALLOC( hC, sizeof(*hC) * (size_t)(m * k)); __MALLOC( hC, sizeof(*hC) * (size_t)(n * m));
// timer_start(timer, "Prepare matrices"); // timer_start(timer, "Prepare matrices");
prepare_matrices(hA, hB);
// timer_stop(timer); // timer_stop(timer);
//timer_start(timer, "Memcopy"); //timer_start(timer, "Memcopy");
__ASSERT(__PREFIX(Memcpy)(A, hA, sizeof(*A) * (size_t)(m * n), __PREFIX(MemcpyHostToDevice)));
__ASSERT(__PREFIX(Memcpy)(B, hB, sizeof(*B) * (size_t)(k * n), __PREFIX(MemcpyHostToDevice)));
// timer_stop(timer); // timer_stop(timer);
// cudaSetDevice(0);
__HANDLE__ handle;
//timer_start(timer, "Create Handle"); //timer_start(timer, "Create Handle");
//if(rocblas_create_handle(&handle) != rocblas_status_success) return EXIT_FAILURE; //if(rocblas_create_handle(&handle) != rocblas_status_success) return EXIT_FAILURE;
__CREATE_HANDLE(&handle);
//timer_stop(timer); //timer_stop(timer);
prepare_matrices(hA, hB, m, n, k);
for (unsigned r = 0; r < rep; r++) for (unsigned r = 0; r < rep; r++)
{ {
__ASSERT(__PREFIX(Memcpy)(A, hA, sizeof(*A) * (size_t)(m * k), __PREFIX(MemcpyHostToDevice)));
__ASSERT(__PREFIX(Memcpy)(B, hB, sizeof(*B) * (size_t)(k * n), __PREFIX(MemcpyHostToDevice)));
float res_r = 0.f; float res_r = 0.f;
char mes[128]; char mes[128];
sprintf(mes, "dim %zu run %d a" ,dim, r); sprintf(mes, "m %zu n %zu k %zu run %d", m, n, k, r);
timer_start(timer, mes); timer_start(timer, mes);
multiplication multiplication
( (
handle, handle,
A, A,
B, B,
C C,
); m,
res_r += timer_stop(timer); n,
sprintf(mes, "dim %zu run %d b" ,dim, r); k
/*
timer_start(timer, mes);
multiplication
(
handle,
B,
A,
C
); );
res_r += timer_stop(timer); res_r += timer_stop(timer);
*/
res[r] = res_r/1.f; res[r] = res_r/1.f;
} }
printf("dimensions: %zu %zu %zu\t -- ", n, m , k); printf("dimensions: %zu %zu %zu\t -- ", n, m , k);
printf("required size: %f GB\n", printf("required size: %f GB\n",
( (
...@@ -207,12 +201,11 @@ run_test ...@@ -207,12 +201,11 @@ run_test
+ k * m * sizeof(*C) + k * m * sizeof(*C)
)/1.e+9); )/1.e+9);
__ASSERT(__PREFIX(Memcpy)(hC, C, sizeof(*hC) * (size_t)(k * m), __PREFIX(MemcpyDeviceToHost))); //__ASSERT(__PREFIX(Memcpy)(hC, C, sizeof(*hC) * (size_t)(k * m), __PREFIX(MemcpyDeviceToHost)));
//print_result(hC); //print_result(hC, 1 << em, 1 << en, 1 << ek);
// timer_start(timer, "Destroy Handle"); // timer_start(timer, "Destroy Handle");
//if(rocblas_destroy_handle(handle) != rocblas_status_success) return EXIT_FAILURE; //if(rocblas_destroy_handle(handle) != rocblas_status_success) return EXIT_FAILURE;
if(__DESTROY_HANDLE(handle) != __PREFIX(Success)) return EXIT_FAILURE;
// timer_stop(timer); // timer_stop(timer);
__PREFIX(Free)(A); __PREFIX(Free)(A);
...@@ -230,19 +223,39 @@ main ...@@ -230,19 +223,39 @@ main
( (
) )
{ {
int rep = 512; int rep = 10;
int min_dim = 1; size_t m_min = 8; // 13
int max_dim = 14; size_t m_max = 11; // 16
size_t n_min = 11; // 11
size_t n_max = 19; // 19
size_t k_min = 5; // 7
size_t k_max = 11; // 11
float * res; float * res;
__MALLOC(res, sizeof(*res) * (size_t)((max_dim - min_dim) * rep));
for (int i = min_dim; i < max_dim; i++) // cudaSetDevice(0);
__HANDLE__ handle;
__CREATE_HANDLE(&handle);
__MALLOC(res, sizeof(*res) * (size_t)(
(m_max - m_min + 1) *
(n_max - n_min + 1) *
(k_max - k_min + 1) *
rep));
for (int em = m_min; em <= m_max; em++)
{ {
size_t dim = 1 << i; for (int en = n_min; en <= n_max; en++)
int ind = (i - min_dim) * rep; {
run_test(dim, rep, &res[ind]); for (int ek = k_min; ek <= k_max; ek++)
{
run_test(1 << em, 1 << en , 1 << ek, rep, &res[0], handle);
}
}
} }
if(__DESTROY_HANDLE(handle) != __PREFIX(Success)) return EXIT_FAILURE;
exit(0);
// store the results // store the results
/*
FILE * f; FILE * f;
char name[128]; char name[128];
sprintf(name, "runtimes"); sprintf(name, "runtimes");
...@@ -267,5 +280,6 @@ main ...@@ -267,5 +280,6 @@ main
fprintf(f, "\n"); fprintf(f, "\n");
} }
fclose(f); fclose(f);
*/
return 0; return 0;
} }
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment