Commit 1cc7cf31 authored by Henning Fehrmann's avatar Henning Fehrmann Committed by Henning Fehrmann
Browse files

add synchronization

parent 9a07a73b
......@@ -22,24 +22,23 @@
#include <math.h>
#include <omp.h>
#define __MALLOC(P, size) P = malloc(size); \
#define __MALLOC(P, size) P = malloc(size); \
if (P == NULL) \
{\
fprintf(stderr, "Allocation failed at line %d in %s\n", __LINE__, __FILE__); \
exit(EXIT_FAILURE); \
}\
size_t m = 10000;
size_t n = 10000;
size_t k = 10000;
void
multiplication
(
__HANDLE__ handle,
const __COMPLEX8__ *A,
const __COMPLEX8__ *B,
__COMPLEX8__ *C
__COMPLEX8__ *C,
size_t m,
size_t n,
size_t k
)
{
__BLAS_OPERATION__ transA = __NO_TRANSFORM__;
......@@ -47,10 +46,6 @@ multiplication
const __COMPLEX8__ alpha = {.x = 1.f, .y = 0.f};
const __COMPLEX8__ beta = {.x = 0.f, .y = 0.f};
int lda = n;
int ldb = n;
int ldc = k;
__CGMEM__
(
handle,
......@@ -61,29 +56,35 @@ multiplication
k,
&alpha,
A,
lda,
m,
B,
ldb,
n,
&beta,
C,
ldc
m
);
int result;
// cublasIcamax(handle,m * n, C, 1, &result);
cudaDeviceSynchronize();
}
void
prepare_matrices
(
__COMPLEX8__ * hA,
__COMPLEX8__ * hB
__COMPLEX8__ * hB,
size_t m,
size_t n,
size_t k
)
{
float fact = 1.f/(float)n/(float)x/(float)y/20.f;
#pragma omp parallel for
for (size_t i = 0; i < n; i++)
for (size_t i = 0; i < m; i++)
{
for (size_t j = 0; j < m; j++)
for (size_t j = 0; j < k; j++)
{
size_t ind = j + m * i;
size_t ind = j + k * i;
hA[ind].x = (float)xorshf96()*fact;
hA[ind].y = (float)xorshf96()*fact;
}
......@@ -104,7 +105,10 @@ prepare_matrices
void
print_result
(
__COMPLEX8__ * hC
__COMPLEX8__ * hC,
size_t m,
size_t n,
size_t k
)
{
printf("-------- %zu %zu\n", m, k);
......@@ -124,23 +128,22 @@ print_result
int
run_test
(
size_t dim,
size_t m,
size_t n,
size_t k,
unsigned rep,
float * res
float * res,
__HANDLE__ handle
)
{
m = dim;
n = dim;
k = dim;
struct runtime * timer;
__MALLOC(timer, sizeof(*timer));
__COMPLEX8__ *A;
__COMPLEX8__ *B;
__COMPLEX8__ *C;
__ASSERT(__PREFIX(Malloc)((void **)&A, sizeof(*A) * (size_t)(m * n)));
__ASSERT(__PREFIX(Malloc)((void **)&A, sizeof(*A) * (size_t)(m * k)));
__ASSERT(__PREFIX(Malloc)((void **)&B, sizeof(*B) * (size_t)(n * k)));
__ASSERT(__PREFIX(Malloc)((void **)&C, sizeof(*C) * (size_t)(m * k)));
__ASSERT(__PREFIX(Malloc)((void **)&C, sizeof(*C) * (size_t)(m * n)));
if (C == NULL)
{
fprintf(stderr, "C not allocated\n");
......@@ -148,57 +151,49 @@ run_test
}
__COMPLEX8__ *hA;
__MALLOC( hA, sizeof(*hA) * (size_t)(m * n));
__MALLOC( hA, sizeof(*hA) * (size_t)(m * k));
__COMPLEX8__ *hB;
__MALLOC( hB, sizeof(*hB) * (size_t)(k * n));
__COMPLEX8__ *hC;
__MALLOC( hC, sizeof(*hC) * (size_t)(m * k));
__MALLOC( hC, sizeof(*hC) * (size_t)(n * m));
// timer_start(timer, "Prepare matrices");
prepare_matrices(hA, hB);
// timer_stop(timer);
//timer_start(timer, "Memcopy");
__ASSERT(__PREFIX(Memcpy)(A, hA, sizeof(*A) * (size_t)(m * n), __PREFIX(MemcpyHostToDevice)));
__ASSERT(__PREFIX(Memcpy)(B, hB, sizeof(*B) * (size_t)(k * n), __PREFIX(MemcpyHostToDevice)));
// timer_stop(timer);
// cudaSetDevice(0);
__HANDLE__ handle;
//timer_start(timer, "Create Handle");
//if(rocblas_create_handle(&handle) != rocblas_status_success) return EXIT_FAILURE;
__CREATE_HANDLE(&handle);
//timer_stop(timer);
prepare_matrices(hA, hB, m, n, k);
for (unsigned r = 0; r < rep; r++)
{
__ASSERT(__PREFIX(Memcpy)(A, hA, sizeof(*A) * (size_t)(m * k), __PREFIX(MemcpyHostToDevice)));
__ASSERT(__PREFIX(Memcpy)(B, hB, sizeof(*B) * (size_t)(k * n), __PREFIX(MemcpyHostToDevice)));
float res_r = 0.f;
char mes[128];
sprintf(mes, "dim %zu run %d a" ,dim, r);
sprintf(mes, "m %zu n %zu k %zu run %d", m, n, k, r);
timer_start(timer, mes);
multiplication
(
handle,
A,
B,
C
);
res_r += timer_stop(timer);
sprintf(mes, "dim %zu run %d b" ,dim, r);
/*
timer_start(timer, mes);
multiplication
(
handle,
B,
A,
C
C,
m,
n,
k
);
res_r += timer_stop(timer);
*/
res[r] = res_r/1.f;
}
printf("dimensions: %zu %zu %zu\t -- ", n, m , k);
printf("required size: %f GB\n",
(
......@@ -207,12 +202,11 @@ run_test
+ k * m * sizeof(*C)
)/1.e+9);
__ASSERT(__PREFIX(Memcpy)(hC, C, sizeof(*hC) * (size_t)(k * m), __PREFIX(MemcpyDeviceToHost)));
//print_result(hC);
//__ASSERT(__PREFIX(Memcpy)(hC, C, sizeof(*hC) * (size_t)(k * m), __PREFIX(MemcpyDeviceToHost)));
//print_result(hC, 1 << em, 1 << en, 1 << ek);
// timer_start(timer, "Destroy Handle");
//if(rocblas_destroy_handle(handle) != rocblas_status_success) return EXIT_FAILURE;
if(__DESTROY_HANDLE(handle) != __PREFIX(Success)) return EXIT_FAILURE;
// timer_stop(timer);
__PREFIX(Free)(A);
......@@ -230,19 +224,39 @@ main
(
)
{
int rep = 512;
int min_dim = 1;
int max_dim = 14;
int rep = 10;
size_t m_min = 8; // 13
size_t m_max = 11; // 16
size_t n_min = 11; // 11
size_t n_max = 19; // 19
size_t k_min = 5; // 7
size_t k_max = 11; // 11
float * res;
__MALLOC(res, sizeof(*res) * (size_t)((max_dim - min_dim) * rep));
for (int i = min_dim; i < max_dim; i++)
// cudaSetDevice(0);
__HANDLE__ handle;
__CREATE_HANDLE(&handle);
__MALLOC(res, sizeof(*res) * (size_t)(
(m_max - m_min + 1) *
(n_max - n_min + 1) *
(k_max - k_min + 1) *
rep));
for (int em = m_min; em <= m_max; em++)
{
size_t dim = 1 << i;
int ind = (i - min_dim) * rep;
run_test(dim, rep, &res[ind]);
for (int en = n_min; en <= n_max; en++)
{
for (int ek = k_min; ek <= k_max; ek++)
{
run_test(1 << em, 1 << en , 1 << ek, rep, &res[0], handle);
}
}
}
if(__DESTROY_HANDLE(handle) != __PREFIX(Success)) return EXIT_FAILURE;
exit(0);
// store the results
/*
FILE * f;
char name[128];
sprintf(name, "runtimes");
......@@ -267,5 +281,6 @@ main
fprintf(f, "\n");
}
fclose(f);
*/
return 0;
}
......@@ -37,12 +37,28 @@
exit(EXIT_FAILURE); \
}\
void
prepare_data
(
__COMPLEX8__ * hA,
size_t s
)
{
float fact = 1.f/(float)x/(float)y/20.f;
#pragma omp parallel for
for (size_t i = 0; i < s; i++)
{
hA[i].x = (float)xorshf96()*fact * fact;
hA[i].y = (float)xorshf96()*fact * fact;
}
}
int
run_test
(
size_t T,
size_t N,
unsigned rep,
int nofftws
unsigned rep
)
{
......@@ -51,19 +67,13 @@ run_test
// Create HIP device buffer
__COMPLEX8__ *A;
__COMPLEX8__ *hB;
__MALLOC(hB, sizeof(*hB) * N);
__MALLOC(hB, sizeof(*hB) * N * T);
__ASSERT(__PREFIX(Malloc)((void**)&A, sizeof(*A) * N));
__ASSERT(__PREFIX(Malloc)((void**)&A, sizeof(*A) * N * T));
// Initialize data
__COMPLEX8__ * hA;
__MALLOC(hA, sizeof(*hA) * N);
float fact = 1.f/(float)x/(float)y/20.f;
for (size_t i = 0; i < N; i++)
{
hA[i].x = (float)xorshf96()*fact * fact;
hA[i].y = (float)xorshf96()*fact * fact;
}
__MALLOC(hA, sizeof(*hA) * N * T);
// Copy data to device
__ASSERT(__PREFIX(Memcpy)(A, hA, sizeof(*hA) * N, __PREFIX(MemcpyHostToDevice)));
......@@ -72,8 +82,8 @@ run_test
size_t length = N;
char mes[128];
sprintf(mes, "dim: %zu\tPlan generation." ,N);
timer_start(timer, mes);
//sprintf(mes, "dim: %zu\tPlan generation." ,N);
//timer_start(timer, mes);
#ifdef ROC
rocfft_plan_create
......@@ -88,34 +98,45 @@ run_test
NULL
);
#elif CUDA
cufftPlan1d( &plan, N, CUFFT_C2C, 1);
int batch = T; // --- Number of batched executions
int rank = 1; // --- 1D FFTs
int na[] = { N }; // --- Size of the Fourier transform
int istride = 1, ostride = 1; // --- Distance between two successive input/output elements
int idist = N, odist = N; // --- Distance between batches
int inembed[] = { 0 }; // --- Input size with pitch (ignored for 1D transforms)
int onembed[] = { 0 }; // --- Output size with pitch (ignored for 1D transforms)
cufftPlanMany
(
&plan,
rank,
na,
inembed,
istride,
idist,
onembed,
ostride,
odist,
CUFFT_C2C,
batch
);
#endif
timer_stop(timer);
for (int r = 0 ; r < 2; r++)
prepare_data(hA, N * T);
for (int r = 0 ; r < 10; r++)
{
// Execute plan
sprintf(mes, "dim: %zu\tExecute plan round %d." ,N , r);
sprintf(mes, "T = %zu n = %zu\t round %d." ,T, N , r);
timer_start(timer, mes);
#ifdef ROC
rocfft_execute(plan, (void**) &A, NULL, NULL);
#elif CUDA
cufftExecC2C(plan, A, A, CUFFT_FORWARD);
#endif
__PREFIX(DeviceSynchronize)();
timer_stop(timer);
// Wait for execution to finish
sprintf(mes, "dim: %zu\tSynchronize round %d." ,N , r);
timer_start(timer, mes);
#ifdef ROC
hipDeviceSynchronize();
#endif
timer_stop(timer);
}
// Destroy plan
sprintf(mes, "dim: %zu\tDestroy plan." ,N);
timer_start(timer, mes);
__DESTROY_PLAN(plan);
timer_stop(timer);
__ASSERT(__PREFIX(Memcpy)(hB, A, sizeof(*A) * N, __PREFIX(MemcpyDeviceToHost)));
......@@ -132,22 +153,26 @@ main
)
{
int rep = 1;
int min_dim = 8;
int max_dim = 28;
int nofftws = 128;
int t_min = 8;
int t_max = 11;
int n_min = 11;
int n_max = 19;
float * res = malloc(sizeof(*res) * (size_t)((max_dim - min_dim) * rep));
float * res = malloc(sizeof(*res) * (size_t)((n_max - n_min + 1) * rep));
if (res == NULL)
{
fprintf(stderr, "Couldn't allocate res\n");
exit(1);
}
for (int i = min_dim; i < max_dim; i++)
for (int et = t_min; et <= t_max; et ++)
{
size_t dim = 1 << i;
//int ind = (i - min_dim) * rep;
run_test(dim, rep, nofftws);
int t = 1 << et;
for (int en = n_min; en <= n_max; en++)
{
size_t n = 1 << en;
run_test(t, n, rep);
}
}
free(res);
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment