diff --git a/example/main.cpp b/example/main.cpp index 2492a256bacc13c7fea65554af36af3dfaf79061..2dd88ea8a22852bfff6fe45a0d81588cd0810a39 100644 --- a/example/main.cpp +++ b/example/main.cpp @@ -66,23 +66,23 @@ #define eps_avg 10.0 -#define MAX( _a, _b) ((_a)>(_b)?(_a) : (_b)) +#define MAX( _a, _b) ((_a)>(_b)?(_a) : (_b)) typedef enum { - clFFT_OUT_OF_PLACE, - clFFT_IN_PLACE, + clFFT_OUT_OF_PLACE, + clFFT_IN_PLACE, }clFFT_TestType; typedef struct { - double real; - double imag; + double real; + double imag; }clFFT_ComplexDouble; typedef struct { - double *real; - double *imag; + double *real; + double *imag; }clFFT_SplitComplexDouble; cl_device_id device_id; @@ -100,9 +100,9 @@ double subtractTimes( uint64_t endTime, uint64_t startTime ) mach_timebase_info_data_t info; kern_return_t err = mach_timebase_info( &info ); - //Convert the timebase into seconds + //Convert the timebase into seconds if( err == 0 ) - conversion = 1e-9 * (double) info.numer / (double) info.denom; + conversion = 1e-9 * (double) info.numer / (double) info.denom; } return conversion * (double) difference; @@ -111,583 +111,583 @@ double subtractTimes( uint64_t endTime, uint64_t startTime ) #ifdef __APPLE__ void computeReferenceF(clFFT_SplitComplex *out, clFFT_Dim3 n, - unsigned int batchSize, clFFT_Dimension dim, clFFT_Direction dir) + unsigned int batchSize, clFFT_Dimension dim, clFFT_Direction dir) { - FFTSetup plan_vdsp; - DSPSplitComplex out_vdsp; - FFTDirection dir_vdsp = dir == clFFT_Forward ? FFT_FORWARD : FFT_INVERSE; - - unsigned int i, j, k; - unsigned int stride; - unsigned int log2Nx = (unsigned int) log2(n.x); - unsigned int log2Ny = (unsigned int) log2(n.y); - unsigned int log2Nz = (unsigned int) log2(n.z); - unsigned int log2N; - - log2N = log2Nx; - log2N = log2N > log2Ny ? log2N : log2Ny; - log2N = log2N > log2Nz ? log2N : log2Nz; - - plan_vdsp = vDSP_create_fftsetup(log2N, 2); - - switch(dim) - { - case clFFT_1D: - - for(i = 0; i < batchSize; i++) - { - stride = i * n.x; - out_vdsp.realp = out->real + stride; - out_vdsp.imagp = out->imag + stride; - - vDSP_fft_zip(plan_vdsp, &out_vdsp, 1, log2Nx, dir_vdsp); - } - break; - - case clFFT_2D: - - for(i = 0; i < batchSize; i++) - { - for(j = 0; j < n.y; j++) - { - stride = j * n.x + i * n.x * n.y; - out_vdsp.realp = out->real + stride; - out_vdsp.imagp = out->imag + stride; - - vDSP_fft_zip(plan_vdsp, &out_vdsp, 1, log2Nx, dir_vdsp); - } - } - for(i = 0; i < batchSize; i++) - { - for(j = 0; j < n.x; j++) - { - stride = j + i * n.x * n.y; - out_vdsp.realp = out->real + stride; - out_vdsp.imagp = out->imag + stride; - - vDSP_fft_zip(plan_vdsp, &out_vdsp, n.x, log2Ny, dir_vdsp); - } - } - break; - - case clFFT_3D: - - for(i = 0; i < batchSize; i++) - { - for(j = 0; j < n.z; j++) - { - for(k = 0; k < n.y; k++) - { - stride = k * n.x + j * n.x * n.y + i * n.x * n.y * n.z; - out_vdsp.realp = out->real + stride; - out_vdsp.imagp = out->imag + stride; - - vDSP_fft_zip(plan_vdsp, &out_vdsp, 1, log2Nx, dir_vdsp); - } - } - } - for(i = 0; i < batchSize; i++) - { - for(j = 0; j < n.z; j++) - { - for(k = 0; k < n.x; k++) - { - stride = k + j * n.x * n.y + i * n.x * n.y * n.z; - out_vdsp.realp = out->real + stride; - out_vdsp.imagp = out->imag + stride; - - vDSP_fft_zip(plan_vdsp, &out_vdsp, n.x, log2Ny, dir_vdsp); - } - } - } - for(i = 0; i < batchSize; i++) - { - for(j = 0; j < n.y; j++) - { - for(k = 0; k < n.x; k++) - { - stride = k + j * n.x + i * n.x * n.y * n.z; - out_vdsp.realp = out->real + stride; - out_vdsp.imagp = out->imag + stride; - - vDSP_fft_zip(plan_vdsp, &out_vdsp, n.x*n.y, log2Nz, dir_vdsp); - } - } - } - break; - } - - vDSP_destroy_fftsetup(plan_vdsp); + FFTSetup plan_vdsp; + DSPSplitComplex out_vdsp; + FFTDirection dir_vdsp = dir == clFFT_Forward ? FFT_FORWARD : FFT_INVERSE; + + unsigned int i, j, k; + unsigned int stride; + unsigned int log2Nx = (unsigned int) log2(n.x); + unsigned int log2Ny = (unsigned int) log2(n.y); + unsigned int log2Nz = (unsigned int) log2(n.z); + unsigned int log2N; + + log2N = log2Nx; + log2N = log2N > log2Ny ? log2N : log2Ny; + log2N = log2N > log2Nz ? log2N : log2Nz; + + plan_vdsp = vDSP_create_fftsetup(log2N, 2); + + switch(dim) + { + case clFFT_1D: + + for(i = 0; i < batchSize; i++) + { + stride = i * n.x; + out_vdsp.realp = out->real + stride; + out_vdsp.imagp = out->imag + stride; + + vDSP_fft_zip(plan_vdsp, &out_vdsp, 1, log2Nx, dir_vdsp); + } + break; + + case clFFT_2D: + + for(i = 0; i < batchSize; i++) + { + for(j = 0; j < n.y; j++) + { + stride = j * n.x + i * n.x * n.y; + out_vdsp.realp = out->real + stride; + out_vdsp.imagp = out->imag + stride; + + vDSP_fft_zip(plan_vdsp, &out_vdsp, 1, log2Nx, dir_vdsp); + } + } + for(i = 0; i < batchSize; i++) + { + for(j = 0; j < n.x; j++) + { + stride = j + i * n.x * n.y; + out_vdsp.realp = out->real + stride; + out_vdsp.imagp = out->imag + stride; + + vDSP_fft_zip(plan_vdsp, &out_vdsp, n.x, log2Ny, dir_vdsp); + } + } + break; + + case clFFT_3D: + + for(i = 0; i < batchSize; i++) + { + for(j = 0; j < n.z; j++) + { + for(k = 0; k < n.y; k++) + { + stride = k * n.x + j * n.x * n.y + i * n.x * n.y * n.z; + out_vdsp.realp = out->real + stride; + out_vdsp.imagp = out->imag + stride; + + vDSP_fft_zip(plan_vdsp, &out_vdsp, 1, log2Nx, dir_vdsp); + } + } + } + for(i = 0; i < batchSize; i++) + { + for(j = 0; j < n.z; j++) + { + for(k = 0; k < n.x; k++) + { + stride = k + j * n.x * n.y + i * n.x * n.y * n.z; + out_vdsp.realp = out->real + stride; + out_vdsp.imagp = out->imag + stride; + + vDSP_fft_zip(plan_vdsp, &out_vdsp, n.x, log2Ny, dir_vdsp); + } + } + } + for(i = 0; i < batchSize; i++) + { + for(j = 0; j < n.y; j++) + { + for(k = 0; k < n.x; k++) + { + stride = k + j * n.x + i * n.x * n.y * n.z; + out_vdsp.realp = out->real + stride; + out_vdsp.imagp = out->imag + stride; + + vDSP_fft_zip(plan_vdsp, &out_vdsp, n.x*n.y, log2Nz, dir_vdsp); + } + } + } + break; + } + + vDSP_destroy_fftsetup(plan_vdsp); } #endif #ifdef __APPLE__ void computeReferenceD(clFFT_SplitComplexDouble *out, clFFT_Dim3 n, - unsigned int batchSize, clFFT_Dimension dim, clFFT_Direction dir) + unsigned int batchSize, clFFT_Dimension dim, clFFT_Direction dir) { - FFTSetupD plan_vdsp; - DSPDoubleSplitComplex out_vdsp; - FFTDirection dir_vdsp = dir == clFFT_Forward ? FFT_FORWARD : FFT_INVERSE; - - unsigned int i, j, k; - unsigned int stride; - unsigned int log2Nx = (int) log2(n.x); - unsigned int log2Ny = (int) log2(n.y); - unsigned int log2Nz = (int) log2(n.z); - unsigned int log2N; - - log2N = log2Nx; - log2N = log2N > log2Ny ? log2N : log2Ny; - log2N = log2N > log2Nz ? log2N : log2Nz; - - plan_vdsp = vDSP_create_fftsetupD(log2N, 2); - - switch(dim) - { - case clFFT_1D: - - for(i = 0; i < batchSize; i++) - { - stride = i * n.x; - out_vdsp.realp = out->real + stride; - out_vdsp.imagp = out->imag + stride; - - vDSP_fft_zipD(plan_vdsp, &out_vdsp, 1, log2Nx, dir_vdsp); - } - break; - - case clFFT_2D: - - for(i = 0; i < batchSize; i++) - { - for(j = 0; j < n.y; j++) - { - stride = j * n.x + i * n.x * n.y; - out_vdsp.realp = out->real + stride; - out_vdsp.imagp = out->imag + stride; - - vDSP_fft_zipD(plan_vdsp, &out_vdsp, 1, log2Nx, dir_vdsp); - } - } - for(i = 0; i < batchSize; i++) - { - for(j = 0; j < n.x; j++) - { - stride = j + i * n.x * n.y; - out_vdsp.realp = out->real + stride; - out_vdsp.imagp = out->imag + stride; - - vDSP_fft_zipD(plan_vdsp, &out_vdsp, n.x, log2Ny, dir_vdsp); - } - } - break; - - case clFFT_3D: - - for(i = 0; i < batchSize; i++) - { - for(j = 0; j < n.z; j++) - { - for(k = 0; k < n.y; k++) - { - stride = k * n.x + j * n.x * n.y + i * n.x * n.y * n.z; - out_vdsp.realp = out->real + stride; - out_vdsp.imagp = out->imag + stride; - - vDSP_fft_zipD(plan_vdsp, &out_vdsp, 1, log2Nx, dir_vdsp); - } - } - } - for(i = 0; i < batchSize; i++) - { - for(j = 0; j < n.z; j++) - { - for(k = 0; k < n.x; k++) - { - stride = k + j * n.x * n.y + i * n.x * n.y * n.z; - out_vdsp.realp = out->real + stride; - out_vdsp.imagp = out->imag + stride; - - vDSP_fft_zipD(plan_vdsp, &out_vdsp, n.x, log2Ny, dir_vdsp); - } - } - } - for(i = 0; i < batchSize; i++) - { - for(j = 0; j < n.y; j++) - { - for(k = 0; k < n.x; k++) - { - stride = k + j * n.x + i * n.x * n.y * n.z; - out_vdsp.realp = out->real + stride; - out_vdsp.imagp = out->imag + stride; - - vDSP_fft_zipD(plan_vdsp, &out_vdsp, n.x*n.y, log2Nz, dir_vdsp); - } - } - } - break; - } - - vDSP_destroy_fftsetupD(plan_vdsp); + FFTSetupD plan_vdsp; + DSPDoubleSplitComplex out_vdsp; + FFTDirection dir_vdsp = dir == clFFT_Forward ? FFT_FORWARD : FFT_INVERSE; + + unsigned int i, j, k; + unsigned int stride; + unsigned int log2Nx = (int) log2(n.x); + unsigned int log2Ny = (int) log2(n.y); + unsigned int log2Nz = (int) log2(n.z); + unsigned int log2N; + + log2N = log2Nx; + log2N = log2N > log2Ny ? log2N : log2Ny; + log2N = log2N > log2Nz ? log2N : log2Nz; + + plan_vdsp = vDSP_create_fftsetupD(log2N, 2); + + switch(dim) + { + case clFFT_1D: + + for(i = 0; i < batchSize; i++) + { + stride = i * n.x; + out_vdsp.realp = out->real + stride; + out_vdsp.imagp = out->imag + stride; + + vDSP_fft_zipD(plan_vdsp, &out_vdsp, 1, log2Nx, dir_vdsp); + } + break; + + case clFFT_2D: + + for(i = 0; i < batchSize; i++) + { + for(j = 0; j < n.y; j++) + { + stride = j * n.x + i * n.x * n.y; + out_vdsp.realp = out->real + stride; + out_vdsp.imagp = out->imag + stride; + + vDSP_fft_zipD(plan_vdsp, &out_vdsp, 1, log2Nx, dir_vdsp); + } + } + for(i = 0; i < batchSize; i++) + { + for(j = 0; j < n.x; j++) + { + stride = j + i * n.x * n.y; + out_vdsp.realp = out->real + stride; + out_vdsp.imagp = out->imag + stride; + + vDSP_fft_zipD(plan_vdsp, &out_vdsp, n.x, log2Ny, dir_vdsp); + } + } + break; + + case clFFT_3D: + + for(i = 0; i < batchSize; i++) + { + for(j = 0; j < n.z; j++) + { + for(k = 0; k < n.y; k++) + { + stride = k * n.x + j * n.x * n.y + i * n.x * n.y * n.z; + out_vdsp.realp = out->real + stride; + out_vdsp.imagp = out->imag + stride; + + vDSP_fft_zipD(plan_vdsp, &out_vdsp, 1, log2Nx, dir_vdsp); + } + } + } + for(i = 0; i < batchSize; i++) + { + for(j = 0; j < n.z; j++) + { + for(k = 0; k < n.x; k++) + { + stride = k + j * n.x * n.y + i * n.x * n.y * n.z; + out_vdsp.realp = out->real + stride; + out_vdsp.imagp = out->imag + stride; + + vDSP_fft_zipD(plan_vdsp, &out_vdsp, n.x, log2Ny, dir_vdsp); + } + } + } + for(i = 0; i < batchSize; i++) + { + for(j = 0; j < n.y; j++) + { + for(k = 0; k < n.x; k++) + { + stride = k + j * n.x + i * n.x * n.y * n.z; + out_vdsp.realp = out->real + stride; + out_vdsp.imagp = out->imag + stride; + + vDSP_fft_zipD(plan_vdsp, &out_vdsp, n.x*n.y, log2Nz, dir_vdsp); + } + } + } + break; + } + + vDSP_destroy_fftsetupD(plan_vdsp); } #endif double complexNormSq(clFFT_ComplexDouble a) { - return (a.real * a.real + a.imag * a.imag); + return (a.real * a.real + a.imag * a.imag); } double computeL2Error(clFFT_SplitComplex *data, clFFT_SplitComplexDouble *data_ref, int n, int batchSize, double *max_diff, double *min_diff) { - int i, j; - double avg_norm = 0.0; - *max_diff = 0.0; - *min_diff = 0x1.0p1000; - - for(j = 0; j < batchSize; j++) - { - double norm_ref = 0.0; - double norm = 0.0; - for(i = 0; i < n; i++) - { - int index = j * n + i; - clFFT_ComplexDouble diff = (clFFT_ComplexDouble) { data_ref->real[index] - data->real[index], data_ref->imag[index] - data->imag[index] }; - double norm_tmp = complexNormSq(diff); - norm += norm_tmp; - norm_ref += (data_ref->real[index] * data_ref->real[index] + data_ref->imag[index] * data_ref->imag[index]); - } - double curr_norm = sqrt( norm / norm_ref ) / FLT_EPSILON; - avg_norm += curr_norm; - *max_diff = *max_diff < curr_norm ? curr_norm : *max_diff; - *min_diff = *min_diff > curr_norm ? curr_norm : *min_diff; - } - - return avg_norm / batchSize; + int i, j; + double avg_norm = 0.0; + *max_diff = 0.0; + *min_diff = 0x1.0p1000; + + for(j = 0; j < batchSize; j++) + { + double norm_ref = 0.0; + double norm = 0.0; + for(i = 0; i < n; i++) + { + int index = j * n + i; + clFFT_ComplexDouble diff = (clFFT_ComplexDouble) { data_ref->real[index] - data->real[index], data_ref->imag[index] - data->imag[index] }; + double norm_tmp = complexNormSq(diff); + norm += norm_tmp; + norm_ref += (data_ref->real[index] * data_ref->real[index] + data_ref->imag[index] * data_ref->imag[index]); + } + double curr_norm = sqrt( norm / norm_ref ) / FLT_EPSILON; + avg_norm += curr_norm; + *max_diff = *max_diff < curr_norm ? curr_norm : *max_diff; + *min_diff = *min_diff > curr_norm ? curr_norm : *min_diff; + } + + return avg_norm / batchSize; } void convertInterleavedToSplit(clFFT_SplitComplex *result_split, clFFT_Complex *data_cl, int length) { - int i; - for(i = 0; i < length; i++) { - result_split->real[i] = data_cl[i].real; - result_split->imag[i] = data_cl[i].imag; - } + int i; + for(i = 0; i < length; i++) { + result_split->real[i] = data_cl[i].real; + result_split->imag[i] = data_cl[i].imag; + } } int runTest(clFFT_Dim3 n, int batchSize, clFFT_Direction dir, clFFT_Dimension dim, - clFFT_DataFormat dataFormat, int numIter, clFFT_TestType testType) + clFFT_DataFormat dataFormat, int numIter, clFFT_TestType testType) { - cl_int err = CL_SUCCESS; - int iter; + cl_int err = CL_SUCCESS; + int iter; #ifdef __APPLE__ - double t; - uint64_t t0, t1; + double t; + uint64_t t0, t1; - int mx = log2(n.x); - int my = log2(n.y); - int mz = log2(n.z); + int mx = log2(n.x); + int my = log2(n.y); + int mz = log2(n.z); - double gflops = 5e-9 * ((double)mx + (double)my + (double)mz) * (double)n.x * (double)n.y * (double)n.z * (double)batchSize * (double)numIter; + double gflops = 5e-9 * ((double)mx + (double)my + (double)mz) * (double)n.x * (double)n.y * (double)n.z * (double)batchSize * (double)numIter; #endif - int length = n.x * n.y * n.z * batchSize; - - clFFT_SplitComplex data_i_split = (clFFT_SplitComplex) { NULL, NULL }; - clFFT_SplitComplex data_cl_split = (clFFT_SplitComplex) { NULL, NULL }; - clFFT_Complex *data_i = NULL; - clFFT_Complex *data_cl = NULL; - clFFT_SplitComplexDouble data_iref = (clFFT_SplitComplexDouble) { NULL, NULL }; - clFFT_SplitComplexDouble data_oref = (clFFT_SplitComplexDouble) { NULL, NULL }; - - clFFT_Plan plan = NULL; - cl_mem data_in = NULL; - cl_mem data_out = NULL; - cl_mem data_in_real = NULL; - cl_mem data_in_imag = NULL; - cl_mem data_out_real = NULL; - cl_mem data_out_imag = NULL; - - if(dataFormat == clFFT_SplitComplexFormat) { - data_i_split.real = (float *) malloc(sizeof(float) * length); - data_i_split.imag = (float *) malloc(sizeof(float) * length); - data_cl_split.real = (float *) malloc(sizeof(float) * length); - data_cl_split.imag = (float *) malloc(sizeof(float) * length); - if(!data_i_split.real || !data_i_split.imag || !data_cl_split.real || !data_cl_split.imag) - { - err = -1; - log_error("Out-of-Resources\n"); - goto cleanup; - } - } - else { - data_i = (clFFT_Complex *) malloc(sizeof(clFFT_Complex)*length); - data_cl = (clFFT_Complex *) malloc(sizeof(clFFT_Complex)*length); - if(!data_i || !data_cl) - { - err = -2; - log_error("Out-of-Resouces\n"); - goto cleanup; - } - } - - data_iref.real = (double *) malloc(sizeof(double) * length); - data_iref.imag = (double *) malloc(sizeof(double) * length); - data_oref.real = (double *) malloc(sizeof(double) * length); - data_oref.imag = (double *) malloc(sizeof(double) * length); - if(!data_iref.real || !data_iref.imag || !data_oref.real || !data_oref.imag) - { - err = -3; - log_error("Out-of-Resources\n"); - goto cleanup; - } - - int i; - if(dataFormat == clFFT_SplitComplexFormat) { - for(i = 0; i < length; i++) - { - data_i_split.real[i] = 2.0f * (float) rand() / (float) RAND_MAX - 1.0f; - data_i_split.imag[i] = 2.0f * (float) rand() / (float) RAND_MAX - 1.0f; - data_cl_split.real[i] = 0.0f; - data_cl_split.imag[i] = 0.0f; - data_iref.real[i] = data_i_split.real[i]; - data_iref.imag[i] = data_i_split.imag[i]; - data_oref.real[i] = data_iref.real[i]; - data_oref.imag[i] = data_iref.imag[i]; - } - } - else { - for(i = 0; i < length; i++) - { - data_i[i].real = 2.0f * (float) rand() / (float) RAND_MAX - 1.0f; - data_i[i].imag = 2.0f * (float) rand() / (float) RAND_MAX - 1.0f; - data_cl[i].real = 0.0f; - data_cl[i].imag = 0.0f; - data_iref.real[i] = data_i[i].real; - data_iref.imag[i] = data_i[i].imag; - data_oref.real[i] = data_iref.real[i]; - data_oref.imag[i] = data_iref.imag[i]; - } - } - - plan = clFFT_CreatePlan( context, n, dim, dataFormat, &err ); - if(!plan || err) - { - log_error("clFFT_CreatePlan failed\n"); - goto cleanup; - } - - //clFFT_DumpPlan(plan, stdout); - - if(dataFormat == clFFT_SplitComplexFormat) - { - data_in_real = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length*sizeof(float), data_i_split.real, &err); - if(!data_in_real || err) - { - log_error("clCreateBuffer failed\n"); - goto cleanup; - } - - data_in_imag = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length*sizeof(float), data_i_split.imag, &err); - if(!data_in_imag || err) - { - log_error("clCreateBuffer failed\n"); - goto cleanup; - } - - if(testType == clFFT_OUT_OF_PLACE) - { - data_out_real = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length*sizeof(float), data_cl_split.real, &err); - if(!data_out_real || err) - { - log_error("clCreateBuffer failed\n"); - goto cleanup; - } - - data_out_imag = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length*sizeof(float), data_cl_split.imag, &err); - if(!data_out_imag || err) - { - log_error("clCreateBuffer failed\n"); - goto cleanup; - } - } - else - { - data_out_real = data_in_real; - data_out_imag = data_in_imag; - } - } - else - { - data_in = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length*sizeof(float)*2, data_i, &err); - if(!data_in) - { - log_error("clCreateBuffer failed\n"); - goto cleanup; - } - if(testType == clFFT_OUT_OF_PLACE) - { - data_out = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length*sizeof(float)*2, data_cl, &err); - if(!data_out) - { - log_error("clCreateBuffer failed\n"); - goto cleanup; - } - } - else - data_out = data_in; - } - - - err = CL_SUCCESS; + int length = n.x * n.y * n.z * batchSize; + + clFFT_SplitComplex data_i_split = (clFFT_SplitComplex) { NULL, NULL }; + clFFT_SplitComplex data_cl_split = (clFFT_SplitComplex) { NULL, NULL }; + clFFT_Complex *data_i = NULL; + clFFT_Complex *data_cl = NULL; + clFFT_SplitComplexDouble data_iref = (clFFT_SplitComplexDouble) { NULL, NULL }; + clFFT_SplitComplexDouble data_oref = (clFFT_SplitComplexDouble) { NULL, NULL }; + + clFFT_Plan plan = NULL; + cl_mem data_in = NULL; + cl_mem data_out = NULL; + cl_mem data_in_real = NULL; + cl_mem data_in_imag = NULL; + cl_mem data_out_real = NULL; + cl_mem data_out_imag = NULL; + + if(dataFormat == clFFT_SplitComplexFormat) { + data_i_split.real = (float *) malloc(sizeof(float) * length); + data_i_split.imag = (float *) malloc(sizeof(float) * length); + data_cl_split.real = (float *) malloc(sizeof(float) * length); + data_cl_split.imag = (float *) malloc(sizeof(float) * length); + if(!data_i_split.real || !data_i_split.imag || !data_cl_split.real || !data_cl_split.imag) + { + err = -1; + log_error("Out-of-Resources\n"); + goto cleanup; + } + } + else { + data_i = (clFFT_Complex *) malloc(sizeof(clFFT_Complex)*length); + data_cl = (clFFT_Complex *) malloc(sizeof(clFFT_Complex)*length); + if(!data_i || !data_cl) + { + err = -2; + log_error("Out-of-Resouces\n"); + goto cleanup; + } + } + + data_iref.real = (double *) malloc(sizeof(double) * length); + data_iref.imag = (double *) malloc(sizeof(double) * length); + data_oref.real = (double *) malloc(sizeof(double) * length); + data_oref.imag = (double *) malloc(sizeof(double) * length); + if(!data_iref.real || !data_iref.imag || !data_oref.real || !data_oref.imag) + { + err = -3; + log_error("Out-of-Resources\n"); + goto cleanup; + } + + int i; + if(dataFormat == clFFT_SplitComplexFormat) { + for(i = 0; i < length; i++) + { + data_i_split.real[i] = 2.0f * (float) rand() / (float) RAND_MAX - 1.0f; + data_i_split.imag[i] = 2.0f * (float) rand() / (float) RAND_MAX - 1.0f; + data_cl_split.real[i] = 0.0f; + data_cl_split.imag[i] = 0.0f; + data_iref.real[i] = data_i_split.real[i]; + data_iref.imag[i] = data_i_split.imag[i]; + data_oref.real[i] = data_iref.real[i]; + data_oref.imag[i] = data_iref.imag[i]; + } + } + else { + for(i = 0; i < length; i++) + { + data_i[i].real = 2.0f * (float) rand() / (float) RAND_MAX - 1.0f; + data_i[i].imag = 2.0f * (float) rand() / (float) RAND_MAX - 1.0f; + data_cl[i].real = 0.0f; + data_cl[i].imag = 0.0f; + data_iref.real[i] = data_i[i].real; + data_iref.imag[i] = data_i[i].imag; + data_oref.real[i] = data_iref.real[i]; + data_oref.imag[i] = data_iref.imag[i]; + } + } + + plan = clFFT_CreatePlan( context, n, dim, dataFormat, &err ); + if(!plan || err) + { + log_error("clFFT_CreatePlan failed\n"); + goto cleanup; + } + + //clFFT_DumpPlan(plan, stdout); + + if(dataFormat == clFFT_SplitComplexFormat) + { + data_in_real = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length*sizeof(float), data_i_split.real, &err); + if(!data_in_real || err) + { + log_error("clCreateBuffer failed\n"); + goto cleanup; + } + + data_in_imag = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length*sizeof(float), data_i_split.imag, &err); + if(!data_in_imag || err) + { + log_error("clCreateBuffer failed\n"); + goto cleanup; + } + + if(testType == clFFT_OUT_OF_PLACE) + { + data_out_real = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length*sizeof(float), data_cl_split.real, &err); + if(!data_out_real || err) + { + log_error("clCreateBuffer failed\n"); + goto cleanup; + } + + data_out_imag = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length*sizeof(float), data_cl_split.imag, &err); + if(!data_out_imag || err) + { + log_error("clCreateBuffer failed\n"); + goto cleanup; + } + } + else + { + data_out_real = data_in_real; + data_out_imag = data_in_imag; + } + } + else + { + data_in = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length*sizeof(float)*2, data_i, &err); + if(!data_in) + { + log_error("clCreateBuffer failed\n"); + goto cleanup; + } + if(testType == clFFT_OUT_OF_PLACE) + { + data_out = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length*sizeof(float)*2, data_cl, &err); + if(!data_out) + { + log_error("clCreateBuffer failed\n"); + goto cleanup; + } + } + else + data_out = data_in; + } + + + err = CL_SUCCESS; #ifdef __APPLE__ - t0 = mach_absolute_time(); + t0 = mach_absolute_time(); #endif - if(dataFormat == clFFT_SplitComplexFormat) - { - for(iter = 0; iter < numIter; iter++) - err |= clFFT_ExecutePlannar(queue, plan, batchSize, dir, data_in_real, data_in_imag, data_out_real, data_out_imag, 0, NULL, NULL); - } - else - { - for(iter = 0; iter < numIter; iter++) - err |= clFFT_ExecuteInterleaved(queue, plan, batchSize, dir, data_in, data_out, 0, NULL, NULL); - } - - err |= clFinish(queue); - - if(err) - { - log_error("clFFT_Execute\n"); - goto cleanup; - } + if(dataFormat == clFFT_SplitComplexFormat) + { + for(iter = 0; iter < numIter; iter++) + err |= clFFT_ExecutePlannar(queue, plan, batchSize, dir, data_in_real, data_in_imag, data_out_real, data_out_imag, 0, NULL, NULL); + } + else + { + for(iter = 0; iter < numIter; iter++) + err |= clFFT_ExecuteInterleaved(queue, plan, batchSize, dir, data_in, data_out, 0, NULL, NULL); + } + + err |= clFinish(queue); + + if(err) + { + log_error("clFFT_Execute\n"); + goto cleanup; + } #ifdef __APPLE__ - t1 = mach_absolute_time(); - t = subtractTimes(t1, t0); - char temp[100]; - sprintf(temp, "GFlops achieved for n = (%d, %d, %d), batchsize = %d", n.x, n.y, n.z, batchSize); - log_perf(gflops / (float) t, 1, "GFlops/s", "%s", temp); + t1 = mach_absolute_time(); + t = subtractTimes(t1, t0); + char temp[100]; + sprintf(temp, "GFlops achieved for n = (%d, %d, %d), batchsize = %d", n.x, n.y, n.z, batchSize); + log_perf(gflops / (float) t, 1, "GFlops/s", "%s", temp); #endif - if(dataFormat == clFFT_SplitComplexFormat) - { - err |= clEnqueueReadBuffer(queue, data_out_real, CL_TRUE, 0, length*sizeof(float), data_cl_split.real, 0, NULL, NULL); - err |= clEnqueueReadBuffer(queue, data_out_imag, CL_TRUE, 0, length*sizeof(float), data_cl_split.imag, 0, NULL, NULL); - } - else - { - err |= clEnqueueReadBuffer(queue, data_out, CL_TRUE, 0, length*sizeof(float)*2, data_cl, 0, NULL, NULL); - } - - if(err) - { - log_error("clEnqueueReadBuffer failed\n"); + if(dataFormat == clFFT_SplitComplexFormat) + { + err |= clEnqueueReadBuffer(queue, data_out_real, CL_TRUE, 0, length*sizeof(float), data_cl_split.real, 0, NULL, NULL); + err |= clEnqueueReadBuffer(queue, data_out_imag, CL_TRUE, 0, length*sizeof(float), data_cl_split.imag, 0, NULL, NULL); + } + else + { + err |= clEnqueueReadBuffer(queue, data_out, CL_TRUE, 0, length*sizeof(float)*2, data_cl, 0, NULL, NULL); + } + + if(err) + { + log_error("clEnqueueReadBuffer failed\n"); goto cleanup; - } + } #ifdef __APPLE__ - computeReferenceD(&data_oref, n, batchSize, dim, dir); - - double diff_avg, diff_max, diff_min; - if(dataFormat == clFFT_SplitComplexFormat) { - diff_avg = computeL2Error(&data_cl_split, &data_oref, n.x*n.y*n.z, batchSize, &diff_max, &diff_min); - if(diff_avg > eps_avg) - log_error("Test failed (n=(%d, %d, %d), batchsize=%d): %s Test: rel. L2-error = %f eps (max=%f eps, min=%f eps)\n", n.x, n.y, n.z, batchSize, (testType == clFFT_OUT_OF_PLACE) ? "out-of-place" : "in-place", diff_avg, diff_max, diff_min); - else - log_info("Test passed (n=(%d, %d, %d), batchsize=%d): %s Test: rel. L2-error = %f eps (max=%f eps, min=%f eps)\n", n.x, n.y, n.z, batchSize, (testType == clFFT_OUT_OF_PLACE) ? "out-of-place" : "in-place", diff_avg, diff_max, diff_min); - } - else { - clFFT_SplitComplex result_split; - result_split.real = (float *) malloc(length*sizeof(float)); - result_split.imag = (float *) malloc(length*sizeof(float)); - convertInterleavedToSplit(&result_split, data_cl, length); - diff_avg = computeL2Error(&result_split, &data_oref, n.x*n.y*n.z, batchSize, &diff_max, &diff_min); - - if(diff_avg > eps_avg) - log_error("Test failed (n=(%d, %d, %d), batchsize=%d): %s Test: rel. L2-error = %f eps (max=%f eps, min=%f eps)\n", n.x, n.y, n.z, batchSize, (testType == clFFT_OUT_OF_PLACE) ? "out-of-place" : "in-place", diff_avg, diff_max, diff_min); - else - log_info("Test passed (n=(%d, %d, %d), batchsize=%d): %s Test: rel. L2-error = %f eps (max=%f eps, min=%f eps)\n", n.x, n.y, n.z, batchSize, (testType == clFFT_OUT_OF_PLACE) ? "out-of-place" : "in-place", diff_avg, diff_max, diff_min); - free(result_split.real); - free(result_split.imag); - } + computeReferenceD(&data_oref, n, batchSize, dim, dir); + + double diff_avg, diff_max, diff_min; + if(dataFormat == clFFT_SplitComplexFormat) { + diff_avg = computeL2Error(&data_cl_split, &data_oref, n.x*n.y*n.z, batchSize, &diff_max, &diff_min); + if(diff_avg > eps_avg) + log_error("Test failed (n=(%d, %d, %d), batchsize=%d): %s Test: rel. L2-error = %f eps (max=%f eps, min=%f eps)\n", n.x, n.y, n.z, batchSize, (testType == clFFT_OUT_OF_PLACE) ? "out-of-place" : "in-place", diff_avg, diff_max, diff_min); + else + log_info("Test passed (n=(%d, %d, %d), batchsize=%d): %s Test: rel. L2-error = %f eps (max=%f eps, min=%f eps)\n", n.x, n.y, n.z, batchSize, (testType == clFFT_OUT_OF_PLACE) ? "out-of-place" : "in-place", diff_avg, diff_max, diff_min); + } + else { + clFFT_SplitComplex result_split; + result_split.real = (float *) malloc(length*sizeof(float)); + result_split.imag = (float *) malloc(length*sizeof(float)); + convertInterleavedToSplit(&result_split, data_cl, length); + diff_avg = computeL2Error(&result_split, &data_oref, n.x*n.y*n.z, batchSize, &diff_max, &diff_min); + + if(diff_avg > eps_avg) + log_error("Test failed (n=(%d, %d, %d), batchsize=%d): %s Test: rel. L2-error = %f eps (max=%f eps, min=%f eps)\n", n.x, n.y, n.z, batchSize, (testType == clFFT_OUT_OF_PLACE) ? "out-of-place" : "in-place", diff_avg, diff_max, diff_min); + else + log_info("Test passed (n=(%d, %d, %d), batchsize=%d): %s Test: rel. L2-error = %f eps (max=%f eps, min=%f eps)\n", n.x, n.y, n.z, batchSize, (testType == clFFT_OUT_OF_PLACE) ? "out-of-place" : "in-place", diff_avg, diff_max, diff_min); + free(result_split.real); + free(result_split.imag); + } #endif cleanup: - clFFT_DestroyPlan(plan); - if(dataFormat == clFFT_SplitComplexFormat) - { - if(data_i_split.real) - free(data_i_split.real); - if(data_i_split.imag) - free(data_i_split.imag); - if(data_cl_split.real) - free(data_cl_split.real); - if(data_cl_split.imag) - free(data_cl_split.imag); - - if(data_in_real) - clReleaseMemObject(data_in_real); - if(data_in_imag) - clReleaseMemObject(data_in_imag); - if(data_out_real && testType == clFFT_OUT_OF_PLACE) - clReleaseMemObject(data_out_real); - if(data_out_imag && clFFT_OUT_OF_PLACE) - clReleaseMemObject(data_out_imag); - } - else - { - if(data_i) - free(data_i); - if(data_cl) - free(data_cl); - - if(data_in) - clReleaseMemObject(data_in); - if(data_out && testType == clFFT_OUT_OF_PLACE) - clReleaseMemObject(data_out); - } - - if(data_iref.real) - free(data_iref.real); - if(data_iref.imag) - free(data_iref.imag); - if(data_oref.real) - free(data_oref.real); - if(data_oref.imag) - free(data_oref.imag); - - return err; + clFFT_DestroyPlan(plan); + if(dataFormat == clFFT_SplitComplexFormat) + { + if(data_i_split.real) + free(data_i_split.real); + if(data_i_split.imag) + free(data_i_split.imag); + if(data_cl_split.real) + free(data_cl_split.real); + if(data_cl_split.imag) + free(data_cl_split.imag); + + if(data_in_real) + clReleaseMemObject(data_in_real); + if(data_in_imag) + clReleaseMemObject(data_in_imag); + if(data_out_real && testType == clFFT_OUT_OF_PLACE) + clReleaseMemObject(data_out_real); + if(data_out_imag && clFFT_OUT_OF_PLACE) + clReleaseMemObject(data_out_imag); + } + else + { + if(data_i) + free(data_i); + if(data_cl) + free(data_cl); + + if(data_in) + clReleaseMemObject(data_in); + if(data_out && testType == clFFT_OUT_OF_PLACE) + clReleaseMemObject(data_out); + } + + if(data_iref.real) + free(data_iref.real); + if(data_iref.imag) + free(data_iref.imag); + if(data_oref.real) + free(data_oref.real); + if(data_oref.imag) + free(data_oref.imag); + + return err; } bool ifLineCommented(const char *line) { - const char *Line = line; - while(*Line != '\0') - if((*Line == '/') && (*(Line + 1) == '/')) - return true; - else - Line++; - return false; + const char *Line = line; + while(*Line != '\0') + if((*Line == '/') && (*(Line + 1) == '/')) + return true; + else + Line++; + return false; } cl_device_type getGlobalDeviceType() { - char *force_cpu = getenv( "CL_DEVICE_TYPE" ); - if( force_cpu != NULL ) - { - if( strcmp( force_cpu, "gpu" ) == 0 || strcmp( force_cpu, "CL_DEVICE_TYPE_GPU" ) == 0 ) - return CL_DEVICE_TYPE_GPU; - else if( strcmp( force_cpu, "cpu" ) == 0 || strcmp( force_cpu, "CL_DEVICE_TYPE_CPU" ) == 0 ) - return CL_DEVICE_TYPE_CPU; - else if( strcmp( force_cpu, "accelerator" ) == 0 || strcmp( force_cpu, "CL_DEVICE_TYPE_ACCELERATOR" ) == 0 ) - return CL_DEVICE_TYPE_ACCELERATOR; - else if( strcmp( force_cpu, "CL_DEVICE_TYPE_DEFAULT" ) == 0 ) - return CL_DEVICE_TYPE_DEFAULT; - } - // default - return CL_DEVICE_TYPE_GPU; + char *force_cpu = getenv( "CL_DEVICE_TYPE" ); + if( force_cpu != NULL ) + { + if( strcmp( force_cpu, "gpu" ) == 0 || strcmp( force_cpu, "CL_DEVICE_TYPE_GPU" ) == 0 ) + return CL_DEVICE_TYPE_GPU; + else if( strcmp( force_cpu, "cpu" ) == 0 || strcmp( force_cpu, "CL_DEVICE_TYPE_CPU" ) == 0 ) + return CL_DEVICE_TYPE_CPU; + else if( strcmp( force_cpu, "accelerator" ) == 0 || strcmp( force_cpu, "CL_DEVICE_TYPE_ACCELERATOR" ) == 0 ) + return CL_DEVICE_TYPE_ACCELERATOR; + else if( strcmp( force_cpu, "CL_DEVICE_TYPE_DEFAULT" ) == 0 ) + return CL_DEVICE_TYPE_DEFAULT; + } + // default + return CL_DEVICE_TYPE_GPU; } void @@ -699,231 +699,231 @@ notify_callback(const char *errinfo, const void *private_info, size_t cb, void * int checkMemRequirements(clFFT_Dim3 n, int batchSize, clFFT_TestType testType, cl_ulong gMemSize) { - cl_ulong memReq = (testType == clFFT_OUT_OF_PLACE) ? 3 : 2; - memReq *= n.x*n.y*n.z*sizeof(clFFT_Complex)*batchSize; - memReq = memReq/1024/1024; - if(memReq >= gMemSize) - return -1; - return 0; + cl_ulong memReq = (testType == clFFT_OUT_OF_PLACE) ? 3 : 2; + memReq *= n.x*n.y*n.z*sizeof(clFFT_Complex)*batchSize; + memReq = memReq/1024/1024; + if(memReq >= gMemSize) + return -1; + return 0; } int main (int argc, char * const argv[]) { - test_start(); - - cl_ulong gMemSize; - clFFT_Direction dir = clFFT_Forward; - int numIter = 1; - clFFT_Dim3 n = { 1024, 1, 1 }; - int batchSize = 1; - clFFT_DataFormat dataFormat = clFFT_SplitComplexFormat; - clFFT_Dimension dim = clFFT_1D; - clFFT_TestType testType = clFFT_OUT_OF_PLACE; - cl_device_id device_ids[16]; - - FILE *paramFile; - - cl_int err; - unsigned int num_devices; - - cl_device_type device_type = getGlobalDeviceType(); - if(device_type != CL_DEVICE_TYPE_GPU) - { - log_info("Test only supported on DEVICE_TYPE_GPU\n"); - test_finish(); - exit(0); - } - - err = clGetDeviceIDs(NULL, device_type, sizeof(device_ids), device_ids, &num_devices); - if(err) - { - printf("ERROR: clGetDeviceIDs failed with error: %d\n", err); - test_finish(); - return -1; - } - - device_id = NULL; - unsigned int i = 0; - - if (argc == 3) { - cl_bool available; - err = clGetDeviceInfo(device_ids[atoi(argv[2])], CL_DEVICE_AVAILABLE, sizeof(cl_bool), &available, NULL); - if(err) { - printf("ERROR: Cannot check device availability of device # %d\n", atoi(argv[2])); + test_start(); + + cl_ulong gMemSize; + clFFT_Direction dir = clFFT_Forward; + int numIter = 1; + clFFT_Dim3 n = { 1024, 1, 1 }; + int batchSize = 1; + clFFT_DataFormat dataFormat = clFFT_SplitComplexFormat; + clFFT_Dimension dim = clFFT_1D; + clFFT_TestType testType = clFFT_OUT_OF_PLACE; + cl_device_id device_ids[16]; + + FILE *paramFile; + + cl_int err; + unsigned int num_devices; + + cl_device_type device_type = getGlobalDeviceType(); + if(device_type != CL_DEVICE_TYPE_GPU) + { + log_info("Test only supported on DEVICE_TYPE_GPU\n"); + test_finish(); + exit(0); + } + + err = clGetDeviceIDs(NULL, device_type, sizeof(device_ids), device_ids, &num_devices); + if(err) + { + printf("ERROR: clGetDeviceIDs failed with error: %d\n", err); + test_finish(); + return -1; + } + + device_id = NULL; + unsigned int i = 0; + + if (argc == 3) { + cl_bool available; + err = clGetDeviceInfo(device_ids[atoi(argv[2])], CL_DEVICE_AVAILABLE, sizeof(cl_bool), &available, NULL); + if(err) { + printf("ERROR: Cannot check device availability of device # %d\n", atoi(argv[2])); return -1; - } - - if(available) { - device_id = device_ids[atoi(argv[2])]; - } - else { - char name[200]; - err = clGetDeviceInfo(device_ids[i], CL_DEVICE_NAME, sizeof(name), name, NULL); - if(err == CL_SUCCESS) { - printf("ERROR: Device %s not available for compute\n", name); - } - else { - printf("ERROR: Device # %d not available for compute\n", atoi(argv[2])); - } - return -1; + } + + if(available) { + device_id = device_ids[atoi(argv[2])]; + } + else { + char name[200]; + err = clGetDeviceInfo(device_ids[i], CL_DEVICE_NAME, sizeof(name), name, NULL); + if(err == CL_SUCCESS) { + printf("ERROR: Device %s not available for compute\n", name); } - } - else { - for(i = 0; i < num_devices; i++) - { - cl_bool available; - err = clGetDeviceInfo(device_ids[i], CL_DEVICE_AVAILABLE, sizeof(cl_bool), &available, NULL); - if(err) { - printf("ERROR: Cannot check device availability of device # %d. Continuing with next available device...\n", i); - continue; - } - - if(available) { - device_id = device_ids[i]; - break; - } - else { - char name[200]; - err = clGetDeviceInfo(device_ids[i], CL_DEVICE_NAME, sizeof(name), name, NULL); - if(err == CL_SUCCESS) { - printf("INFO: Device %s not available for compute\n", name); - } - else { - printf("INFO: Device # %d not available for compute\n", i); - } - } - } - } - - if(!device_id) { - log_error("None of the devices available for compute ... aborting test\n"); - test_finish(); - return -1; - } - else { - char name[200]; - err = clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(name), name, NULL); - if(err == CL_SUCCESS) { - printf("INFO: Using device %s...\n", name); - } - else { - printf("INFO: Using device # %d...\n", i); - } - } - - context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); - if(!context || err) - { - log_error("clCreateContext failed\n"); - test_finish(); - return -1; - } + else { + printf("ERROR: Device # %d not available for compute\n", atoi(argv[2])); + } + return -1; + } + } + else { + for(i = 0; i < num_devices; i++) + { + cl_bool available; + err = clGetDeviceInfo(device_ids[i], CL_DEVICE_AVAILABLE, sizeof(cl_bool), &available, NULL); + if(err) { + printf("ERROR: Cannot check device availability of device # %d. Continuing with next available device...\n", i); + continue; + } + + if(available) { + device_id = device_ids[i]; + break; + } + else { + char name[200]; + err = clGetDeviceInfo(device_ids[i], CL_DEVICE_NAME, sizeof(name), name, NULL); + if(err == CL_SUCCESS) { + printf("INFO: Device %s not available for compute\n", name); + } + else { + printf("INFO: Device # %d not available for compute\n", i); + } + } + } + } + + if(!device_id) { + log_error("None of the devices available for compute ... aborting test\n"); + test_finish(); + return -1; + } + else { + char name[200]; + err = clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(name), name, NULL); + if(err == CL_SUCCESS) { + printf("INFO: Using device %s...\n", name); + } + else { + printf("INFO: Using device # %d...\n", i); + } + } + + context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); + if(!context || err) + { + log_error("clCreateContext failed\n"); + test_finish(); + return -1; + } queue = clCreateCommandQueue(context, device_id, 0, &err); if(!queue || err) - { + { log_error("clCreateCommandQueue() failed.\n"); - clReleaseContext(context); + clReleaseContext(context); test_finish(); return -1; } - err = clGetDeviceInfo(device_id, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(cl_ulong), &gMemSize, NULL); - if(err) - { - log_error("Failed to get global mem size\n"); - clReleaseContext(context); - clReleaseCommandQueue(queue); - test_finish(); - return -2; - } - - gMemSize /= (1024*1024); - - char delim[] = " \n"; - char tmpStr[100]; - char line[200]; - char *param, *val; - int total_errors = 0; - if(argc == 1) { - log_error("Need file name with list of parameters to run the test\n"); - test_finish(); - return -1; - } - - if(argc >= 2) { // arguments are supplied in a file with arguments for a single run are all on the same line - paramFile = fopen(argv[1], "r"); - if(!paramFile) { - log_error("Cannot open the parameter file\n"); - clReleaseContext(context); - clReleaseCommandQueue(queue); - test_finish(); - return -3; - } - while(fgets(line, 199, paramFile)) { - if(!strcmp(line, "") || !strcmp(line, "\n") || ifLineCommented(line)) - continue; - param = strtok(line, delim); - while(param) { - val = strtok(NULL, delim); - if(!strcmp(param, "-n")) { - sscanf(val, "%d", &n.x); - val = strtok(NULL, delim); - sscanf(val, "%d", &n.y); - val = strtok(NULL, delim); - sscanf(val, "%d", &n.z); - } - else if(!strcmp(param, "-batchsize")) - sscanf(val, "%d", &batchSize); - else if(!strcmp(param, "-dir")) { - sscanf(val, "%s", tmpStr); - if(!strcmp(tmpStr, "forward")) - dir = clFFT_Forward; - else if(!strcmp(tmpStr, "inverse")) - dir = clFFT_Inverse; - } - else if(!strcmp(param, "-dim")) { - sscanf(val, "%s", tmpStr); - if(!strcmp(tmpStr, "1D")) - dim = clFFT_1D; - else if(!strcmp(tmpStr, "2D")) - dim = clFFT_2D; - else if(!strcmp(tmpStr, "3D")) - dim = clFFT_3D; - } - else if(!strcmp(param, "-format")) { - sscanf(val, "%s", tmpStr); - if(!strcmp(tmpStr, "plannar")) - dataFormat = clFFT_SplitComplexFormat; - else if(!strcmp(tmpStr, "interleaved")) - dataFormat = clFFT_InterleavedComplexFormat; - } - else if(!strcmp(param, "-numiter")) - sscanf(val, "%d", &numIter); - else if(!strcmp(param, "-testtype")) { - sscanf(val, "%s", tmpStr); - if(!strcmp(tmpStr, "out-of-place")) - testType = clFFT_OUT_OF_PLACE; - else if(!strcmp(tmpStr, "in-place")) - testType = clFFT_IN_PLACE; - } - param = strtok(NULL, delim); - } - - if(checkMemRequirements(n, batchSize, testType, gMemSize)) { - log_info("This test cannot run because memory requirements canot be met by the available device\n"); - continue; - } - - err = runTest(n, batchSize, dir, dim, dataFormat, numIter, testType); - if (err) - total_errors++; - } - } - - clReleaseContext(context); - clReleaseCommandQueue(queue); - - test_finish(); - return total_errors; + err = clGetDeviceInfo(device_id, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(cl_ulong), &gMemSize, NULL); + if(err) + { + log_error("Failed to get global mem size\n"); + clReleaseContext(context); + clReleaseCommandQueue(queue); + test_finish(); + return -2; + } + + gMemSize /= (1024*1024); + + char delim[] = " \n"; + char tmpStr[100]; + char line[200]; + char *param, *val; + int total_errors = 0; + if(argc == 1) { + log_error("Need file name with list of parameters to run the test\n"); + test_finish(); + return -1; + } + + if(argc >= 2) { // arguments are supplied in a file with arguments for a single run are all on the same line + paramFile = fopen(argv[1], "r"); + if(!paramFile) { + log_error("Cannot open the parameter file\n"); + clReleaseContext(context); + clReleaseCommandQueue(queue); + test_finish(); + return -3; + } + while(fgets(line, 199, paramFile)) { + if(!strcmp(line, "") || !strcmp(line, "\n") || ifLineCommented(line)) + continue; + param = strtok(line, delim); + while(param) { + val = strtok(NULL, delim); + if(!strcmp(param, "-n")) { + sscanf(val, "%d", &n.x); + val = strtok(NULL, delim); + sscanf(val, "%d", &n.y); + val = strtok(NULL, delim); + sscanf(val, "%d", &n.z); + } + else if(!strcmp(param, "-batchsize")) + sscanf(val, "%d", &batchSize); + else if(!strcmp(param, "-dir")) { + sscanf(val, "%s", tmpStr); + if(!strcmp(tmpStr, "forward")) + dir = clFFT_Forward; + else if(!strcmp(tmpStr, "inverse")) + dir = clFFT_Inverse; + } + else if(!strcmp(param, "-dim")) { + sscanf(val, "%s", tmpStr); + if(!strcmp(tmpStr, "1D")) + dim = clFFT_1D; + else if(!strcmp(tmpStr, "2D")) + dim = clFFT_2D; + else if(!strcmp(tmpStr, "3D")) + dim = clFFT_3D; + } + else if(!strcmp(param, "-format")) { + sscanf(val, "%s", tmpStr); + if(!strcmp(tmpStr, "plannar")) + dataFormat = clFFT_SplitComplexFormat; + else if(!strcmp(tmpStr, "interleaved")) + dataFormat = clFFT_InterleavedComplexFormat; + } + else if(!strcmp(param, "-numiter")) + sscanf(val, "%d", &numIter); + else if(!strcmp(param, "-testtype")) { + sscanf(val, "%s", tmpStr); + if(!strcmp(tmpStr, "out-of-place")) + testType = clFFT_OUT_OF_PLACE; + else if(!strcmp(tmpStr, "in-place")) + testType = clFFT_IN_PLACE; + } + param = strtok(NULL, delim); + } + + if(checkMemRequirements(n, batchSize, testType, gMemSize)) { + log_info("This test cannot run because memory requirements canot be met by the available device\n"); + continue; + } + + err = runTest(n, batchSize, dir, dim, dataFormat, numIter, testType); + if (err) + total_errors++; + } + } + + clReleaseContext(context); + clReleaseCommandQueue(queue); + + test_finish(); + return total_errors; } diff --git a/include/clFFT.h b/include/clFFT.h index 3b3e12134d9d2b90e0ddd6f24c74eec8a01350f0..5f59869850369f7925cdc7bfb37b2dd51529609e 100644 --- a/include/clFFT.h +++ b/include/clFFT.h @@ -63,44 +63,44 @@ extern "C" { // XForm type typedef enum { - clFFT_Forward = -1, - clFFT_Inverse = 1 + clFFT_Forward = -1, + clFFT_Inverse = 1 }clFFT_Direction; // XForm dimension typedef enum { - clFFT_1D = 0, - clFFT_2D = 1, - clFFT_3D = 3 + clFFT_1D = 0, + clFFT_2D = 1, + clFFT_3D = 3 }clFFT_Dimension; // XForm Data type typedef enum { - clFFT_SplitComplexFormat = 0, - clFFT_InterleavedComplexFormat = 1 + clFFT_SplitComplexFormat = 0, + clFFT_InterleavedComplexFormat = 1 }clFFT_DataFormat; typedef struct { - unsigned int x; - unsigned int y; - unsigned int z; + unsigned int x; + unsigned int y; + unsigned int z; }clFFT_Dim3; typedef struct { - float *real; - float *imag; + float *real; + float *imag; } clFFT_SplitComplex; typedef struct { - float real; - float imag; + float real; + float imag; }clFFT_Complex; typedef void* clFFT_Plan; @@ -110,19 +110,19 @@ clFFT_Plan clFFT_CreatePlan( cl_context context, clFFT_Dim3 n, clFFT_Dimension d void clFFT_DestroyPlan( clFFT_Plan plan ); cl_int clFFT_ExecuteInterleaved( cl_command_queue queue, clFFT_Plan plan, cl_int batchSize, clFFT_Direction dir, - cl_mem data_in, cl_mem data_out, - cl_int num_events, cl_event *event_list, cl_event *event ); + cl_mem data_in, cl_mem data_out, + cl_int num_events, cl_event *event_list, cl_event *event ); cl_int clFFT_ExecutePlannar( cl_command_queue queue, clFFT_Plan plan, cl_int batchSize, clFFT_Direction dir, - cl_mem data_in_real, cl_mem data_in_imag, cl_mem data_out_real, cl_mem data_out_imag, - cl_int num_events, cl_event *event_list, cl_event *event ); + cl_mem data_in_real, cl_mem data_in_imag, cl_mem data_out_real, cl_mem data_out_imag, + cl_int num_events, cl_event *event_list, cl_event *event ); cl_int clFFT_1DTwistInterleaved(clFFT_Plan Plan, cl_command_queue queue, cl_mem array, - size_t numRows, size_t numCols, size_t startRow, size_t rowsToProcess, clFFT_Direction dir); + size_t numRows, size_t numCols, size_t startRow, size_t rowsToProcess, clFFT_Direction dir); cl_int clFFT_1DTwistPlannar(clFFT_Plan Plan, cl_command_queue queue, cl_mem array_real, cl_mem array_imag, - size_t numRows, size_t numCols, size_t startRow, size_t rowsToProcess, clFFT_Direction dir); + size_t numRows, size_t numCols, size_t startRow, size_t rowsToProcess, clFFT_Direction dir); void clFFT_DumpPlan( clFFT_Plan plan, FILE *file); diff --git a/src/fft_base_kernels.h b/src/fft_base_kernels.h index 186b545f346c0ad1c8edb40832e87e48192837d1..fe70c8b86a0fe59ab51bddab94e530f5f4334650 100644 --- a/src/fft_base_kernels.h +++ b/src/fft_base_kernels.h @@ -55,222 +55,222 @@ using namespace std; static string baseKernels = string( "#ifndef M_PI\n" - "#define M_PI 0x1.921fb54442d18p+1\n" - "#endif\n" - "#define complexMul(a,b) ((float2)(mad(-(a).y, (b).y, (a).x * (b).x), mad((a).y, (b).x, (a).x * (b).y)))\n" - "#define conj(a) ((float2)((a).x, -(a).y))\n" - "#define conjTransp(a) ((float2)(-(a).y, (a).x))\n" - "\n" - "#define fftKernel2(a,dir) \\\n" - "{ \\\n" - " float2 c = (a)[0]; \\\n" - " (a)[0] = c + (a)[1]; \\\n" - " (a)[1] = c - (a)[1]; \\\n" - "}\n" - "\n" - "#define fftKernel2S(d1,d2,dir) \\\n" - "{ \\\n" - " float2 c = (d1); \\\n" - " (d1) = c + (d2); \\\n" - " (d2) = c - (d2); \\\n" - "}\n" - "\n" - "#define fftKernel4(a,dir) \\\n" - "{ \\\n" - " fftKernel2S((a)[0], (a)[2], dir); \\\n" - " fftKernel2S((a)[1], (a)[3], dir); \\\n" - " fftKernel2S((a)[0], (a)[1], dir); \\\n" - " (a)[3] = (float2)(dir)*(conjTransp((a)[3])); \\\n" - " fftKernel2S((a)[2], (a)[3], dir); \\\n" - " float2 c = (a)[1]; \\\n" - " (a)[1] = (a)[2]; \\\n" - " (a)[2] = c; \\\n" - "}\n" - "\n" - "#define fftKernel4s(a0,a1,a2,a3,dir) \\\n" - "{ \\\n" - " fftKernel2S((a0), (a2), dir); \\\n" - " fftKernel2S((a1), (a3), dir); \\\n" - " fftKernel2S((a0), (a1), dir); \\\n" - " (a3) = (float2)(dir)*(conjTransp((a3))); \\\n" - " fftKernel2S((a2), (a3), dir); \\\n" - " float2 c = (a1); \\\n" - " (a1) = (a2); \\\n" - " (a2) = c; \\\n" - "}\n" - "\n" - "#define bitreverse8(a) \\\n" - "{ \\\n" - " float2 c; \\\n" - " c = (a)[1]; \\\n" - " (a)[1] = (a)[4]; \\\n" - " (a)[4] = c; \\\n" - " c = (a)[3]; \\\n" - " (a)[3] = (a)[6]; \\\n" - " (a)[6] = c; \\\n" - "}\n" - "\n" - "#define fftKernel8(a,dir) \\\n" - "{ \\\n" - " const float2 w1 = (float2)(0x1.6a09e6p-1f, dir*0x1.6a09e6p-1f); \\\n" - " const float2 w3 = (float2)(-0x1.6a09e6p-1f, dir*0x1.6a09e6p-1f); \\\n" - " float2 c; \\\n" - " fftKernel2S((a)[0], (a)[4], dir); \\\n" - " fftKernel2S((a)[1], (a)[5], dir); \\\n" - " fftKernel2S((a)[2], (a)[6], dir); \\\n" - " fftKernel2S((a)[3], (a)[7], dir); \\\n" - " (a)[5] = complexMul(w1, (a)[5]); \\\n" - " (a)[6] = (float2)(dir)*(conjTransp((a)[6])); \\\n" - " (a)[7] = complexMul(w3, (a)[7]); \\\n" - " fftKernel2S((a)[0], (a)[2], dir); \\\n" - " fftKernel2S((a)[1], (a)[3], dir); \\\n" - " fftKernel2S((a)[4], (a)[6], dir); \\\n" - " fftKernel2S((a)[5], (a)[7], dir); \\\n" - " (a)[3] = (float2)(dir)*(conjTransp((a)[3])); \\\n" - " (a)[7] = (float2)(dir)*(conjTransp((a)[7])); \\\n" - " fftKernel2S((a)[0], (a)[1], dir); \\\n" - " fftKernel2S((a)[2], (a)[3], dir); \\\n" - " fftKernel2S((a)[4], (a)[5], dir); \\\n" - " fftKernel2S((a)[6], (a)[7], dir); \\\n" - " bitreverse8((a)); \\\n" - "}\n" - "\n" - "#define bitreverse4x4(a) \\\n" - "{ \\\n" - " float2 c; \\\n" - " c = (a)[1]; (a)[1] = (a)[4]; (a)[4] = c; \\\n" - " c = (a)[2]; (a)[2] = (a)[8]; (a)[8] = c; \\\n" - " c = (a)[3]; (a)[3] = (a)[12]; (a)[12] = c; \\\n" - " c = (a)[6]; (a)[6] = (a)[9]; (a)[9] = c; \\\n" - " c = (a)[7]; (a)[7] = (a)[13]; (a)[13] = c; \\\n" - " c = (a)[11]; (a)[11] = (a)[14]; (a)[14] = c; \\\n" - "}\n" - "\n" - "#define fftKernel16(a,dir) \\\n" - "{ \\\n" - " const float w0 = 0x1.d906bcp-1f; \\\n" - " const float w1 = 0x1.87de2ap-2f; \\\n" - " const float w2 = 0x1.6a09e6p-1f; \\\n" - " fftKernel4s((a)[0], (a)[4], (a)[8], (a)[12], dir); \\\n" - " fftKernel4s((a)[1], (a)[5], (a)[9], (a)[13], dir); \\\n" - " fftKernel4s((a)[2], (a)[6], (a)[10], (a)[14], dir); \\\n" - " fftKernel4s((a)[3], (a)[7], (a)[11], (a)[15], dir); \\\n" - " (a)[5] = complexMul((a)[5], (float2)(w0, dir*w1)); \\\n" - " (a)[6] = complexMul((a)[6], (float2)(w2, dir*w2)); \\\n" - " (a)[7] = complexMul((a)[7], (float2)(w1, dir*w0)); \\\n" - " (a)[9] = complexMul((a)[9], (float2)(w2, dir*w2)); \\\n" - " (a)[10] = (float2)(dir)*(conjTransp((a)[10])); \\\n" - " (a)[11] = complexMul((a)[11], (float2)(-w2, dir*w2)); \\\n" - " (a)[13] = complexMul((a)[13], (float2)(w1, dir*w0)); \\\n" - " (a)[14] = complexMul((a)[14], (float2)(-w2, dir*w2)); \\\n" - " (a)[15] = complexMul((a)[15], (float2)(-w0, dir*-w1)); \\\n" - " fftKernel4((a), dir); \\\n" - " fftKernel4((a) + 4, dir); \\\n" - " fftKernel4((a) + 8, dir); \\\n" - " fftKernel4((a) + 12, dir); \\\n" - " bitreverse4x4((a)); \\\n" - "}\n" - "\n" - "#define bitreverse32(a) \\\n" - "{ \\\n" - " float2 c1, c2; \\\n" - " c1 = (a)[2]; (a)[2] = (a)[1]; c2 = (a)[4]; (a)[4] = c1; c1 = (a)[8]; (a)[8] = c2; c2 = (a)[16]; (a)[16] = c1; (a)[1] = c2; \\\n" - " c1 = (a)[6]; (a)[6] = (a)[3]; c2 = (a)[12]; (a)[12] = c1; c1 = (a)[24]; (a)[24] = c2; c2 = (a)[17]; (a)[17] = c1; (a)[3] = c2; \\\n" - " c1 = (a)[10]; (a)[10] = (a)[5]; c2 = (a)[20]; (a)[20] = c1; c1 = (a)[9]; (a)[9] = c2; c2 = (a)[18]; (a)[18] = c1; (a)[5] = c2; \\\n" - " c1 = (a)[14]; (a)[14] = (a)[7]; c2 = (a)[28]; (a)[28] = c1; c1 = (a)[25]; (a)[25] = c2; c2 = (a)[19]; (a)[19] = c1; (a)[7] = c2; \\\n" - " c1 = (a)[22]; (a)[22] = (a)[11]; c2 = (a)[13]; (a)[13] = c1; c1 = (a)[26]; (a)[26] = c2; c2 = (a)[21]; (a)[21] = c1; (a)[11] = c2; \\\n" - " c1 = (a)[30]; (a)[30] = (a)[15]; c2 = (a)[29]; (a)[29] = c1; c1 = (a)[27]; (a)[27] = c2; c2 = (a)[23]; (a)[23] = c1; (a)[15] = c2; \\\n" - "}\n" - "\n" - "#define fftKernel32(a,dir) \\\n" - "{ \\\n" - " fftKernel2S((a)[0], (a)[16], dir); \\\n" - " fftKernel2S((a)[1], (a)[17], dir); \\\n" - " fftKernel2S((a)[2], (a)[18], dir); \\\n" - " fftKernel2S((a)[3], (a)[19], dir); \\\n" - " fftKernel2S((a)[4], (a)[20], dir); \\\n" - " fftKernel2S((a)[5], (a)[21], dir); \\\n" - " fftKernel2S((a)[6], (a)[22], dir); \\\n" - " fftKernel2S((a)[7], (a)[23], dir); \\\n" - " fftKernel2S((a)[8], (a)[24], dir); \\\n" - " fftKernel2S((a)[9], (a)[25], dir); \\\n" - " fftKernel2S((a)[10], (a)[26], dir); \\\n" - " fftKernel2S((a)[11], (a)[27], dir); \\\n" - " fftKernel2S((a)[12], (a)[28], dir); \\\n" - " fftKernel2S((a)[13], (a)[29], dir); \\\n" - " fftKernel2S((a)[14], (a)[30], dir); \\\n" - " fftKernel2S((a)[15], (a)[31], dir); \\\n" - " (a)[17] = complexMul((a)[17], (float2)(0x1.f6297cp-1f, dir*0x1.8f8b84p-3f)); \\\n" - " (a)[18] = complexMul((a)[18], (float2)(0x1.d906bcp-1f, dir*0x1.87de2ap-2f)); \\\n" - " (a)[19] = complexMul((a)[19], (float2)(0x1.a9b662p-1f, dir*0x1.1c73b4p-1f)); \\\n" - " (a)[20] = complexMul((a)[20], (float2)(0x1.6a09e6p-1f, dir*0x1.6a09e6p-1f)); \\\n" - " (a)[21] = complexMul((a)[21], (float2)(0x1.1c73b4p-1f, dir*0x1.a9b662p-1f)); \\\n" - " (a)[22] = complexMul((a)[22], (float2)(0x1.87de2ap-2f, dir*0x1.d906bcp-1f)); \\\n" - " (a)[23] = complexMul((a)[23], (float2)(0x1.8f8b84p-3f, dir*0x1.f6297cp-1f)); \\\n" - " (a)[24] = complexMul((a)[24], (float2)(0x0p+0f, dir*0x1p+0f)); \\\n" - " (a)[25] = complexMul((a)[25], (float2)(-0x1.8f8b84p-3f, dir*0x1.f6297cp-1f)); \\\n" - " (a)[26] = complexMul((a)[26], (float2)(-0x1.87de2ap-2f, dir*0x1.d906bcp-1f)); \\\n" - " (a)[27] = complexMul((a)[27], (float2)(-0x1.1c73b4p-1f, dir*0x1.a9b662p-1f)); \\\n" - " (a)[28] = complexMul((a)[28], (float2)(-0x1.6a09e6p-1f, dir*0x1.6a09e6p-1f)); \\\n" - " (a)[29] = complexMul((a)[29], (float2)(-0x1.a9b662p-1f, dir*0x1.1c73b4p-1f)); \\\n" - " (a)[30] = complexMul((a)[30], (float2)(-0x1.d906bcp-1f, dir*0x1.87de2ap-2f)); \\\n" - " (a)[31] = complexMul((a)[31], (float2)(-0x1.f6297cp-1f, dir*0x1.8f8b84p-3f)); \\\n" - " fftKernel16((a), dir); \\\n" - " fftKernel16((a) + 16, dir); \\\n" - " bitreverse32((a)); \\\n" - "}\n\n" - ); + "#define M_PI 0x1.921fb54442d18p+1\n" + "#endif\n" + "#define complexMul(a,b) ((float2)(mad(-(a).y, (b).y, (a).x * (b).x), mad((a).y, (b).x, (a).x * (b).y)))\n" + "#define conj(a) ((float2)((a).x, -(a).y))\n" + "#define conjTransp(a) ((float2)(-(a).y, (a).x))\n" + "\n" + "#define fftKernel2(a,dir) \\\n" + "{ \\\n" + " float2 c = (a)[0]; \\\n" + " (a)[0] = c + (a)[1]; \\\n" + " (a)[1] = c - (a)[1]; \\\n" + "}\n" + "\n" + "#define fftKernel2S(d1,d2,dir) \\\n" + "{ \\\n" + " float2 c = (d1); \\\n" + " (d1) = c + (d2); \\\n" + " (d2) = c - (d2); \\\n" + "}\n" + "\n" + "#define fftKernel4(a,dir) \\\n" + "{ \\\n" + " fftKernel2S((a)[0], (a)[2], dir); \\\n" + " fftKernel2S((a)[1], (a)[3], dir); \\\n" + " fftKernel2S((a)[0], (a)[1], dir); \\\n" + " (a)[3] = (float2)(dir)*(conjTransp((a)[3])); \\\n" + " fftKernel2S((a)[2], (a)[3], dir); \\\n" + " float2 c = (a)[1]; \\\n" + " (a)[1] = (a)[2]; \\\n" + " (a)[2] = c; \\\n" + "}\n" + "\n" + "#define fftKernel4s(a0,a1,a2,a3,dir) \\\n" + "{ \\\n" + " fftKernel2S((a0), (a2), dir); \\\n" + " fftKernel2S((a1), (a3), dir); \\\n" + " fftKernel2S((a0), (a1), dir); \\\n" + " (a3) = (float2)(dir)*(conjTransp((a3))); \\\n" + " fftKernel2S((a2), (a3), dir); \\\n" + " float2 c = (a1); \\\n" + " (a1) = (a2); \\\n" + " (a2) = c; \\\n" + "}\n" + "\n" + "#define bitreverse8(a) \\\n" + "{ \\\n" + " float2 c; \\\n" + " c = (a)[1]; \\\n" + " (a)[1] = (a)[4]; \\\n" + " (a)[4] = c; \\\n" + " c = (a)[3]; \\\n" + " (a)[3] = (a)[6]; \\\n" + " (a)[6] = c; \\\n" + "}\n" + "\n" + "#define fftKernel8(a,dir) \\\n" + "{ \\\n" + " const float2 w1 = (float2)(0x1.6a09e6p-1f, dir*0x1.6a09e6p-1f); \\\n" + " const float2 w3 = (float2)(-0x1.6a09e6p-1f, dir*0x1.6a09e6p-1f); \\\n" + " float2 c; \\\n" + " fftKernel2S((a)[0], (a)[4], dir); \\\n" + " fftKernel2S((a)[1], (a)[5], dir); \\\n" + " fftKernel2S((a)[2], (a)[6], dir); \\\n" + " fftKernel2S((a)[3], (a)[7], dir); \\\n" + " (a)[5] = complexMul(w1, (a)[5]); \\\n" + " (a)[6] = (float2)(dir)*(conjTransp((a)[6])); \\\n" + " (a)[7] = complexMul(w3, (a)[7]); \\\n" + " fftKernel2S((a)[0], (a)[2], dir); \\\n" + " fftKernel2S((a)[1], (a)[3], dir); \\\n" + " fftKernel2S((a)[4], (a)[6], dir); \\\n" + " fftKernel2S((a)[5], (a)[7], dir); \\\n" + " (a)[3] = (float2)(dir)*(conjTransp((a)[3])); \\\n" + " (a)[7] = (float2)(dir)*(conjTransp((a)[7])); \\\n" + " fftKernel2S((a)[0], (a)[1], dir); \\\n" + " fftKernel2S((a)[2], (a)[3], dir); \\\n" + " fftKernel2S((a)[4], (a)[5], dir); \\\n" + " fftKernel2S((a)[6], (a)[7], dir); \\\n" + " bitreverse8((a)); \\\n" + "}\n" + "\n" + "#define bitreverse4x4(a) \\\n" + "{ \\\n" + " float2 c; \\\n" + " c = (a)[1]; (a)[1] = (a)[4]; (a)[4] = c; \\\n" + " c = (a)[2]; (a)[2] = (a)[8]; (a)[8] = c; \\\n" + " c = (a)[3]; (a)[3] = (a)[12]; (a)[12] = c; \\\n" + " c = (a)[6]; (a)[6] = (a)[9]; (a)[9] = c; \\\n" + " c = (a)[7]; (a)[7] = (a)[13]; (a)[13] = c; \\\n" + " c = (a)[11]; (a)[11] = (a)[14]; (a)[14] = c; \\\n" + "}\n" + "\n" + "#define fftKernel16(a,dir) \\\n" + "{ \\\n" + " const float w0 = 0x1.d906bcp-1f; \\\n" + " const float w1 = 0x1.87de2ap-2f; \\\n" + " const float w2 = 0x1.6a09e6p-1f; \\\n" + " fftKernel4s((a)[0], (a)[4], (a)[8], (a)[12], dir); \\\n" + " fftKernel4s((a)[1], (a)[5], (a)[9], (a)[13], dir); \\\n" + " fftKernel4s((a)[2], (a)[6], (a)[10], (a)[14], dir); \\\n" + " fftKernel4s((a)[3], (a)[7], (a)[11], (a)[15], dir); \\\n" + " (a)[5] = complexMul((a)[5], (float2)(w0, dir*w1)); \\\n" + " (a)[6] = complexMul((a)[6], (float2)(w2, dir*w2)); \\\n" + " (a)[7] = complexMul((a)[7], (float2)(w1, dir*w0)); \\\n" + " (a)[9] = complexMul((a)[9], (float2)(w2, dir*w2)); \\\n" + " (a)[10] = (float2)(dir)*(conjTransp((a)[10])); \\\n" + " (a)[11] = complexMul((a)[11], (float2)(-w2, dir*w2)); \\\n" + " (a)[13] = complexMul((a)[13], (float2)(w1, dir*w0)); \\\n" + " (a)[14] = complexMul((a)[14], (float2)(-w2, dir*w2)); \\\n" + " (a)[15] = complexMul((a)[15], (float2)(-w0, dir*-w1)); \\\n" + " fftKernel4((a), dir); \\\n" + " fftKernel4((a) + 4, dir); \\\n" + " fftKernel4((a) + 8, dir); \\\n" + " fftKernel4((a) + 12, dir); \\\n" + " bitreverse4x4((a)); \\\n" + "}\n" + "\n" + "#define bitreverse32(a) \\\n" + "{ \\\n" + " float2 c1, c2; \\\n" + " c1 = (a)[2]; (a)[2] = (a)[1]; c2 = (a)[4]; (a)[4] = c1; c1 = (a)[8]; (a)[8] = c2; c2 = (a)[16]; (a)[16] = c1; (a)[1] = c2; \\\n" + " c1 = (a)[6]; (a)[6] = (a)[3]; c2 = (a)[12]; (a)[12] = c1; c1 = (a)[24]; (a)[24] = c2; c2 = (a)[17]; (a)[17] = c1; (a)[3] = c2; \\\n" + " c1 = (a)[10]; (a)[10] = (a)[5]; c2 = (a)[20]; (a)[20] = c1; c1 = (a)[9]; (a)[9] = c2; c2 = (a)[18]; (a)[18] = c1; (a)[5] = c2; \\\n" + " c1 = (a)[14]; (a)[14] = (a)[7]; c2 = (a)[28]; (a)[28] = c1; c1 = (a)[25]; (a)[25] = c2; c2 = (a)[19]; (a)[19] = c1; (a)[7] = c2; \\\n" + " c1 = (a)[22]; (a)[22] = (a)[11]; c2 = (a)[13]; (a)[13] = c1; c1 = (a)[26]; (a)[26] = c2; c2 = (a)[21]; (a)[21] = c1; (a)[11] = c2; \\\n" + " c1 = (a)[30]; (a)[30] = (a)[15]; c2 = (a)[29]; (a)[29] = c1; c1 = (a)[27]; (a)[27] = c2; c2 = (a)[23]; (a)[23] = c1; (a)[15] = c2; \\\n" + "}\n" + "\n" + "#define fftKernel32(a,dir) \\\n" + "{ \\\n" + " fftKernel2S((a)[0], (a)[16], dir); \\\n" + " fftKernel2S((a)[1], (a)[17], dir); \\\n" + " fftKernel2S((a)[2], (a)[18], dir); \\\n" + " fftKernel2S((a)[3], (a)[19], dir); \\\n" + " fftKernel2S((a)[4], (a)[20], dir); \\\n" + " fftKernel2S((a)[5], (a)[21], dir); \\\n" + " fftKernel2S((a)[6], (a)[22], dir); \\\n" + " fftKernel2S((a)[7], (a)[23], dir); \\\n" + " fftKernel2S((a)[8], (a)[24], dir); \\\n" + " fftKernel2S((a)[9], (a)[25], dir); \\\n" + " fftKernel2S((a)[10], (a)[26], dir); \\\n" + " fftKernel2S((a)[11], (a)[27], dir); \\\n" + " fftKernel2S((a)[12], (a)[28], dir); \\\n" + " fftKernel2S((a)[13], (a)[29], dir); \\\n" + " fftKernel2S((a)[14], (a)[30], dir); \\\n" + " fftKernel2S((a)[15], (a)[31], dir); \\\n" + " (a)[17] = complexMul((a)[17], (float2)(0x1.f6297cp-1f, dir*0x1.8f8b84p-3f)); \\\n" + " (a)[18] = complexMul((a)[18], (float2)(0x1.d906bcp-1f, dir*0x1.87de2ap-2f)); \\\n" + " (a)[19] = complexMul((a)[19], (float2)(0x1.a9b662p-1f, dir*0x1.1c73b4p-1f)); \\\n" + " (a)[20] = complexMul((a)[20], (float2)(0x1.6a09e6p-1f, dir*0x1.6a09e6p-1f)); \\\n" + " (a)[21] = complexMul((a)[21], (float2)(0x1.1c73b4p-1f, dir*0x1.a9b662p-1f)); \\\n" + " (a)[22] = complexMul((a)[22], (float2)(0x1.87de2ap-2f, dir*0x1.d906bcp-1f)); \\\n" + " (a)[23] = complexMul((a)[23], (float2)(0x1.8f8b84p-3f, dir*0x1.f6297cp-1f)); \\\n" + " (a)[24] = complexMul((a)[24], (float2)(0x0p+0f, dir*0x1p+0f)); \\\n" + " (a)[25] = complexMul((a)[25], (float2)(-0x1.8f8b84p-3f, dir*0x1.f6297cp-1f)); \\\n" + " (a)[26] = complexMul((a)[26], (float2)(-0x1.87de2ap-2f, dir*0x1.d906bcp-1f)); \\\n" + " (a)[27] = complexMul((a)[27], (float2)(-0x1.1c73b4p-1f, dir*0x1.a9b662p-1f)); \\\n" + " (a)[28] = complexMul((a)[28], (float2)(-0x1.6a09e6p-1f, dir*0x1.6a09e6p-1f)); \\\n" + " (a)[29] = complexMul((a)[29], (float2)(-0x1.a9b662p-1f, dir*0x1.1c73b4p-1f)); \\\n" + " (a)[30] = complexMul((a)[30], (float2)(-0x1.d906bcp-1f, dir*0x1.87de2ap-2f)); \\\n" + " (a)[31] = complexMul((a)[31], (float2)(-0x1.f6297cp-1f, dir*0x1.8f8b84p-3f)); \\\n" + " fftKernel16((a), dir); \\\n" + " fftKernel16((a) + 16, dir); \\\n" + " bitreverse32((a)); \\\n" + "}\n\n" + ); static string twistKernelInterleaved = string( - "__kernel void \\\n" - "clFFT_1DTwistInterleaved(__global float2 *in, unsigned int startRow, unsigned int numCols, unsigned int N, unsigned int numRowsToProcess, int dir) \\\n" - "{ \\\n" - " float2 a, w; \\\n" - " float ang; \\\n" - " unsigned int j; \\\n" - " unsigned int i = get_global_id(0); \\\n" - " unsigned int startIndex = i; \\\n" - " \\\n" - " if(i < numCols) \\\n" - " { \\\n" - " for(j = 0; j < numRowsToProcess; j++) \\\n" - " { \\\n" - " a = in[startIndex]; \\\n" - " ang = 2.0f * M_PI * dir * i * (startRow + j) / N; \\\n" - " w = (float2)(native_cos(ang), native_sin(ang)); \\\n" - " a = complexMul(a, w); \\\n" - " in[startIndex] = a; \\\n" - " startIndex += numCols; \\\n" - " } \\\n" - " } \\\n" - "} \\\n" - ); + "__kernel void \\\n" + "clFFT_1DTwistInterleaved(__global float2 *in, unsigned int startRow, unsigned int numCols, unsigned int N, unsigned int numRowsToProcess, int dir) \\\n" + "{ \\\n" + " float2 a, w; \\\n" + " float ang; \\\n" + " unsigned int j; \\\n" + " unsigned int i = get_global_id(0); \\\n" + " unsigned int startIndex = i; \\\n" + " \\\n" + " if(i < numCols) \\\n" + " { \\\n" + " for(j = 0; j < numRowsToProcess; j++) \\\n" + " { \\\n" + " a = in[startIndex]; \\\n" + " ang = 2.0f * M_PI * dir * i * (startRow + j) / N; \\\n" + " w = (float2)(native_cos(ang), native_sin(ang)); \\\n" + " a = complexMul(a, w); \\\n" + " in[startIndex] = a; \\\n" + " startIndex += numCols; \\\n" + " } \\\n" + " } \\\n" + "} \\\n" + ); static string twistKernelPlannar = string( - "__kernel void \\\n" - "clFFT_1DTwistSplit(__global float *in_real, __global float *in_imag , unsigned int startRow, unsigned int numCols, unsigned int N, unsigned int numRowsToProcess, int dir) \\\n" - "{ \\\n" - " float2 a, w; \\\n" - " float ang; \\\n" - " unsigned int j; \\\n" - " unsigned int i = get_global_id(0); \\\n" - " unsigned int startIndex = i; \\\n" - " \\\n" - " if(i < numCols) \\\n" - " { \\\n" - " for(j = 0; j < numRowsToProcess; j++) \\\n" - " { \\\n" - " a = (float2)(in_real[startIndex], in_imag[startIndex]); \\\n" - " ang = 2.0f * M_PI * dir * i * (startRow + j) / N; \\\n" - " w = (float2)(native_cos(ang), native_sin(ang)); \\\n" - " a = complexMul(a, w); \\\n" - " in_real[startIndex] = a.x; \\\n" - " in_imag[startIndex] = a.y; \\\n" - " startIndex += numCols; \\\n" - " } \\\n" - " } \\\n" - "} \\\n" - ); + "__kernel void \\\n" + "clFFT_1DTwistSplit(__global float *in_real, __global float *in_imag , unsigned int startRow, unsigned int numCols, unsigned int N, unsigned int numRowsToProcess, int dir) \\\n" + "{ \\\n" + " float2 a, w; \\\n" + " float ang; \\\n" + " unsigned int j; \\\n" + " unsigned int i = get_global_id(0); \\\n" + " unsigned int startIndex = i; \\\n" + " \\\n" + " if(i < numCols) \\\n" + " { \\\n" + " for(j = 0; j < numRowsToProcess; j++) \\\n" + " { \\\n" + " a = (float2)(in_real[startIndex], in_imag[startIndex]); \\\n" + " ang = 2.0f * M_PI * dir * i * (startRow + j) / N; \\\n" + " w = (float2)(native_cos(ang), native_sin(ang)); \\\n" + " a = complexMul(a, w); \\\n" + " in_real[startIndex] = a.x; \\\n" + " in_imag[startIndex] = a.y; \\\n" + " startIndex += numCols; \\\n" + " } \\\n" + " } \\\n" + "} \\\n" + ); diff --git a/src/fft_execute.cpp b/src/fft_execute.cpp index ad43d4f9631c599eaa376175607e58a35cbcdb03..bdeb7a3b3624c297a7a053433511dbe142e57180 100644 --- a/src/fft_execute.cpp +++ b/src/fft_execute.cpp @@ -58,348 +58,348 @@ static cl_int allocateTemporaryBufferInterleaved(cl_fft_plan *plan, cl_uint batchSize) { - cl_int err = CL_SUCCESS; - if(plan->temp_buffer_needed && plan->last_batch_size != batchSize) - { - plan->last_batch_size = batchSize; - size_t tmpLength = plan->n.x * plan->n.y * plan->n.z * batchSize * 2 * sizeof(cl_float); - - if(plan->tempmemobj) - clReleaseMemObject(plan->tempmemobj); - - plan->tempmemobj = clCreateBuffer(plan->context, CL_MEM_READ_WRITE, tmpLength, NULL, &err); - } - return err; + cl_int err = CL_SUCCESS; + if(plan->temp_buffer_needed && plan->last_batch_size != batchSize) + { + plan->last_batch_size = batchSize; + size_t tmpLength = plan->n.x * plan->n.y * plan->n.z * batchSize * 2 * sizeof(cl_float); + + if(plan->tempmemobj) + clReleaseMemObject(plan->tempmemobj); + + plan->tempmemobj = clCreateBuffer(plan->context, CL_MEM_READ_WRITE, tmpLength, NULL, &err); + } + return err; } static cl_int allocateTemporaryBufferPlannar(cl_fft_plan *plan, cl_uint batchSize) { - cl_int err = CL_SUCCESS; - cl_int terr; - if(plan->temp_buffer_needed && plan->last_batch_size != batchSize) - { - plan->last_batch_size = batchSize; - size_t tmpLength = plan->n.x * plan->n.y * plan->n.z * batchSize * sizeof(cl_float); - - if(plan->tempmemobj_real) - clReleaseMemObject(plan->tempmemobj_real); - - if(plan->tempmemobj_imag) - clReleaseMemObject(plan->tempmemobj_imag); - - plan->tempmemobj_real = clCreateBuffer(plan->context, CL_MEM_READ_WRITE, tmpLength, NULL, &err); - plan->tempmemobj_imag = clCreateBuffer(plan->context, CL_MEM_READ_WRITE, tmpLength, NULL, &terr); - err |= terr; - } - return err; + cl_int err = CL_SUCCESS; + cl_int terr; + if(plan->temp_buffer_needed && plan->last_batch_size != batchSize) + { + plan->last_batch_size = batchSize; + size_t tmpLength = plan->n.x * plan->n.y * plan->n.z * batchSize * sizeof(cl_float); + + if(plan->tempmemobj_real) + clReleaseMemObject(plan->tempmemobj_real); + + if(plan->tempmemobj_imag) + clReleaseMemObject(plan->tempmemobj_imag); + + plan->tempmemobj_real = clCreateBuffer(plan->context, CL_MEM_READ_WRITE, tmpLength, NULL, &err); + plan->tempmemobj_imag = clCreateBuffer(plan->context, CL_MEM_READ_WRITE, tmpLength, NULL, &terr); + err |= terr; + } + return err; } void getKernelWorkDimensions(cl_fft_plan *plan, cl_fft_kernel_info *kernelInfo, cl_int *batchSize, size_t *gWorkItems, size_t *lWorkItems) { - *lWorkItems = kernelInfo->num_workitems_per_workgroup; - int numWorkGroups = kernelInfo->num_workgroups; + *lWorkItems = kernelInfo->num_workitems_per_workgroup; + int numWorkGroups = kernelInfo->num_workgroups; int numXFormsPerWG = kernelInfo->num_xforms_per_workgroup; - switch(kernelInfo->dir) - { - case cl_fft_kernel_x: + switch(kernelInfo->dir) + { + case cl_fft_kernel_x: *batchSize *= (plan->n.y * plan->n.z); numWorkGroups = (*batchSize % numXFormsPerWG) ? (*batchSize/numXFormsPerWG + 1) : (*batchSize/numXFormsPerWG); numWorkGroups *= kernelInfo->num_workgroups; - break; - case cl_fft_kernel_y: - *batchSize *= plan->n.z; - numWorkGroups *= *batchSize; - break; - case cl_fft_kernel_z: - numWorkGroups *= *batchSize; - break; - } - - *gWorkItems = numWorkGroups * *lWorkItems; + break; + case cl_fft_kernel_y: + *batchSize *= plan->n.z; + numWorkGroups *= *batchSize; + break; + case cl_fft_kernel_z: + numWorkGroups *= *batchSize; + break; + } + + *gWorkItems = numWorkGroups * *lWorkItems; } cl_int clFFT_ExecuteInterleaved( cl_command_queue queue, clFFT_Plan Plan, cl_int batchSize, clFFT_Direction dir, - cl_mem data_in, cl_mem data_out, - cl_int num_events, cl_event *event_list, cl_event *event ) + cl_mem data_in, cl_mem data_out, + cl_int num_events, cl_event *event_list, cl_event *event ) { - int s; - cl_fft_plan *plan = (cl_fft_plan *) Plan; - if(plan->format != clFFT_InterleavedComplexFormat) - return CL_INVALID_VALUE; - - cl_int err; - size_t gWorkItems, lWorkItems; - int inPlaceDone = -1; - - cl_int isInPlace = data_in == data_out ? 1 : 0; - - if((err = allocateTemporaryBufferInterleaved(plan, batchSize)) != CL_SUCCESS) - return err; - - cl_mem memObj[3]; - memObj[0] = data_in; - memObj[1] = data_out; - memObj[2] = plan->tempmemobj; - cl_fft_kernel_info *kernelInfo = plan->kernel_info; - int numKernels = plan->num_kernels; - - int numKernelsOdd = numKernels & 1; - int currRead = 0; - int currWrite = 1; - - // at least one external dram shuffle (transpose) required - if(plan->temp_buffer_needed) - { - // in-place transform - if(isInPlace) - { - inPlaceDone = 0; - currRead = 1; - currWrite = 2; - } - else - { - currWrite = (numKernels & 1) ? 1 : 2; - } - - while(kernelInfo) - { - if( isInPlace && numKernelsOdd && !inPlaceDone && kernelInfo->in_place_possible) - { - currWrite = currRead; - inPlaceDone = 1; - } - - s = batchSize; - getKernelWorkDimensions(plan, kernelInfo, &s, &gWorkItems, &lWorkItems); - err |= clSetKernelArg(kernelInfo->kernel, 0, sizeof(cl_mem), &memObj[currRead]); - err |= clSetKernelArg(kernelInfo->kernel, 1, sizeof(cl_mem), &memObj[currWrite]); - err |= clSetKernelArg(kernelInfo->kernel, 2, sizeof(cl_int), &dir); - err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_int), &s); - - err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, 0, NULL, NULL); - if(err) - return err; - - currRead = (currWrite == 1) ? 1 : 2; - currWrite = (currWrite == 1) ? 2 : 1; - - kernelInfo = kernelInfo->next; - } - } - // no dram shuffle (transpose required) transform - // all kernels can execute in-place. - else { - - while(kernelInfo) - { - s = batchSize; - getKernelWorkDimensions(plan, kernelInfo, &s, &gWorkItems, &lWorkItems); - err |= clSetKernelArg(kernelInfo->kernel, 0, sizeof(cl_mem), &memObj[currRead]); - err |= clSetKernelArg(kernelInfo->kernel, 1, sizeof(cl_mem), &memObj[currWrite]); - err |= clSetKernelArg(kernelInfo->kernel, 2, sizeof(cl_int), &dir); - err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_int), &s); - - err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, 0, NULL, NULL); - if(err) - return err; - - currRead = 1; - currWrite = 1; - - kernelInfo = kernelInfo->next; - } - } - - return err; + int s; + cl_fft_plan *plan = (cl_fft_plan *) Plan; + if(plan->format != clFFT_InterleavedComplexFormat) + return CL_INVALID_VALUE; + + cl_int err; + size_t gWorkItems, lWorkItems; + int inPlaceDone = -1; + + cl_int isInPlace = data_in == data_out ? 1 : 0; + + if((err = allocateTemporaryBufferInterleaved(plan, batchSize)) != CL_SUCCESS) + return err; + + cl_mem memObj[3]; + memObj[0] = data_in; + memObj[1] = data_out; + memObj[2] = plan->tempmemobj; + cl_fft_kernel_info *kernelInfo = plan->kernel_info; + int numKernels = plan->num_kernels; + + int numKernelsOdd = numKernels & 1; + int currRead = 0; + int currWrite = 1; + + // at least one external dram shuffle (transpose) required + if(plan->temp_buffer_needed) + { + // in-place transform + if(isInPlace) + { + inPlaceDone = 0; + currRead = 1; + currWrite = 2; + } + else + { + currWrite = (numKernels & 1) ? 1 : 2; + } + + while(kernelInfo) + { + if( isInPlace && numKernelsOdd && !inPlaceDone && kernelInfo->in_place_possible) + { + currWrite = currRead; + inPlaceDone = 1; + } + + s = batchSize; + getKernelWorkDimensions(plan, kernelInfo, &s, &gWorkItems, &lWorkItems); + err |= clSetKernelArg(kernelInfo->kernel, 0, sizeof(cl_mem), &memObj[currRead]); + err |= clSetKernelArg(kernelInfo->kernel, 1, sizeof(cl_mem), &memObj[currWrite]); + err |= clSetKernelArg(kernelInfo->kernel, 2, sizeof(cl_int), &dir); + err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_int), &s); + + err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, 0, NULL, NULL); + if(err) + return err; + + currRead = (currWrite == 1) ? 1 : 2; + currWrite = (currWrite == 1) ? 2 : 1; + + kernelInfo = kernelInfo->next; + } + } + // no dram shuffle (transpose required) transform + // all kernels can execute in-place. + else { + + while(kernelInfo) + { + s = batchSize; + getKernelWorkDimensions(plan, kernelInfo, &s, &gWorkItems, &lWorkItems); + err |= clSetKernelArg(kernelInfo->kernel, 0, sizeof(cl_mem), &memObj[currRead]); + err |= clSetKernelArg(kernelInfo->kernel, 1, sizeof(cl_mem), &memObj[currWrite]); + err |= clSetKernelArg(kernelInfo->kernel, 2, sizeof(cl_int), &dir); + err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_int), &s); + + err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, 0, NULL, NULL); + if(err) + return err; + + currRead = 1; + currWrite = 1; + + kernelInfo = kernelInfo->next; + } + } + + return err; } cl_int clFFT_ExecutePlannar( cl_command_queue queue, clFFT_Plan Plan, cl_int batchSize, clFFT_Direction dir, - cl_mem data_in_real, cl_mem data_in_imag, cl_mem data_out_real, cl_mem data_out_imag, - cl_int num_events, cl_event *event_list, cl_event *event) + cl_mem data_in_real, cl_mem data_in_imag, cl_mem data_out_real, cl_mem data_out_imag, + cl_int num_events, cl_event *event_list, cl_event *event) { - int s; - cl_fft_plan *plan = (cl_fft_plan *) Plan; - - if(plan->format != clFFT_SplitComplexFormat) - return CL_INVALID_VALUE; - - cl_int err; - size_t gWorkItems, lWorkItems; - int inPlaceDone = -1; - - cl_int isInPlace = ((data_in_real == data_out_real) && (data_in_imag == data_out_imag)) ? 1 : 0; - - if((err = allocateTemporaryBufferPlannar(plan, batchSize)) != CL_SUCCESS) - return err; - - cl_mem memObj_real[3]; - cl_mem memObj_imag[3]; - memObj_real[0] = data_in_real; - memObj_real[1] = data_out_real; - memObj_real[2] = plan->tempmemobj_real; - memObj_imag[0] = data_in_imag; - memObj_imag[1] = data_out_imag; - memObj_imag[2] = plan->tempmemobj_imag; - - cl_fft_kernel_info *kernelInfo = plan->kernel_info; - int numKernels = plan->num_kernels; - - int numKernelsOdd = numKernels & 1; - int currRead = 0; - int currWrite = 1; - - // at least one external dram shuffle (transpose) required - if(plan->temp_buffer_needed) - { - // in-place transform - if(isInPlace) - { - inPlaceDone = 0; - currRead = 1; - currWrite = 2; - } - else - { - currWrite = (numKernels & 1) ? 1 : 2; - } - - while(kernelInfo) - { - if( isInPlace && numKernelsOdd && !inPlaceDone && kernelInfo->in_place_possible) - { - currWrite = currRead; - inPlaceDone = 1; - } - - s = batchSize; - getKernelWorkDimensions(plan, kernelInfo, &s, &gWorkItems, &lWorkItems); - err |= clSetKernelArg(kernelInfo->kernel, 0, sizeof(cl_mem), &memObj_real[currRead]); - err |= clSetKernelArg(kernelInfo->kernel, 1, sizeof(cl_mem), &memObj_imag[currRead]); - err |= clSetKernelArg(kernelInfo->kernel, 2, sizeof(cl_mem), &memObj_real[currWrite]); - err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_mem), &memObj_imag[currWrite]); - err |= clSetKernelArg(kernelInfo->kernel, 4, sizeof(cl_int), &dir); - err |= clSetKernelArg(kernelInfo->kernel, 5, sizeof(cl_int), &s); - - err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, 0, NULL, NULL); - if(err) - return err; - - currRead = (currWrite == 1) ? 1 : 2; - currWrite = (currWrite == 1) ? 2 : 1; - - kernelInfo = kernelInfo->next; - } - } - // no dram shuffle (transpose required) transform - else { - - while(kernelInfo) - { - s = batchSize; - getKernelWorkDimensions(plan, kernelInfo, &s, &gWorkItems, &lWorkItems); - err |= clSetKernelArg(kernelInfo->kernel, 0, sizeof(cl_mem), &memObj_real[currRead]); - err |= clSetKernelArg(kernelInfo->kernel, 1, sizeof(cl_mem), &memObj_imag[currRead]); - err |= clSetKernelArg(kernelInfo->kernel, 2, sizeof(cl_mem), &memObj_real[currWrite]); - err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_mem), &memObj_imag[currWrite]); - err |= clSetKernelArg(kernelInfo->kernel, 4, sizeof(cl_int), &dir); - err |= clSetKernelArg(kernelInfo->kernel, 5, sizeof(cl_int), &s); - - err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, 0, NULL, NULL); - if(err) - return err; - - currRead = 1; - currWrite = 1; - - kernelInfo = kernelInfo->next; - } - } - - return err; + int s; + cl_fft_plan *plan = (cl_fft_plan *) Plan; + + if(plan->format != clFFT_SplitComplexFormat) + return CL_INVALID_VALUE; + + cl_int err; + size_t gWorkItems, lWorkItems; + int inPlaceDone = -1; + + cl_int isInPlace = ((data_in_real == data_out_real) && (data_in_imag == data_out_imag)) ? 1 : 0; + + if((err = allocateTemporaryBufferPlannar(plan, batchSize)) != CL_SUCCESS) + return err; + + cl_mem memObj_real[3]; + cl_mem memObj_imag[3]; + memObj_real[0] = data_in_real; + memObj_real[1] = data_out_real; + memObj_real[2] = plan->tempmemobj_real; + memObj_imag[0] = data_in_imag; + memObj_imag[1] = data_out_imag; + memObj_imag[2] = plan->tempmemobj_imag; + + cl_fft_kernel_info *kernelInfo = plan->kernel_info; + int numKernels = plan->num_kernels; + + int numKernelsOdd = numKernels & 1; + int currRead = 0; + int currWrite = 1; + + // at least one external dram shuffle (transpose) required + if(plan->temp_buffer_needed) + { + // in-place transform + if(isInPlace) + { + inPlaceDone = 0; + currRead = 1; + currWrite = 2; + } + else + { + currWrite = (numKernels & 1) ? 1 : 2; + } + + while(kernelInfo) + { + if( isInPlace && numKernelsOdd && !inPlaceDone && kernelInfo->in_place_possible) + { + currWrite = currRead; + inPlaceDone = 1; + } + + s = batchSize; + getKernelWorkDimensions(plan, kernelInfo, &s, &gWorkItems, &lWorkItems); + err |= clSetKernelArg(kernelInfo->kernel, 0, sizeof(cl_mem), &memObj_real[currRead]); + err |= clSetKernelArg(kernelInfo->kernel, 1, sizeof(cl_mem), &memObj_imag[currRead]); + err |= clSetKernelArg(kernelInfo->kernel, 2, sizeof(cl_mem), &memObj_real[currWrite]); + err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_mem), &memObj_imag[currWrite]); + err |= clSetKernelArg(kernelInfo->kernel, 4, sizeof(cl_int), &dir); + err |= clSetKernelArg(kernelInfo->kernel, 5, sizeof(cl_int), &s); + + err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, 0, NULL, NULL); + if(err) + return err; + + currRead = (currWrite == 1) ? 1 : 2; + currWrite = (currWrite == 1) ? 2 : 1; + + kernelInfo = kernelInfo->next; + } + } + // no dram shuffle (transpose required) transform + else { + + while(kernelInfo) + { + s = batchSize; + getKernelWorkDimensions(plan, kernelInfo, &s, &gWorkItems, &lWorkItems); + err |= clSetKernelArg(kernelInfo->kernel, 0, sizeof(cl_mem), &memObj_real[currRead]); + err |= clSetKernelArg(kernelInfo->kernel, 1, sizeof(cl_mem), &memObj_imag[currRead]); + err |= clSetKernelArg(kernelInfo->kernel, 2, sizeof(cl_mem), &memObj_real[currWrite]); + err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_mem), &memObj_imag[currWrite]); + err |= clSetKernelArg(kernelInfo->kernel, 4, sizeof(cl_int), &dir); + err |= clSetKernelArg(kernelInfo->kernel, 5, sizeof(cl_int), &s); + + err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, 0, NULL, NULL); + if(err) + return err; + + currRead = 1; + currWrite = 1; + + kernelInfo = kernelInfo->next; + } + } + + return err; } cl_int clFFT_1DTwistInterleaved(clFFT_Plan Plan, cl_command_queue queue, cl_mem array, - size_t numRows, size_t numCols, size_t startRow, size_t rowsToProcess, clFFT_Direction dir) + size_t numRows, size_t numCols, size_t startRow, size_t rowsToProcess, clFFT_Direction dir) { - cl_fft_plan *plan = (cl_fft_plan *) Plan; - - unsigned int N = numRows*numCols; - unsigned int nCols = numCols; - unsigned int sRow = startRow; - unsigned int rToProcess = rowsToProcess; - int d = dir; - int err = 0; - - cl_device_id device_id; - err = clGetCommandQueueInfo(queue, CL_QUEUE_DEVICE, sizeof(cl_device_id), &device_id, NULL); - if(err) - return err; - - size_t gSize; - err = clGetKernelWorkGroupInfo(plan->twist_kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &gSize, NULL); - if(err) - return err; - - gSize = min(128, gSize); - size_t numGlobalThreads[1] = { max(numCols / gSize, 1)*gSize }; - size_t numLocalThreads[1] = { gSize }; - - err |= clSetKernelArg(plan->twist_kernel, 0, sizeof(cl_mem), &array); - err |= clSetKernelArg(plan->twist_kernel, 1, sizeof(unsigned int), &sRow); - err |= clSetKernelArg(plan->twist_kernel, 2, sizeof(unsigned int), &nCols); - err |= clSetKernelArg(plan->twist_kernel, 3, sizeof(unsigned int), &N); - err |= clSetKernelArg(plan->twist_kernel, 4, sizeof(unsigned int), &rToProcess); - err |= clSetKernelArg(plan->twist_kernel, 5, sizeof(int), &d); - - err |= clEnqueueNDRangeKernel(queue, plan->twist_kernel, 1, NULL, numGlobalThreads, numLocalThreads, 0, NULL, NULL); - - return err; + cl_fft_plan *plan = (cl_fft_plan *) Plan; + + unsigned int N = numRows*numCols; + unsigned int nCols = numCols; + unsigned int sRow = startRow; + unsigned int rToProcess = rowsToProcess; + int d = dir; + int err = 0; + + cl_device_id device_id; + err = clGetCommandQueueInfo(queue, CL_QUEUE_DEVICE, sizeof(cl_device_id), &device_id, NULL); + if(err) + return err; + + size_t gSize; + err = clGetKernelWorkGroupInfo(plan->twist_kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &gSize, NULL); + if(err) + return err; + + gSize = min(128, gSize); + size_t numGlobalThreads[1] = { max(numCols / gSize, 1)*gSize }; + size_t numLocalThreads[1] = { gSize }; + + err |= clSetKernelArg(plan->twist_kernel, 0, sizeof(cl_mem), &array); + err |= clSetKernelArg(plan->twist_kernel, 1, sizeof(unsigned int), &sRow); + err |= clSetKernelArg(plan->twist_kernel, 2, sizeof(unsigned int), &nCols); + err |= clSetKernelArg(plan->twist_kernel, 3, sizeof(unsigned int), &N); + err |= clSetKernelArg(plan->twist_kernel, 4, sizeof(unsigned int), &rToProcess); + err |= clSetKernelArg(plan->twist_kernel, 5, sizeof(int), &d); + + err |= clEnqueueNDRangeKernel(queue, plan->twist_kernel, 1, NULL, numGlobalThreads, numLocalThreads, 0, NULL, NULL); + + return err; } cl_int clFFT_1DTwistPlannar(clFFT_Plan Plan, cl_command_queue queue, cl_mem array_real, cl_mem array_imag, - size_t numRows, size_t numCols, size_t startRow, size_t rowsToProcess, clFFT_Direction dir) + size_t numRows, size_t numCols, size_t startRow, size_t rowsToProcess, clFFT_Direction dir) { - cl_fft_plan *plan = (cl_fft_plan *) Plan; - - unsigned int N = numRows*numCols; - unsigned int nCols = numCols; - unsigned int sRow = startRow; - unsigned int rToProcess = rowsToProcess; - int d = dir; - int err = 0; - - cl_device_id device_id; - err = clGetCommandQueueInfo(queue, CL_QUEUE_DEVICE, sizeof(cl_device_id), &device_id, NULL); - if(err) - return err; - - size_t gSize; - err = clGetKernelWorkGroupInfo(plan->twist_kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &gSize, NULL); - if(err) - return err; - - gSize = min(128, gSize); - size_t numGlobalThreads[1] = { max(numCols / gSize, 1)*gSize }; - size_t numLocalThreads[1] = { gSize }; - - err |= clSetKernelArg(plan->twist_kernel, 0, sizeof(cl_mem), &array_real); - err |= clSetKernelArg(plan->twist_kernel, 1, sizeof(cl_mem), &array_imag); - err |= clSetKernelArg(plan->twist_kernel, 2, sizeof(unsigned int), &sRow); - err |= clSetKernelArg(plan->twist_kernel, 3, sizeof(unsigned int), &nCols); - err |= clSetKernelArg(plan->twist_kernel, 4, sizeof(unsigned int), &N); - err |= clSetKernelArg(plan->twist_kernel, 5, sizeof(unsigned int), &rToProcess); - err |= clSetKernelArg(plan->twist_kernel, 6, sizeof(int), &d); - - err |= clEnqueueNDRangeKernel(queue, plan->twist_kernel, 1, NULL, numGlobalThreads, numLocalThreads, 0, NULL, NULL); - - return err; + cl_fft_plan *plan = (cl_fft_plan *) Plan; + + unsigned int N = numRows*numCols; + unsigned int nCols = numCols; + unsigned int sRow = startRow; + unsigned int rToProcess = rowsToProcess; + int d = dir; + int err = 0; + + cl_device_id device_id; + err = clGetCommandQueueInfo(queue, CL_QUEUE_DEVICE, sizeof(cl_device_id), &device_id, NULL); + if(err) + return err; + + size_t gSize; + err = clGetKernelWorkGroupInfo(plan->twist_kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &gSize, NULL); + if(err) + return err; + + gSize = min(128, gSize); + size_t numGlobalThreads[1] = { max(numCols / gSize, 1)*gSize }; + size_t numLocalThreads[1] = { gSize }; + + err |= clSetKernelArg(plan->twist_kernel, 0, sizeof(cl_mem), &array_real); + err |= clSetKernelArg(plan->twist_kernel, 1, sizeof(cl_mem), &array_imag); + err |= clSetKernelArg(plan->twist_kernel, 2, sizeof(unsigned int), &sRow); + err |= clSetKernelArg(plan->twist_kernel, 3, sizeof(unsigned int), &nCols); + err |= clSetKernelArg(plan->twist_kernel, 4, sizeof(unsigned int), &N); + err |= clSetKernelArg(plan->twist_kernel, 5, sizeof(unsigned int), &rToProcess); + err |= clSetKernelArg(plan->twist_kernel, 6, sizeof(int), &d); + + err |= clEnqueueNDRangeKernel(queue, plan->twist_kernel, 1, NULL, numGlobalThreads, numLocalThreads, 0, NULL, NULL); + + return err; } diff --git a/src/fft_internal.h b/src/fft_internal.h index 71d72da6a7b59461fcc311f6cebb7d0b8b83dd44..22c3969aa982e6ca1398df43fb1b7bc387098023 100644 --- a/src/fft_internal.h +++ b/src/fft_internal.h @@ -58,104 +58,104 @@ using namespace std; typedef enum kernel_dir_t { - cl_fft_kernel_x, - cl_fft_kernel_y, - cl_fft_kernel_z + cl_fft_kernel_x, + cl_fft_kernel_y, + cl_fft_kernel_z }cl_fft_kernel_dir; typedef struct kernel_info_t { - cl_kernel kernel; - char *kernel_name; - size_t lmem_size; - size_t num_workgroups; + cl_kernel kernel; + char *kernel_name; + size_t lmem_size; + size_t num_workgroups; size_t num_xforms_per_workgroup; - size_t num_workitems_per_workgroup; - cl_fft_kernel_dir dir; - int in_place_possible; - kernel_info_t *next; + size_t num_workitems_per_workgroup; + cl_fft_kernel_dir dir; + int in_place_possible; + kernel_info_t *next; }cl_fft_kernel_info; typedef struct { - // context in which fft resources are created and kernels are executed - cl_context context; - - // size of signal - clFFT_Dim3 n; - - // dimension of transform ... must be either 1D, 2D or 3D - clFFT_Dimension dim; - - // data format ... must be either interleaved or plannar - clFFT_DataFormat format; - - // string containing kernel source. Generated at runtime based on - // n, dim, format and other parameters - string *kernel_string; - - // CL program containing source and kernel this particular - // n, dim, data format - cl_program program; - - // linked list of kernels which needs to be executed for this fft - cl_fft_kernel_info *kernel_info; - - // number of kernels - int num_kernels; - - // twist kernel for virtualizing fft of very large sizes that do not - // fit in GPU global memory - cl_kernel twist_kernel; - - // flag indicating if temporary intermediate buffer is needed or not. - // this depends on fft kernels being executed and if transform is - // in-place or out-of-place. e.g. Local memory fft (say 1D 1024 ... - // one that does not require global transpose do not need temporary buffer) - // 2D 1024x1024 out-of-place fft however do require intermediate buffer. - // If temp buffer is needed, its allocation is lazy i.e. its not allocated - // until its needed - cl_int temp_buffer_needed; - - // Batch size is runtime parameter and size of temporary buffer (if needed) - // depends on batch size. Allocation of temporary buffer is lazy i.e. its - // only created when needed. Once its created at first call of clFFT_Executexxx - // it is not allocated next time if next time clFFT_Executexxx is called with - // batch size different than the first call. last_batch_size caches the last - // batch size with which this plan is used so that we dont keep allocating/deallocating - // temp buffer if same batch size is used again and again. - size_t last_batch_size; - - // temporary buffer for interleaved plan - cl_mem tempmemobj; - - // temporary buffer for planner plan. Only one of tempmemobj or - // (tempmemobj_real, tempmemobj_imag) pair is valid (allocated) depending - // data format of plan (plannar or interleaved) - cl_mem tempmemobj_real, tempmemobj_imag; - - // Maximum size of signal for which local memory transposed based - // fft is sufficient i.e. no global mem transpose (communication) - // is needed - size_t max_localmem_fft_size; - - // Maximum work items per work group allowed. This, along with max_radix below controls - // maximum local memory being used by fft kernels of this plan. Set to 256 by default - size_t max_work_item_per_workgroup; - - // Maximum base radix for local memory fft ... this controls the maximum register - // space used by work items. Currently defaults to 16 - size_t max_radix; - - // Device depended parameter that tells how many work-items need to be read consecutive - // values to make sure global memory access by work-items of a work-group result in - // coalesced memory access to utilize full bandwidth e.g. on NVidia tesla, this is 16 - size_t min_mem_coalesce_width; - - // Number of local memory banks. This is used to geneate kernel with local memory - // transposes with appropriate padding to avoid bank conflicts to local memory - // e.g. on NVidia it is 16. - size_t num_local_mem_banks; + // context in which fft resources are created and kernels are executed + cl_context context; + + // size of signal + clFFT_Dim3 n; + + // dimension of transform ... must be either 1D, 2D or 3D + clFFT_Dimension dim; + + // data format ... must be either interleaved or plannar + clFFT_DataFormat format; + + // string containing kernel source. Generated at runtime based on + // n, dim, format and other parameters + string *kernel_string; + + // CL program containing source and kernel this particular + // n, dim, data format + cl_program program; + + // linked list of kernels which needs to be executed for this fft + cl_fft_kernel_info *kernel_info; + + // number of kernels + int num_kernels; + + // twist kernel for virtualizing fft of very large sizes that do not + // fit in GPU global memory + cl_kernel twist_kernel; + + // flag indicating if temporary intermediate buffer is needed or not. + // this depends on fft kernels being executed and if transform is + // in-place or out-of-place. e.g. Local memory fft (say 1D 1024 ... + // one that does not require global transpose do not need temporary buffer) + // 2D 1024x1024 out-of-place fft however do require intermediate buffer. + // If temp buffer is needed, its allocation is lazy i.e. its not allocated + // until its needed + cl_int temp_buffer_needed; + + // Batch size is runtime parameter and size of temporary buffer (if needed) + // depends on batch size. Allocation of temporary buffer is lazy i.e. its + // only created when needed. Once its created at first call of clFFT_Executexxx + // it is not allocated next time if next time clFFT_Executexxx is called with + // batch size different than the first call. last_batch_size caches the last + // batch size with which this plan is used so that we dont keep allocating/deallocating + // temp buffer if same batch size is used again and again. + size_t last_batch_size; + + // temporary buffer for interleaved plan + cl_mem tempmemobj; + + // temporary buffer for planner plan. Only one of tempmemobj or + // (tempmemobj_real, tempmemobj_imag) pair is valid (allocated) depending + // data format of plan (plannar or interleaved) + cl_mem tempmemobj_real, tempmemobj_imag; + + // Maximum size of signal for which local memory transposed based + // fft is sufficient i.e. no global mem transpose (communication) + // is needed + size_t max_localmem_fft_size; + + // Maximum work items per work group allowed. This, along with max_radix below controls + // maximum local memory being used by fft kernels of this plan. Set to 256 by default + size_t max_work_item_per_workgroup; + + // Maximum base radix for local memory fft ... this controls the maximum register + // space used by work items. Currently defaults to 16 + size_t max_radix; + + // Device depended parameter that tells how many work-items need to be read consecutive + // values to make sure global memory access by work-items of a work-group result in + // coalesced memory access to utilize full bandwidth e.g. on NVidia tesla, this is 16 + size_t min_mem_coalesce_width; + + // Number of local memory banks. This is used to geneate kernel with local memory + // transposes with appropriate padding to avoid bank conflicts to local memory + // e.g. on NVidia it is 16. + size_t num_local_mem_banks; }cl_fft_plan; void FFT1D(cl_fft_plan *plan, cl_fft_kernel_dir dir); diff --git a/src/fft_kernelstring.cpp b/src/fft_kernelstring.cpp index 53bf94afb5cfa96ffbf70a46c6ee6cccd8c73906..52a667c6a7f27124bd17c8856050d76718f04046 100644 --- a/src/fft_kernelstring.cpp +++ b/src/fft_kernelstring.cpp @@ -65,9 +65,9 @@ using namespace std; static string num2str(int num) { - char temp[200]; - sprintf(temp, "%d", num); - return string(temp); + char temp[200]; + sprintf(temp, "%d", num); + return string(temp); } // For any n, this function decomposes n into factors for loacal memory tranpose @@ -116,83 +116,83 @@ getRadixArray(unsigned int n, unsigned int *radixArray, unsigned int *numRadices return; } - switch(n) - { - case 2: - *numRadices = 1; - radixArray[0] = 2; - break; - - case 4: - *numRadices = 1; - radixArray[0] = 4; - break; - - case 8: - *numRadices = 1; - radixArray[0] = 8; - break; - - case 16: - *numRadices = 2; - radixArray[0] = 8; radixArray[1] = 2; - break; - - case 32: - *numRadices = 2; - radixArray[0] = 8; radixArray[1] = 4; - break; - - case 64: - *numRadices = 2; - radixArray[0] = 8; radixArray[1] = 8; - break; - - case 128: - *numRadices = 3; - radixArray[0] = 8; radixArray[1] = 4; radixArray[2] = 4; - break; - - case 256: - *numRadices = 4; - radixArray[0] = 4; radixArray[1] = 4; radixArray[2] = 4; radixArray[3] = 4; - break; - - case 512: - *numRadices = 3; - radixArray[0] = 8; radixArray[1] = 8; radixArray[2] = 8; - break; - - case 1024: - *numRadices = 3; - radixArray[0] = 16; radixArray[1] = 16; radixArray[2] = 4; - break; - case 2048: - *numRadices = 4; - radixArray[0] = 8; radixArray[1] = 8; radixArray[2] = 8; radixArray[3] = 4; - break; - default: - *numRadices = 0; - return; - } + switch(n) + { + case 2: + *numRadices = 1; + radixArray[0] = 2; + break; + + case 4: + *numRadices = 1; + radixArray[0] = 4; + break; + + case 8: + *numRadices = 1; + radixArray[0] = 8; + break; + + case 16: + *numRadices = 2; + radixArray[0] = 8; radixArray[1] = 2; + break; + + case 32: + *numRadices = 2; + radixArray[0] = 8; radixArray[1] = 4; + break; + + case 64: + *numRadices = 2; + radixArray[0] = 8; radixArray[1] = 8; + break; + + case 128: + *numRadices = 3; + radixArray[0] = 8; radixArray[1] = 4; radixArray[2] = 4; + break; + + case 256: + *numRadices = 4; + radixArray[0] = 4; radixArray[1] = 4; radixArray[2] = 4; radixArray[3] = 4; + break; + + case 512: + *numRadices = 3; + radixArray[0] = 8; radixArray[1] = 8; radixArray[2] = 8; + break; + + case 1024: + *numRadices = 3; + radixArray[0] = 16; radixArray[1] = 16; radixArray[2] = 4; + break; + case 2048: + *numRadices = 4; + radixArray[0] = 8; radixArray[1] = 8; radixArray[2] = 8; radixArray[3] = 4; + break; + default: + *numRadices = 0; + return; + } } static void insertHeader(string &kernelString, string &kernelName, clFFT_DataFormat dataFormat) { - if(dataFormat == clFFT_SplitComplexFormat) - kernelString += string("__kernel void ") + kernelName + string("(__global float *in_real, __global float *in_imag, __global float *out_real, __global float *out_imag, int dir, int S)\n"); - else - kernelString += string("__kernel void ") + kernelName + string("(__global float2 *in, __global float2 *out, int dir, int S)\n"); + if(dataFormat == clFFT_SplitComplexFormat) + kernelString += string("__kernel void ") + kernelName + string("(__global float *in_real, __global float *in_imag, __global float *out_real, __global float *out_imag, int dir, int S)\n"); + else + kernelString += string("__kernel void ") + kernelName + string("(__global float2 *in, __global float2 *out, int dir, int S)\n"); } static void insertVariables(string &kStream, int maxRadix) { - kStream += string(" int i, j, r, indexIn, indexOut, index, tid, bNum, xNum, k, l;\n"); + kStream += string(" int i, j, r, indexIn, indexOut, index, tid, bNum, xNum, k, l;\n"); kStream += string(" int s, ii, jj, offset;\n"); - kStream += string(" float2 w;\n"); - kStream += string(" float ang, angf, ang1;\n"); + kStream += string(" float2 w;\n"); + kStream += string(" float ang, angf, ang1;\n"); kStream += string(" __local float *lMemStore, *lMemLoad;\n"); kStream += string(" float2 a[") + num2str(maxRadix) + string("];\n"); kStream += string(" int lId = get_local_id( 0 );\n"); @@ -202,82 +202,82 @@ insertVariables(string &kStream, int maxRadix) static void formattedLoad(string &kernelString, int aIndex, int gIndex, clFFT_DataFormat dataFormat) { - if(dataFormat == clFFT_InterleavedComplexFormat) - kernelString += string(" a[") + num2str(aIndex) + string("] = in[") + num2str(gIndex) + string("];\n"); - else - { - kernelString += string(" a[") + num2str(aIndex) + string("].x = in_real[") + num2str(gIndex) + string("];\n"); - kernelString += string(" a[") + num2str(aIndex) + string("].y = in_imag[") + num2str(gIndex) + string("];\n"); - } + if(dataFormat == clFFT_InterleavedComplexFormat) + kernelString += string(" a[") + num2str(aIndex) + string("] = in[") + num2str(gIndex) + string("];\n"); + else + { + kernelString += string(" a[") + num2str(aIndex) + string("].x = in_real[") + num2str(gIndex) + string("];\n"); + kernelString += string(" a[") + num2str(aIndex) + string("].y = in_imag[") + num2str(gIndex) + string("];\n"); + } } static void formattedStore(string &kernelString, int aIndex, int gIndex, clFFT_DataFormat dataFormat) { - if(dataFormat == clFFT_InterleavedComplexFormat) - kernelString += string(" out[") + num2str(gIndex) + string("] = a[") + num2str(aIndex) + string("];\n"); - else - { - kernelString += string(" out_real[") + num2str(gIndex) + string("] = a[") + num2str(aIndex) + string("].x;\n"); - kernelString += string(" out_imag[") + num2str(gIndex) + string("] = a[") + num2str(aIndex) + string("].y;\n"); - } + if(dataFormat == clFFT_InterleavedComplexFormat) + kernelString += string(" out[") + num2str(gIndex) + string("] = a[") + num2str(aIndex) + string("];\n"); + else + { + kernelString += string(" out_real[") + num2str(gIndex) + string("] = a[") + num2str(aIndex) + string("].x;\n"); + kernelString += string(" out_imag[") + num2str(gIndex) + string("] = a[") + num2str(aIndex) + string("].y;\n"); + } } static int insertGlobalLoadsAndTranspose(string &kernelString, int N, int numWorkItemsPerXForm, int numXFormsPerWG, int R0, int mem_coalesce_width, clFFT_DataFormat dataFormat) { - int log2NumWorkItemsPerXForm = (int) log2(numWorkItemsPerXForm); - int groupSize = numWorkItemsPerXForm * numXFormsPerWG; - int i, j; - int lMemSize = 0; + int log2NumWorkItemsPerXForm = (int) log2(numWorkItemsPerXForm); + int groupSize = numWorkItemsPerXForm * numXFormsPerWG; + int i, j; + int lMemSize = 0; - if(numXFormsPerWG > 1) - kernelString += string(" s = S & ") + num2str(numXFormsPerWG - 1) + string(";\n"); + if(numXFormsPerWG > 1) + kernelString += string(" s = S & ") + num2str(numXFormsPerWG - 1) + string(";\n"); if(numWorkItemsPerXForm >= mem_coalesce_width) { - if(numXFormsPerWG > 1) - { + if(numXFormsPerWG > 1) + { kernelString += string(" ii = lId & ") + num2str(numWorkItemsPerXForm-1) + string(";\n"); kernelString += string(" jj = lId >> ") + num2str(log2NumWorkItemsPerXForm) + string(";\n"); kernelString += string(" if( !s || (groupId < get_num_groups(0)-1) || (jj < s) ) {\n"); - kernelString += string(" offset = mad24( mad24(groupId, ") + num2str(numXFormsPerWG) + string(", jj), ") + num2str(N) + string(", ii );\n"); - if(dataFormat == clFFT_InterleavedComplexFormat) - { - kernelString += string(" in += offset;\n"); - kernelString += string(" out += offset;\n"); - } - else - { - kernelString += string(" in_real += offset;\n"); - kernelString += string(" in_imag += offset;\n"); - kernelString += string(" out_real += offset;\n"); - kernelString += string(" out_imag += offset;\n"); - } - for(i = 0; i < R0; i++) - formattedLoad(kernelString, i, i*numWorkItemsPerXForm, dataFormat); - kernelString += string(" }\n"); - } - else - { - kernelString += string(" ii = lId;\n"); - kernelString += string(" jj = 0;\n"); - kernelString += string(" offset = mad24(groupId, ") + num2str(N) + string(", ii);\n"); - if(dataFormat == clFFT_InterleavedComplexFormat) - { - kernelString += string(" in += offset;\n"); - kernelString += string(" out += offset;\n"); - } - else - { - kernelString += string(" in_real += offset;\n"); - kernelString += string(" in_imag += offset;\n"); - kernelString += string(" out_real += offset;\n"); - kernelString += string(" out_imag += offset;\n"); - } - for(i = 0; i < R0; i++) - formattedLoad(kernelString, i, i*numWorkItemsPerXForm, dataFormat); - } + kernelString += string(" offset = mad24( mad24(groupId, ") + num2str(numXFormsPerWG) + string(", jj), ") + num2str(N) + string(", ii );\n"); + if(dataFormat == clFFT_InterleavedComplexFormat) + { + kernelString += string(" in += offset;\n"); + kernelString += string(" out += offset;\n"); + } + else + { + kernelString += string(" in_real += offset;\n"); + kernelString += string(" in_imag += offset;\n"); + kernelString += string(" out_real += offset;\n"); + kernelString += string(" out_imag += offset;\n"); + } + for(i = 0; i < R0; i++) + formattedLoad(kernelString, i, i*numWorkItemsPerXForm, dataFormat); + kernelString += string(" }\n"); + } + else + { + kernelString += string(" ii = lId;\n"); + kernelString += string(" jj = 0;\n"); + kernelString += string(" offset = mad24(groupId, ") + num2str(N) + string(", ii);\n"); + if(dataFormat == clFFT_InterleavedComplexFormat) + { + kernelString += string(" in += offset;\n"); + kernelString += string(" out += offset;\n"); + } + else + { + kernelString += string(" in_real += offset;\n"); + kernelString += string(" in_imag += offset;\n"); + kernelString += string(" out_real += offset;\n"); + kernelString += string(" out_imag += offset;\n"); + } + for(i = 0; i < R0; i++) + formattedLoad(kernelString, i, i*numWorkItemsPerXForm, dataFormat); + } } else if( N >= mem_coalesce_width ) { @@ -289,120 +289,120 @@ insertGlobalLoadsAndTranspose(string &kernelString, int N, int numWorkItemsPerXF kernelString += string(" lMemStore = sMem + mad24( jj, ") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n"); kernelString += string(" offset = mad24( groupId, ") + num2str(numXFormsPerWG) + string(", jj);\n"); kernelString += string(" offset = mad24( offset, ") + num2str(N) + string(", ii );\n"); - if(dataFormat == clFFT_InterleavedComplexFormat) - { - kernelString += string(" in += offset;\n"); - kernelString += string(" out += offset;\n"); - } - else - { - kernelString += string(" in_real += offset;\n"); - kernelString += string(" in_imag += offset;\n"); - kernelString += string(" out_real += offset;\n"); - kernelString += string(" out_imag += offset;\n"); - } - - kernelString += string("if((groupId == get_num_groups(0)-1) && s) {\n"); + if(dataFormat == clFFT_InterleavedComplexFormat) + { + kernelString += string(" in += offset;\n"); + kernelString += string(" out += offset;\n"); + } + else + { + kernelString += string(" in_real += offset;\n"); + kernelString += string(" in_imag += offset;\n"); + kernelString += string(" out_real += offset;\n"); + kernelString += string(" out_imag += offset;\n"); + } + + kernelString += string("if((groupId == get_num_groups(0)-1) && s) {\n"); for(i = 0; i < numOuterIter; i++ ) { kernelString += string(" if( jj < s ) {\n"); - for(j = 0; j < numInnerIter; j++ ) - formattedLoad(kernelString, i * numInnerIter + j, j * mem_coalesce_width + i * ( groupSize / mem_coalesce_width ) * N, dataFormat); - kernelString += string(" }\n"); - if(i != numOuterIter - 1) - kernelString += string(" jj += ") + num2str(groupSize / mem_coalesce_width) + string(";\n"); + for(j = 0; j < numInnerIter; j++ ) + formattedLoad(kernelString, i * numInnerIter + j, j * mem_coalesce_width + i * ( groupSize / mem_coalesce_width ) * N, dataFormat); + kernelString += string(" }\n"); + if(i != numOuterIter - 1) + kernelString += string(" jj += ") + num2str(groupSize / mem_coalesce_width) + string(";\n"); } - kernelString += string("}\n "); - kernelString += string("else {\n"); + kernelString += string("}\n "); + kernelString += string("else {\n"); for(i = 0; i < numOuterIter; i++ ) { - for(j = 0; j < numInnerIter; j++ ) - formattedLoad(kernelString, i * numInnerIter + j, j * mem_coalesce_width + i * ( groupSize / mem_coalesce_width ) * N, dataFormat); + for(j = 0; j < numInnerIter; j++ ) + formattedLoad(kernelString, i * numInnerIter + j, j * mem_coalesce_width + i * ( groupSize / mem_coalesce_width ) * N, dataFormat); } - kernelString += string("}\n"); + kernelString += string("}\n"); - kernelString += string(" ii = lId & ") + num2str(numWorkItemsPerXForm - 1) + string(";\n"); - kernelString += string(" jj = lId >> ") + num2str(log2NumWorkItemsPerXForm) + string(";\n"); + kernelString += string(" ii = lId & ") + num2str(numWorkItemsPerXForm - 1) + string(";\n"); + kernelString += string(" jj = lId >> ") + num2str(log2NumWorkItemsPerXForm) + string(";\n"); kernelString += string(" lMemLoad = sMem + mad24( jj, ") + num2str(N + numWorkItemsPerXForm) + string(", ii);\n"); for( i = 0; i < numOuterIter; i++ ) - { - for( j = 0; j < numInnerIter; j++ ) - { - kernelString += string(" lMemStore[") + num2str(j * mem_coalesce_width + i * ( groupSize / mem_coalesce_width ) * (N + numWorkItemsPerXForm )) + string("] = a[") + - num2str(i * numInnerIter + j) + string("].x;\n"); - } - } + { + for( j = 0; j < numInnerIter; j++ ) + { + kernelString += string(" lMemStore[") + num2str(j * mem_coalesce_width + i * ( groupSize / mem_coalesce_width ) * (N + numWorkItemsPerXForm )) + string("] = a[") + + num2str(i * numInnerIter + j) + string("].x;\n"); + } + } + kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); + + for( i = 0; i < R0; i++ ) + kernelString += string(" a[") + num2str(i) + string("].x = lMemLoad[") + num2str(i * numWorkItemsPerXForm) + string("];\n"); + kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); + + for( i = 0; i < numOuterIter; i++ ) + { + for( j = 0; j < numInnerIter; j++ ) + { + kernelString += string(" lMemStore[") + num2str(j * mem_coalesce_width + i * ( groupSize / mem_coalesce_width ) * (N + numWorkItemsPerXForm )) + string("] = a[") + + num2str(i * numInnerIter + j) + string("].y;\n"); + } + } kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); for( i = 0; i < R0; i++ ) - kernelString += string(" a[") + num2str(i) + string("].x = lMemLoad[") + num2str(i * numWorkItemsPerXForm) + string("];\n"); - kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); - - for( i = 0; i < numOuterIter; i++ ) - { - for( j = 0; j < numInnerIter; j++ ) - { - kernelString += string(" lMemStore[") + num2str(j * mem_coalesce_width + i * ( groupSize / mem_coalesce_width ) * (N + numWorkItemsPerXForm )) + string("] = a[") + - num2str(i * numInnerIter + j) + string("].y;\n"); - } - } - kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); - - for( i = 0; i < R0; i++ ) - kernelString += string(" a[") + num2str(i) + string("].y = lMemLoad[") + num2str(i * numWorkItemsPerXForm) + string("];\n"); - kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); - - lMemSize = (N + numWorkItemsPerXForm) * numXFormsPerWG; + kernelString += string(" a[") + num2str(i) + string("].y = lMemLoad[") + num2str(i * numWorkItemsPerXForm) + string("];\n"); + kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); + + lMemSize = (N + numWorkItemsPerXForm) * numXFormsPerWG; } else { kernelString += string(" offset = mad24( groupId, ") + num2str(N * numXFormsPerWG) + string(", lId );\n"); - if(dataFormat == clFFT_InterleavedComplexFormat) - { - kernelString += string(" in += offset;\n"); - kernelString += string(" out += offset;\n"); - } - else - { - kernelString += string(" in_real += offset;\n"); - kernelString += string(" in_imag += offset;\n"); - kernelString += string(" out_real += offset;\n"); - kernelString += string(" out_imag += offset;\n"); - } + if(dataFormat == clFFT_InterleavedComplexFormat) + { + kernelString += string(" in += offset;\n"); + kernelString += string(" out += offset;\n"); + } + else + { + kernelString += string(" in_real += offset;\n"); + kernelString += string(" in_imag += offset;\n"); + kernelString += string(" out_real += offset;\n"); + kernelString += string(" out_imag += offset;\n"); + } kernelString += string(" ii = lId & ") + num2str(N-1) + string(";\n"); kernelString += string(" jj = lId >> ") + num2str((int)log2(N)) + string(";\n"); kernelString += string(" lMemStore = sMem + mad24( jj, ") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n"); - kernelString += string("if((groupId == get_num_groups(0)-1) && s) {\n"); + kernelString += string("if((groupId == get_num_groups(0)-1) && s) {\n"); for( i = 0; i < R0; i++ ) { kernelString += string(" if(jj < s )\n"); - formattedLoad(kernelString, i, i*groupSize, dataFormat); - if(i != R0 - 1) - kernelString += string(" jj += ") + num2str(groupSize / N) + string(";\n"); + formattedLoad(kernelString, i, i*groupSize, dataFormat); + if(i != R0 - 1) + kernelString += string(" jj += ") + num2str(groupSize / N) + string(";\n"); } - kernelString += string("}\n"); - kernelString += string("else {\n"); + kernelString += string("}\n"); + kernelString += string("else {\n"); for( i = 0; i < R0; i++ ) { - formattedLoad(kernelString, i, i*groupSize, dataFormat); + formattedLoad(kernelString, i, i*groupSize, dataFormat); } - kernelString += string("}\n"); + kernelString += string("}\n"); - if(numWorkItemsPerXForm > 1) - { + if(numWorkItemsPerXForm > 1) + { kernelString += string(" ii = lId & ") + num2str(numWorkItemsPerXForm - 1) + string(";\n"); kernelString += string(" jj = lId >> ") + num2str(log2NumWorkItemsPerXForm) + string(";\n"); kernelString += string(" lMemLoad = sMem + mad24( jj, ") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n"); - } - else - { + } + else + { kernelString += string(" ii = 0;\n"); kernelString += string(" jj = lId;\n"); kernelString += string(" lMemLoad = sMem + mul24( jj, ") + num2str(N + numWorkItemsPerXForm) + string(");\n"); - } + } for( i = 0; i < R0; i++ ) @@ -411,7 +411,7 @@ insertGlobalLoadsAndTranspose(string &kernelString, int N, int numWorkItemsPerXF for( i = 0; i < R0; i++ ) kernelString += string(" a[") + num2str(i) + string("].x = lMemLoad[") + num2str(i * numWorkItemsPerXForm) + string("];\n"); - kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); + kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); for( i = 0; i < R0; i++ ) kernelString += string(" lMemStore[") + num2str(i * ( groupSize / N ) * ( N + numWorkItemsPerXForm )) + string("] = a[") + num2str(i) + string("].y;\n"); @@ -419,39 +419,39 @@ insertGlobalLoadsAndTranspose(string &kernelString, int N, int numWorkItemsPerXF for( i = 0; i < R0; i++ ) kernelString += string(" a[") + num2str(i) + string("].y = lMemLoad[") + num2str(i * numWorkItemsPerXForm) + string("];\n"); - kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); + kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); - lMemSize = (N + numWorkItemsPerXForm) * numXFormsPerWG; + lMemSize = (N + numWorkItemsPerXForm) * numXFormsPerWG; } - return lMemSize; + return lMemSize; } static int insertGlobalStoresAndTranspose(string &kernelString, int N, int maxRadix, int Nr, int numWorkItemsPerXForm, int numXFormsPerWG, int mem_coalesce_width, clFFT_DataFormat dataFormat) { - int groupSize = numWorkItemsPerXForm * numXFormsPerWG; - int i, j, k, ind; - int lMemSize = 0; - int numIter = maxRadix / Nr; - string indent = string(""); + int groupSize = numWorkItemsPerXForm * numXFormsPerWG; + int i, j, k, ind; + int lMemSize = 0; + int numIter = maxRadix / Nr; + string indent = string(""); if( numWorkItemsPerXForm >= mem_coalesce_width ) { - if(numXFormsPerWG > 1) - { + if(numXFormsPerWG > 1) + { kernelString += string(" if( !s || (groupId < get_num_groups(0)-1) || (jj < s) ) {\n"); - indent = string(" "); - } - for(i = 0; i < maxRadix; i++) - { - j = i % numIter; - k = i / numIter; - ind = j * Nr + k; - formattedStore(kernelString, ind, i*numWorkItemsPerXForm, dataFormat); - } - if(numXFormsPerWG > 1) - kernelString += string(" }\n"); + indent = string(" "); + } + for(i = 0; i < maxRadix; i++) + { + j = i % numIter; + k = i / numIter; + ind = j * Nr + k; + formattedStore(kernelString, ind, i*numWorkItemsPerXForm, dataFormat); + } + if(numXFormsPerWG > 1) + kernelString += string(" }\n"); } else if( N >= mem_coalesce_width ) { @@ -464,69 +464,69 @@ insertGlobalStoresAndTranspose(string &kernelString, int N, int maxRadix, int Nr kernelString += string(" lMemStore = sMem + mad24( jj,") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n"); for( i = 0; i < maxRadix; i++ ) - { - j = i % numIter; - k = i / numIter; - ind = j * Nr + k; + { + j = i % numIter; + k = i / numIter; + ind = j * Nr + k; kernelString += string(" lMemLoad[") + num2str(i*numWorkItemsPerXForm) + string("] = a[") + num2str(ind) + string("].x;\n"); - } + } kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); for( i = 0; i < numOuterIter; i++ ) - for( j = 0; j < numInnerIter; j++ ) - kernelString += string(" a[") + num2str(i*numInnerIter + j) + string("].x = lMemStore[") + num2str(j*mem_coalesce_width + i*( groupSize / mem_coalesce_width )*(N + numWorkItemsPerXForm)) + string("];\n"); + for( j = 0; j < numInnerIter; j++ ) + kernelString += string(" a[") + num2str(i*numInnerIter + j) + string("].x = lMemStore[") + num2str(j*mem_coalesce_width + i*( groupSize / mem_coalesce_width )*(N + numWorkItemsPerXForm)) + string("];\n"); kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); for( i = 0; i < maxRadix; i++ ) - { - j = i % numIter; - k = i / numIter; - ind = j * Nr + k; + { + j = i % numIter; + k = i / numIter; + ind = j * Nr + k; kernelString += string(" lMemLoad[") + num2str(i*numWorkItemsPerXForm) + string("] = a[") + num2str(ind) + string("].y;\n"); - } + } kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); for( i = 0; i < numOuterIter; i++ ) - for( j = 0; j < numInnerIter; j++ ) - kernelString += string(" a[") + num2str(i*numInnerIter + j) + string("].y = lMemStore[") + num2str(j*mem_coalesce_width + i*( groupSize / mem_coalesce_width )*(N + numWorkItemsPerXForm)) + string("];\n"); + for( j = 0; j < numInnerIter; j++ ) + kernelString += string(" a[") + num2str(i*numInnerIter + j) + string("].y = lMemStore[") + num2str(j*mem_coalesce_width + i*( groupSize / mem_coalesce_width )*(N + numWorkItemsPerXForm)) + string("];\n"); kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); - kernelString += string("if((groupId == get_num_groups(0)-1) && s) {\n"); - for(i = 0; i < numOuterIter; i++ ) + kernelString += string("if((groupId == get_num_groups(0)-1) && s) {\n"); + for(i = 0; i < numOuterIter; i++ ) { kernelString += string(" if( jj < s ) {\n"); - for(j = 0; j < numInnerIter; j++ ) - formattedStore(kernelString, i*numInnerIter + j, j*mem_coalesce_width + i*(groupSize/mem_coalesce_width)*N, dataFormat); - kernelString += string(" }\n"); - if(i != numOuterIter - 1) - kernelString += string(" jj += ") + num2str(groupSize / mem_coalesce_width) + string(";\n"); + for(j = 0; j < numInnerIter; j++ ) + formattedStore(kernelString, i*numInnerIter + j, j*mem_coalesce_width + i*(groupSize/mem_coalesce_width)*N, dataFormat); + kernelString += string(" }\n"); + if(i != numOuterIter - 1) + kernelString += string(" jj += ") + num2str(groupSize / mem_coalesce_width) + string(";\n"); } - kernelString += string("}\n"); - kernelString += string("else {\n"); - for(i = 0; i < numOuterIter; i++ ) + kernelString += string("}\n"); + kernelString += string("else {\n"); + for(i = 0; i < numOuterIter; i++ ) { - for(j = 0; j < numInnerIter; j++ ) - formattedStore(kernelString, i*numInnerIter + j, j*mem_coalesce_width + i*(groupSize/mem_coalesce_width)*N, dataFormat); + for(j = 0; j < numInnerIter; j++ ) + formattedStore(kernelString, i*numInnerIter + j, j*mem_coalesce_width + i*(groupSize/mem_coalesce_width)*N, dataFormat); } - kernelString += string("}\n"); + kernelString += string("}\n"); - lMemSize = (N + numWorkItemsPerXForm) * numXFormsPerWG; - } + lMemSize = (N + numWorkItemsPerXForm) * numXFormsPerWG; + } else { kernelString += string(" lMemLoad = sMem + mad24( jj,") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n"); - kernelString += string(" ii = lId & ") + num2str(N - 1) + string(";\n"); + kernelString += string(" ii = lId & ") + num2str(N - 1) + string(";\n"); kernelString += string(" jj = lId >> ") + num2str((int) log2(N)) + string(";\n"); kernelString += string(" lMemStore = sMem + mad24( jj,") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n"); for( i = 0; i < maxRadix; i++ ) - { - j = i % numIter; - k = i / numIter; - ind = j * Nr + k; + { + j = i % numIter; + k = i / numIter; + ind = j * Nr + k; kernelString += string(" lMemLoad[") + num2str(i*numWorkItemsPerXForm) + string("] = a[") + num2str(ind) + string("].x;\n"); - } + } kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); for( i = 0; i < maxRadix; i++ ) @@ -534,188 +534,188 @@ insertGlobalStoresAndTranspose(string &kernelString, int N, int maxRadix, int Nr kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); for( i = 0; i < maxRadix; i++ ) - { - j = i % numIter; - k = i / numIter; - ind = j * Nr + k; + { + j = i % numIter; + k = i / numIter; + ind = j * Nr + k; kernelString += string(" lMemLoad[") + num2str(i*numWorkItemsPerXForm) + string("] = a[") + num2str(ind) + string("].y;\n"); - } + } kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); for( i = 0; i < maxRadix; i++ ) kernelString += string(" a[") + num2str(i) + string("].y = lMemStore[") + num2str(i*( groupSize / N )*( N + numWorkItemsPerXForm )) + string("];\n"); kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); - kernelString += string("if((groupId == get_num_groups(0)-1) && s) {\n"); - for( i = 0; i < maxRadix; i++ ) + kernelString += string("if((groupId == get_num_groups(0)-1) && s) {\n"); + for( i = 0; i < maxRadix; i++ ) { kernelString += string(" if(jj < s ) {\n"); - formattedStore(kernelString, i, i*groupSize, dataFormat); - kernelString += string(" }\n"); - if( i != maxRadix - 1) - kernelString += string(" jj +=") + num2str(groupSize / N) + string(";\n"); + formattedStore(kernelString, i, i*groupSize, dataFormat); + kernelString += string(" }\n"); + if( i != maxRadix - 1) + kernelString += string(" jj +=") + num2str(groupSize / N) + string(";\n"); } - kernelString += string("}\n"); - kernelString += string("else {\n"); - for( i = 0; i < maxRadix; i++ ) + kernelString += string("}\n"); + kernelString += string("else {\n"); + for( i = 0; i < maxRadix; i++ ) { - formattedStore(kernelString, i, i*groupSize, dataFormat); + formattedStore(kernelString, i, i*groupSize, dataFormat); } - kernelString += string("}\n"); + kernelString += string("}\n"); - lMemSize = (N + numWorkItemsPerXForm) * numXFormsPerWG; + lMemSize = (N + numWorkItemsPerXForm) * numXFormsPerWG; } - return lMemSize; + return lMemSize; } static void insertfftKernel(string &kernelString, int Nr, int numIter) { - int i; - for(i = 0; i < numIter; i++) - { - kernelString += string(" fftKernel") + num2str(Nr) + string("(a+") + num2str(i*Nr) + string(", dir);\n"); - } + int i; + for(i = 0; i < numIter; i++) + { + kernelString += string(" fftKernel") + num2str(Nr) + string("(a+") + num2str(i*Nr) + string(", dir);\n"); + } } static void insertTwiddleKernel(string &kernelString, int Nr, int numIter, int Nprev, int len, int numWorkItemsPerXForm) { - int z, k; - int logNPrev = log2(Nprev); - - for(z = 0; z < numIter; z++) - { - if(z == 0) - { - if(Nprev > 1) - kernelString += string(" angf = (float) (ii >> ") + num2str(logNPrev) + string(");\n"); - else - kernelString += string(" angf = (float) ii;\n"); - } - else - { - if(Nprev > 1) - kernelString += string(" angf = (float) ((") + num2str(z*numWorkItemsPerXForm) + string(" + ii) >>") + num2str(logNPrev) + string(");\n"); - else - kernelString += string(" angf = (float) (") + num2str(z*numWorkItemsPerXForm) + string(" + ii);\n"); - } - - for(k = 1; k < Nr; k++) { - int ind = z*Nr + k; - //float fac = (float) (2.0 * M_PI * (double) k / (double) len); - kernelString += string(" ang = dir * ( 2.0f * M_PI * ") + num2str(k) + string(".0f / ") + num2str(len) + string(".0f )") + string(" * angf;\n"); - kernelString += string(" w = (float2)(native_cos(ang), native_sin(ang));\n"); - kernelString += string(" a[") + num2str(ind) + string("] = complexMul(a[") + num2str(ind) + string("], w);\n"); - } - } + int z, k; + int logNPrev = log2(Nprev); + + for(z = 0; z < numIter; z++) + { + if(z == 0) + { + if(Nprev > 1) + kernelString += string(" angf = (float) (ii >> ") + num2str(logNPrev) + string(");\n"); + else + kernelString += string(" angf = (float) ii;\n"); + } + else + { + if(Nprev > 1) + kernelString += string(" angf = (float) ((") + num2str(z*numWorkItemsPerXForm) + string(" + ii) >>") + num2str(logNPrev) + string(");\n"); + else + kernelString += string(" angf = (float) (") + num2str(z*numWorkItemsPerXForm) + string(" + ii);\n"); + } + + for(k = 1; k < Nr; k++) { + int ind = z*Nr + k; + //float fac = (float) (2.0 * M_PI * (double) k / (double) len); + kernelString += string(" ang = dir * ( 2.0f * M_PI * ") + num2str(k) + string(".0f / ") + num2str(len) + string(".0f )") + string(" * angf;\n"); + kernelString += string(" w = (float2)(native_cos(ang), native_sin(ang));\n"); + kernelString += string(" a[") + num2str(ind) + string("] = complexMul(a[") + num2str(ind) + string("], w);\n"); + } + } } static int getPadding(int numWorkItemsPerXForm, int Nprev, int numWorkItemsReq, int numXFormsPerWG, int Nr, int numBanks, int *offset, int *midPad) { - if((numWorkItemsPerXForm <= Nprev) || (Nprev >= numBanks)) - *offset = 0; - else { - int numRowsReq = ((numWorkItemsPerXForm < numBanks) ? numWorkItemsPerXForm : numBanks) / Nprev; - int numColsReq = 1; - if(numRowsReq > Nr) - numColsReq = numRowsReq / Nr; - numColsReq = Nprev * numColsReq; - *offset = numColsReq; - } - - if(numWorkItemsPerXForm >= numBanks || numXFormsPerWG == 1) - *midPad = 0; - else { - int bankNum = ( (numWorkItemsReq + *offset) * Nr ) & (numBanks - 1); - if( bankNum >= numWorkItemsPerXForm ) - *midPad = 0; - else - *midPad = numWorkItemsPerXForm - bankNum; - } - - int lMemSize = ( numWorkItemsReq + *offset) * Nr * numXFormsPerWG + *midPad * (numXFormsPerWG - 1); - return lMemSize; + if((numWorkItemsPerXForm <= Nprev) || (Nprev >= numBanks)) + *offset = 0; + else { + int numRowsReq = ((numWorkItemsPerXForm < numBanks) ? numWorkItemsPerXForm : numBanks) / Nprev; + int numColsReq = 1; + if(numRowsReq > Nr) + numColsReq = numRowsReq / Nr; + numColsReq = Nprev * numColsReq; + *offset = numColsReq; + } + + if(numWorkItemsPerXForm >= numBanks || numXFormsPerWG == 1) + *midPad = 0; + else { + int bankNum = ( (numWorkItemsReq + *offset) * Nr ) & (numBanks - 1); + if( bankNum >= numWorkItemsPerXForm ) + *midPad = 0; + else + *midPad = numWorkItemsPerXForm - bankNum; + } + + int lMemSize = ( numWorkItemsReq + *offset) * Nr * numXFormsPerWG + *midPad * (numXFormsPerWG - 1); + return lMemSize; } static void insertLocalStores(string &kernelString, int numIter, int Nr, int numWorkItemsPerXForm, int numWorkItemsReq, int offset, string &comp) { - int z, k; - - for(z = 0; z < numIter; z++) { - for(k = 0; k < Nr; k++) { - int index = k*(numWorkItemsReq + offset) + z*numWorkItemsPerXForm; - kernelString += string(" lMemStore[") + num2str(index) + string("] = a[") + num2str(z*Nr + k) + string("].") + comp + string(";\n"); - } - } - kernelString += string(" barrier(CLK_LOCAL_MEM_FENCE);\n"); + int z, k; + + for(z = 0; z < numIter; z++) { + for(k = 0; k < Nr; k++) { + int index = k*(numWorkItemsReq + offset) + z*numWorkItemsPerXForm; + kernelString += string(" lMemStore[") + num2str(index) + string("] = a[") + num2str(z*Nr + k) + string("].") + comp + string(";\n"); + } + } + kernelString += string(" barrier(CLK_LOCAL_MEM_FENCE);\n"); } static void insertLocalLoads(string &kernelString, int n, int Nr, int Nrn, int Nprev, int Ncurr, int numWorkItemsPerXForm, int numWorkItemsReq, int offset, string &comp) { - int numWorkItemsReqN = n / Nrn; - int interBlockHNum = max( Nprev / numWorkItemsPerXForm, 1 ); - int interBlockHStride = numWorkItemsPerXForm; - int vertWidth = max(numWorkItemsPerXForm / Nprev, 1); - vertWidth = min( vertWidth, Nr); - int vertNum = Nr / vertWidth; - int vertStride = ( n / Nr + offset ) * vertWidth; - int iter = max( numWorkItemsReqN / numWorkItemsPerXForm, 1); - int intraBlockHStride = (numWorkItemsPerXForm / (Nprev*Nr)) > 1 ? (numWorkItemsPerXForm / (Nprev*Nr)) : 1; - intraBlockHStride *= Nprev; - - int stride = numWorkItemsReq / Nrn; - int i; - for(i = 0; i < iter; i++) { - int ii = i / (interBlockHNum * vertNum); - int zz = i % (interBlockHNum * vertNum); - int jj = zz % interBlockHNum; - int kk = zz / interBlockHNum; - int z; - for(z = 0; z < Nrn; z++) { - int st = kk * vertStride + jj * interBlockHStride + ii * intraBlockHStride + z * stride; - kernelString += string(" a[") + num2str(i*Nrn + z) + string("].") + comp + string(" = lMemLoad[") + num2str(st) + string("];\n"); - } - } - kernelString += string(" barrier(CLK_LOCAL_MEM_FENCE);\n"); + int numWorkItemsReqN = n / Nrn; + int interBlockHNum = max( Nprev / numWorkItemsPerXForm, 1 ); + int interBlockHStride = numWorkItemsPerXForm; + int vertWidth = max(numWorkItemsPerXForm / Nprev, 1); + vertWidth = min( vertWidth, Nr); + int vertNum = Nr / vertWidth; + int vertStride = ( n / Nr + offset ) * vertWidth; + int iter = max( numWorkItemsReqN / numWorkItemsPerXForm, 1); + int intraBlockHStride = (numWorkItemsPerXForm / (Nprev*Nr)) > 1 ? (numWorkItemsPerXForm / (Nprev*Nr)) : 1; + intraBlockHStride *= Nprev; + + int stride = numWorkItemsReq / Nrn; + int i; + for(i = 0; i < iter; i++) { + int ii = i / (interBlockHNum * vertNum); + int zz = i % (interBlockHNum * vertNum); + int jj = zz % interBlockHNum; + int kk = zz / interBlockHNum; + int z; + for(z = 0; z < Nrn; z++) { + int st = kk * vertStride + jj * interBlockHStride + ii * intraBlockHStride + z * stride; + kernelString += string(" a[") + num2str(i*Nrn + z) + string("].") + comp + string(" = lMemLoad[") + num2str(st) + string("];\n"); + } + } + kernelString += string(" barrier(CLK_LOCAL_MEM_FENCE);\n"); } static void insertLocalLoadIndexArithmatic(string &kernelString, int Nprev, int Nr, int numWorkItemsReq, int numWorkItemsPerXForm, int numXFormsPerWG, int offset, int midPad) { - int Ncurr = Nprev * Nr; - int logNcurr = log2(Ncurr); - int logNprev = log2(Nprev); - int incr = (numWorkItemsReq + offset) * Nr + midPad; - - if(Ncurr < numWorkItemsPerXForm) - { - if(Nprev == 1) - kernelString += string(" j = ii & ") + num2str(Ncurr - 1) + string(";\n"); - else - kernelString += string(" j = (ii & ") + num2str(Ncurr - 1) + string(") >> ") + num2str(logNprev) + string(";\n"); - - if(Nprev == 1) - kernelString += string(" i = ii >> ") + num2str(logNcurr) + string(";\n"); - else - kernelString += string(" i = mad24(ii >> ") + num2str(logNcurr) + string(", ") + num2str(Nprev) + string(", ii & ") + num2str(Nprev - 1) + string(");\n"); - } - else - { - if(Nprev == 1) - kernelString += string(" j = ii;\n"); - else - kernelString += string(" j = ii >> ") + num2str(logNprev) + string(";\n"); - if(Nprev == 1) - kernelString += string(" i = 0;\n"); - else - kernelString += string(" i = ii & ") + num2str(Nprev - 1) + string(";\n"); - } + int Ncurr = Nprev * Nr; + int logNcurr = log2(Ncurr); + int logNprev = log2(Nprev); + int incr = (numWorkItemsReq + offset) * Nr + midPad; + + if(Ncurr < numWorkItemsPerXForm) + { + if(Nprev == 1) + kernelString += string(" j = ii & ") + num2str(Ncurr - 1) + string(";\n"); + else + kernelString += string(" j = (ii & ") + num2str(Ncurr - 1) + string(") >> ") + num2str(logNprev) + string(";\n"); + + if(Nprev == 1) + kernelString += string(" i = ii >> ") + num2str(logNcurr) + string(";\n"); + else + kernelString += string(" i = mad24(ii >> ") + num2str(logNcurr) + string(", ") + num2str(Nprev) + string(", ii & ") + num2str(Nprev - 1) + string(");\n"); + } + else + { + if(Nprev == 1) + kernelString += string(" j = ii;\n"); + else + kernelString += string(" j = ii >> ") + num2str(logNprev) + string(";\n"); + if(Nprev == 1) + kernelString += string(" i = 0;\n"); + else + kernelString += string(" i = ii & ") + num2str(Nprev - 1) + string(";\n"); + } if(numXFormsPerWG > 1) kernelString += string(" i = mad24(jj, ") + num2str(incr) + string(", i);\n"); @@ -726,126 +726,126 @@ insertLocalLoadIndexArithmatic(string &kernelString, int Nprev, int Nr, int numW static void insertLocalStoreIndexArithmatic(string &kernelString, int numWorkItemsReq, int numXFormsPerWG, int Nr, int offset, int midPad) { - if(numXFormsPerWG == 1) { - kernelString += string(" lMemStore = sMem + ii;\n"); - } - else { - kernelString += string(" lMemStore = sMem + mad24(jj, ") + num2str((numWorkItemsReq + offset)*Nr + midPad) + string(", ii);\n"); - } + if(numXFormsPerWG == 1) { + kernelString += string(" lMemStore = sMem + ii;\n"); + } + else { + kernelString += string(" lMemStore = sMem + mad24(jj, ") + num2str((numWorkItemsReq + offset)*Nr + midPad) + string(", ii);\n"); + } } static void createLocalMemfftKernelString(cl_fft_plan *plan) { - unsigned int radixArray[10]; - unsigned int numRadix; + unsigned int radixArray[10]; + unsigned int numRadix; + + unsigned int n = plan->n.x; - unsigned int n = plan->n.x; + assert(n <= plan->max_work_item_per_workgroup * plan->max_radix && "signal lenght too big for local mem fft\n"); - assert(n <= plan->max_work_item_per_workgroup * plan->max_radix && "signal lenght too big for local mem fft\n"); + getRadixArray(n, radixArray, &numRadix, 0); + assert(numRadix > 0 && "no radix array supplied\n"); - getRadixArray(n, radixArray, &numRadix, 0); - assert(numRadix > 0 && "no radix array supplied\n"); + if(n/radixArray[0] > plan->max_work_item_per_workgroup) + getRadixArray(n, radixArray, &numRadix, plan->max_radix); - if(n/radixArray[0] > plan->max_work_item_per_workgroup) - getRadixArray(n, radixArray, &numRadix, plan->max_radix); + assert(radixArray[0] <= plan->max_radix && "max radix choosen is greater than allowed\n"); + assert(n/radixArray[0] <= plan->max_work_item_per_workgroup && "required work items per xform greater than maximum work items allowed per work group for local mem fft\n"); - assert(radixArray[0] <= plan->max_radix && "max radix choosen is greater than allowed\n"); - assert(n/radixArray[0] <= plan->max_work_item_per_workgroup && "required work items per xform greater than maximum work items allowed per work group for local mem fft\n"); + unsigned int tmpLen = 1; + unsigned int i; + for(i = 0; i < numRadix; i++) + { + assert( radixArray[i] && !( (radixArray[i] - 1) & radixArray[i] ) ); + tmpLen *= radixArray[i]; + } + assert(tmpLen == n && "product of radices choosen doesnt match the length of signal\n"); - unsigned int tmpLen = 1; - unsigned int i; - for(i = 0; i < numRadix; i++) - { - assert( radixArray[i] && !( (radixArray[i] - 1) & radixArray[i] ) ); - tmpLen *= radixArray[i]; - } - assert(tmpLen == n && "product of radices choosen doesnt match the length of signal\n"); + int offset, midPad; + string localString(""), kernelName(""); - int offset, midPad; - string localString(""), kernelName(""); + clFFT_DataFormat dataFormat = plan->format; + string *kernelString = plan->kernel_string; - clFFT_DataFormat dataFormat = plan->format; - string *kernelString = plan->kernel_string; + cl_fft_kernel_info **kInfo = &plan->kernel_info; + int kCount = 0; - cl_fft_kernel_info **kInfo = &plan->kernel_info; - int kCount = 0; + while(*kInfo) + { + kInfo = &(*kInfo)->next; + kCount++; + } - while(*kInfo) - { - kInfo = &(*kInfo)->next; - kCount++; - } + kernelName = string("fft") + num2str(kCount); + + *kInfo = (cl_fft_kernel_info *) malloc(sizeof(cl_fft_kernel_info)); + (*kInfo)->kernel = 0; + (*kInfo)->lmem_size = 0; + (*kInfo)->num_workgroups = 0; + (*kInfo)->num_workitems_per_workgroup = 0; + (*kInfo)->dir = cl_fft_kernel_x; + (*kInfo)->in_place_possible = 1; + (*kInfo)->next = NULL; + (*kInfo)->kernel_name = (char *) malloc(sizeof(char)*(kernelName.size()+1)); + strcpy((*kInfo)->kernel_name, kernelName.c_str()); + + unsigned int numWorkItemsPerXForm = n / radixArray[0]; + unsigned int numWorkItemsPerWG = numWorkItemsPerXForm <= 64 ? 64 : numWorkItemsPerXForm; + assert(numWorkItemsPerWG <= plan->max_work_item_per_workgroup); + int numXFormsPerWG = numWorkItemsPerWG / numWorkItemsPerXForm; + (*kInfo)->num_workgroups = 1; + (*kInfo)->num_xforms_per_workgroup = numXFormsPerWG; + (*kInfo)->num_workitems_per_workgroup = numWorkItemsPerWG; - kernelName = string("fft") + num2str(kCount); + unsigned int *N = radixArray; + unsigned int maxRadix = N[0]; + unsigned int lMemSize = 0; - *kInfo = (cl_fft_kernel_info *) malloc(sizeof(cl_fft_kernel_info)); - (*kInfo)->kernel = 0; - (*kInfo)->lmem_size = 0; - (*kInfo)->num_workgroups = 0; - (*kInfo)->num_workitems_per_workgroup = 0; - (*kInfo)->dir = cl_fft_kernel_x; - (*kInfo)->in_place_possible = 1; - (*kInfo)->next = NULL; - (*kInfo)->kernel_name = (char *) malloc(sizeof(char)*(kernelName.size()+1)); - strcpy((*kInfo)->kernel_name, kernelName.c_str()); + insertVariables(localString, maxRadix); - unsigned int numWorkItemsPerXForm = n / radixArray[0]; - unsigned int numWorkItemsPerWG = numWorkItemsPerXForm <= 64 ? 64 : numWorkItemsPerXForm; - assert(numWorkItemsPerWG <= plan->max_work_item_per_workgroup); - int numXFormsPerWG = numWorkItemsPerWG / numWorkItemsPerXForm; - (*kInfo)->num_workgroups = 1; - (*kInfo)->num_xforms_per_workgroup = numXFormsPerWG; - (*kInfo)->num_workitems_per_workgroup = numWorkItemsPerWG; - - unsigned int *N = radixArray; - unsigned int maxRadix = N[0]; - unsigned int lMemSize = 0; - - insertVariables(localString, maxRadix); - - lMemSize = insertGlobalLoadsAndTranspose(localString, n, numWorkItemsPerXForm, numXFormsPerWG, maxRadix, plan->min_mem_coalesce_width, dataFormat); - (*kInfo)->lmem_size = (lMemSize > (*kInfo)->lmem_size) ? lMemSize : (*kInfo)->lmem_size; - - string xcomp = string("x"); - string ycomp = string("y"); - - unsigned int Nprev = 1; - unsigned int len = n; - unsigned int r; - for(r = 0; r < numRadix; r++) - { - int numIter = N[0] / N[r]; - int numWorkItemsReq = n / N[r]; - int Ncurr = Nprev * N[r]; - insertfftKernel(localString, N[r], numIter); - - if(r < (numRadix - 1)) { - insertTwiddleKernel(localString, N[r], numIter, Nprev, len, numWorkItemsPerXForm); - lMemSize = getPadding(numWorkItemsPerXForm, Nprev, numWorkItemsReq, numXFormsPerWG, N[r], plan->num_local_mem_banks, &offset, &midPad); - (*kInfo)->lmem_size = (lMemSize > (*kInfo)->lmem_size) ? lMemSize : (*kInfo)->lmem_size; - insertLocalStoreIndexArithmatic(localString, numWorkItemsReq, numXFormsPerWG, N[r], offset, midPad); - insertLocalLoadIndexArithmatic(localString, Nprev, N[r], numWorkItemsReq, numWorkItemsPerXForm, numXFormsPerWG, offset, midPad); - insertLocalStores(localString, numIter, N[r], numWorkItemsPerXForm, numWorkItemsReq, offset, xcomp); - insertLocalLoads(localString, n, N[r], N[r+1], Nprev, Ncurr, numWorkItemsPerXForm, numWorkItemsReq, offset, xcomp); - insertLocalStores(localString, numIter, N[r], numWorkItemsPerXForm, numWorkItemsReq, offset, ycomp); - insertLocalLoads(localString, n, N[r], N[r+1], Nprev, Ncurr, numWorkItemsPerXForm, numWorkItemsReq, offset, ycomp); - Nprev = Ncurr; - len = len / N[r]; - } - } - - lMemSize = insertGlobalStoresAndTranspose(localString, n, maxRadix, N[numRadix - 1], numWorkItemsPerXForm, numXFormsPerWG, plan->min_mem_coalesce_width, dataFormat); - (*kInfo)->lmem_size = (lMemSize > (*kInfo)->lmem_size) ? lMemSize : (*kInfo)->lmem_size; - - insertHeader(*kernelString, kernelName, dataFormat); - *kernelString += string("{\n"); - if((*kInfo)->lmem_size) + lMemSize = insertGlobalLoadsAndTranspose(localString, n, numWorkItemsPerXForm, numXFormsPerWG, maxRadix, plan->min_mem_coalesce_width, dataFormat); + (*kInfo)->lmem_size = (lMemSize > (*kInfo)->lmem_size) ? lMemSize : (*kInfo)->lmem_size; + + string xcomp = string("x"); + string ycomp = string("y"); + + unsigned int Nprev = 1; + unsigned int len = n; + unsigned int r; + for(r = 0; r < numRadix; r++) + { + int numIter = N[0] / N[r]; + int numWorkItemsReq = n / N[r]; + int Ncurr = Nprev * N[r]; + insertfftKernel(localString, N[r], numIter); + + if(r < (numRadix - 1)) { + insertTwiddleKernel(localString, N[r], numIter, Nprev, len, numWorkItemsPerXForm); + lMemSize = getPadding(numWorkItemsPerXForm, Nprev, numWorkItemsReq, numXFormsPerWG, N[r], plan->num_local_mem_banks, &offset, &midPad); + (*kInfo)->lmem_size = (lMemSize > (*kInfo)->lmem_size) ? lMemSize : (*kInfo)->lmem_size; + insertLocalStoreIndexArithmatic(localString, numWorkItemsReq, numXFormsPerWG, N[r], offset, midPad); + insertLocalLoadIndexArithmatic(localString, Nprev, N[r], numWorkItemsReq, numWorkItemsPerXForm, numXFormsPerWG, offset, midPad); + insertLocalStores(localString, numIter, N[r], numWorkItemsPerXForm, numWorkItemsReq, offset, xcomp); + insertLocalLoads(localString, n, N[r], N[r+1], Nprev, Ncurr, numWorkItemsPerXForm, numWorkItemsReq, offset, xcomp); + insertLocalStores(localString, numIter, N[r], numWorkItemsPerXForm, numWorkItemsReq, offset, ycomp); + insertLocalLoads(localString, n, N[r], N[r+1], Nprev, Ncurr, numWorkItemsPerXForm, numWorkItemsReq, offset, ycomp); + Nprev = Ncurr; + len = len / N[r]; + } + } + + lMemSize = insertGlobalStoresAndTranspose(localString, n, maxRadix, N[numRadix - 1], numWorkItemsPerXForm, numXFormsPerWG, plan->min_mem_coalesce_width, dataFormat); + (*kInfo)->lmem_size = (lMemSize > (*kInfo)->lmem_size) ? lMemSize : (*kInfo)->lmem_size; + + insertHeader(*kernelString, kernelName, dataFormat); + *kernelString += string("{\n"); + if((*kInfo)->lmem_size) *kernelString += string(" __local float sMem[") + num2str((*kInfo)->lmem_size) + string("];\n"); - *kernelString += localString; - *kernelString += string("}\n"); + *kernelString += localString; + *kernelString += string("}\n"); } // For n larger than what can be computed using local memory fft, global transposes @@ -872,266 +872,266 @@ createLocalMemfftKernelString(cl_fft_plan *plan) void getGlobalRadixInfo(int n, int *radix, int *R1, int *R2, int *numRadices) { - int baseRadix = min(n, 128); - - int numR = 0; - int N = n; - while(N > baseRadix) - { - N /= baseRadix; - numR++; - } - - for(int i = 0; i < numR; i++) - radix[i] = baseRadix; - - radix[numR] = N; - numR++; - *numRadices = numR; - - for(int i = 0; i < numR; i++) - { - int B = radix[i]; - if(B <= 8) - { - R1[i] = B; - R2[i] = 1; - continue; - } - - int r1 = 2; - int r2 = B / r1; - while(r2 > r1) - { - r1 *=2; - r2 = B / r1; - } - R1[i] = r1; - R2[i] = r2; - } + int baseRadix = min(n, 128); + + int numR = 0; + int N = n; + while(N > baseRadix) + { + N /= baseRadix; + numR++; + } + + for(int i = 0; i < numR; i++) + radix[i] = baseRadix; + + radix[numR] = N; + numR++; + *numRadices = numR; + + for(int i = 0; i < numR; i++) + { + int B = radix[i]; + if(B <= 8) + { + R1[i] = B; + R2[i] = 1; + continue; + } + + int r1 = 2; + int r2 = B / r1; + while(r2 > r1) + { + r1 *=2; + r2 = B / r1; + } + R1[i] = r1; + R2[i] = r2; + } } static void createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir dir, int vertBS) { - int i, j, k, t; - int radixArr[10] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + int i, j, k, t; + int radixArr[10] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; int R1Arr[10] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; int R2Arr[10] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; - int radix, R1, R2; - int numRadices; - - int maxThreadsPerBlock = plan->max_work_item_per_workgroup; - int maxArrayLen = plan->max_radix; - int batchSize = plan->min_mem_coalesce_width; - clFFT_DataFormat dataFormat = plan->format; - int vertical = (dir == cl_fft_kernel_x) ? 0 : 1; - - getGlobalRadixInfo(n, radixArr, R1Arr, R2Arr, &numRadices); - - int numPasses = numRadices; - - string localString(""), kernelName(""); - string *kernelString = plan->kernel_string; - cl_fft_kernel_info **kInfo = &plan->kernel_info; - int kCount = 0; - - while(*kInfo) - { - kInfo = &(*kInfo)->next; - kCount++; - } - - int N = n; - int m = (int)log2(n); - int Rinit = vertical ? BS : 1; - batchSize = vertical ? min(BS, batchSize) : batchSize; - int passNum; - - for(passNum = 0; passNum < numPasses; passNum++) - { - - localString.clear(); - kernelName.clear(); - - radix = radixArr[passNum]; - R1 = R1Arr[passNum]; - R2 = R2Arr[passNum]; - - int strideI = Rinit; - for(i = 0; i < numPasses; i++) - if(i != passNum) - strideI *= radixArr[i]; - - int strideO = Rinit; - for(i = 0; i < passNum; i++) - strideO *= radixArr[i]; - - int threadsPerXForm = R2; - batchSize = R2 == 1 ? plan->max_work_item_per_workgroup : batchSize; - batchSize = min(batchSize, strideI); - int threadsPerBlock = batchSize * threadsPerXForm; - threadsPerBlock = min(threadsPerBlock, maxThreadsPerBlock); - batchSize = threadsPerBlock / threadsPerXForm; - assert(R2 <= R1); - assert(R1*R2 == radix); - assert(R1 <= maxArrayLen); - assert(threadsPerBlock <= maxThreadsPerBlock); - - int numIter = R1 / R2; - int gInInc = threadsPerBlock / batchSize; - - - int lgStrideO = log2(strideO); - int numBlocksPerXForm = strideI / batchSize; - int numBlocks = numBlocksPerXForm; - if(!vertical) - numBlocks *= BS; - else - numBlocks *= vertBS; - - kernelName = string("fft") + num2str(kCount); - *kInfo = (cl_fft_kernel_info *) malloc(sizeof(cl_fft_kernel_info)); - (*kInfo)->kernel = 0; - if(R2 == 1) - (*kInfo)->lmem_size = 0; - else - { - if(strideO == 1) - (*kInfo)->lmem_size = (radix + 1)*batchSize; - else - (*kInfo)->lmem_size = threadsPerBlock*R1; - } - (*kInfo)->num_workgroups = numBlocks; + int radix, R1, R2; + int numRadices; + + int maxThreadsPerBlock = plan->max_work_item_per_workgroup; + int maxArrayLen = plan->max_radix; + int batchSize = plan->min_mem_coalesce_width; + clFFT_DataFormat dataFormat = plan->format; + int vertical = (dir == cl_fft_kernel_x) ? 0 : 1; + + getGlobalRadixInfo(n, radixArr, R1Arr, R2Arr, &numRadices); + + int numPasses = numRadices; + + string localString(""), kernelName(""); + string *kernelString = plan->kernel_string; + cl_fft_kernel_info **kInfo = &plan->kernel_info; + int kCount = 0; + + while(*kInfo) + { + kInfo = &(*kInfo)->next; + kCount++; + } + + int N = n; + int m = (int)log2(n); + int Rinit = vertical ? BS : 1; + batchSize = vertical ? min(BS, batchSize) : batchSize; + int passNum; + + for(passNum = 0; passNum < numPasses; passNum++) + { + + localString.clear(); + kernelName.clear(); + + radix = radixArr[passNum]; + R1 = R1Arr[passNum]; + R2 = R2Arr[passNum]; + + int strideI = Rinit; + for(i = 0; i < numPasses; i++) + if(i != passNum) + strideI *= radixArr[i]; + + int strideO = Rinit; + for(i = 0; i < passNum; i++) + strideO *= radixArr[i]; + + int threadsPerXForm = R2; + batchSize = R2 == 1 ? plan->max_work_item_per_workgroup : batchSize; + batchSize = min(batchSize, strideI); + int threadsPerBlock = batchSize * threadsPerXForm; + threadsPerBlock = min(threadsPerBlock, maxThreadsPerBlock); + batchSize = threadsPerBlock / threadsPerXForm; + assert(R2 <= R1); + assert(R1*R2 == radix); + assert(R1 <= maxArrayLen); + assert(threadsPerBlock <= maxThreadsPerBlock); + + int numIter = R1 / R2; + int gInInc = threadsPerBlock / batchSize; + + + int lgStrideO = log2(strideO); + int numBlocksPerXForm = strideI / batchSize; + int numBlocks = numBlocksPerXForm; + if(!vertical) + numBlocks *= BS; + else + numBlocks *= vertBS; + + kernelName = string("fft") + num2str(kCount); + *kInfo = (cl_fft_kernel_info *) malloc(sizeof(cl_fft_kernel_info)); + (*kInfo)->kernel = 0; + if(R2 == 1) + (*kInfo)->lmem_size = 0; + else + { + if(strideO == 1) + (*kInfo)->lmem_size = (radix + 1)*batchSize; + else + (*kInfo)->lmem_size = threadsPerBlock*R1; + } + (*kInfo)->num_workgroups = numBlocks; (*kInfo)->num_xforms_per_workgroup = 1; - (*kInfo)->num_workitems_per_workgroup = threadsPerBlock; - (*kInfo)->dir = dir; - if( (passNum == (numPasses - 1)) && (numPasses & 1) ) - (*kInfo)->in_place_possible = 1; - else - (*kInfo)->in_place_possible = 0; - (*kInfo)->next = NULL; - (*kInfo)->kernel_name = (char *) malloc(sizeof(char)*(kernelName.size()+1)); - strcpy((*kInfo)->kernel_name, kernelName.c_str()); - - insertVariables(localString, R1); - - if(vertical) - { - localString += string("xNum = groupId >> ") + num2str((int)log2(numBlocksPerXForm)) + string(";\n"); - localString += string("groupId = groupId & ") + num2str(numBlocksPerXForm - 1) + string(";\n"); - localString += string("indexIn = mad24(groupId, ") + num2str(batchSize) + string(", xNum << ") + num2str((int)log2(n*BS)) + string(");\n"); - localString += string("tid = mul24(groupId, ") + num2str(batchSize) + string(");\n"); - localString += string("i = tid >> ") + num2str(lgStrideO) + string(";\n"); - localString += string("j = tid & ") + num2str(strideO - 1) + string(";\n"); - int stride = radix*Rinit; - for(i = 0; i < passNum; i++) - stride *= radixArr[i]; - localString += string("indexOut = mad24(i, ") + num2str(stride) + string(", j + ") + string("(xNum << ") + num2str((int) log2(n*BS)) + string("));\n"); - localString += string("bNum = groupId;\n"); - } - else - { - int lgNumBlocksPerXForm = log2(numBlocksPerXForm); - localString += string("bNum = groupId & ") + num2str(numBlocksPerXForm - 1) + string(";\n"); - localString += string("xNum = groupId >> ") + num2str(lgNumBlocksPerXForm) + string(";\n"); - localString += string("indexIn = mul24(bNum, ") + num2str(batchSize) + string(");\n"); - localString += string("tid = indexIn;\n"); - localString += string("i = tid >> ") + num2str(lgStrideO) + string(";\n"); - localString += string("j = tid & ") + num2str(strideO - 1) + string(";\n"); - int stride = radix*Rinit; - for(i = 0; i < passNum; i++) - stride *= radixArr[i]; - localString += string("indexOut = mad24(i, ") + num2str(stride) + string(", j);\n"); - localString += string("indexIn += (xNum << ") + num2str(m) + string(");\n"); - localString += string("indexOut += (xNum << ") + num2str(m) + string(");\n"); - } - - // Load Data - int lgBatchSize = log2(batchSize); - localString += string("tid = lId;\n"); - localString += string("i = tid & ") + num2str(batchSize - 1) + string(";\n"); - localString += string("j = tid >> ") + num2str(lgBatchSize) + string(";\n"); - localString += string("indexIn += mad24(j, ") + num2str(strideI) + string(", i);\n"); - - if(dataFormat == clFFT_SplitComplexFormat) - { - localString += string("in_real += indexIn;\n"); - localString += string("in_imag += indexIn;\n"); - for(j = 0; j < R1; j++) - localString += string("a[") + num2str(j) + string("].x = in_real[") + num2str(j*gInInc*strideI) + string("];\n"); - for(j = 0; j < R1; j++) - localString += string("a[") + num2str(j) + string("].y = in_imag[") + num2str(j*gInInc*strideI) + string("];\n"); - } - else - { - localString += string("in += indexIn;\n"); - for(j = 0; j < R1; j++) - localString += string("a[") + num2str(j) + string("] = in[") + num2str(j*gInInc*strideI) + string("];\n"); - } - - localString += string("fftKernel") + num2str(R1) + string("(a, dir);\n"); - - if(R2 > 1) - { - // twiddle - for(k = 1; k < R1; k++) - { - localString += string("ang = dir*(2.0f*M_PI*") + num2str(k) + string("/") + num2str(radix) + string(")*j;\n"); - localString += string("w = (float2)(native_cos(ang), native_sin(ang));\n"); - localString += string("a[") + num2str(k) + string("] = complexMul(a[") + num2str(k) + string("], w);\n"); - } - - // shuffle - numIter = R1 / R2; - localString += string("indexIn = mad24(j, ") + num2str(threadsPerBlock*numIter) + string(", i);\n"); - localString += string("lMemStore = sMem + tid;\n"); - localString += string("lMemLoad = sMem + indexIn;\n"); - for(k = 0; k < R1; k++) - localString += string("lMemStore[") + num2str(k*threadsPerBlock) + string("] = a[") + num2str(k) + string("].x;\n"); - localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n"); - for(k = 0; k < numIter; k++) - for(t = 0; t < R2; t++) - localString += string("a[") + num2str(k*R2+t) + string("].x = lMemLoad[") + num2str(t*batchSize + k*threadsPerBlock) + string("];\n"); - localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n"); - for(k = 0; k < R1; k++) - localString += string("lMemStore[") + num2str(k*threadsPerBlock) + string("] = a[") + num2str(k) + string("].y;\n"); - localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n"); - for(k = 0; k < numIter; k++) - for(t = 0; t < R2; t++) - localString += string("a[") + num2str(k*R2+t) + string("].y = lMemLoad[") + num2str(t*batchSize + k*threadsPerBlock) + string("];\n"); - localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n"); - - for(j = 0; j < numIter; j++) - localString += string("fftKernel") + num2str(R2) + string("(a + ") + num2str(j*R2) + string(", dir);\n"); - } - - // twiddle - if(passNum < (numPasses - 1)) - { - localString += string("l = ((bNum << ") + num2str(lgBatchSize) + string(") + i) >> ") + num2str(lgStrideO) + string(";\n"); - localString += string("k = j << ") + num2str((int)log2(R1/R2)) + string(";\n"); - localString += string("ang1 = dir*(2.0f*M_PI/") + num2str(N) + string(")*l;\n"); - for(t = 0; t < R1; t++) - { - localString += string("ang = ang1*(k + ") + num2str((t%R2)*R1 + (t/R2)) + string(");\n"); - localString += string("w = (float2)(native_cos(ang), native_sin(ang));\n"); - localString += string("a[") + num2str(t) + string("] = complexMul(a[") + num2str(t) + string("], w);\n"); - } - } - - // Store Data - if(strideO == 1) - { - - localString += string("lMemStore = sMem + mad24(i, ") + num2str(radix + 1) + string(", j << ") + num2str((int)log2(R1/R2)) + string(");\n"); - localString += string("lMemLoad = sMem + mad24(tid >> ") + num2str((int)log2(radix)) + string(", ") + num2str(radix+1) + string(", tid & ") + num2str(radix-1) + string(");\n"); - - for(int i = 0; i < R1/R2; i++) - for(int j = 0; j < R2; j++) - localString += string("lMemStore[ ") + num2str(i + j*R1) + string("] = a[") + num2str(i*R2+j) + string("].x;\n"); - localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n"); - if(threadsPerBlock >= radix) + (*kInfo)->num_workitems_per_workgroup = threadsPerBlock; + (*kInfo)->dir = dir; + if( (passNum == (numPasses - 1)) && (numPasses & 1) ) + (*kInfo)->in_place_possible = 1; + else + (*kInfo)->in_place_possible = 0; + (*kInfo)->next = NULL; + (*kInfo)->kernel_name = (char *) malloc(sizeof(char)*(kernelName.size()+1)); + strcpy((*kInfo)->kernel_name, kernelName.c_str()); + + insertVariables(localString, R1); + + if(vertical) + { + localString += string("xNum = groupId >> ") + num2str((int)log2(numBlocksPerXForm)) + string(";\n"); + localString += string("groupId = groupId & ") + num2str(numBlocksPerXForm - 1) + string(";\n"); + localString += string("indexIn = mad24(groupId, ") + num2str(batchSize) + string(", xNum << ") + num2str((int)log2(n*BS)) + string(");\n"); + localString += string("tid = mul24(groupId, ") + num2str(batchSize) + string(");\n"); + localString += string("i = tid >> ") + num2str(lgStrideO) + string(";\n"); + localString += string("j = tid & ") + num2str(strideO - 1) + string(";\n"); + int stride = radix*Rinit; + for(i = 0; i < passNum; i++) + stride *= radixArr[i]; + localString += string("indexOut = mad24(i, ") + num2str(stride) + string(", j + ") + string("(xNum << ") + num2str((int) log2(n*BS)) + string("));\n"); + localString += string("bNum = groupId;\n"); + } + else + { + int lgNumBlocksPerXForm = log2(numBlocksPerXForm); + localString += string("bNum = groupId & ") + num2str(numBlocksPerXForm - 1) + string(";\n"); + localString += string("xNum = groupId >> ") + num2str(lgNumBlocksPerXForm) + string(";\n"); + localString += string("indexIn = mul24(bNum, ") + num2str(batchSize) + string(");\n"); + localString += string("tid = indexIn;\n"); + localString += string("i = tid >> ") + num2str(lgStrideO) + string(";\n"); + localString += string("j = tid & ") + num2str(strideO - 1) + string(";\n"); + int stride = radix*Rinit; + for(i = 0; i < passNum; i++) + stride *= radixArr[i]; + localString += string("indexOut = mad24(i, ") + num2str(stride) + string(", j);\n"); + localString += string("indexIn += (xNum << ") + num2str(m) + string(");\n"); + localString += string("indexOut += (xNum << ") + num2str(m) + string(");\n"); + } + + // Load Data + int lgBatchSize = log2(batchSize); + localString += string("tid = lId;\n"); + localString += string("i = tid & ") + num2str(batchSize - 1) + string(";\n"); + localString += string("j = tid >> ") + num2str(lgBatchSize) + string(";\n"); + localString += string("indexIn += mad24(j, ") + num2str(strideI) + string(", i);\n"); + + if(dataFormat == clFFT_SplitComplexFormat) + { + localString += string("in_real += indexIn;\n"); + localString += string("in_imag += indexIn;\n"); + for(j = 0; j < R1; j++) + localString += string("a[") + num2str(j) + string("].x = in_real[") + num2str(j*gInInc*strideI) + string("];\n"); + for(j = 0; j < R1; j++) + localString += string("a[") + num2str(j) + string("].y = in_imag[") + num2str(j*gInInc*strideI) + string("];\n"); + } + else + { + localString += string("in += indexIn;\n"); + for(j = 0; j < R1; j++) + localString += string("a[") + num2str(j) + string("] = in[") + num2str(j*gInInc*strideI) + string("];\n"); + } + + localString += string("fftKernel") + num2str(R1) + string("(a, dir);\n"); + + if(R2 > 1) + { + // twiddle + for(k = 1; k < R1; k++) + { + localString += string("ang = dir*(2.0f*M_PI*") + num2str(k) + string("/") + num2str(radix) + string(")*j;\n"); + localString += string("w = (float2)(native_cos(ang), native_sin(ang));\n"); + localString += string("a[") + num2str(k) + string("] = complexMul(a[") + num2str(k) + string("], w);\n"); + } + + // shuffle + numIter = R1 / R2; + localString += string("indexIn = mad24(j, ") + num2str(threadsPerBlock*numIter) + string(", i);\n"); + localString += string("lMemStore = sMem + tid;\n"); + localString += string("lMemLoad = sMem + indexIn;\n"); + for(k = 0; k < R1; k++) + localString += string("lMemStore[") + num2str(k*threadsPerBlock) + string("] = a[") + num2str(k) + string("].x;\n"); + localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n"); + for(k = 0; k < numIter; k++) + for(t = 0; t < R2; t++) + localString += string("a[") + num2str(k*R2+t) + string("].x = lMemLoad[") + num2str(t*batchSize + k*threadsPerBlock) + string("];\n"); + localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n"); + for(k = 0; k < R1; k++) + localString += string("lMemStore[") + num2str(k*threadsPerBlock) + string("] = a[") + num2str(k) + string("].y;\n"); + localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n"); + for(k = 0; k < numIter; k++) + for(t = 0; t < R2; t++) + localString += string("a[") + num2str(k*R2+t) + string("].y = lMemLoad[") + num2str(t*batchSize + k*threadsPerBlock) + string("];\n"); + localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n"); + + for(j = 0; j < numIter; j++) + localString += string("fftKernel") + num2str(R2) + string("(a + ") + num2str(j*R2) + string(", dir);\n"); + } + + // twiddle + if(passNum < (numPasses - 1)) + { + localString += string("l = ((bNum << ") + num2str(lgBatchSize) + string(") + i) >> ") + num2str(lgStrideO) + string(";\n"); + localString += string("k = j << ") + num2str((int)log2(R1/R2)) + string(";\n"); + localString += string("ang1 = dir*(2.0f*M_PI/") + num2str(N) + string(")*l;\n"); + for(t = 0; t < R1; t++) + { + localString += string("ang = ang1*(k + ") + num2str((t%R2)*R1 + (t/R2)) + string(");\n"); + localString += string("w = (float2)(native_cos(ang), native_sin(ang));\n"); + localString += string("a[") + num2str(t) + string("] = complexMul(a[") + num2str(t) + string("], w);\n"); + } + } + + // Store Data + if(strideO == 1) + { + + localString += string("lMemStore = sMem + mad24(i, ") + num2str(radix + 1) + string(", j << ") + num2str((int)log2(R1/R2)) + string(");\n"); + localString += string("lMemLoad = sMem + mad24(tid >> ") + num2str((int)log2(radix)) + string(", ") + num2str(radix+1) + string(", tid & ") + num2str(radix-1) + string(");\n"); + + for(int i = 0; i < R1/R2; i++) + for(int j = 0; j < R2; j++) + localString += string("lMemStore[ ") + num2str(i + j*R1) + string("] = a[") + num2str(i*R2+j) + string("].x;\n"); + localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n"); + if(threadsPerBlock >= radix) { for(int i = 0; i < R1; i++) localString += string("a[") + num2str(i) + string("].x = lMemLoad[") + num2str(i*(radix+1)*(threadsPerBlock/radix)) + string("];\n"); @@ -1144,13 +1144,13 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir for(int j = 0; j < innerIter; j++) localString += string("a[") + num2str(i*innerIter+j) + string("].x = lMemLoad[") + num2str(j*threadsPerBlock + i*(radix+1)) + string("];\n"); } - localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n"); + localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n"); - for(int i = 0; i < R1/R2; i++) - for(int j = 0; j < R2; j++) - localString += string("lMemStore[ ") + num2str(i + j*R1) + string("] = a[") + num2str(i*R2+j) + string("].y;\n"); - localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n"); - if(threadsPerBlock >= radix) + for(int i = 0; i < R1/R2; i++) + for(int j = 0; j < R2; j++) + localString += string("lMemStore[ ") + num2str(i + j*R1) + string("] = a[") + num2str(i*R2+j) + string("].y;\n"); + localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n"); + if(threadsPerBlock >= radix) { for(int i = 0; i < R1; i++) localString += string("a[") + num2str(i) + string("].y = lMemLoad[") + num2str(i*(radix+1)*(threadsPerBlock/radix)) + string("];\n"); @@ -1163,53 +1163,53 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir for(int j = 0; j < innerIter; j++) localString += string("a[") + num2str(i*innerIter+j) + string("].y = lMemLoad[") + num2str(j*threadsPerBlock + i*(radix+1)) + string("];\n"); } - localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n"); - - localString += string("indexOut += tid;\n"); - if(dataFormat == clFFT_SplitComplexFormat) { - localString += string("out_real += indexOut;\n"); - localString += string("out_imag += indexOut;\n"); - for(k = 0; k < R1; k++) - localString += string("out_real[") + num2str(k*threadsPerBlock) + string("] = a[") + num2str(k) + string("].x;\n"); - for(k = 0; k < R1; k++) - localString += string("out_imag[") + num2str(k*threadsPerBlock) + string("] = a[") + num2str(k) + string("].y;\n"); - } - else { - localString += string("out += indexOut;\n"); - for(k = 0; k < R1; k++) - localString += string("out[") + num2str(k*threadsPerBlock) + string("] = a[") + num2str(k) + string("];\n"); - } - - } - else - { - localString += string("indexOut += mad24(j, ") + num2str(numIter*strideO) + string(", i);\n"); - if(dataFormat == clFFT_SplitComplexFormat) { - localString += string("out_real += indexOut;\n"); - localString += string("out_imag += indexOut;\n"); - for(k = 0; k < R1; k++) - localString += string("out_real[") + num2str(((k%R2)*R1 + (k/R2))*strideO) + string("] = a[") + num2str(k) + string("].x;\n"); - for(k = 0; k < R1; k++) - localString += string("out_imag[") + num2str(((k%R2)*R1 + (k/R2))*strideO) + string("] = a[") + num2str(k) + string("].y;\n"); - } - else { - localString += string("out += indexOut;\n"); - for(k = 0; k < R1; k++) - localString += string("out[") + num2str(((k%R2)*R1 + (k/R2))*strideO) + string("] = a[") + num2str(k) + string("];\n"); - } - } - - insertHeader(*kernelString, kernelName, dataFormat); - *kernelString += string("{\n"); - if((*kInfo)->lmem_size) - *kernelString += string(" __local float sMem[") + num2str((*kInfo)->lmem_size) + string("];\n"); - *kernelString += localString; - *kernelString += string("}\n"); - - N /= radix; - kInfo = &(*kInfo)->next; - kCount++; - } + localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n"); + + localString += string("indexOut += tid;\n"); + if(dataFormat == clFFT_SplitComplexFormat) { + localString += string("out_real += indexOut;\n"); + localString += string("out_imag += indexOut;\n"); + for(k = 0; k < R1; k++) + localString += string("out_real[") + num2str(k*threadsPerBlock) + string("] = a[") + num2str(k) + string("].x;\n"); + for(k = 0; k < R1; k++) + localString += string("out_imag[") + num2str(k*threadsPerBlock) + string("] = a[") + num2str(k) + string("].y;\n"); + } + else { + localString += string("out += indexOut;\n"); + for(k = 0; k < R1; k++) + localString += string("out[") + num2str(k*threadsPerBlock) + string("] = a[") + num2str(k) + string("];\n"); + } + + } + else + { + localString += string("indexOut += mad24(j, ") + num2str(numIter*strideO) + string(", i);\n"); + if(dataFormat == clFFT_SplitComplexFormat) { + localString += string("out_real += indexOut;\n"); + localString += string("out_imag += indexOut;\n"); + for(k = 0; k < R1; k++) + localString += string("out_real[") + num2str(((k%R2)*R1 + (k/R2))*strideO) + string("] = a[") + num2str(k) + string("].x;\n"); + for(k = 0; k < R1; k++) + localString += string("out_imag[") + num2str(((k%R2)*R1 + (k/R2))*strideO) + string("] = a[") + num2str(k) + string("].y;\n"); + } + else { + localString += string("out += indexOut;\n"); + for(k = 0; k < R1; k++) + localString += string("out[") + num2str(((k%R2)*R1 + (k/R2))*strideO) + string("] = a[") + num2str(k) + string("];\n"); + } + } + + insertHeader(*kernelString, kernelName, dataFormat); + *kernelString += string("{\n"); + if((*kInfo)->lmem_size) + *kernelString += string(" __local float sMem[") + num2str((*kInfo)->lmem_size) + string("];\n"); + *kernelString += localString; + *kernelString += string("}\n"); + + N /= radix; + kInfo = &(*kInfo)->next; + kCount++; + } } void FFT1D(cl_fft_plan *plan, cl_fft_kernel_dir dir) @@ -1217,41 +1217,41 @@ void FFT1D(cl_fft_plan *plan, cl_fft_kernel_dir dir) unsigned int radixArray[10]; unsigned int numRadix; - switch(dir) - { - case cl_fft_kernel_x: - if(plan->n.x > plan->max_localmem_fft_size) - { - createGlobalFFTKernelString(plan, plan->n.x, 1, cl_fft_kernel_x, 1); - } - else if(plan->n.x > 1) - { - getRadixArray(plan->n.x, radixArray, &numRadix, 0); - if(plan->n.x / radixArray[0] <= plan->max_work_item_per_workgroup) - { - createLocalMemfftKernelString(plan); - } - else - { - getRadixArray(plan->n.x, radixArray, &numRadix, plan->max_radix); - if(plan->n.x / radixArray[0] <= plan->max_work_item_per_workgroup) - createLocalMemfftKernelString(plan); - else - createGlobalFFTKernelString(plan, plan->n.x, 1, cl_fft_kernel_x, 1); - } - } - break; - - case cl_fft_kernel_y: - if(plan->n.y > 1) - createGlobalFFTKernelString(plan, plan->n.y, plan->n.x, cl_fft_kernel_y, 1); - break; - - case cl_fft_kernel_z: - if(plan->n.z > 1) - createGlobalFFTKernelString(plan, plan->n.z, plan->n.x*plan->n.y, cl_fft_kernel_z, 1); - default: - return; - } + switch(dir) + { + case cl_fft_kernel_x: + if(plan->n.x > plan->max_localmem_fft_size) + { + createGlobalFFTKernelString(plan, plan->n.x, 1, cl_fft_kernel_x, 1); + } + else if(plan->n.x > 1) + { + getRadixArray(plan->n.x, radixArray, &numRadix, 0); + if(plan->n.x / radixArray[0] <= plan->max_work_item_per_workgroup) + { + createLocalMemfftKernelString(plan); + } + else + { + getRadixArray(plan->n.x, radixArray, &numRadix, plan->max_radix); + if(plan->n.x / radixArray[0] <= plan->max_work_item_per_workgroup) + createLocalMemfftKernelString(plan); + else + createGlobalFFTKernelString(plan, plan->n.x, 1, cl_fft_kernel_x, 1); + } + } + break; + + case cl_fft_kernel_y: + if(plan->n.y > 1) + createGlobalFFTKernelString(plan, plan->n.y, plan->n.x, cl_fft_kernel_y, 1); + break; + + case cl_fft_kernel_z: + if(plan->n.z > 1) + createGlobalFFTKernelString(plan, plan->n.z, plan->n.x*plan->n.y, cl_fft_kernel_z, 1); + default: + return; + } } diff --git a/src/fft_setup.cpp b/src/fft_setup.cpp index 593184d84dbdf89f13bbae168bb831dec9bb18e0..49657827964f5c60f7ab7ac7a51687a1fcd6841c 100644 --- a/src/fft_setup.cpp +++ b/src/fft_setup.cpp @@ -64,56 +64,56 @@ extern void getKernelWorkDimensions(cl_fft_plan *plan, cl_fft_kernel_info *kerne static void getBlockConfigAndKernelString(cl_fft_plan *plan) { - plan->temp_buffer_needed = 0; - *plan->kernel_string += baseKernels; - - if(plan->format == clFFT_SplitComplexFormat) - *plan->kernel_string += twistKernelPlannar; - else - *plan->kernel_string += twistKernelInterleaved; - - switch(plan->dim) - { - case clFFT_1D: - FFT1D(plan, cl_fft_kernel_x); - break; - - case clFFT_2D: - FFT1D(plan, cl_fft_kernel_x); - FFT1D(plan, cl_fft_kernel_y); - break; - - case clFFT_3D: - FFT1D(plan, cl_fft_kernel_x); - FFT1D(plan, cl_fft_kernel_y); - FFT1D(plan, cl_fft_kernel_z); - break; - - default: - return; - } - - plan->temp_buffer_needed = 0; - cl_fft_kernel_info *kInfo = plan->kernel_info; - while(kInfo) - { - plan->temp_buffer_needed |= !kInfo->in_place_possible; - kInfo = kInfo->next; - } + plan->temp_buffer_needed = 0; + *plan->kernel_string += baseKernels; + + if(plan->format == clFFT_SplitComplexFormat) + *plan->kernel_string += twistKernelPlannar; + else + *plan->kernel_string += twistKernelInterleaved; + + switch(plan->dim) + { + case clFFT_1D: + FFT1D(plan, cl_fft_kernel_x); + break; + + case clFFT_2D: + FFT1D(plan, cl_fft_kernel_x); + FFT1D(plan, cl_fft_kernel_y); + break; + + case clFFT_3D: + FFT1D(plan, cl_fft_kernel_x); + FFT1D(plan, cl_fft_kernel_y); + FFT1D(plan, cl_fft_kernel_z); + break; + + default: + return; + } + + plan->temp_buffer_needed = 0; + cl_fft_kernel_info *kInfo = plan->kernel_info; + while(kInfo) + { + plan->temp_buffer_needed |= !kInfo->in_place_possible; + kInfo = kInfo->next; + } } static void deleteKernelInfo(cl_fft_kernel_info *kInfo) { - if(kInfo) - { - if(kInfo->kernel_name) - free(kInfo->kernel_name); - if(kInfo->kernel) - clReleaseKernel(kInfo->kernel); - free(kInfo); - } + if(kInfo) + { + if(kInfo->kernel_name) + free(kInfo->kernel_name); + if(kInfo->kernel) + clReleaseKernel(kInfo->kernel); + free(kInfo); + } } static void @@ -121,71 +121,71 @@ destroy_plan(cl_fft_plan *Plan) { cl_fft_kernel_info *kernel_info = Plan->kernel_info; - while(kernel_info) - { - cl_fft_kernel_info *tmp = kernel_info->next; - deleteKernelInfo(kernel_info); - kernel_info = tmp; - } - - Plan->kernel_info = NULL; - - if(Plan->kernel_string) - { - delete Plan->kernel_string; - Plan->kernel_string = NULL; - } - if(Plan->twist_kernel) - { - clReleaseKernel(Plan->twist_kernel); - Plan->twist_kernel = NULL; - } - if(Plan->program) - { - clReleaseProgram(Plan->program); - Plan->program = NULL; - } - if(Plan->tempmemobj) - { - clReleaseMemObject(Plan->tempmemobj); - Plan->tempmemobj = NULL; - } - if(Plan->tempmemobj_real) - { - clReleaseMemObject(Plan->tempmemobj_real); - Plan->tempmemobj_real = NULL; - } - if(Plan->tempmemobj_imag) - { - clReleaseMemObject(Plan->tempmemobj_imag); - Plan->tempmemobj_imag = NULL; - } + while(kernel_info) + { + cl_fft_kernel_info *tmp = kernel_info->next; + deleteKernelInfo(kernel_info); + kernel_info = tmp; + } + + Plan->kernel_info = NULL; + + if(Plan->kernel_string) + { + delete Plan->kernel_string; + Plan->kernel_string = NULL; + } + if(Plan->twist_kernel) + { + clReleaseKernel(Plan->twist_kernel); + Plan->twist_kernel = NULL; + } + if(Plan->program) + { + clReleaseProgram(Plan->program); + Plan->program = NULL; + } + if(Plan->tempmemobj) + { + clReleaseMemObject(Plan->tempmemobj); + Plan->tempmemobj = NULL; + } + if(Plan->tempmemobj_real) + { + clReleaseMemObject(Plan->tempmemobj_real); + Plan->tempmemobj_real = NULL; + } + if(Plan->tempmemobj_imag) + { + clReleaseMemObject(Plan->tempmemobj_imag); + Plan->tempmemobj_imag = NULL; + } } static int createKernelList(cl_fft_plan *plan) { - cl_program program = plan->program; - cl_fft_kernel_info *kernel_info = plan->kernel_info; - - cl_int err; - while(kernel_info) - { - kernel_info->kernel = clCreateKernel(program, kernel_info->kernel_name, &err); - if(!kernel_info->kernel || err != CL_SUCCESS) - return err; - kernel_info = kernel_info->next; - } - - if(plan->format == clFFT_SplitComplexFormat) - plan->twist_kernel = clCreateKernel(program, "clFFT_1DTwistSplit", &err); - else - plan->twist_kernel = clCreateKernel(program, "clFFT_1DTwistInterleaved", &err); - - if(!plan->twist_kernel || err) - return err; - - return CL_SUCCESS; + cl_program program = plan->program; + cl_fft_kernel_info *kernel_info = plan->kernel_info; + + cl_int err; + while(kernel_info) + { + kernel_info->kernel = clCreateKernel(program, kernel_info->kernel_name, &err); + if(!kernel_info->kernel || err != CL_SUCCESS) + return err; + kernel_info = kernel_info->next; + } + + if(plan->format == clFFT_SplitComplexFormat) + plan->twist_kernel = clCreateKernel(program, "clFFT_1DTwistSplit", &err); + else + plan->twist_kernel = clCreateKernel(program, "clFFT_1DTwistInterleaved", &err); + + if(!plan->twist_kernel || err) + return err; + + return CL_SUCCESS; } int getMaxKernelWorkGroupSize(cl_fft_plan *plan, unsigned int *max_wg_size, unsigned int num_devices, cl_device_id *devices) @@ -198,24 +198,24 @@ int getMaxKernelWorkGroupSize(cl_fft_plan *plan, unsigned int *max_wg_size, unsi unsigned int i; for(i = 0; i < num_devices; i++) { - cl_fft_kernel_info *kInfo = plan->kernel_info; - while(kInfo) - { - err = clGetKernelWorkGroupInfo(kInfo->kernel, devices[i], CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &wg_size, NULL); - if(err != CL_SUCCESS) - return -1; + cl_fft_kernel_info *kInfo = plan->kernel_info; + while(kInfo) + { + err = clGetKernelWorkGroupInfo(kInfo->kernel, devices[i], CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &wg_size, NULL); + if(err != CL_SUCCESS) + return -1; - if(wg_size < kInfo->num_workitems_per_workgroup) - reg_needed |= 1; + if(wg_size < kInfo->num_workitems_per_workgroup) + reg_needed |= 1; - if(*max_wg_size > wg_size) - *max_wg_size = wg_size; + if(*max_wg_size > wg_size) + *max_wg_size = wg_size; - kInfo = kInfo->next; - } - } + kInfo = kInfo->next; + } + } - return reg_needed; + return reg_needed; } #define ERR_MACRO(err) { \ @@ -224,179 +224,179 @@ int getMaxKernelWorkGroupSize(cl_fft_plan *plan, unsigned int *max_wg_size, unsi if(error_code) \ *error_code = err; \ clFFT_DestroyPlan((clFFT_Plan) plan); \ - return (clFFT_Plan) NULL; \ + return (clFFT_Plan) NULL; \ } \ - } + } clFFT_Plan clFFT_CreatePlan(cl_context context, clFFT_Dim3 n, clFFT_Dimension dim, clFFT_DataFormat dataFormat, cl_int *error_code ) { - int i; - cl_int err; - int isPow2 = 1; - cl_fft_plan *plan = NULL; - ostringstream kString; - int num_devices; - int gpu_found = 0; - cl_device_id devices[16]; - size_t ret_size; - cl_device_type device_type; + int i; + cl_int err; + int isPow2 = 1; + cl_fft_plan *plan = NULL; + ostringstream kString; + int num_devices; + int gpu_found = 0; + cl_device_id devices[16]; + size_t ret_size; + cl_device_type device_type; if(!context) - ERR_MACRO(CL_INVALID_VALUE); - - isPow2 |= n.x && !( (n.x - 1) & n.x ); - isPow2 |= n.y && !( (n.y - 1) & n.y ); - isPow2 |= n.z && !( (n.z - 1) & n.z ); - - if(!isPow2) - ERR_MACRO(CL_INVALID_VALUE); - - if( (dim == clFFT_1D && (n.y != 1 || n.z != 1)) || (dim == clFFT_2D && n.z != 1) ) - ERR_MACRO(CL_INVALID_VALUE); - - plan = (cl_fft_plan *) malloc(sizeof(cl_fft_plan)); - if(!plan) - ERR_MACRO(CL_OUT_OF_RESOURCES); - - plan->context = context; - clRetainContext(context); - plan->n = n; - plan->dim = dim; - plan->format = dataFormat; - plan->kernel_info = 0; - plan->num_kernels = 0; - plan->twist_kernel = 0; - plan->program = 0; - plan->temp_buffer_needed = 0; - plan->last_batch_size = 0; - plan->tempmemobj = 0; - plan->tempmemobj_real = 0; - plan->tempmemobj_imag = 0; - plan->max_localmem_fft_size = 2048; - plan->max_work_item_per_workgroup = 256; - plan->max_radix = 16; - plan->min_mem_coalesce_width = 16; - plan->num_local_mem_banks = 16; + ERR_MACRO(CL_INVALID_VALUE); + + isPow2 |= n.x && !( (n.x - 1) & n.x ); + isPow2 |= n.y && !( (n.y - 1) & n.y ); + isPow2 |= n.z && !( (n.z - 1) & n.z ); + + if(!isPow2) + ERR_MACRO(CL_INVALID_VALUE); + + if( (dim == clFFT_1D && (n.y != 1 || n.z != 1)) || (dim == clFFT_2D && n.z != 1) ) + ERR_MACRO(CL_INVALID_VALUE); + + plan = (cl_fft_plan *) malloc(sizeof(cl_fft_plan)); + if(!plan) + ERR_MACRO(CL_OUT_OF_RESOURCES); + + plan->context = context; + clRetainContext(context); + plan->n = n; + plan->dim = dim; + plan->format = dataFormat; + plan->kernel_info = 0; + plan->num_kernels = 0; + plan->twist_kernel = 0; + plan->program = 0; + plan->temp_buffer_needed = 0; + plan->last_batch_size = 0; + plan->tempmemobj = 0; + plan->tempmemobj_real = 0; + plan->tempmemobj_imag = 0; + plan->max_localmem_fft_size = 2048; + plan->max_work_item_per_workgroup = 256; + plan->max_radix = 16; + plan->min_mem_coalesce_width = 16; + plan->num_local_mem_banks = 16; patch_kernel_source: - plan->kernel_string = new string(""); - if(!plan->kernel_string) + plan->kernel_string = new string(""); + if(!plan->kernel_string) ERR_MACRO(CL_OUT_OF_RESOURCES); - getBlockConfigAndKernelString(plan); + getBlockConfigAndKernelString(plan); - const char *source_str = plan->kernel_string->c_str(); - plan->program = clCreateProgramWithSource(context, 1, (const char**) &source_str, NULL, &err); + const char *source_str = plan->kernel_string->c_str(); + plan->program = clCreateProgramWithSource(context, 1, (const char**) &source_str, NULL, &err); ERR_MACRO(err); - err = clGetContextInfo(context, CL_CONTEXT_DEVICES, sizeof(devices), devices, &ret_size); - ERR_MACRO(err); + err = clGetContextInfo(context, CL_CONTEXT_DEVICES, sizeof(devices), devices, &ret_size); + ERR_MACRO(err); - num_devices = ret_size / sizeof(cl_device_id); + num_devices = ret_size / sizeof(cl_device_id); - for(i = 0; i < num_devices; i++) - { - err = clGetDeviceInfo(devices[i], CL_DEVICE_TYPE, sizeof(device_type), &device_type, NULL); - ERR_MACRO(err); + for(i = 0; i < num_devices; i++) + { + err = clGetDeviceInfo(devices[i], CL_DEVICE_TYPE, sizeof(device_type), &device_type, NULL); + ERR_MACRO(err); - if(device_type == CL_DEVICE_TYPE_GPU) - { - gpu_found = 1; - err = clBuildProgram(plan->program, 1, &devices[i], "-cl-mad-enable", NULL, NULL); - if (err != CL_SUCCESS) - { - char *build_log; - char devicename[200]; - size_t log_size; + if(device_type == CL_DEVICE_TYPE_GPU) + { + gpu_found = 1; + err = clBuildProgram(plan->program, 1, &devices[i], "-cl-mad-enable", NULL, NULL); + if (err != CL_SUCCESS) + { + char *build_log; + char devicename[200]; + size_t log_size; - err = clGetProgramBuildInfo(plan->program, devices[i], CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); - ERR_MACRO(err); + err = clGetProgramBuildInfo(plan->program, devices[i], CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); + ERR_MACRO(err); - build_log = (char *) malloc(log_size + 1); + build_log = (char *) malloc(log_size + 1); - err = clGetProgramBuildInfo(plan->program, devices[i], CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL); - ERR_MACRO(err); + err = clGetProgramBuildInfo(plan->program, devices[i], CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL); + ERR_MACRO(err); - err = clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(devicename), devicename, NULL); - ERR_MACRO(err); + err = clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(devicename), devicename, NULL); + ERR_MACRO(err); - fprintf(stdout, "FFT program build log on device %s\n", devicename); - fprintf(stdout, "%s\n", build_log); - free(build_log); + fprintf(stdout, "FFT program build log on device %s\n", devicename); + fprintf(stdout, "%s\n", build_log); + free(build_log); - ERR_MACRO(err); - } - } - } + ERR_MACRO(err); + } + } + } - if(!gpu_found) - ERR_MACRO(CL_INVALID_CONTEXT); + if(!gpu_found) + ERR_MACRO(CL_INVALID_CONTEXT); - err = createKernelList(plan); + err = createKernelList(plan); ERR_MACRO(err); // we created program and kernels based on "some max work group size (default 256)" ... this work group size // may be larger than what kernel may execute with ... if thats the case we need to regenerate the kernel source // setting this as limit i.e max group size and rebuild. - unsigned int max_kernel_wg_size; - int patching_req = getMaxKernelWorkGroupSize(plan, &max_kernel_wg_size, num_devices, devices); - if(patching_req == -1) - { - ERR_MACRO(err); - } - - if(patching_req) - { - destroy_plan(plan); - plan->max_work_item_per_workgroup = max_kernel_wg_size; - goto patch_kernel_source; - } - - cl_fft_kernel_info *kInfo = plan->kernel_info; - while(kInfo) - { - plan->num_kernels++; - kInfo = kInfo->next; - } - - if(error_code) - *error_code = CL_SUCCESS; - - return (clFFT_Plan) plan; + unsigned int max_kernel_wg_size; + int patching_req = getMaxKernelWorkGroupSize(plan, &max_kernel_wg_size, num_devices, devices); + if(patching_req == -1) + { + ERR_MACRO(err); + } + + if(patching_req) + { + destroy_plan(plan); + plan->max_work_item_per_workgroup = max_kernel_wg_size; + goto patch_kernel_source; + } + + cl_fft_kernel_info *kInfo = plan->kernel_info; + while(kInfo) + { + plan->num_kernels++; + kInfo = kInfo->next; + } + + if(error_code) + *error_code = CL_SUCCESS; + + return (clFFT_Plan) plan; } void clFFT_DestroyPlan(clFFT_Plan plan) { cl_fft_plan *Plan = (cl_fft_plan *) plan; - if(Plan) - { - destroy_plan(Plan); - clReleaseContext(Plan->context); - free(Plan); - } + if(Plan) + { + destroy_plan(Plan); + clReleaseContext(Plan->context); + free(Plan); + } } void clFFT_DumpPlan( clFFT_Plan Plan, FILE *file) { - size_t gDim, lDim; - FILE *out; - if(!file) - out = stdout; - else - out = file; - - cl_fft_plan *plan = (cl_fft_plan *) Plan; - cl_fft_kernel_info *kInfo = plan->kernel_info; - - while(kInfo) - { - cl_int s = 1; - getKernelWorkDimensions(plan, kInfo, &s, &gDim, &lDim); - fprintf(out, "Run kernel %s with global dim = {%zd*BatchSize}, local dim={%zd}\n", kInfo->kernel_name, gDim, lDim); - kInfo = kInfo->next; - } - fprintf(out, "%s\n", plan->kernel_string->c_str()); + size_t gDim, lDim; + FILE *out; + if(!file) + out = stdout; + else + out = file; + + cl_fft_plan *plan = (cl_fft_plan *) Plan; + cl_fft_kernel_info *kInfo = plan->kernel_info; + + while(kInfo) + { + cl_int s = 1; + getKernelWorkDimensions(plan, kInfo, &s, &gDim, &lDim); + fprintf(out, "Run kernel %s with global dim = {%zd*BatchSize}, local dim={%zd}\n", kInfo->kernel_name, gDim, lDim); + kInfo = kInfo->next; + } + fprintf(out, "%s\n", plan->kernel_string->c_str()); }