diff --git a/example/main.cpp b/example/main.cpp index 69bcc53712b1eef749e7f0c1f16b31c7abe707f2..2492a256bacc13c7fea65554af36af3dfaf79061 100644 --- a/example/main.cpp +++ b/example/main.cpp @@ -94,58 +94,58 @@ double subtractTimes( uint64_t endTime, uint64_t startTime ) { uint64_t difference = endTime - startTime; static double conversion = 0.0; - + if( conversion == 0.0 ) { mach_timebase_info_data_t info; kern_return_t err = mach_timebase_info( &info ); - + //Convert the timebase into seconds if( err == 0 ) conversion = 1e-9 * (double) info.numer / (double) info.denom; } - + return conversion * (double) difference; } #endif #ifdef __APPLE__ -void computeReferenceF(clFFT_SplitComplex *out, clFFT_Dim3 n, +void computeReferenceF(clFFT_SplitComplex *out, clFFT_Dim3 n, unsigned int batchSize, clFFT_Dimension dim, clFFT_Direction dir) { FFTSetup plan_vdsp; DSPSplitComplex out_vdsp; FFTDirection dir_vdsp = dir == clFFT_Forward ? FFT_FORWARD : FFT_INVERSE; - + unsigned int i, j, k; unsigned int stride; unsigned int log2Nx = (unsigned int) log2(n.x); unsigned int log2Ny = (unsigned int) log2(n.y); unsigned int log2Nz = (unsigned int) log2(n.z); unsigned int log2N; - + log2N = log2Nx; log2N = log2N > log2Ny ? log2N : log2Ny; log2N = log2N > log2Nz ? log2N : log2Nz; - + plan_vdsp = vDSP_create_fftsetup(log2N, 2); - + switch(dim) { case clFFT_1D: - + for(i = 0; i < batchSize; i++) { stride = i * n.x; out_vdsp.realp = out->real + stride; out_vdsp.imagp = out->imag + stride; - + vDSP_fft_zip(plan_vdsp, &out_vdsp, 1, log2Nx, dir_vdsp); } break; - + case clFFT_2D: - + for(i = 0; i < batchSize; i++) { for(j = 0; j < n.y; j++) @@ -153,7 +153,7 @@ void computeReferenceF(clFFT_SplitComplex *out, clFFT_Dim3 n, stride = j * n.x + i * n.x * n.y; out_vdsp.realp = out->real + stride; out_vdsp.imagp = out->imag + stride; - + vDSP_fft_zip(plan_vdsp, &out_vdsp, 1, log2Nx, dir_vdsp); } } @@ -164,14 +164,14 @@ void computeReferenceF(clFFT_SplitComplex *out, clFFT_Dim3 n, stride = j + i * n.x * n.y; out_vdsp.realp = out->real + stride; out_vdsp.imagp = out->imag + stride; - + vDSP_fft_zip(plan_vdsp, &out_vdsp, n.x, log2Ny, dir_vdsp); } } break; - + case clFFT_3D: - + for(i = 0; i < batchSize; i++) { for(j = 0; j < n.z; j++) @@ -181,7 +181,7 @@ void computeReferenceF(clFFT_SplitComplex *out, clFFT_Dim3 n, stride = k * n.x + j * n.x * n.y + i * n.x * n.y * n.z; out_vdsp.realp = out->real + stride; out_vdsp.imagp = out->imag + stride; - + vDSP_fft_zip(plan_vdsp, &out_vdsp, 1, log2Nx, dir_vdsp); } } @@ -195,7 +195,7 @@ void computeReferenceF(clFFT_SplitComplex *out, clFFT_Dim3 n, stride = k + j * n.x * n.y + i * n.x * n.y * n.z; out_vdsp.realp = out->real + stride; out_vdsp.imagp = out->imag + stride; - + vDSP_fft_zip(plan_vdsp, &out_vdsp, n.x, log2Ny, dir_vdsp); } } @@ -209,55 +209,55 @@ void computeReferenceF(clFFT_SplitComplex *out, clFFT_Dim3 n, stride = k + j * n.x + i * n.x * n.y * n.z; out_vdsp.realp = out->real + stride; out_vdsp.imagp = out->imag + stride; - + vDSP_fft_zip(plan_vdsp, &out_vdsp, n.x*n.y, log2Nz, dir_vdsp); } } } break; } - + vDSP_destroy_fftsetup(plan_vdsp); } #endif #ifdef __APPLE__ -void computeReferenceD(clFFT_SplitComplexDouble *out, clFFT_Dim3 n, +void computeReferenceD(clFFT_SplitComplexDouble *out, clFFT_Dim3 n, unsigned int batchSize, clFFT_Dimension dim, clFFT_Direction dir) { FFTSetupD plan_vdsp; DSPDoubleSplitComplex out_vdsp; FFTDirection dir_vdsp = dir == clFFT_Forward ? FFT_FORWARD : FFT_INVERSE; - + unsigned int i, j, k; unsigned int stride; unsigned int log2Nx = (int) log2(n.x); unsigned int log2Ny = (int) log2(n.y); unsigned int log2Nz = (int) log2(n.z); unsigned int log2N; - + log2N = log2Nx; log2N = log2N > log2Ny ? log2N : log2Ny; log2N = log2N > log2Nz ? log2N : log2Nz; - + plan_vdsp = vDSP_create_fftsetupD(log2N, 2); - + switch(dim) { case clFFT_1D: - + for(i = 0; i < batchSize; i++) { stride = i * n.x; out_vdsp.realp = out->real + stride; out_vdsp.imagp = out->imag + stride; - + vDSP_fft_zipD(plan_vdsp, &out_vdsp, 1, log2Nx, dir_vdsp); } break; - + case clFFT_2D: - + for(i = 0; i < batchSize; i++) { for(j = 0; j < n.y; j++) @@ -265,7 +265,7 @@ void computeReferenceD(clFFT_SplitComplexDouble *out, clFFT_Dim3 n, stride = j * n.x + i * n.x * n.y; out_vdsp.realp = out->real + stride; out_vdsp.imagp = out->imag + stride; - + vDSP_fft_zipD(plan_vdsp, &out_vdsp, 1, log2Nx, dir_vdsp); } } @@ -276,14 +276,14 @@ void computeReferenceD(clFFT_SplitComplexDouble *out, clFFT_Dim3 n, stride = j + i * n.x * n.y; out_vdsp.realp = out->real + stride; out_vdsp.imagp = out->imag + stride; - + vDSP_fft_zipD(plan_vdsp, &out_vdsp, n.x, log2Ny, dir_vdsp); } } break; - + case clFFT_3D: - + for(i = 0; i < batchSize; i++) { for(j = 0; j < n.z; j++) @@ -293,7 +293,7 @@ void computeReferenceD(clFFT_SplitComplexDouble *out, clFFT_Dim3 n, stride = k * n.x + j * n.x * n.y + i * n.x * n.y * n.z; out_vdsp.realp = out->real + stride; out_vdsp.imagp = out->imag + stride; - + vDSP_fft_zipD(plan_vdsp, &out_vdsp, 1, log2Nx, dir_vdsp); } } @@ -307,7 +307,7 @@ void computeReferenceD(clFFT_SplitComplexDouble *out, clFFT_Dim3 n, stride = k + j * n.x * n.y + i * n.x * n.y * n.z; out_vdsp.realp = out->real + stride; out_vdsp.imagp = out->imag + stride; - + vDSP_fft_zipD(plan_vdsp, &out_vdsp, n.x, log2Ny, dir_vdsp); } } @@ -321,14 +321,14 @@ void computeReferenceD(clFFT_SplitComplexDouble *out, clFFT_Dim3 n, stride = k + j * n.x + i * n.x * n.y * n.z; out_vdsp.realp = out->real + stride; out_vdsp.imagp = out->imag + stride; - + vDSP_fft_zipD(plan_vdsp, &out_vdsp, n.x*n.y, log2Nz, dir_vdsp); } } } break; } - + vDSP_destroy_fftsetupD(plan_vdsp); } #endif @@ -344,12 +344,12 @@ double computeL2Error(clFFT_SplitComplex *data, clFFT_SplitComplexDouble *data_r double avg_norm = 0.0; *max_diff = 0.0; *min_diff = 0x1.0p1000; - + for(j = 0; j < batchSize; j++) { double norm_ref = 0.0; double norm = 0.0; - for(i = 0; i < n; i++) + for(i = 0; i < n; i++) { int index = j * n + i; clFFT_ComplexDouble diff = (clFFT_ComplexDouble) { data_ref->real[index] - data->real[index], data_ref->imag[index] - data->imag[index] }; @@ -362,7 +362,7 @@ double computeL2Error(clFFT_SplitComplex *data, clFFT_SplitComplexDouble *data_r *max_diff = *max_diff < curr_norm ? curr_norm : *max_diff; *min_diff = *min_diff > curr_norm ? curr_norm : *min_diff; } - + return avg_norm / batchSize; } @@ -375,9 +375,9 @@ void convertInterleavedToSplit(clFFT_SplitComplex *result_split, clFFT_Complex * } } -int runTest(clFFT_Dim3 n, int batchSize, clFFT_Direction dir, clFFT_Dimension dim, +int runTest(clFFT_Dim3 n, int batchSize, clFFT_Direction dir, clFFT_Dimension dim, clFFT_DataFormat dataFormat, int numIter, clFFT_TestType testType) -{ +{ cl_int err = CL_SUCCESS; int iter; @@ -393,14 +393,14 @@ int runTest(clFFT_Dim3 n, int batchSize, clFFT_Direction dir, clFFT_Dimension di #endif int length = n.x * n.y * n.z * batchSize; - + clFFT_SplitComplex data_i_split = (clFFT_SplitComplex) { NULL, NULL }; clFFT_SplitComplex data_cl_split = (clFFT_SplitComplex) { NULL, NULL }; clFFT_Complex *data_i = NULL; clFFT_Complex *data_cl = NULL; - clFFT_SplitComplexDouble data_iref = (clFFT_SplitComplexDouble) { NULL, NULL }; + clFFT_SplitComplexDouble data_iref = (clFFT_SplitComplexDouble) { NULL, NULL }; clFFT_SplitComplexDouble data_oref = (clFFT_SplitComplexDouble) { NULL, NULL }; - + clFFT_Plan plan = NULL; cl_mem data_in = NULL; cl_mem data_out = NULL; @@ -408,7 +408,7 @@ int runTest(clFFT_Dim3 n, int batchSize, clFFT_Direction dir, clFFT_Dimension di cl_mem data_in_imag = NULL; cl_mem data_out_real = NULL; cl_mem data_out_imag = NULL; - + if(dataFormat == clFFT_SplitComplexFormat) { data_i_split.real = (float *) malloc(sizeof(float) * length); data_i_split.imag = (float *) malloc(sizeof(float) * length); @@ -431,11 +431,11 @@ int runTest(clFFT_Dim3 n, int batchSize, clFFT_Direction dir, clFFT_Dimension di goto cleanup; } } - + data_iref.real = (double *) malloc(sizeof(double) * length); data_iref.imag = (double *) malloc(sizeof(double) * length); data_oref.real = (double *) malloc(sizeof(double) * length); - data_oref.imag = (double *) malloc(sizeof(double) * length); + data_oref.imag = (double *) malloc(sizeof(double) * length); if(!data_iref.real || !data_iref.imag || !data_oref.real || !data_oref.imag) { err = -3; @@ -450,11 +450,11 @@ int runTest(clFFT_Dim3 n, int batchSize, clFFT_Direction dir, clFFT_Dimension di data_i_split.real[i] = 2.0f * (float) rand() / (float) RAND_MAX - 1.0f; data_i_split.imag[i] = 2.0f * (float) rand() / (float) RAND_MAX - 1.0f; data_cl_split.real[i] = 0.0f; - data_cl_split.imag[i] = 0.0f; + data_cl_split.imag[i] = 0.0f; data_iref.real[i] = data_i_split.real[i]; data_iref.imag[i] = data_i_split.imag[i]; data_oref.real[i] = data_iref.real[i]; - data_oref.imag[i] = data_iref.imag[i]; + data_oref.imag[i] = data_iref.imag[i]; } } else { @@ -463,54 +463,54 @@ int runTest(clFFT_Dim3 n, int batchSize, clFFT_Direction dir, clFFT_Dimension di data_i[i].real = 2.0f * (float) rand() / (float) RAND_MAX - 1.0f; data_i[i].imag = 2.0f * (float) rand() / (float) RAND_MAX - 1.0f; data_cl[i].real = 0.0f; - data_cl[i].imag = 0.0f; + data_cl[i].imag = 0.0f; data_iref.real[i] = data_i[i].real; data_iref.imag[i] = data_i[i].imag; data_oref.real[i] = data_iref.real[i]; - data_oref.imag[i] = data_iref.imag[i]; - } + data_oref.imag[i] = data_iref.imag[i]; + } } - + plan = clFFT_CreatePlan( context, n, dim, dataFormat, &err ); - if(!plan || err) + if(!plan || err) { log_error("clFFT_CreatePlan failed\n"); goto cleanup; } - + //clFFT_DumpPlan(plan, stdout); - + if(dataFormat == clFFT_SplitComplexFormat) { data_in_real = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length*sizeof(float), data_i_split.real, &err); - if(!data_in_real || err) + if(!data_in_real || err) { log_error("clCreateBuffer failed\n"); goto cleanup; } - + data_in_imag = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length*sizeof(float), data_i_split.imag, &err); - if(!data_in_imag || err) + if(!data_in_imag || err) { log_error("clCreateBuffer failed\n"); goto cleanup; } - + if(testType == clFFT_OUT_OF_PLACE) { data_out_real = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length*sizeof(float), data_cl_split.real, &err); - if(!data_out_real || err) + if(!data_out_real || err) { log_error("clCreateBuffer failed\n"); goto cleanup; } - + data_out_imag = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length*sizeof(float), data_cl_split.imag, &err); - if(!data_out_imag || err) + if(!data_out_imag || err) { log_error("clCreateBuffer failed\n"); goto cleanup; - } + } } else { @@ -521,7 +521,7 @@ int runTest(clFFT_Dim3 n, int batchSize, clFFT_Direction dir, clFFT_Dimension di else { data_in = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length*sizeof(float)*2, data_i, &err); - if(!data_in) + if(!data_in) { log_error("clCreateBuffer failed\n"); goto cleanup; @@ -529,17 +529,17 @@ int runTest(clFFT_Dim3 n, int batchSize, clFFT_Direction dir, clFFT_Dimension di if(testType == clFFT_OUT_OF_PLACE) { data_out = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length*sizeof(float)*2, data_cl, &err); - if(!data_out) + if(!data_out) { log_error("clCreateBuffer failed\n"); goto cleanup; - } + } } else data_out = data_in; } - - + + err = CL_SUCCESS; #ifdef __APPLE__ @@ -552,20 +552,20 @@ int runTest(clFFT_Dim3 n, int batchSize, clFFT_Direction dir, clFFT_Dimension di } else { - for(iter = 0; iter < numIter; iter++) + for(iter = 0; iter < numIter; iter++) err |= clFFT_ExecuteInterleaved(queue, plan, batchSize, dir, data_in, data_out, 0, NULL, NULL); } - + err |= clFinish(queue); - - if(err) + + if(err) { log_error("clFFT_Execute\n"); - goto cleanup; + goto cleanup; } #ifdef __APPLE__ - t1 = mach_absolute_time(); + t1 = mach_absolute_time(); t = subtractTimes(t1, t0); char temp[100]; sprintf(temp, "GFlops achieved for n = (%d, %d, %d), batchsize = %d", n.x, n.y, n.z, batchSize); @@ -573,7 +573,7 @@ int runTest(clFFT_Dim3 n, int batchSize, clFFT_Direction dir, clFFT_Dimension di #endif if(dataFormat == clFFT_SplitComplexFormat) - { + { err |= clEnqueueReadBuffer(queue, data_out_real, CL_TRUE, 0, length*sizeof(float), data_cl_split.real, 0, NULL, NULL); err |= clEnqueueReadBuffer(queue, data_out_imag, CL_TRUE, 0, length*sizeof(float), data_cl_split.imag, 0, NULL, NULL); } @@ -581,23 +581,23 @@ int runTest(clFFT_Dim3 n, int batchSize, clFFT_Direction dir, clFFT_Dimension di { err |= clEnqueueReadBuffer(queue, data_out, CL_TRUE, 0, length*sizeof(float)*2, data_cl, 0, NULL, NULL); } - - if(err) + + if(err) { log_error("clEnqueueReadBuffer failed\n"); goto cleanup; - } + } #ifdef __APPLE__ computeReferenceD(&data_oref, n, batchSize, dim, dir); - + double diff_avg, diff_max, diff_min; if(dataFormat == clFFT_SplitComplexFormat) { diff_avg = computeL2Error(&data_cl_split, &data_oref, n.x*n.y*n.z, batchSize, &diff_max, &diff_min); if(diff_avg > eps_avg) log_error("Test failed (n=(%d, %d, %d), batchsize=%d): %s Test: rel. L2-error = %f eps (max=%f eps, min=%f eps)\n", n.x, n.y, n.z, batchSize, (testType == clFFT_OUT_OF_PLACE) ? "out-of-place" : "in-place", diff_avg, diff_max, diff_min); else - log_info("Test passed (n=(%d, %d, %d), batchsize=%d): %s Test: rel. L2-error = %f eps (max=%f eps, min=%f eps)\n", n.x, n.y, n.z, batchSize, (testType == clFFT_OUT_OF_PLACE) ? "out-of-place" : "in-place", diff_avg, diff_max, diff_min); + log_info("Test passed (n=(%d, %d, %d), batchsize=%d): %s Test: rel. L2-error = %f eps (max=%f eps, min=%f eps)\n", n.x, n.y, n.z, batchSize, (testType == clFFT_OUT_OF_PLACE) ? "out-of-place" : "in-place", diff_avg, diff_max, diff_min); } else { clFFT_SplitComplex result_split; @@ -605,19 +605,19 @@ int runTest(clFFT_Dim3 n, int batchSize, clFFT_Direction dir, clFFT_Dimension di result_split.imag = (float *) malloc(length*sizeof(float)); convertInterleavedToSplit(&result_split, data_cl, length); diff_avg = computeL2Error(&result_split, &data_oref, n.x*n.y*n.z, batchSize, &diff_max, &diff_min); - + if(diff_avg > eps_avg) log_error("Test failed (n=(%d, %d, %d), batchsize=%d): %s Test: rel. L2-error = %f eps (max=%f eps, min=%f eps)\n", n.x, n.y, n.z, batchSize, (testType == clFFT_OUT_OF_PLACE) ? "out-of-place" : "in-place", diff_avg, diff_max, diff_min); else - log_info("Test passed (n=(%d, %d, %d), batchsize=%d): %s Test: rel. L2-error = %f eps (max=%f eps, min=%f eps)\n", n.x, n.y, n.z, batchSize, (testType == clFFT_OUT_OF_PLACE) ? "out-of-place" : "in-place", diff_avg, diff_max, diff_min); + log_info("Test passed (n=(%d, %d, %d), batchsize=%d): %s Test: rel. L2-error = %f eps (max=%f eps, min=%f eps)\n", n.x, n.y, n.z, batchSize, (testType == clFFT_OUT_OF_PLACE) ? "out-of-place" : "in-place", diff_avg, diff_max, diff_min); free(result_split.real); free(result_split.imag); } #endif cleanup: - clFFT_DestroyPlan(plan); - if(dataFormat == clFFT_SplitComplexFormat) + clFFT_DestroyPlan(plan); + if(dataFormat == clFFT_SplitComplexFormat) { if(data_i_split.real) free(data_i_split.real); @@ -627,7 +627,7 @@ cleanup: free(data_cl_split.real); if(data_cl_split.imag) free(data_cl_split.imag); - + if(data_in_real) clReleaseMemObject(data_in_real); if(data_in_imag) @@ -637,28 +637,28 @@ cleanup: if(data_out_imag && clFFT_OUT_OF_PLACE) clReleaseMemObject(data_out_imag); } - else + else { if(data_i) free(data_i); if(data_cl) free(data_cl); - + if(data_in) clReleaseMemObject(data_in); if(data_out && testType == clFFT_OUT_OF_PLACE) clReleaseMemObject(data_out); } - + if(data_iref.real) free(data_iref.real); if(data_iref.imag) - free(data_iref.imag); + free(data_iref.imag); if(data_oref.real) free(data_oref.real); if(data_oref.imag) free(data_oref.imag); - + return err; } @@ -690,7 +690,7 @@ cl_device_type getGlobalDeviceType() return CL_DEVICE_TYPE_GPU; } -void +void notify_callback(const char *errinfo, const void *private_info, size_t cb, void *user_data) { printf( "ERROR: %s\n", errinfo ); @@ -708,9 +708,9 @@ checkMemRequirements(clFFT_Dim3 n, int batchSize, clFFT_TestType testType, cl_ul } int main (int argc, char * const argv[]) { - + test_start(); - + cl_ulong gMemSize; clFFT_Direction dir = clFFT_Forward; int numIter = 1; @@ -720,28 +720,28 @@ int main (int argc, char * const argv[]) { clFFT_Dimension dim = clFFT_1D; clFFT_TestType testType = clFFT_OUT_OF_PLACE; cl_device_id device_ids[16]; - + FILE *paramFile; - + cl_int err; unsigned int num_devices; - - cl_device_type device_type = getGlobalDeviceType(); - if(device_type != CL_DEVICE_TYPE_GPU) + + cl_device_type device_type = getGlobalDeviceType(); + if(device_type != CL_DEVICE_TYPE_GPU) { log_info("Test only supported on DEVICE_TYPE_GPU\n"); test_finish(); exit(0); } - + err = clGetDeviceIDs(NULL, device_type, sizeof(device_ids), device_ids, &num_devices); - if(err) - { + if(err) + { printf("ERROR: clGetDeviceIDs failed with error: %d\n", err); test_finish(); return -1; } - + device_id = NULL; unsigned int i = 0; @@ -794,7 +794,7 @@ int main (int argc, char * const argv[]) { } } } - + if(!device_id) { log_error("None of the devices available for compute ... aborting test\n"); test_finish(); @@ -812,13 +812,13 @@ int main (int argc, char * const argv[]) { } context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); - if(!context || err) + if(!context || err) { log_error("clCreateContext failed\n"); test_finish(); return -1; } - + queue = clCreateCommandQueue(context, device_id, 0, &err); if(!queue || err) { @@ -826,8 +826,8 @@ int main (int argc, char * const argv[]) { clReleaseContext(context); test_finish(); return -1; - } - + } + err = clGetDeviceInfo(device_id, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(cl_ulong), &gMemSize, NULL); if(err) { @@ -837,26 +837,26 @@ int main (int argc, char * const argv[]) { test_finish(); return -2; } - + gMemSize /= (1024*1024); - + char delim[] = " \n"; char tmpStr[100]; char line[200]; - char *param, *val; + char *param, *val; int total_errors = 0; if(argc == 1) { log_error("Need file name with list of parameters to run the test\n"); test_finish(); return -1; } - + if(argc >= 2) { // arguments are supplied in a file with arguments for a single run are all on the same line paramFile = fopen(argv[1], "r"); if(!paramFile) { log_error("Cannot open the parameter file\n"); clReleaseContext(context); - clReleaseCommandQueue(queue); + clReleaseCommandQueue(queue); test_finish(); return -3; } @@ -871,9 +871,9 @@ int main (int argc, char * const argv[]) { val = strtok(NULL, delim); sscanf(val, "%d", &n.y); val = strtok(NULL, delim); - sscanf(val, "%d", &n.z); + sscanf(val, "%d", &n.z); } - else if(!strcmp(param, "-batchsize")) + else if(!strcmp(param, "-batchsize")) sscanf(val, "%d", &batchSize); else if(!strcmp(param, "-dir")) { sscanf(val, "%s", tmpStr); @@ -887,16 +887,16 @@ int main (int argc, char * const argv[]) { if(!strcmp(tmpStr, "1D")) dim = clFFT_1D; else if(!strcmp(tmpStr, "2D")) - dim = clFFT_2D; + dim = clFFT_2D; else if(!strcmp(tmpStr, "3D")) - dim = clFFT_3D; + dim = clFFT_3D; } else if(!strcmp(param, "-format")) { sscanf(val, "%s", tmpStr); if(!strcmp(tmpStr, "plannar")) dataFormat = clFFT_SplitComplexFormat; else if(!strcmp(tmpStr, "interleaved")) - dataFormat = clFFT_InterleavedComplexFormat; + dataFormat = clFFT_InterleavedComplexFormat; } else if(!strcmp(param, "-numiter")) sscanf(val, "%d", &numIter); @@ -905,25 +905,25 @@ int main (int argc, char * const argv[]) { if(!strcmp(tmpStr, "out-of-place")) testType = clFFT_OUT_OF_PLACE; else if(!strcmp(tmpStr, "in-place")) - testType = clFFT_IN_PLACE; + testType = clFFT_IN_PLACE; } param = strtok(NULL, delim); } - + if(checkMemRequirements(n, batchSize, testType, gMemSize)) { log_info("This test cannot run because memory requirements canot be met by the available device\n"); continue; } - + err = runTest(n, batchSize, dir, dim, dataFormat, numIter, testType); if (err) total_errors++; } } - + clReleaseContext(context); clReleaseCommandQueue(queue); - + test_finish(); - return total_errors; + return total_errors; } diff --git a/include/clFFT.h b/include/clFFT.h index 8dde2a4a91a84fea04256f306ca1362329ed0c61..3b3e12134d9d2b90e0ddd6f24c74eec8a01350f0 100644 --- a/include/clFFT.h +++ b/include/clFFT.h @@ -61,11 +61,11 @@ extern "C" { #endif // XForm type -typedef enum +typedef enum { clFFT_Forward = -1, clFFT_Inverse = 1 - + }clFFT_Direction; // XForm dimension @@ -74,7 +74,7 @@ typedef enum clFFT_1D = 0, clFFT_2D = 1, clFFT_3D = 3 - + }clFFT_Dimension; // XForm Data type @@ -89,8 +89,8 @@ typedef struct unsigned int x; unsigned int y; unsigned int z; -}clFFT_Dim3; - +}clFFT_Dim3; + typedef struct { float *real; @@ -103,31 +103,31 @@ typedef struct float imag; }clFFT_Complex; -typedef void* clFFT_Plan; +typedef void* clFFT_Plan; clFFT_Plan clFFT_CreatePlan( cl_context context, clFFT_Dim3 n, clFFT_Dimension dim, clFFT_DataFormat dataFormat, cl_int *error_code ); void clFFT_DestroyPlan( clFFT_Plan plan ); -cl_int clFFT_ExecuteInterleaved( cl_command_queue queue, clFFT_Plan plan, cl_int batchSize, clFFT_Direction dir, +cl_int clFFT_ExecuteInterleaved( cl_command_queue queue, clFFT_Plan plan, cl_int batchSize, clFFT_Direction dir, cl_mem data_in, cl_mem data_out, cl_int num_events, cl_event *event_list, cl_event *event ); -cl_int clFFT_ExecutePlannar( cl_command_queue queue, clFFT_Plan plan, cl_int batchSize, clFFT_Direction dir, +cl_int clFFT_ExecutePlannar( cl_command_queue queue, clFFT_Plan plan, cl_int batchSize, clFFT_Direction dir, cl_mem data_in_real, cl_mem data_in_imag, cl_mem data_out_real, cl_mem data_out_imag, cl_int num_events, cl_event *event_list, cl_event *event ); -cl_int clFFT_1DTwistInterleaved(clFFT_Plan Plan, cl_command_queue queue, cl_mem array, +cl_int clFFT_1DTwistInterleaved(clFFT_Plan Plan, cl_command_queue queue, cl_mem array, size_t numRows, size_t numCols, size_t startRow, size_t rowsToProcess, clFFT_Direction dir); - -cl_int clFFT_1DTwistPlannar(clFFT_Plan Plan, cl_command_queue queue, cl_mem array_real, cl_mem array_imag, + +cl_int clFFT_1DTwistPlannar(clFFT_Plan Plan, cl_command_queue queue, cl_mem array_real, cl_mem array_imag, size_t numRows, size_t numCols, size_t startRow, size_t rowsToProcess, clFFT_Direction dir); - -void clFFT_DumpPlan( clFFT_Plan plan, FILE *file); + +void clFFT_DumpPlan( clFFT_Plan plan, FILE *file); #ifdef __cplusplus } #endif -#endif +#endif diff --git a/src/fft_base_kernels.h b/src/fft_base_kernels.h index 101795697f55e125e4fbafa8757aef5de033fad6..186b545f346c0ad1c8edb40832e87e48192837d1 100644 --- a/src/fft_base_kernels.h +++ b/src/fft_base_kernels.h @@ -59,7 +59,7 @@ static string baseKernels = string( "#endif\n" "#define complexMul(a,b) ((float2)(mad(-(a).y, (b).y, (a).x * (b).x), mad((a).y, (b).x, (a).x * (b).y)))\n" "#define conj(a) ((float2)((a).x, -(a).y))\n" - "#define conjTransp(a) ((float2)(-(a).y, (a).x))\n" + "#define conjTransp(a) ((float2)(-(a).y, (a).x))\n" "\n" "#define fftKernel2(a,dir) \\\n" "{ \\\n" @@ -67,14 +67,14 @@ static string baseKernels = string( " (a)[0] = c + (a)[1]; \\\n" " (a)[1] = c - (a)[1]; \\\n" "}\n" - "\n" + "\n" "#define fftKernel2S(d1,d2,dir) \\\n" "{ \\\n" " float2 c = (d1); \\\n" " (d1) = c + (d2); \\\n" " (d2) = c - (d2); \\\n" "}\n" - "\n" + "\n" "#define fftKernel4(a,dir) \\\n" "{ \\\n" " fftKernel2S((a)[0], (a)[2], dir); \\\n" @@ -86,7 +86,7 @@ static string baseKernels = string( " (a)[1] = (a)[2]; \\\n" " (a)[2] = c; \\\n" "}\n" - "\n" + "\n" "#define fftKernel4s(a0,a1,a2,a3,dir) \\\n" "{ \\\n" " fftKernel2S((a0), (a2), dir); \\\n" @@ -96,9 +96,9 @@ static string baseKernels = string( " fftKernel2S((a2), (a3), dir); \\\n" " float2 c = (a1); \\\n" " (a1) = (a2); \\\n" - " (a2) = c; \\\n" + " (a2) = c; \\\n" "}\n" - "\n" + "\n" "#define bitreverse8(a) \\\n" "{ \\\n" " float2 c; \\\n" @@ -109,7 +109,7 @@ static string baseKernels = string( " (a)[3] = (a)[6]; \\\n" " (a)[6] = c; \\\n" "}\n" - "\n" + "\n" "#define fftKernel8(a,dir) \\\n" "{ \\\n" " const float2 w1 = (float2)(0x1.6a09e6p-1f, dir*0x1.6a09e6p-1f); \\\n" @@ -134,7 +134,7 @@ static string baseKernels = string( " fftKernel2S((a)[6], (a)[7], dir); \\\n" " bitreverse8((a)); \\\n" "}\n" - "\n" + "\n" "#define bitreverse4x4(a) \\\n" "{ \\\n" " float2 c; \\\n" @@ -145,7 +145,7 @@ static string baseKernels = string( " c = (a)[7]; (a)[7] = (a)[13]; (a)[13] = c; \\\n" " c = (a)[11]; (a)[11] = (a)[14]; (a)[14] = c; \\\n" "}\n" - "\n" + "\n" "#define fftKernel16(a,dir) \\\n" "{ \\\n" " const float w0 = 0x1.d906bcp-1f; \\\n" @@ -170,7 +170,7 @@ static string baseKernels = string( " fftKernel4((a) + 12, dir); \\\n" " bitreverse4x4((a)); \\\n" "}\n" - "\n" + "\n" "#define bitreverse32(a) \\\n" "{ \\\n" " float2 c1, c2; \\\n" @@ -181,7 +181,7 @@ static string baseKernels = string( " c1 = (a)[22]; (a)[22] = (a)[11]; c2 = (a)[13]; (a)[13] = c1; c1 = (a)[26]; (a)[26] = c2; c2 = (a)[21]; (a)[21] = c1; (a)[11] = c2; \\\n" " c1 = (a)[30]; (a)[30] = (a)[15]; c2 = (a)[29]; (a)[29] = c1; c1 = (a)[27]; (a)[27] = c2; c2 = (a)[23]; (a)[23] = c1; (a)[15] = c2; \\\n" "}\n" - "\n" + "\n" "#define fftKernel32(a,dir) \\\n" "{ \\\n" " fftKernel2S((a)[0], (a)[16], dir); \\\n" @@ -270,7 +270,7 @@ static string twistKernelPlannar = string( " } \\\n" " } \\\n" "} \\\n" - ); + ); diff --git a/src/fft_execute.cpp b/src/fft_execute.cpp index b500cd3f36131cf33bcb1778a94cb34ed143513f..ad43d4f9631c599eaa376175607e58a35cbcdb03 100644 --- a/src/fft_execute.cpp +++ b/src/fft_execute.cpp @@ -59,17 +59,17 @@ static cl_int allocateTemporaryBufferInterleaved(cl_fft_plan *plan, cl_uint batchSize) { cl_int err = CL_SUCCESS; - if(plan->temp_buffer_needed && plan->last_batch_size != batchSize) + if(plan->temp_buffer_needed && plan->last_batch_size != batchSize) { - plan->last_batch_size = batchSize; + plan->last_batch_size = batchSize; size_t tmpLength = plan->n.x * plan->n.y * plan->n.z * batchSize * 2 * sizeof(cl_float); - + if(plan->tempmemobj) clReleaseMemObject(plan->tempmemobj); - + plan->tempmemobj = clCreateBuffer(plan->context, CL_MEM_READ_WRITE, tmpLength, NULL, &err); } - return err; + return err; } static cl_int @@ -77,21 +77,21 @@ allocateTemporaryBufferPlannar(cl_fft_plan *plan, cl_uint batchSize) { cl_int err = CL_SUCCESS; cl_int terr; - if(plan->temp_buffer_needed && plan->last_batch_size != batchSize) + if(plan->temp_buffer_needed && plan->last_batch_size != batchSize) { - plan->last_batch_size = batchSize; + plan->last_batch_size = batchSize; size_t tmpLength = plan->n.x * plan->n.y * plan->n.z * batchSize * sizeof(cl_float); - + if(plan->tempmemobj_real) clReleaseMemObject(plan->tempmemobj_real); if(plan->tempmemobj_imag) - clReleaseMemObject(plan->tempmemobj_imag); - + clReleaseMemObject(plan->tempmemobj_imag); + plan->tempmemobj_real = clCreateBuffer(plan->context, CL_MEM_READ_WRITE, tmpLength, NULL, &err); plan->tempmemobj_imag = clCreateBuffer(plan->context, CL_MEM_READ_WRITE, tmpLength, NULL, &terr); err |= terr; - } + } return err; } @@ -101,7 +101,7 @@ getKernelWorkDimensions(cl_fft_plan *plan, cl_fft_kernel_info *kernelInfo, cl_in *lWorkItems = kernelInfo->num_workitems_per_workgroup; int numWorkGroups = kernelInfo->num_workgroups; int numXFormsPerWG = kernelInfo->num_xforms_per_workgroup; - + switch(kernelInfo->dir) { case cl_fft_kernel_x: @@ -117,45 +117,45 @@ getKernelWorkDimensions(cl_fft_plan *plan, cl_fft_kernel_info *kernelInfo, cl_in numWorkGroups *= *batchSize; break; } - + *gWorkItems = numWorkGroups * *lWorkItems; } -cl_int -clFFT_ExecuteInterleaved( cl_command_queue queue, clFFT_Plan Plan, cl_int batchSize, clFFT_Direction dir, - cl_mem data_in, cl_mem data_out, +cl_int +clFFT_ExecuteInterleaved( cl_command_queue queue, clFFT_Plan Plan, cl_int batchSize, clFFT_Direction dir, + cl_mem data_in, cl_mem data_out, cl_int num_events, cl_event *event_list, cl_event *event ) -{ +{ int s; cl_fft_plan *plan = (cl_fft_plan *) Plan; if(plan->format != clFFT_InterleavedComplexFormat) return CL_INVALID_VALUE; - + cl_int err; size_t gWorkItems, lWorkItems; int inPlaceDone = -1; - + cl_int isInPlace = data_in == data_out ? 1 : 0; - + if((err = allocateTemporaryBufferInterleaved(plan, batchSize)) != CL_SUCCESS) - return err; - + return err; + cl_mem memObj[3]; memObj[0] = data_in; memObj[1] = data_out; memObj[2] = plan->tempmemobj; cl_fft_kernel_info *kernelInfo = plan->kernel_info; int numKernels = plan->num_kernels; - + int numKernelsOdd = numKernels & 1; int currRead = 0; int currWrite = 1; - + // at least one external dram shuffle (transpose) required - if(plan->temp_buffer_needed) + if(plan->temp_buffer_needed) { // in-place transform - if(isInPlace) + if(isInPlace) { inPlaceDone = 0; currRead = 1; @@ -165,36 +165,36 @@ clFFT_ExecuteInterleaved( cl_command_queue queue, clFFT_Plan Plan, cl_int batchS { currWrite = (numKernels & 1) ? 1 : 2; } - - while(kernelInfo) + + while(kernelInfo) { - if( isInPlace && numKernelsOdd && !inPlaceDone && kernelInfo->in_place_possible) + if( isInPlace && numKernelsOdd && !inPlaceDone && kernelInfo->in_place_possible) { currWrite = currRead; inPlaceDone = 1; } - + s = batchSize; getKernelWorkDimensions(plan, kernelInfo, &s, &gWorkItems, &lWorkItems); err |= clSetKernelArg(kernelInfo->kernel, 0, sizeof(cl_mem), &memObj[currRead]); err |= clSetKernelArg(kernelInfo->kernel, 1, sizeof(cl_mem), &memObj[currWrite]); err |= clSetKernelArg(kernelInfo->kernel, 2, sizeof(cl_int), &dir); err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_int), &s); - + err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, 0, NULL, NULL); if(err) return err; - + currRead = (currWrite == 1) ? 1 : 2; - currWrite = (currWrite == 1) ? 2 : 1; - + currWrite = (currWrite == 1) ? 2 : 1; + kernelInfo = kernelInfo->next; - } + } } // no dram shuffle (transpose required) transform // all kernels can execute in-place. else { - + while(kernelInfo) { s = batchSize; @@ -203,41 +203,41 @@ clFFT_ExecuteInterleaved( cl_command_queue queue, clFFT_Plan Plan, cl_int batchS err |= clSetKernelArg(kernelInfo->kernel, 1, sizeof(cl_mem), &memObj[currWrite]); err |= clSetKernelArg(kernelInfo->kernel, 2, sizeof(cl_int), &dir); err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_int), &s); - + err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, 0, NULL, NULL); if(err) - return err; - + return err; + currRead = 1; currWrite = 1; - + kernelInfo = kernelInfo->next; } } - + return err; } -cl_int -clFFT_ExecutePlannar( cl_command_queue queue, clFFT_Plan Plan, cl_int batchSize, clFFT_Direction dir, +cl_int +clFFT_ExecutePlannar( cl_command_queue queue, clFFT_Plan Plan, cl_int batchSize, clFFT_Direction dir, cl_mem data_in_real, cl_mem data_in_imag, cl_mem data_out_real, cl_mem data_out_imag, cl_int num_events, cl_event *event_list, cl_event *event) -{ +{ int s; cl_fft_plan *plan = (cl_fft_plan *) Plan; - + if(plan->format != clFFT_SplitComplexFormat) return CL_INVALID_VALUE; - + cl_int err; size_t gWorkItems, lWorkItems; int inPlaceDone = -1; - + cl_int isInPlace = ((data_in_real == data_out_real) && (data_in_imag == data_out_imag)) ? 1 : 0; - + if((err = allocateTemporaryBufferPlannar(plan, batchSize)) != CL_SUCCESS) - return err; - + return err; + cl_mem memObj_real[3]; cl_mem memObj_imag[3]; memObj_real[0] = data_in_real; @@ -246,19 +246,19 @@ clFFT_ExecutePlannar( cl_command_queue queue, clFFT_Plan Plan, cl_int batchSize, memObj_imag[0] = data_in_imag; memObj_imag[1] = data_out_imag; memObj_imag[2] = plan->tempmemobj_imag; - + cl_fft_kernel_info *kernelInfo = plan->kernel_info; int numKernels = plan->num_kernels; - + int numKernelsOdd = numKernels & 1; int currRead = 0; int currWrite = 1; - + // at least one external dram shuffle (transpose) required - if(plan->temp_buffer_needed) + if(plan->temp_buffer_needed) { // in-place transform - if(isInPlace) + if(isInPlace) { inPlaceDone = 0; currRead = 1; @@ -268,15 +268,15 @@ clFFT_ExecutePlannar( cl_command_queue queue, clFFT_Plan Plan, cl_int batchSize, { currWrite = (numKernels & 1) ? 1 : 2; } - - while(kernelInfo) + + while(kernelInfo) { - if( isInPlace && numKernelsOdd && !inPlaceDone && kernelInfo->in_place_possible) + if( isInPlace && numKernelsOdd && !inPlaceDone && kernelInfo->in_place_possible) { currWrite = currRead; inPlaceDone = 1; } - + s = batchSize; getKernelWorkDimensions(plan, kernelInfo, &s, &gWorkItems, &lWorkItems); err |= clSetKernelArg(kernelInfo->kernel, 0, sizeof(cl_mem), &memObj_real[currRead]); @@ -285,20 +285,20 @@ clFFT_ExecutePlannar( cl_command_queue queue, clFFT_Plan Plan, cl_int batchSize, err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_mem), &memObj_imag[currWrite]); err |= clSetKernelArg(kernelInfo->kernel, 4, sizeof(cl_int), &dir); err |= clSetKernelArg(kernelInfo->kernel, 5, sizeof(cl_int), &s); - + err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, 0, NULL, NULL); if(err) - return err; - + return err; + currRead = (currWrite == 1) ? 1 : 2; - currWrite = (currWrite == 1) ? 2 : 1; - + currWrite = (currWrite == 1) ? 2 : 1; + kernelInfo = kernelInfo->next; - } + } } // no dram shuffle (transpose required) transform else { - + while(kernelInfo) { s = batchSize; @@ -309,87 +309,87 @@ clFFT_ExecutePlannar( cl_command_queue queue, clFFT_Plan Plan, cl_int batchSize, err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_mem), &memObj_imag[currWrite]); err |= clSetKernelArg(kernelInfo->kernel, 4, sizeof(cl_int), &dir); err |= clSetKernelArg(kernelInfo->kernel, 5, sizeof(cl_int), &s); - + err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, 0, NULL, NULL); if(err) - return err; - + return err; + currRead = 1; currWrite = 1; - + kernelInfo = kernelInfo->next; } } - + return err; } -cl_int -clFFT_1DTwistInterleaved(clFFT_Plan Plan, cl_command_queue queue, cl_mem array, +cl_int +clFFT_1DTwistInterleaved(clFFT_Plan Plan, cl_command_queue queue, cl_mem array, size_t numRows, size_t numCols, size_t startRow, size_t rowsToProcess, clFFT_Direction dir) { cl_fft_plan *plan = (cl_fft_plan *) Plan; - + unsigned int N = numRows*numCols; unsigned int nCols = numCols; unsigned int sRow = startRow; unsigned int rToProcess = rowsToProcess; int d = dir; int err = 0; - + cl_device_id device_id; err = clGetCommandQueueInfo(queue, CL_QUEUE_DEVICE, sizeof(cl_device_id), &device_id, NULL); if(err) return err; - + size_t gSize; err = clGetKernelWorkGroupInfo(plan->twist_kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &gSize, NULL); if(err) return err; - + gSize = min(128, gSize); size_t numGlobalThreads[1] = { max(numCols / gSize, 1)*gSize }; size_t numLocalThreads[1] = { gSize }; - + err |= clSetKernelArg(plan->twist_kernel, 0, sizeof(cl_mem), &array); err |= clSetKernelArg(plan->twist_kernel, 1, sizeof(unsigned int), &sRow); err |= clSetKernelArg(plan->twist_kernel, 2, sizeof(unsigned int), &nCols); err |= clSetKernelArg(plan->twist_kernel, 3, sizeof(unsigned int), &N); err |= clSetKernelArg(plan->twist_kernel, 4, sizeof(unsigned int), &rToProcess); err |= clSetKernelArg(plan->twist_kernel, 5, sizeof(int), &d); - - err |= clEnqueueNDRangeKernel(queue, plan->twist_kernel, 1, NULL, numGlobalThreads, numLocalThreads, 0, NULL, NULL); - - return err; + + err |= clEnqueueNDRangeKernel(queue, plan->twist_kernel, 1, NULL, numGlobalThreads, numLocalThreads, 0, NULL, NULL); + + return err; } -cl_int -clFFT_1DTwistPlannar(clFFT_Plan Plan, cl_command_queue queue, cl_mem array_real, cl_mem array_imag, +cl_int +clFFT_1DTwistPlannar(clFFT_Plan Plan, cl_command_queue queue, cl_mem array_real, cl_mem array_imag, size_t numRows, size_t numCols, size_t startRow, size_t rowsToProcess, clFFT_Direction dir) { cl_fft_plan *plan = (cl_fft_plan *) Plan; - + unsigned int N = numRows*numCols; unsigned int nCols = numCols; unsigned int sRow = startRow; unsigned int rToProcess = rowsToProcess; int d = dir; int err = 0; - + cl_device_id device_id; err = clGetCommandQueueInfo(queue, CL_QUEUE_DEVICE, sizeof(cl_device_id), &device_id, NULL); if(err) return err; - + size_t gSize; err = clGetKernelWorkGroupInfo(plan->twist_kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &gSize, NULL); if(err) return err; - + gSize = min(128, gSize); size_t numGlobalThreads[1] = { max(numCols / gSize, 1)*gSize }; size_t numLocalThreads[1] = { gSize }; - + err |= clSetKernelArg(plan->twist_kernel, 0, sizeof(cl_mem), &array_real); err |= clSetKernelArg(plan->twist_kernel, 1, sizeof(cl_mem), &array_imag); err |= clSetKernelArg(plan->twist_kernel, 2, sizeof(unsigned int), &sRow); @@ -397,9 +397,9 @@ clFFT_1DTwistPlannar(clFFT_Plan Plan, cl_command_queue queue, cl_mem array_real, err |= clSetKernelArg(plan->twist_kernel, 4, sizeof(unsigned int), &N); err |= clSetKernelArg(plan->twist_kernel, 5, sizeof(unsigned int), &rToProcess); err |= clSetKernelArg(plan->twist_kernel, 6, sizeof(int), &d); - - err |= clEnqueueNDRangeKernel(queue, plan->twist_kernel, 1, NULL, numGlobalThreads, numLocalThreads, 0, NULL, NULL); - - return err; + + err |= clEnqueueNDRangeKernel(queue, plan->twist_kernel, 1, NULL, numGlobalThreads, numLocalThreads, 0, NULL, NULL); + + return err; } diff --git a/src/fft_internal.h b/src/fft_internal.h index a45b69c98af037b2228aa29b4a046b1c4f1cf86f..71d72da6a7b59461fcc311f6cebb7d0b8b83dd44 100644 --- a/src/fft_internal.h +++ b/src/fft_internal.h @@ -76,83 +76,83 @@ typedef struct kernel_info_t kernel_info_t *next; }cl_fft_kernel_info; -typedef struct +typedef struct { // context in which fft resources are created and kernels are executed cl_context context; - + // size of signal clFFT_Dim3 n; - + // dimension of transform ... must be either 1D, 2D or 3D clFFT_Dimension dim; - + // data format ... must be either interleaved or plannar clFFT_DataFormat format; - + // string containing kernel source. Generated at runtime based on // n, dim, format and other parameters string *kernel_string; - - // CL program containing source and kernel this particular + + // CL program containing source and kernel this particular // n, dim, data format cl_program program; - + // linked list of kernels which needs to be executed for this fft cl_fft_kernel_info *kernel_info; - + // number of kernels int num_kernels; - + // twist kernel for virtualizing fft of very large sizes that do not // fit in GPU global memory cl_kernel twist_kernel; - + // flag indicating if temporary intermediate buffer is needed or not. - // this depends on fft kernels being executed and if transform is - // in-place or out-of-place. e.g. Local memory fft (say 1D 1024 ... + // this depends on fft kernels being executed and if transform is + // in-place or out-of-place. e.g. Local memory fft (say 1D 1024 ... // one that does not require global transpose do not need temporary buffer) // 2D 1024x1024 out-of-place fft however do require intermediate buffer. // If temp buffer is needed, its allocation is lazy i.e. its not allocated // until its needed cl_int temp_buffer_needed; - + // Batch size is runtime parameter and size of temporary buffer (if needed) // depends on batch size. Allocation of temporary buffer is lazy i.e. its // only created when needed. Once its created at first call of clFFT_Executexxx - // it is not allocated next time if next time clFFT_Executexxx is called with + // it is not allocated next time if next time clFFT_Executexxx is called with // batch size different than the first call. last_batch_size caches the last // batch size with which this plan is used so that we dont keep allocating/deallocating // temp buffer if same batch size is used again and again. size_t last_batch_size; - + // temporary buffer for interleaved plan cl_mem tempmemobj; - - // temporary buffer for planner plan. Only one of tempmemobj or - // (tempmemobj_real, tempmemobj_imag) pair is valid (allocated) depending + + // temporary buffer for planner plan. Only one of tempmemobj or + // (tempmemobj_real, tempmemobj_imag) pair is valid (allocated) depending // data format of plan (plannar or interleaved) cl_mem tempmemobj_real, tempmemobj_imag; - + // Maximum size of signal for which local memory transposed based // fft is sufficient i.e. no global mem transpose (communication) // is needed size_t max_localmem_fft_size; - - // Maximum work items per work group allowed. This, along with max_radix below controls + + // Maximum work items per work group allowed. This, along with max_radix below controls // maximum local memory being used by fft kernels of this plan. Set to 256 by default size_t max_work_item_per_workgroup; - - // Maximum base radix for local memory fft ... this controls the maximum register + + // Maximum base radix for local memory fft ... this controls the maximum register // space used by work items. Currently defaults to 16 size_t max_radix; - + // Device depended parameter that tells how many work-items need to be read consecutive - // values to make sure global memory access by work-items of a work-group result in + // values to make sure global memory access by work-items of a work-group result in // coalesced memory access to utilize full bandwidth e.g. on NVidia tesla, this is 16 size_t min_mem_coalesce_width; - - // Number of local memory banks. This is used to geneate kernel with local memory + + // Number of local memory banks. This is used to geneate kernel with local memory // transposes with appropriate padding to avoid bank conflicts to local memory // e.g. on NVidia it is 16. size_t num_local_mem_banks; @@ -160,4 +160,4 @@ typedef struct void FFT1D(cl_fft_plan *plan, cl_fft_kernel_dir dir); -#endif +#endif diff --git a/src/fft_kernelstring.cpp b/src/fft_kernelstring.cpp index ad9892decc71ef8124724584b70020e3ab6986f6..53bf94afb5cfa96ffbf70a46c6ee6cccd8c73906 100644 --- a/src/fft_kernelstring.cpp +++ b/src/fft_kernelstring.cpp @@ -62,7 +62,7 @@ using namespace std; #define max(A,B) ((A) > (B) ? (A) : (B)) #define min(A,B) ((A) < (B) ? (A) : (B)) -static string +static string num2str(int num) { char temp[200]; @@ -70,36 +70,36 @@ num2str(int num) return string(temp); } -// For any n, this function decomposes n into factors for loacal memory tranpose +// For any n, this function decomposes n into factors for loacal memory tranpose // based fft. Factors (radices) are sorted such that the first one (radixArray[0]) // is the largest. This base radix determines the number of registers used by each // work item and product of remaining radices determine the size of work group needed. // To make things concrete with and example, suppose n = 1024. It is decomposed into -// 1024 = 16 x 16 x 4. Hence kernel uses float2 a[16], for local in-register fft and +// 1024 = 16 x 16 x 4. Hence kernel uses float2 a[16], for local in-register fft and // needs 16 x 4 = 64 work items per work group. So kernel first performance 64 length -// 16 ffts (64 work items working in parallel) following by transpose using local +// 16 ffts (64 work items working in parallel) following by transpose using local // memory followed by again 64 length 16 ffts followed by transpose using local memory -// followed by 256 length 4 ffts. For the last step since with size of work group is +// followed by 256 length 4 ffts. For the last step since with size of work group is // 64 and each work item can array for 16 values, 64 work items can compute 256 length -// 4 ffts by each work item computing 4 length 4 ffts. +// 4 ffts by each work item computing 4 length 4 ffts. // Similarly for n = 2048 = 8 x 8 x 8 x 4, each work group has 8 x 8 x 4 = 256 work // iterms which each computes 256 (in-parallel) length 8 ffts in-register, followed // by transpose using local memory, followed by 256 length 8 in-register ffts, followed // by transpose using local memory, followed by 256 length 8 in-register ffts, followed // by transpose using local memory, followed by 512 length 4 in-register ffts. Again, // for the last step, each work item computes two length 4 in-register ffts and thus -// 256 work items are needed to compute all 512 ffts. -// For n = 32 = 8 x 4, 4 work items first compute 4 in-register +// 256 work items are needed to compute all 512 ffts. +// For n = 32 = 8 x 4, 4 work items first compute 4 in-register // lenth 8 ffts, followed by transpose using local memory followed by 8 in-register // length 4 ffts, where each work item computes two length 4 ffts thus 4 work items -// can compute 8 length 4 ffts. However if work group size of say 64 is choosen, -// each work group can compute 64/ 4 = 16 size 32 ffts (batched transform). +// can compute 8 length 4 ffts. However if work group size of say 64 is choosen, +// each work group can compute 64/ 4 = 16 size 32 ffts (batched transform). // Users can play with these parameters to figure what gives best performance on // their particular device i.e. some device have less register space thus using -// smaller base radix can avoid spilling ... some has small local memory thus +// smaller base radix can avoid spilling ... some has small local memory thus // using smaller work group size may be required etc -static void +static void getRadixArray(unsigned int n, unsigned int *radixArray, unsigned int *numRadices, unsigned int maxRadix) { if(maxRadix > 1) @@ -116,57 +116,57 @@ getRadixArray(unsigned int n, unsigned int *radixArray, unsigned int *numRadices return; } - switch(n) + switch(n) { case 2: *numRadices = 1; radixArray[0] = 2; break; - + case 4: *numRadices = 1; radixArray[0] = 4; break; - + case 8: *numRadices = 1; radixArray[0] = 8; break; - + case 16: *numRadices = 2; - radixArray[0] = 8; radixArray[1] = 2; + radixArray[0] = 8; radixArray[1] = 2; break; - + case 32: *numRadices = 2; radixArray[0] = 8; radixArray[1] = 4; break; - + case 64: *numRadices = 2; radixArray[0] = 8; radixArray[1] = 8; break; - + case 128: *numRadices = 3; radixArray[0] = 8; radixArray[1] = 4; radixArray[2] = 4; break; - + case 256: *numRadices = 4; radixArray[0] = 4; radixArray[1] = 4; radixArray[2] = 4; radixArray[3] = 4; break; - + case 512: *numRadices = 3; radixArray[0] = 8; radixArray[1] = 8; radixArray[2] = 8; - break; - + break; + case 1024: *numRadices = 3; radixArray[0] = 16; radixArray[1] = 16; radixArray[2] = 4; - break; + break; case 2048: *numRadices = 4; radixArray[0] = 8; radixArray[1] = 8; radixArray[2] = 8; radixArray[3] = 4; @@ -180,13 +180,13 @@ getRadixArray(unsigned int n, unsigned int *radixArray, unsigned int *numRadices static void insertHeader(string &kernelString, string &kernelName, clFFT_DataFormat dataFormat) { - if(dataFormat == clFFT_SplitComplexFormat) + if(dataFormat == clFFT_SplitComplexFormat) kernelString += string("__kernel void ") + kernelName + string("(__global float *in_real, __global float *in_imag, __global float *out_real, __global float *out_imag, int dir, int S)\n"); - else + else kernelString += string("__kernel void ") + kernelName + string("(__global float2 *in, __global float2 *out, int dir, int S)\n"); } -static void +static void insertVariables(string &kStream, int maxRadix) { kStream += string(" int i, j, r, indexIn, indexOut, index, tid, bNum, xNum, k, l;\n"); @@ -230,12 +230,12 @@ insertGlobalLoadsAndTranspose(string &kernelString, int N, int numWorkItemsPerXF int groupSize = numWorkItemsPerXForm * numXFormsPerWG; int i, j; int lMemSize = 0; - + if(numXFormsPerWG > 1) kernelString += string(" s = S & ") + num2str(numXFormsPerWG - 1) + string(";\n"); - + if(numWorkItemsPerXForm >= mem_coalesce_width) - { + { if(numXFormsPerWG > 1) { kernelString += string(" ii = lId & ") + num2str(numWorkItemsPerXForm-1) + string(";\n"); @@ -283,7 +283,7 @@ insertGlobalLoadsAndTranspose(string &kernelString, int N, int numWorkItemsPerXF { int numInnerIter = N / mem_coalesce_width; int numOuterIter = numXFormsPerWG / ( groupSize / mem_coalesce_width ); - + kernelString += string(" ii = lId & ") + num2str(mem_coalesce_width - 1) + string(";\n"); kernelString += string(" jj = lId >> ") + num2str((int)log2(mem_coalesce_width)) + string(";\n"); kernelString += string(" lMemStore = sMem + mad24( jj, ") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n"); @@ -301,60 +301,60 @@ insertGlobalLoadsAndTranspose(string &kernelString, int N, int numWorkItemsPerXF kernelString += string(" out_real += offset;\n"); kernelString += string(" out_imag += offset;\n"); } - + kernelString += string("if((groupId == get_num_groups(0)-1) && s) {\n"); for(i = 0; i < numOuterIter; i++ ) { kernelString += string(" if( jj < s ) {\n"); - for(j = 0; j < numInnerIter; j++ ) + for(j = 0; j < numInnerIter; j++ ) formattedLoad(kernelString, i * numInnerIter + j, j * mem_coalesce_width + i * ( groupSize / mem_coalesce_width ) * N, dataFormat); - kernelString += string(" }\n"); + kernelString += string(" }\n"); if(i != numOuterIter - 1) - kernelString += string(" jj += ") + num2str(groupSize / mem_coalesce_width) + string(";\n"); + kernelString += string(" jj += ") + num2str(groupSize / mem_coalesce_width) + string(";\n"); } kernelString += string("}\n "); kernelString += string("else {\n"); for(i = 0; i < numOuterIter; i++ ) { - for(j = 0; j < numInnerIter; j++ ) - formattedLoad(kernelString, i * numInnerIter + j, j * mem_coalesce_width + i * ( groupSize / mem_coalesce_width ) * N, dataFormat); - } + for(j = 0; j < numInnerIter; j++ ) + formattedLoad(kernelString, i * numInnerIter + j, j * mem_coalesce_width + i * ( groupSize / mem_coalesce_width ) * N, dataFormat); + } kernelString += string("}\n"); - + kernelString += string(" ii = lId & ") + num2str(numWorkItemsPerXForm - 1) + string(";\n"); kernelString += string(" jj = lId >> ") + num2str(log2NumWorkItemsPerXForm) + string(";\n"); - kernelString += string(" lMemLoad = sMem + mad24( jj, ") + num2str(N + numWorkItemsPerXForm) + string(", ii);\n"); - + kernelString += string(" lMemLoad = sMem + mad24( jj, ") + num2str(N + numWorkItemsPerXForm) + string(", ii);\n"); + for( i = 0; i < numOuterIter; i++ ) { for( j = 0; j < numInnerIter; j++ ) - { - kernelString += string(" lMemStore[") + num2str(j * mem_coalesce_width + i * ( groupSize / mem_coalesce_width ) * (N + numWorkItemsPerXForm )) + string("] = a[") + + { + kernelString += string(" lMemStore[") + num2str(j * mem_coalesce_width + i * ( groupSize / mem_coalesce_width ) * (N + numWorkItemsPerXForm )) + string("] = a[") + num2str(i * numInnerIter + j) + string("].x;\n"); } - } + } kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); - + for( i = 0; i < R0; i++ ) - kernelString += string(" a[") + num2str(i) + string("].x = lMemLoad[") + num2str(i * numWorkItemsPerXForm) + string("];\n"); - kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); + kernelString += string(" a[") + num2str(i) + string("].x = lMemLoad[") + num2str(i * numWorkItemsPerXForm) + string("];\n"); + kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); for( i = 0; i < numOuterIter; i++ ) { for( j = 0; j < numInnerIter; j++ ) - { - kernelString += string(" lMemStore[") + num2str(j * mem_coalesce_width + i * ( groupSize / mem_coalesce_width ) * (N + numWorkItemsPerXForm )) + string("] = a[") + + { + kernelString += string(" lMemStore[") + num2str(j * mem_coalesce_width + i * ( groupSize / mem_coalesce_width ) * (N + numWorkItemsPerXForm )) + string("] = a[") + num2str(i * numInnerIter + j) + string("].y;\n"); } - } + } kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); - + for( i = 0; i < R0; i++ ) - kernelString += string(" a[") + num2str(i) + string("].y = lMemLoad[") + num2str(i * numWorkItemsPerXForm) + string("];\n"); - kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); - + kernelString += string(" a[") + num2str(i) + string("].y = lMemLoad[") + num2str(i * numWorkItemsPerXForm) + string("];\n"); + kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); + lMemSize = (N + numWorkItemsPerXForm) * numXFormsPerWG; - } + } else { kernelString += string(" offset = mad24( groupId, ") + num2str(N * numXFormsPerWG) + string(", lId );\n"); @@ -370,11 +370,11 @@ insertGlobalLoadsAndTranspose(string &kernelString, int N, int numWorkItemsPerXF kernelString += string(" out_real += offset;\n"); kernelString += string(" out_imag += offset;\n"); } - + kernelString += string(" ii = lId & ") + num2str(N-1) + string(";\n"); kernelString += string(" jj = lId >> ") + num2str((int)log2(N)) + string(";\n"); kernelString += string(" lMemStore = sMem + mad24( jj, ") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n"); - + kernelString += string("if((groupId == get_num_groups(0)-1) && s) {\n"); for( i = 0; i < R0; i++ ) { @@ -388,42 +388,42 @@ insertGlobalLoadsAndTranspose(string &kernelString, int N, int numWorkItemsPerXF for( i = 0; i < R0; i++ ) { formattedLoad(kernelString, i, i*groupSize, dataFormat); - } + } kernelString += string("}\n"); - + if(numWorkItemsPerXForm > 1) { kernelString += string(" ii = lId & ") + num2str(numWorkItemsPerXForm - 1) + string(";\n"); kernelString += string(" jj = lId >> ") + num2str(log2NumWorkItemsPerXForm) + string(";\n"); - kernelString += string(" lMemLoad = sMem + mad24( jj, ") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n"); + kernelString += string(" lMemLoad = sMem + mad24( jj, ") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n"); } - else + else { kernelString += string(" ii = 0;\n"); kernelString += string(" jj = lId;\n"); - kernelString += string(" lMemLoad = sMem + mul24( jj, ") + num2str(N + numWorkItemsPerXForm) + string(");\n"); + kernelString += string(" lMemLoad = sMem + mul24( jj, ") + num2str(N + numWorkItemsPerXForm) + string(");\n"); } - + for( i = 0; i < R0; i++ ) - kernelString += string(" lMemStore[") + num2str(i * ( groupSize / N ) * ( N + numWorkItemsPerXForm )) + string("] = a[") + num2str(i) + string("].x;\n"); - kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); - + kernelString += string(" lMemStore[") + num2str(i * ( groupSize / N ) * ( N + numWorkItemsPerXForm )) + string("] = a[") + num2str(i) + string("].x;\n"); + kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); + for( i = 0; i < R0; i++ ) kernelString += string(" a[") + num2str(i) + string("].x = lMemLoad[") + num2str(i * numWorkItemsPerXForm) + string("];\n"); kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); - + for( i = 0; i < R0; i++ ) - kernelString += string(" lMemStore[") + num2str(i * ( groupSize / N ) * ( N + numWorkItemsPerXForm )) + string("] = a[") + num2str(i) + string("].y;\n"); - kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); - + kernelString += string(" lMemStore[") + num2str(i * ( groupSize / N ) * ( N + numWorkItemsPerXForm )) + string("] = a[") + num2str(i) + string("].y;\n"); + kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); + for( i = 0; i < R0; i++ ) kernelString += string(" a[") + num2str(i) + string("].y = lMemLoad[") + num2str(i * numWorkItemsPerXForm) + string("];\n"); kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); - + lMemSize = (N + numWorkItemsPerXForm) * numXFormsPerWG; } - + return lMemSize; } @@ -435,15 +435,15 @@ insertGlobalStoresAndTranspose(string &kernelString, int N, int maxRadix, int Nr int lMemSize = 0; int numIter = maxRadix / Nr; string indent = string(""); - + if( numWorkItemsPerXForm >= mem_coalesce_width ) - { + { if(numXFormsPerWG > 1) { kernelString += string(" if( !s || (groupId < get_num_groups(0)-1) || (jj < s) ) {\n"); indent = string(" "); - } - for(i = 0; i < maxRadix; i++) + } + for(i = 0; i < maxRadix; i++) { j = i % numIter; k = i / numIter; @@ -457,95 +457,95 @@ insertGlobalStoresAndTranspose(string &kernelString, int N, int maxRadix, int Nr { int numInnerIter = N / mem_coalesce_width; int numOuterIter = numXFormsPerWG / ( groupSize / mem_coalesce_width ); - - kernelString += string(" lMemLoad = sMem + mad24( jj, ") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n"); + + kernelString += string(" lMemLoad = sMem + mad24( jj, ") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n"); kernelString += string(" ii = lId & ") + num2str(mem_coalesce_width - 1) + string(";\n"); kernelString += string(" jj = lId >> ") + num2str((int)log2(mem_coalesce_width)) + string(";\n"); kernelString += string(" lMemStore = sMem + mad24( jj,") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n"); - + for( i = 0; i < maxRadix; i++ ) { j = i % numIter; k = i / numIter; ind = j * Nr + k; - kernelString += string(" lMemLoad[") + num2str(i*numWorkItemsPerXForm) + string("] = a[") + num2str(ind) + string("].x;\n"); - } - kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); - + kernelString += string(" lMemLoad[") + num2str(i*numWorkItemsPerXForm) + string("] = a[") + num2str(ind) + string("].x;\n"); + } + kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); + for( i = 0; i < numOuterIter; i++ ) for( j = 0; j < numInnerIter; j++ ) kernelString += string(" a[") + num2str(i*numInnerIter + j) + string("].x = lMemStore[") + num2str(j*mem_coalesce_width + i*( groupSize / mem_coalesce_width )*(N + numWorkItemsPerXForm)) + string("];\n"); kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); - + for( i = 0; i < maxRadix; i++ ) { j = i % numIter; k = i / numIter; ind = j * Nr + k; - kernelString += string(" lMemLoad[") + num2str(i*numWorkItemsPerXForm) + string("] = a[") + num2str(ind) + string("].y;\n"); - } - kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); - + kernelString += string(" lMemLoad[") + num2str(i*numWorkItemsPerXForm) + string("] = a[") + num2str(ind) + string("].y;\n"); + } + kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); + for( i = 0; i < numOuterIter; i++ ) for( j = 0; j < numInnerIter; j++ ) kernelString += string(" a[") + num2str(i*numInnerIter + j) + string("].y = lMemStore[") + num2str(j*mem_coalesce_width + i*( groupSize / mem_coalesce_width )*(N + numWorkItemsPerXForm)) + string("];\n"); - kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); - + kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); + kernelString += string("if((groupId == get_num_groups(0)-1) && s) {\n"); for(i = 0; i < numOuterIter; i++ ) { kernelString += string(" if( jj < s ) {\n"); - for(j = 0; j < numInnerIter; j++ ) - formattedStore(kernelString, i*numInnerIter + j, j*mem_coalesce_width + i*(groupSize/mem_coalesce_width)*N, dataFormat); - kernelString += string(" }\n"); + for(j = 0; j < numInnerIter; j++ ) + formattedStore(kernelString, i*numInnerIter + j, j*mem_coalesce_width + i*(groupSize/mem_coalesce_width)*N, dataFormat); + kernelString += string(" }\n"); if(i != numOuterIter - 1) - kernelString += string(" jj += ") + num2str(groupSize / mem_coalesce_width) + string(";\n"); + kernelString += string(" jj += ") + num2str(groupSize / mem_coalesce_width) + string(";\n"); } kernelString += string("}\n"); kernelString += string("else {\n"); for(i = 0; i < numOuterIter; i++ ) { - for(j = 0; j < numInnerIter; j++ ) - formattedStore(kernelString, i*numInnerIter + j, j*mem_coalesce_width + i*(groupSize/mem_coalesce_width)*N, dataFormat); - } + for(j = 0; j < numInnerIter; j++ ) + formattedStore(kernelString, i*numInnerIter + j, j*mem_coalesce_width + i*(groupSize/mem_coalesce_width)*N, dataFormat); + } kernelString += string("}\n"); - + lMemSize = (N + numWorkItemsPerXForm) * numXFormsPerWG; - } + } else - { - kernelString += string(" lMemLoad = sMem + mad24( jj,") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n"); - + { + kernelString += string(" lMemLoad = sMem + mad24( jj,") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n"); + kernelString += string(" ii = lId & ") + num2str(N - 1) + string(";\n"); kernelString += string(" jj = lId >> ") + num2str((int) log2(N)) + string(";\n"); kernelString += string(" lMemStore = sMem + mad24( jj,") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n"); - + for( i = 0; i < maxRadix; i++ ) { j = i % numIter; k = i / numIter; ind = j * Nr + k; kernelString += string(" lMemLoad[") + num2str(i*numWorkItemsPerXForm) + string("] = a[") + num2str(ind) + string("].x;\n"); - } + } kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); - + for( i = 0; i < maxRadix; i++ ) - kernelString += string(" a[") + num2str(i) + string("].x = lMemStore[") + num2str(i*( groupSize / N )*( N + numWorkItemsPerXForm )) + string("];\n"); - kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); - + kernelString += string(" a[") + num2str(i) + string("].x = lMemStore[") + num2str(i*( groupSize / N )*( N + numWorkItemsPerXForm )) + string("];\n"); + kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); + for( i = 0; i < maxRadix; i++ ) { j = i % numIter; k = i / numIter; ind = j * Nr + k; kernelString += string(" lMemLoad[") + num2str(i*numWorkItemsPerXForm) + string("] = a[") + num2str(ind) + string("].y;\n"); - } + } kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); - + for( i = 0; i < maxRadix; i++ ) - kernelString += string(" a[") + num2str(i) + string("].y = lMemStore[") + num2str(i*( groupSize / N )*( N + numWorkItemsPerXForm )) + string("];\n"); - kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); - + kernelString += string(" a[") + num2str(i) + string("].y = lMemStore[") + num2str(i*( groupSize / N )*( N + numWorkItemsPerXForm )) + string("];\n"); + kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); + kernelString += string("if((groupId == get_num_groups(0)-1) && s) {\n"); for( i = 0; i < maxRadix; i++ ) { @@ -554,26 +554,26 @@ insertGlobalStoresAndTranspose(string &kernelString, int N, int maxRadix, int Nr kernelString += string(" }\n"); if( i != maxRadix - 1) kernelString += string(" jj +=") + num2str(groupSize / N) + string(";\n"); - } + } kernelString += string("}\n"); kernelString += string("else {\n"); for( i = 0; i < maxRadix; i++ ) { formattedStore(kernelString, i, i*groupSize, dataFormat); - } + } kernelString += string("}\n"); - + lMemSize = (N + numWorkItemsPerXForm) * numXFormsPerWG; } - + return lMemSize; } -static void +static void insertfftKernel(string &kernelString, int Nr, int numIter) { int i; - for(i = 0; i < numIter; i++) + for(i = 0; i < numIter; i++) { kernelString += string(" fftKernel") + num2str(Nr) + string("(a+") + num2str(i*Nr) + string(", dir);\n"); } @@ -584,8 +584,8 @@ insertTwiddleKernel(string &kernelString, int Nr, int numIter, int Nprev, int le { int z, k; int logNPrev = log2(Nprev); - - for(z = 0; z < numIter; z++) + + for(z = 0; z < numIter; z++) { if(z == 0) { @@ -593,15 +593,15 @@ insertTwiddleKernel(string &kernelString, int Nr, int numIter, int Nprev, int le kernelString += string(" angf = (float) (ii >> ") + num2str(logNPrev) + string(");\n"); else kernelString += string(" angf = (float) ii;\n"); - } + } else { if(Nprev > 1) - kernelString += string(" angf = (float) ((") + num2str(z*numWorkItemsPerXForm) + string(" + ii) >>") + num2str(logNPrev) + string(");\n"); + kernelString += string(" angf = (float) ((") + num2str(z*numWorkItemsPerXForm) + string(" + ii) >>") + num2str(logNPrev) + string(");\n"); else kernelString += string(" angf = (float) (") + num2str(z*numWorkItemsPerXForm) + string(" + ii);\n"); - } - + } + for(k = 1; k < Nr; k++) { int ind = z*Nr + k; //float fac = (float) (2.0 * M_PI * (double) k / (double) len); @@ -625,7 +625,7 @@ getPadding(int numWorkItemsPerXForm, int Nprev, int numWorkItemsReq, int numXFor numColsReq = Nprev * numColsReq; *offset = numColsReq; } - + if(numWorkItemsPerXForm >= numBanks || numXFormsPerWG == 1) *midPad = 0; else { @@ -635,13 +635,13 @@ getPadding(int numWorkItemsPerXForm, int Nprev, int numWorkItemsReq, int numXFor else *midPad = numWorkItemsPerXForm - bankNum; } - + int lMemSize = ( numWorkItemsReq + *offset) * Nr * numXFormsPerWG + *midPad * (numXFormsPerWG - 1); return lMemSize; } -static void +static void insertLocalStores(string &kernelString, int numIter, int Nr, int numWorkItemsPerXForm, int numWorkItemsReq, int offset, string &comp) { int z, k; @@ -655,21 +655,21 @@ insertLocalStores(string &kernelString, int numIter, int Nr, int numWorkItemsPer kernelString += string(" barrier(CLK_LOCAL_MEM_FENCE);\n"); } -static void +static void insertLocalLoads(string &kernelString, int n, int Nr, int Nrn, int Nprev, int Ncurr, int numWorkItemsPerXForm, int numWorkItemsReq, int offset, string &comp) { - int numWorkItemsReqN = n / Nrn; - int interBlockHNum = max( Nprev / numWorkItemsPerXForm, 1 ); - int interBlockHStride = numWorkItemsPerXForm; - int vertWidth = max(numWorkItemsPerXForm / Nprev, 1); - vertWidth = min( vertWidth, Nr); - int vertNum = Nr / vertWidth; - int vertStride = ( n / Nr + offset ) * vertWidth; + int numWorkItemsReqN = n / Nrn; + int interBlockHNum = max( Nprev / numWorkItemsPerXForm, 1 ); + int interBlockHStride = numWorkItemsPerXForm; + int vertWidth = max(numWorkItemsPerXForm / Nprev, 1); + vertWidth = min( vertWidth, Nr); + int vertNum = Nr / vertWidth; + int vertStride = ( n / Nr + offset ) * vertWidth; int iter = max( numWorkItemsReqN / numWorkItemsPerXForm, 1); int intraBlockHStride = (numWorkItemsPerXForm / (Nprev*Nr)) > 1 ? (numWorkItemsPerXForm / (Nprev*Nr)) : 1; intraBlockHStride *= Nprev; - - int stride = numWorkItemsReq / Nrn; + + int stride = numWorkItemsReq / Nrn; int i; for(i = 0; i < iter; i++) { int ii = i / (interBlockHNum * vertNum); @@ -687,50 +687,50 @@ insertLocalLoads(string &kernelString, int n, int Nr, int Nrn, int Nprev, int Nc static void insertLocalLoadIndexArithmatic(string &kernelString, int Nprev, int Nr, int numWorkItemsReq, int numWorkItemsPerXForm, int numXFormsPerWG, int offset, int midPad) -{ +{ int Ncurr = Nprev * Nr; int logNcurr = log2(Ncurr); int logNprev = log2(Nprev); int incr = (numWorkItemsReq + offset) * Nr + midPad; - - if(Ncurr < numWorkItemsPerXForm) + + if(Ncurr < numWorkItemsPerXForm) { if(Nprev == 1) kernelString += string(" j = ii & ") + num2str(Ncurr - 1) + string(";\n"); else kernelString += string(" j = (ii & ") + num2str(Ncurr - 1) + string(") >> ") + num2str(logNprev) + string(";\n"); - - if(Nprev == 1) + + if(Nprev == 1) kernelString += string(" i = ii >> ") + num2str(logNcurr) + string(";\n"); - else - kernelString += string(" i = mad24(ii >> ") + num2str(logNcurr) + string(", ") + num2str(Nprev) + string(", ii & ") + num2str(Nprev - 1) + string(");\n"); - } - else + else + kernelString += string(" i = mad24(ii >> ") + num2str(logNcurr) + string(", ") + num2str(Nprev) + string(", ii & ") + num2str(Nprev - 1) + string(");\n"); + } + else { if(Nprev == 1) kernelString += string(" j = ii;\n"); else kernelString += string(" j = ii >> ") + num2str(logNprev) + string(";\n"); - if(Nprev == 1) - kernelString += string(" i = 0;\n"); - else + if(Nprev == 1) + kernelString += string(" i = 0;\n"); + else kernelString += string(" i = ii & ") + num2str(Nprev - 1) + string(";\n"); } if(numXFormsPerWG > 1) - kernelString += string(" i = mad24(jj, ") + num2str(incr) + string(", i);\n"); + kernelString += string(" i = mad24(jj, ") + num2str(incr) + string(", i);\n"); - kernelString += string(" lMemLoad = sMem + mad24(j, ") + num2str(numWorkItemsReq + offset) + string(", i);\n"); + kernelString += string(" lMemLoad = sMem + mad24(j, ") + num2str(numWorkItemsReq + offset) + string(", i);\n"); } static void insertLocalStoreIndexArithmatic(string &kernelString, int numWorkItemsReq, int numXFormsPerWG, int Nr, int offset, int midPad) { if(numXFormsPerWG == 1) { - kernelString += string(" lMemStore = sMem + ii;\n"); + kernelString += string(" lMemStore = sMem + ii;\n"); } else { - kernelString += string(" lMemStore = sMem + mad24(jj, ") + num2str((numWorkItemsReq + offset)*Nr + midPad) + string(", ii);\n"); + kernelString += string(" lMemStore = sMem + mad24(jj, ") + num2str((numWorkItemsReq + offset)*Nr + midPad) + string(", ii);\n"); } } @@ -740,47 +740,47 @@ createLocalMemfftKernelString(cl_fft_plan *plan) { unsigned int radixArray[10]; unsigned int numRadix; - + unsigned int n = plan->n.x; - + assert(n <= plan->max_work_item_per_workgroup * plan->max_radix && "signal lenght too big for local mem fft\n"); - + getRadixArray(n, radixArray, &numRadix, 0); assert(numRadix > 0 && "no radix array supplied\n"); - + if(n/radixArray[0] > plan->max_work_item_per_workgroup) getRadixArray(n, radixArray, &numRadix, plan->max_radix); assert(radixArray[0] <= plan->max_radix && "max radix choosen is greater than allowed\n"); assert(n/radixArray[0] <= plan->max_work_item_per_workgroup && "required work items per xform greater than maximum work items allowed per work group for local mem fft\n"); - + unsigned int tmpLen = 1; unsigned int i; for(i = 0; i < numRadix; i++) - { + { assert( radixArray[i] && !( (radixArray[i] - 1) & radixArray[i] ) ); tmpLen *= radixArray[i]; } assert(tmpLen == n && "product of radices choosen doesnt match the length of signal\n"); - + int offset, midPad; string localString(""), kernelName(""); - + clFFT_DataFormat dataFormat = plan->format; string *kernelString = plan->kernel_string; - - + + cl_fft_kernel_info **kInfo = &plan->kernel_info; int kCount = 0; - + while(*kInfo) { kInfo = &(*kInfo)->next; kCount++; } - + kernelName = string("fft") + num2str(kCount); - + *kInfo = (cl_fft_kernel_info *) malloc(sizeof(cl_fft_kernel_info)); (*kInfo)->kernel = 0; (*kInfo)->lmem_size = 0; @@ -791,37 +791,37 @@ createLocalMemfftKernelString(cl_fft_plan *plan) (*kInfo)->next = NULL; (*kInfo)->kernel_name = (char *) malloc(sizeof(char)*(kernelName.size()+1)); strcpy((*kInfo)->kernel_name, kernelName.c_str()); - + unsigned int numWorkItemsPerXForm = n / radixArray[0]; - unsigned int numWorkItemsPerWG = numWorkItemsPerXForm <= 64 ? 64 : numWorkItemsPerXForm; + unsigned int numWorkItemsPerWG = numWorkItemsPerXForm <= 64 ? 64 : numWorkItemsPerXForm; assert(numWorkItemsPerWG <= plan->max_work_item_per_workgroup); int numXFormsPerWG = numWorkItemsPerWG / numWorkItemsPerXForm; (*kInfo)->num_workgroups = 1; (*kInfo)->num_xforms_per_workgroup = numXFormsPerWG; (*kInfo)->num_workitems_per_workgroup = numWorkItemsPerWG; - + unsigned int *N = radixArray; unsigned int maxRadix = N[0]; unsigned int lMemSize = 0; - + insertVariables(localString, maxRadix); - + lMemSize = insertGlobalLoadsAndTranspose(localString, n, numWorkItemsPerXForm, numXFormsPerWG, maxRadix, plan->min_mem_coalesce_width, dataFormat); (*kInfo)->lmem_size = (lMemSize > (*kInfo)->lmem_size) ? lMemSize : (*kInfo)->lmem_size; - + string xcomp = string("x"); string ycomp = string("y"); - + unsigned int Nprev = 1; unsigned int len = n; unsigned int r; - for(r = 0; r < numRadix; r++) + for(r = 0; r < numRadix; r++) { int numIter = N[0] / N[r]; int numWorkItemsReq = n / N[r]; int Ncurr = Nprev * N[r]; insertfftKernel(localString, N[r], numIter); - + if(r < (numRadix - 1)) { insertTwiddleKernel(localString, N[r], numIter, Nprev, len, numWorkItemsPerXForm); lMemSize = getPadding(numWorkItemsPerXForm, Nprev, numWorkItemsReq, numXFormsPerWG, N[r], plan->num_local_mem_banks, &offset, &midPad); @@ -836,10 +836,10 @@ createLocalMemfftKernelString(cl_fft_plan *plan) len = len / N[r]; } } - + lMemSize = insertGlobalStoresAndTranspose(localString, n, maxRadix, N[numRadix - 1], numWorkItemsPerXForm, numXFormsPerWG, plan->min_mem_coalesce_width, dataFormat); (*kInfo)->lmem_size = (lMemSize > (*kInfo)->lmem_size) ? lMemSize : (*kInfo)->lmem_size; - + insertHeader(*kernelString, kernelName, dataFormat); *kernelString += string("{\n"); if((*kInfo)->lmem_size) @@ -852,20 +852,20 @@ createLocalMemfftKernelString(cl_fft_plan *plan) // multiple kernel launces is needed. For these sizes, n can be decomposed using // much larger base radices i.e. say n = 262144 = 128 x 64 x 32. Thus three kernel // launches will be needed, first computing 64 x 32, length 128 ffts, second computing -// 128 x 32 length 64 ffts, and finally a kernel computing 128 x 64 length 32 ffts. -// Each of these base radices can futher be divided into factors so that each of these -// base ffts can be computed within one kernel launch using in-register ffts and local -// memory transposes i.e for the first kernel above which computes 64 x 32 ffts on length -// 128, 128 can be decomposed into 128 = 16 x 8 i.e. 8 work items can compute 8 length -// 16 ffts followed by transpose using local memory followed by each of these eight -// work items computing 2 length 8 ffts thus computing 16 length 8 ffts in total. This +// 128 x 32 length 64 ffts, and finally a kernel computing 128 x 64 length 32 ffts. +// Each of these base radices can futher be divided into factors so that each of these +// base ffts can be computed within one kernel launch using in-register ffts and local +// memory transposes i.e for the first kernel above which computes 64 x 32 ffts on length +// 128, 128 can be decomposed into 128 = 16 x 8 i.e. 8 work items can compute 8 length +// 16 ffts followed by transpose using local memory followed by each of these eight +// work items computing 2 length 8 ffts thus computing 16 length 8 ffts in total. This // means only 8 work items are needed for computing one length 128 fft. If we choose // work group size of say 64, we can compute 64/8 = 8 length 128 ffts within one -// work group. Since we need to compute 64 x 32 length 128 ffts in first kernel, this -// means we need to launch 64 x 32 / 8 = 256 work groups with 64 work items in each +// work group. Since we need to compute 64 x 32 length 128 ffts in first kernel, this +// means we need to launch 64 x 32 / 8 = 256 work groups with 64 work items in each // work group where each work group is computing 8 length 128 ffts where each length // 128 fft is computed by 8 work items. Same logic can be applied to other two kernels -// in this example. Users can play with difference base radices and difference +// in this example. Users can play with difference base radices and difference // decompositions of base radices to generates different kernels and see which gives // best performance. Following function is just fixed to use 128 as base radix @@ -873,22 +873,22 @@ void getGlobalRadixInfo(int n, int *radix, int *R1, int *R2, int *numRadices) { int baseRadix = min(n, 128); - + int numR = 0; int N = n; - while(N > baseRadix) + while(N > baseRadix) { N /= baseRadix; numR++; } - + for(int i = 0; i < numR; i++) radix[i] = baseRadix; - + radix[numR] = N; numR++; *numRadices = numR; - + for(int i = 0; i < numR; i++) { int B = radix[i]; @@ -898,8 +898,8 @@ getGlobalRadixInfo(int n, int *radix, int *R1, int *R2, int *numRadices) R2[i] = 1; continue; } - - int r1 = 2; + + int r1 = 2; int r2 = B / r1; while(r2 > r1) { @@ -908,65 +908,65 @@ getGlobalRadixInfo(int n, int *radix, int *R1, int *R2, int *numRadices) } R1[i] = r1; R2[i] = r2; - } + } } static void createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir dir, int vertBS) -{ +{ int i, j, k, t; int radixArr[10] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; int R1Arr[10] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; int R2Arr[10] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; int radix, R1, R2; int numRadices; - + int maxThreadsPerBlock = plan->max_work_item_per_workgroup; int maxArrayLen = plan->max_radix; - int batchSize = plan->min_mem_coalesce_width; + int batchSize = plan->min_mem_coalesce_width; clFFT_DataFormat dataFormat = plan->format; - int vertical = (dir == cl_fft_kernel_x) ? 0 : 1; - + int vertical = (dir == cl_fft_kernel_x) ? 0 : 1; + getGlobalRadixInfo(n, radixArr, R1Arr, R2Arr, &numRadices); - + int numPasses = numRadices; - + string localString(""), kernelName(""); string *kernelString = plan->kernel_string; - cl_fft_kernel_info **kInfo = &plan->kernel_info; + cl_fft_kernel_info **kInfo = &plan->kernel_info; int kCount = 0; - + while(*kInfo) { kInfo = &(*kInfo)->next; kCount++; } - + int N = n; int m = (int)log2(n); int Rinit = vertical ? BS : 1; batchSize = vertical ? min(BS, batchSize) : batchSize; int passNum; - - for(passNum = 0; passNum < numPasses; passNum++) + + for(passNum = 0; passNum < numPasses; passNum++) { - + localString.clear(); kernelName.clear(); - + radix = radixArr[passNum]; R1 = R1Arr[passNum]; R2 = R2Arr[passNum]; - + int strideI = Rinit; for(i = 0; i < numPasses; i++) if(i != passNum) strideI *= radixArr[i]; - + int strideO = Rinit; for(i = 0; i < passNum; i++) strideO *= radixArr[i]; - + int threadsPerXForm = R2; batchSize = R2 == 1 ? plan->max_work_item_per_workgroup : batchSize; batchSize = min(batchSize, strideI); @@ -977,11 +977,11 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir assert(R1*R2 == radix); assert(R1 <= maxArrayLen); assert(threadsPerBlock <= maxThreadsPerBlock); - + int numIter = R1 / R2; int gInInc = threadsPerBlock / batchSize; - - + + int lgStrideO = log2(strideO); int numBlocksPerXForm = strideI / batchSize; int numBlocks = numBlocksPerXForm; @@ -989,7 +989,7 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir numBlocks *= BS; else numBlocks *= vertBS; - + kernelName = string("fft") + num2str(kCount); *kInfo = (cl_fft_kernel_info *) malloc(sizeof(cl_fft_kernel_info)); (*kInfo)->kernel = 0; @@ -1013,10 +1013,10 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir (*kInfo)->next = NULL; (*kInfo)->kernel_name = (char *) malloc(sizeof(char)*(kernelName.size()+1)); strcpy((*kInfo)->kernel_name, kernelName.c_str()); - + insertVariables(localString, R1); - - if(vertical) + + if(vertical) { localString += string("xNum = groupId >> ") + num2str((int)log2(numBlocksPerXForm)) + string(";\n"); localString += string("groupId = groupId & ") + num2str(numBlocksPerXForm - 1) + string(";\n"); @@ -1030,7 +1030,7 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir localString += string("indexOut = mad24(i, ") + num2str(stride) + string(", j + ") + string("(xNum << ") + num2str((int) log2(n*BS)) + string("));\n"); localString += string("bNum = groupId;\n"); } - else + else { int lgNumBlocksPerXForm = log2(numBlocksPerXForm); localString += string("bNum = groupId & ") + num2str(numBlocksPerXForm - 1) + string(";\n"); @@ -1038,95 +1038,95 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir localString += string("indexIn = mul24(bNum, ") + num2str(batchSize) + string(");\n"); localString += string("tid = indexIn;\n"); localString += string("i = tid >> ") + num2str(lgStrideO) + string(";\n"); - localString += string("j = tid & ") + num2str(strideO - 1) + string(";\n"); + localString += string("j = tid & ") + num2str(strideO - 1) + string(";\n"); int stride = radix*Rinit; for(i = 0; i < passNum; i++) stride *= radixArr[i]; - localString += string("indexOut = mad24(i, ") + num2str(stride) + string(", j);\n"); + localString += string("indexOut = mad24(i, ") + num2str(stride) + string(", j);\n"); localString += string("indexIn += (xNum << ") + num2str(m) + string(");\n"); - localString += string("indexOut += (xNum << ") + num2str(m) + string(");\n"); + localString += string("indexOut += (xNum << ") + num2str(m) + string(");\n"); } - + // Load Data int lgBatchSize = log2(batchSize); localString += string("tid = lId;\n"); localString += string("i = tid & ") + num2str(batchSize - 1) + string(";\n"); - localString += string("j = tid >> ") + num2str(lgBatchSize) + string(";\n"); + localString += string("j = tid >> ") + num2str(lgBatchSize) + string(";\n"); localString += string("indexIn += mad24(j, ") + num2str(strideI) + string(", i);\n"); - if(dataFormat == clFFT_SplitComplexFormat) + if(dataFormat == clFFT_SplitComplexFormat) { localString += string("in_real += indexIn;\n"); - localString += string("in_imag += indexIn;\n"); + localString += string("in_imag += indexIn;\n"); for(j = 0; j < R1; j++) localString += string("a[") + num2str(j) + string("].x = in_real[") + num2str(j*gInInc*strideI) + string("];\n"); - for(j = 0; j < R1; j++) + for(j = 0; j < R1; j++) localString += string("a[") + num2str(j) + string("].y = in_imag[") + num2str(j*gInInc*strideI) + string("];\n"); } - else + else { localString += string("in += indexIn;\n"); for(j = 0; j < R1; j++) localString += string("a[") + num2str(j) + string("] = in[") + num2str(j*gInInc*strideI) + string("];\n"); } - - localString += string("fftKernel") + num2str(R1) + string("(a, dir);\n"); - + + localString += string("fftKernel") + num2str(R1) + string("(a, dir);\n"); + if(R2 > 1) { // twiddle - for(k = 1; k < R1; k++) + for(k = 1; k < R1; k++) { localString += string("ang = dir*(2.0f*M_PI*") + num2str(k) + string("/") + num2str(radix) + string(")*j;\n"); localString += string("w = (float2)(native_cos(ang), native_sin(ang));\n"); - localString += string("a[") + num2str(k) + string("] = complexMul(a[") + num2str(k) + string("], w);\n"); + localString += string("a[") + num2str(k) + string("] = complexMul(a[") + num2str(k) + string("], w);\n"); } - + // shuffle - numIter = R1 / R2; + numIter = R1 / R2; localString += string("indexIn = mad24(j, ") + num2str(threadsPerBlock*numIter) + string(", i);\n"); localString += string("lMemStore = sMem + tid;\n"); localString += string("lMemLoad = sMem + indexIn;\n"); - for(k = 0; k < R1; k++) + for(k = 0; k < R1; k++) localString += string("lMemStore[") + num2str(k*threadsPerBlock) + string("] = a[") + num2str(k) + string("].x;\n"); - localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n"); + localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n"); for(k = 0; k < numIter; k++) for(t = 0; t < R2; t++) localString += string("a[") + num2str(k*R2+t) + string("].x = lMemLoad[") + num2str(t*batchSize + k*threadsPerBlock) + string("];\n"); localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n"); - for(k = 0; k < R1; k++) + for(k = 0; k < R1; k++) localString += string("lMemStore[") + num2str(k*threadsPerBlock) + string("] = a[") + num2str(k) + string("].y;\n"); - localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n"); + localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n"); for(k = 0; k < numIter; k++) for(t = 0; t < R2; t++) localString += string("a[") + num2str(k*R2+t) + string("].y = lMemLoad[") + num2str(t*batchSize + k*threadsPerBlock) + string("];\n"); localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n"); - + for(j = 0; j < numIter; j++) localString += string("fftKernel") + num2str(R2) + string("(a + ") + num2str(j*R2) + string(", dir);\n"); } - + // twiddle - if(passNum < (numPasses - 1)) + if(passNum < (numPasses - 1)) { localString += string("l = ((bNum << ") + num2str(lgBatchSize) + string(") + i) >> ") + num2str(lgStrideO) + string(";\n"); - localString += string("k = j << ") + num2str((int)log2(R1/R2)) + string(";\n"); + localString += string("k = j << ") + num2str((int)log2(R1/R2)) + string(";\n"); localString += string("ang1 = dir*(2.0f*M_PI/") + num2str(N) + string(")*l;\n"); - for(t = 0; t < R1; t++) + for(t = 0; t < R1; t++) { localString += string("ang = ang1*(k + ") + num2str((t%R2)*R1 + (t/R2)) + string(");\n"); localString += string("w = (float2)(native_cos(ang), native_sin(ang));\n"); localString += string("a[") + num2str(t) + string("] = complexMul(a[") + num2str(t) + string("], w);\n"); } } - + // Store Data - if(strideO == 1) + if(strideO == 1) { - + localString += string("lMemStore = sMem + mad24(i, ") + num2str(radix + 1) + string(", j << ") + num2str((int)log2(R1/R2)) + string(");\n"); localString += string("lMemLoad = sMem + mad24(tid >> ") + num2str((int)log2(radix)) + string(", ") + num2str(radix+1) + string(", tid & ") + num2str(radix-1) + string(");\n"); - + for(int i = 0; i < R1/R2; i++) for(int j = 0; j < R2; j++) localString += string("lMemStore[ ") + num2str(i + j*R1) + string("] = a[") + num2str(i*R2+j) + string("].x;\n"); @@ -1145,7 +1145,7 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir localString += string("a[") + num2str(i*innerIter+j) + string("].x = lMemLoad[") + num2str(j*threadsPerBlock + i*(radix+1)) + string("];\n"); } localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n"); - + for(int i = 0; i < R1/R2; i++) for(int j = 0; j < R2; j++) localString += string("lMemStore[ ") + num2str(i + j*R1) + string("] = a[") + num2str(i*R2+j) + string("].y;\n"); @@ -1164,7 +1164,7 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir localString += string("a[") + num2str(i*innerIter+j) + string("].y = lMemLoad[") + num2str(j*threadsPerBlock + i*(radix+1)) + string("];\n"); } localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n"); - + localString += string("indexOut += tid;\n"); if(dataFormat == clFFT_SplitComplexFormat) { localString += string("out_real += indexOut;\n"); @@ -1177,16 +1177,16 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir else { localString += string("out += indexOut;\n"); for(k = 0; k < R1; k++) - localString += string("out[") + num2str(k*threadsPerBlock) + string("] = a[") + num2str(k) + string("];\n"); + localString += string("out[") + num2str(k*threadsPerBlock) + string("] = a[") + num2str(k) + string("];\n"); } - + } - else + else { localString += string("indexOut += mad24(j, ") + num2str(numIter*strideO) + string(", i);\n"); if(dataFormat == clFFT_SplitComplexFormat) { localString += string("out_real += indexOut;\n"); - localString += string("out_imag += indexOut;\n"); + localString += string("out_imag += indexOut;\n"); for(k = 0; k < R1; k++) localString += string("out_real[") + num2str(((k%R2)*R1 + (k/R2))*strideO) + string("] = a[") + num2str(k) + string("].x;\n"); for(k = 0; k < R1; k++) @@ -1198,14 +1198,14 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir localString += string("out[") + num2str(((k%R2)*R1 + (k/R2))*strideO) + string("] = a[") + num2str(k) + string("];\n"); } } - + insertHeader(*kernelString, kernelName, dataFormat); *kernelString += string("{\n"); if((*kInfo)->lmem_size) *kernelString += string(" __local float sMem[") + num2str((*kInfo)->lmem_size) + string("];\n"); *kernelString += localString; - *kernelString += string("}\n"); - + *kernelString += string("}\n"); + N /= radix; kInfo = &(*kInfo)->next; kCount++; @@ -1213,10 +1213,10 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir } void FFT1D(cl_fft_plan *plan, cl_fft_kernel_dir dir) -{ +{ unsigned int radixArray[10]; unsigned int numRadix; - + switch(dir) { case cl_fft_kernel_x: @@ -1241,12 +1241,12 @@ void FFT1D(cl_fft_plan *plan, cl_fft_kernel_dir dir) } } break; - + case cl_fft_kernel_y: if(plan->n.y > 1) createGlobalFFTKernelString(plan, plan->n.y, plan->n.x, cl_fft_kernel_y, 1); break; - + case cl_fft_kernel_z: if(plan->n.z > 1) createGlobalFFTKernelString(plan, plan->n.z, plan->n.x*plan->n.y, cl_fft_kernel_z, 1); diff --git a/src/fft_setup.cpp b/src/fft_setup.cpp index ab678a7ce0cc7d6765ed7f53d9653a9ccb179d3c..593184d84dbdf89f13bbae168bb831dec9bb18e0 100644 --- a/src/fft_setup.cpp +++ b/src/fft_setup.cpp @@ -61,38 +61,38 @@ using namespace std; extern void getKernelWorkDimensions(cl_fft_plan *plan, cl_fft_kernel_info *kernelInfo, cl_int *batchSize, size_t *gWorkItems, size_t *lWorkItems); -static void +static void getBlockConfigAndKernelString(cl_fft_plan *plan) { plan->temp_buffer_needed = 0; *plan->kernel_string += baseKernels; - + if(plan->format == clFFT_SplitComplexFormat) *plan->kernel_string += twistKernelPlannar; else *plan->kernel_string += twistKernelInterleaved; - - switch(plan->dim) + + switch(plan->dim) { case clFFT_1D: FFT1D(plan, cl_fft_kernel_x); break; - + case clFFT_2D: - FFT1D(plan, cl_fft_kernel_x); - FFT1D(plan, cl_fft_kernel_y); + FFT1D(plan, cl_fft_kernel_x); + FFT1D(plan, cl_fft_kernel_y); break; - + case clFFT_3D: - FFT1D(plan, cl_fft_kernel_x); - FFT1D(plan, cl_fft_kernel_y); - FFT1D(plan, cl_fft_kernel_z); + FFT1D(plan, cl_fft_kernel_x); + FFT1D(plan, cl_fft_kernel_y); + FFT1D(plan, cl_fft_kernel_z); break; - + default: return; } - + plan->temp_buffer_needed = 0; cl_fft_kernel_info *kInfo = plan->kernel_info; while(kInfo) @@ -102,7 +102,7 @@ getBlockConfigAndKernelString(cl_fft_plan *plan) } } - + static void deleteKernelInfo(cl_fft_kernel_info *kInfo) { @@ -113,7 +113,7 @@ deleteKernelInfo(cl_fft_kernel_info *kInfo) if(kInfo->kernel) clReleaseKernel(kInfo->kernel); free(kInfo); - } + } } static void @@ -127,14 +127,14 @@ destroy_plan(cl_fft_plan *Plan) deleteKernelInfo(kernel_info); kernel_info = tmp; } - + Plan->kernel_info = NULL; - + if(Plan->kernel_string) { delete Plan->kernel_string; Plan->kernel_string = NULL; - } + } if(Plan->twist_kernel) { clReleaseKernel(Plan->twist_kernel); @@ -145,7 +145,7 @@ destroy_plan(cl_fft_plan *Plan) clReleaseProgram(Plan->program); Plan->program = NULL; } - if(Plan->tempmemobj) + if(Plan->tempmemobj) { clReleaseMemObject(Plan->tempmemobj); Plan->tempmemobj = NULL; @@ -163,25 +163,25 @@ destroy_plan(cl_fft_plan *Plan) } static int -createKernelList(cl_fft_plan *plan) +createKernelList(cl_fft_plan *plan) { cl_program program = plan->program; cl_fft_kernel_info *kernel_info = plan->kernel_info; - + cl_int err; while(kernel_info) { kernel_info->kernel = clCreateKernel(program, kernel_info->kernel_name, &err); if(!kernel_info->kernel || err != CL_SUCCESS) return err; - kernel_info = kernel_info->next; + kernel_info = kernel_info->next; } - + if(plan->format == clFFT_SplitComplexFormat) plan->twist_kernel = clCreateKernel(program, "clFFT_1DTwistSplit", &err); else plan->twist_kernel = clCreateKernel(program, "clFFT_1DTwistInterleaved", &err); - + if(!plan->twist_kernel || err) return err; @@ -189,12 +189,12 @@ createKernelList(cl_fft_plan *plan) } int getMaxKernelWorkGroupSize(cl_fft_plan *plan, unsigned int *max_wg_size, unsigned int num_devices, cl_device_id *devices) -{ +{ int reg_needed = 0; *max_wg_size = INT_MAX; int err; size_t wg_size; - + unsigned int i; for(i = 0; i < num_devices; i++) { @@ -204,19 +204,19 @@ int getMaxKernelWorkGroupSize(cl_fft_plan *plan, unsigned int *max_wg_size, unsi err = clGetKernelWorkGroupInfo(kInfo->kernel, devices[i], CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &wg_size, NULL); if(err != CL_SUCCESS) return -1; - + if(wg_size < kInfo->num_workitems_per_workgroup) reg_needed |= 1; - + if(*max_wg_size > wg_size) *max_wg_size = wg_size; - + kInfo = kInfo->next; } } - + return reg_needed; -} +} #define ERR_MACRO(err) { \ if( err != CL_SUCCESS) \ @@ -241,24 +241,24 @@ clFFT_CreatePlan(cl_context context, clFFT_Dim3 n, clFFT_Dimension dim, clFFT_Da cl_device_id devices[16]; size_t ret_size; cl_device_type device_type; - + if(!context) ERR_MACRO(CL_INVALID_VALUE); - + isPow2 |= n.x && !( (n.x - 1) & n.x ); isPow2 |= n.y && !( (n.y - 1) & n.y ); isPow2 |= n.z && !( (n.z - 1) & n.z ); - + if(!isPow2) ERR_MACRO(CL_INVALID_VALUE); - + if( (dim == clFFT_1D && (n.y != 1 || n.z != 1)) || (dim == clFFT_2D && n.z != 1) ) ERR_MACRO(CL_INVALID_VALUE); plan = (cl_fft_plan *) malloc(sizeof(cl_fft_plan)); if(!plan) ERR_MACRO(CL_OUT_OF_RESOURCES); - + plan->context = context; clRetainContext(context); plan->n = n; @@ -277,8 +277,8 @@ clFFT_CreatePlan(cl_context context, clFFT_Dim3 n, clFFT_Dimension dim, clFFT_Da plan->max_work_item_per_workgroup = 256; plan->max_radix = 16; plan->min_mem_coalesce_width = 16; - plan->num_local_mem_banks = 16; - + plan->num_local_mem_banks = 16; + patch_kernel_source: plan->kernel_string = new string(""); @@ -286,97 +286,97 @@ patch_kernel_source: ERR_MACRO(CL_OUT_OF_RESOURCES); getBlockConfigAndKernelString(plan); - + const char *source_str = plan->kernel_string->c_str(); plan->program = clCreateProgramWithSource(context, 1, (const char**) &source_str, NULL, &err); ERR_MACRO(err); err = clGetContextInfo(context, CL_CONTEXT_DEVICES, sizeof(devices), devices, &ret_size); ERR_MACRO(err); - + num_devices = ret_size / sizeof(cl_device_id); - + for(i = 0; i < num_devices; i++) { err = clGetDeviceInfo(devices[i], CL_DEVICE_TYPE, sizeof(device_type), &device_type, NULL); ERR_MACRO(err); - + if(device_type == CL_DEVICE_TYPE_GPU) - { + { gpu_found = 1; err = clBuildProgram(plan->program, 1, &devices[i], "-cl-mad-enable", NULL, NULL); if (err != CL_SUCCESS) { - char *build_log; + char *build_log; char devicename[200]; size_t log_size; - + err = clGetProgramBuildInfo(plan->program, devices[i], CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); ERR_MACRO(err); - + build_log = (char *) malloc(log_size + 1); - + err = clGetProgramBuildInfo(plan->program, devices[i], CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL); ERR_MACRO(err); - + err = clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(devicename), devicename, NULL); ERR_MACRO(err); - + fprintf(stdout, "FFT program build log on device %s\n", devicename); fprintf(stdout, "%s\n", build_log); free(build_log); - + ERR_MACRO(err); - } - } + } + } } - + if(!gpu_found) ERR_MACRO(CL_INVALID_CONTEXT); - - err = createKernelList(plan); + + err = createKernelList(plan); ERR_MACRO(err); - + // we created program and kernels based on "some max work group size (default 256)" ... this work group size - // may be larger than what kernel may execute with ... if thats the case we need to regenerate the kernel source - // setting this as limit i.e max group size and rebuild. - unsigned int max_kernel_wg_size; + // may be larger than what kernel may execute with ... if thats the case we need to regenerate the kernel source + // setting this as limit i.e max group size and rebuild. + unsigned int max_kernel_wg_size; int patching_req = getMaxKernelWorkGroupSize(plan, &max_kernel_wg_size, num_devices, devices); if(patching_req == -1) { ERR_MACRO(err); } - + if(patching_req) { destroy_plan(plan); plan->max_work_item_per_workgroup = max_kernel_wg_size; goto patch_kernel_source; } - + cl_fft_kernel_info *kInfo = plan->kernel_info; while(kInfo) { plan->num_kernels++; kInfo = kInfo->next; } - + if(error_code) *error_code = CL_SUCCESS; - + return (clFFT_Plan) plan; } -void +void clFFT_DestroyPlan(clFFT_Plan plan) { cl_fft_plan *Plan = (cl_fft_plan *) plan; - if(Plan) - { - destroy_plan(Plan); + if(Plan) + { + destroy_plan(Plan); clReleaseContext(Plan->context); free(Plan); - } + } } void clFFT_DumpPlan( clFFT_Plan Plan, FILE *file) @@ -385,12 +385,12 @@ void clFFT_DumpPlan( clFFT_Plan Plan, FILE *file) FILE *out; if(!file) out = stdout; - else + else out = file; - + cl_fft_plan *plan = (cl_fft_plan *) Plan; cl_fft_kernel_info *kInfo = plan->kernel_info; - + while(kInfo) { cl_int s = 1;