diff --git a/example/main.cpp b/example/main.cpp index 844673fe550056d83c0fa6ff7338bfc346a89b38..1cc290554f3cfe894b318a208da50734dfcad77f 100644 --- a/example/main.cpp +++ b/example/main.cpp @@ -1,555 +1,883 @@ -#include <string.h> -#include <math.h> -#include <stdio.h> -#include <stdlib.h> -#ifdef __APPLE__ - #include <OpenCL/cl.h> -#else - #include <CL/cl.h> -#endif -#include <clFFT.h> -#include <sys/types.h> -#include <sys/stat.h> -#include <stdint.h> -#include <float.h> - -#define eps_avg 10.0 - -#define MAX( _a, _b) ((_a)>(_b)?(_a) : (_b)) - -typedef enum { - clFFT_OUT_OF_PLACE, - clFFT_IN_PLACE, -}clFFT_TestType; - -typedef struct -{ - double real; - double imag; -}clFFT_ComplexDouble; - -typedef struct -{ - double *real; - double *imag; -}clFFT_SplitComplexDouble; - -cl_device_id device_id; -cl_context context; -cl_command_queue queue; - -// ADDED -void log_error (char* s, ...) {printf ("ERROR: %s\n", s);} -// ADDED -void log_info (char* s, ...) {printf ("INFO: %s\n", s);} -int runTest(clFFT_Dim3 n, int batchSize, clFFT_Direction dir, clFFT_Dimension dim, - clFFT_DataFormat dataFormat, int numIter, clFFT_TestType testType) -{ - cl_int err = CL_SUCCESS; - int iter; - - int length = n.x * n.y * n.z * batchSize; - - clFFT_SplitComplex data_i_split = (clFFT_SplitComplex) { NULL, NULL }; - clFFT_SplitComplex data_cl_split = (clFFT_SplitComplex) { NULL, NULL }; - clFFT_Complex *data_i = NULL; - clFFT_Complex *data_cl = NULL; - clFFT_SplitComplexDouble data_iref = (clFFT_SplitComplexDouble) { NULL, NULL }; - clFFT_SplitComplexDouble data_oref = (clFFT_SplitComplexDouble) { NULL, NULL }; - - clFFT_Plan plan = NULL; - cl_mem data_in = NULL; - cl_mem data_out = NULL; - cl_mem data_in_real = NULL; - cl_mem data_in_imag = NULL; - cl_mem data_out_real = NULL; - cl_mem data_out_imag = NULL; - - if(dataFormat == clFFT_SplitComplexFormat) { - data_i_split.real = (float *) malloc(sizeof(float) * length); - data_i_split.imag = (float *) malloc(sizeof(float) * length); - data_cl_split.real = (float *) malloc(sizeof(float) * length); - data_cl_split.imag = (float *) malloc(sizeof(float) * length); - if(!data_i_split.real || !data_i_split.imag || !data_cl_split.real || !data_cl_split.imag) - { - err = -1; - log_error((char*)"Out-of-Resources\n"); - goto cleanup; - } - } - else { - data_i = (clFFT_Complex *) malloc(sizeof(clFFT_Complex)*length); - data_cl = (clFFT_Complex *) malloc(sizeof(clFFT_Complex)*length); - if(!data_i || !data_cl) - { - err = -2; - log_error((char*)"Out-of-Resouces\n"); - goto cleanup; - } - } - - data_iref.real = (double *) malloc(sizeof(double) * length); - data_iref.imag = (double *) malloc(sizeof(double) * length); - data_oref.real = (double *) malloc(sizeof(double) * length); - data_oref.imag = (double *) malloc(sizeof(double) * length); - if(!data_iref.real || !data_iref.imag || !data_oref.real || !data_oref.imag) - { - err = -3; - log_error((char*)"Out-of-Resources\n"); - goto cleanup; - } - - int i; - if(dataFormat == clFFT_SplitComplexFormat) { - for(i = 0; i < length; i++) - { - data_i_split.real[i] = 2.0f * (float) rand() / (float) RAND_MAX - 1.0f; - data_i_split.imag[i] = 2.0f * (float) rand() / (float) RAND_MAX - 1.0f; - data_cl_split.real[i] = 0.0f; - data_cl_split.imag[i] = 0.0f; - data_iref.real[i] = data_i_split.real[i]; - data_iref.imag[i] = data_i_split.imag[i]; - data_oref.real[i] = data_iref.real[i]; - data_oref.imag[i] = data_iref.imag[i]; - } - } - else { +// +// File: main.cpp +// +// Version: <1.0> +// +// Disclaimer: IMPORTANT: This Apple software is supplied to you by Apple Inc. ("Apple") +// in consideration of your agreement to the following terms, and your use, +// installation, modification or redistribution of this Apple software +// constitutes acceptance of these terms. If you do not agree with these +// terms, please do not use, install, modify or redistribute this Apple +// software. +// +// In consideration of your agreement to abide by the following terms, and +// subject to these terms, Apple grants you a personal, non - exclusive +// license, under Apple's copyrights in this original Apple software ( the +// "Apple Software" ), to use, reproduce, modify and redistribute the Apple +// Software, with or without modifications, in source and / or binary forms; +// provided that if you redistribute the Apple Software in its entirety and +// without modifications, you must retain this notice and the following text +// and disclaimers in all such redistributions of the Apple Software. Neither +// the name, trademarks, service marks or logos of Apple Inc. may be used to +// endorse or promote products derived from the Apple Software without specific +// prior written permission from Apple. Except as expressly stated in this +// notice, no other rights or licenses, express or implied, are granted by +// Apple herein, including but not limited to any patent rights that may be +// infringed by your derivative works or by other works in which the Apple +// Software may be incorporated. +// +// The Apple Software is provided by Apple on an "AS IS" basis. APPLE MAKES NO +// WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED +// WARRANTIES OF NON - INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A +// PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION +// ALONE OR IN COMBINATION WITH YOUR PRODUCTS. +// +// IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR +// CONSEQUENTIAL DAMAGES ( INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION ) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION +// AND / OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER +// UNDER THEORY OF CONTRACT, TORT ( INCLUDING NEGLIGENCE ), STRICT LIABILITY OR +// OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Copyright ( C ) 2008 Apple Inc. All Rights Reserved. +// +//////////////////////////////////////////////////////////////////////////////////////////////////// -// ADDED -FILE* f = fopen ("test_waveform.dat", "r"); - for(i = 0; i < length; i++) - { -// ADDED -// ADDED -fscanf (f, "%f\n", &data_i[i].real); -data_i[i].imag = 0; - data_cl[i].real = 0.0f; - data_cl[i].imag = 0.0f; - data_iref.real[i] = data_i[i].real; - data_iref.imag[i] = data_i[i].imag; - data_oref.real[i] = data_iref.real[i]; - data_oref.imag[i] = data_iref.imag[i]; - } - } - - plan = clFFT_CreatePlan( context, n, dim, dataFormat, &err ); - if(!plan || err) - { - log_error((char*)"clFFT_CreatePlan failed\n"); - goto cleanup; - } - - if(dataFormat == clFFT_SplitComplexFormat) - { - data_in_real = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length*sizeof(float), data_i_split.real, &err); - if(!data_in_real || err) - { - log_error((char*)"clCreateBuffer failed\n"); - goto cleanup; - } - - data_in_imag = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length*sizeof(float), data_i_split.imag, &err); - if(!data_in_imag || err) - { - log_error((char*)"clCreateBuffer failed\n"); - goto cleanup; - } - - if(testType == clFFT_OUT_OF_PLACE) - { - data_out_real = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length*sizeof(float), data_cl_split.real, &err); - if(!data_out_real || err) - { - log_error((char*)"clCreateBuffer failed\n"); - goto cleanup; - } - - data_out_imag = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length*sizeof(float), data_cl_split.imag, &err); - if(!data_out_imag || err) - { - log_error((char*)"clCreateBuffer failed\n"); - goto cleanup; - } - } - else - { - data_out_real = data_in_real; - data_out_imag = data_in_imag; - } - } - else - { - data_in = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length*sizeof(float)*2, data_i, &err); - if(!data_in) - { - log_error((char*)"clCreateBuffer failed\n"); - goto cleanup; - } - if(testType == clFFT_OUT_OF_PLACE) - { - data_out = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length*sizeof(float)*2, data_cl, &err); - if(!data_out) - { - log_error((char*)"clCreateBuffer failed\n"); - goto cleanup; - } - } - else - data_out = data_in; - } - - err = CL_SUCCESS; - if(dataFormat == clFFT_SplitComplexFormat) - { - for(iter = 0; iter < numIter; iter++) - err |= clFFT_ExecutePlannar(queue, plan, batchSize, dir, data_in_real, data_in_imag, data_out_real, data_out_imag, 0, NULL, NULL); - } - else - { - for(iter = 0; iter < numIter; iter++) - err |= clFFT_ExecuteInterleaved(queue, plan, batchSize, dir, data_in, data_out, 0, NULL, NULL); - } - - err |= clFinish(queue); - - if(err) - { - log_error((char*)"clFFT_Execute\n"); - goto cleanup; - } - if(dataFormat == clFFT_SplitComplexFormat) - { - err |= clEnqueueReadBuffer(queue, data_out_real, CL_TRUE, 0, length*sizeof(float), data_cl_split.real, 0, NULL, NULL); - err |= clEnqueueReadBuffer(queue, data_out_imag, CL_TRUE, 0, length*sizeof(float), data_cl_split.imag, 0, NULL, NULL); - for (int i = 0; i < length; i++) -// ADDED -printf ("%3d %7.3f %7.3f\n", i, data_cl_split.real[i], data_cl_split.imag[i]); - } - else - { - err |= clEnqueueReadBuffer(queue, data_out, CL_TRUE, 0, length*sizeof(float)*2, data_cl, 0, NULL, NULL); +#include <string.h> +#include <math.h> +#include <stdio.h> +#include <stdlib.h> +#include <OpenCL/opencl.h> +#include "clFFT.h" +#include <mach/mach_time.h> +#include <Accelerate/Accelerate.h> +#include "procs.h" +#include <sys/types.h> +#include <sys/stat.h> +#include <stdint.h> +#include <float.h> - for (int i = 0; i < length; i++) -// ADDED -printf ("%3d %7.3f %7.3f\n", i, data_cl[i].real, data_cl[i].imag); - } - - if(err) - { - log_error((char*)"clEnqueueReadBuffer failed\n"); - goto cleanup; - } - -cleanup: - clFFT_DestroyPlan(plan); - if(dataFormat == clFFT_SplitComplexFormat) - { - if(data_i_split.real) - free(data_i_split.real); - if(data_i_split.imag) - free(data_i_split.imag); - if(data_cl_split.real) - free(data_cl_split.real); - if(data_cl_split.imag) - free(data_cl_split.imag); - - if(data_in_real) - clReleaseMemObject(data_in_real); - if(data_in_imag) - clReleaseMemObject(data_in_imag); - if(data_out_real && testType == clFFT_OUT_OF_PLACE) - clReleaseMemObject(data_out_real); - if(data_out_imag && clFFT_OUT_OF_PLACE) - clReleaseMemObject(data_out_imag); - } - else - { - if(data_i) - free(data_i); - if(data_cl) - free(data_cl); - - if(data_in) - clReleaseMemObject(data_in); - if(data_out && testType == clFFT_OUT_OF_PLACE) - clReleaseMemObject(data_out); - } - - if(data_iref.real) - free(data_iref.real); - if(data_iref.imag) - free(data_iref.imag); - if(data_oref.real) - free(data_oref.real); - if(data_oref.imag) - free(data_oref.imag); - - return err; -} - -bool ifLineCommented(const char *line) { - const char *Line = line; - while(*Line != '\0') - if((*Line == '/') && (*(Line + 1) == '/')) - return true; - else - Line++; - return false; -} - -cl_device_type getGlobalDeviceType() -{ - char *force_cpu = getenv( "CL_DEVICE_TYPE" ); - if( force_cpu != NULL ) - { - if( strcmp( force_cpu, "gpu" ) == 0 || strcmp( force_cpu, "CL_DEVICE_TYPE_GPU" ) == 0 ) - return CL_DEVICE_TYPE_GPU; - else if( strcmp( force_cpu, "cpu" ) == 0 || strcmp( force_cpu, "CL_DEVICE_TYPE_CPU" ) == 0 ) - return CL_DEVICE_TYPE_CPU; - else if( strcmp( force_cpu, "accelerator" ) == 0 || strcmp( force_cpu, "CL_DEVICE_TYPE_ACCELERATOR" ) == 0 ) - return CL_DEVICE_TYPE_ACCELERATOR; - else if( strcmp( force_cpu, "CL_DEVICE_TYPE_DEFAULT" ) == 0 ) - return CL_DEVICE_TYPE_DEFAULT; - } - // default - return CL_DEVICE_TYPE_GPU; -} - -void -notify_callback(const char *errinfo, const void *private_info, size_t cb, void *user_data) -{ - printf("ERROR: %s\n", errinfo); -} - -int -checkMemRequirements(clFFT_Dim3 n, int batchSize, clFFT_TestType testType, cl_ulong gMemSize) -{ - cl_ulong memReq = (testType == clFFT_OUT_OF_PLACE) ? 3 : 2; - memReq *= n.x*n.y*n.z*sizeof(clFFT_Complex)*batchSize; - memReq = memReq/1024/1024; - if(memReq >= gMemSize) - return -1; - return 0; -} - -int main (int argc, char * const argv[]) { - cl_ulong gMemSize; - clFFT_Direction dir = clFFT_Forward; - int numIter = 1; - clFFT_Dim3 n = { 1024, 1, 1 }; - int batchSize = 1; - clFFT_DataFormat dataFormat = clFFT_SplitComplexFormat; - clFFT_Dimension dim = clFFT_1D; - clFFT_TestType testType = clFFT_OUT_OF_PLACE; - cl_device_id device_ids[16]; - - FILE *paramFile; - - cl_int err; +#define eps_avg 10.0 - cl_platform_id cpPlatform; - err = clGetPlatformIDs(1, &cpPlatform, 0); +#define MAX( _a, _b) ((_a)>(_b)?(_a) : (_b)) - unsigned int num_devices; - - cl_device_type device_type = getGlobalDeviceType(); - if(device_type != CL_DEVICE_TYPE_GPU) - { - log_info((char*)"Test only supported on DEVICE_TYPE_GPU\n"); - exit(0); - } - - err = clGetDeviceIDs(cpPlatform, device_type, sizeof(device_ids), device_ids, &num_devices); - if(err) - { - log_error((char*)"clGetComputeDevice failed\n"); - return -1; - } - - device_id = NULL; - unsigned int i = 0; +typedef enum { + clFFT_OUT_OF_PLACE, + clFFT_IN_PLACE, +}clFFT_TestType; - if (argc == 3) { - cl_bool available; - err = clGetDeviceInfo(device_ids[atoi(argv[2])], CL_DEVICE_AVAILABLE, sizeof(cl_bool), &available, NULL); - if(err) - { - printf("ERROR: Cannot check device availability of device # %d\n", atoi(argv[2])); - } +typedef struct +{ + double real; + double imag; +}clFFT_ComplexDouble; - if(available) - { - device_id = device_ids[atoi(argv[2])]; - } - else - { - printf("INFO: Device # %d not available for compute\n", atoi(argv[2])); - return -1; - } +typedef struct +{ + double *real; + double *imag; +}clFFT_SplitComplexDouble; + +cl_device_id device_id; +cl_context context; +cl_command_queue queue; + +typedef unsigned long long ulong; + +double subtractTimes( uint64_t endTime, uint64_t startTime ) +{ + uint64_t difference = endTime - startTime; + static double conversion = 0.0; + + if( conversion == 0.0 ) + { + mach_timebase_info_data_t info; + kern_return_t err = mach_timebase_info( &info ); + + //Convert the timebase into seconds + if( err == 0 ) + conversion = 1e-9 * (double) info.numer / (double) info.denom; } - else { - for(i = 0; i < num_devices; i++) - { - cl_bool available; - err = clGetDeviceInfo(device_ids[i], CL_DEVICE_AVAILABLE, sizeof(cl_bool), &available, NULL); - if(err) - { - printf("ERROR: Cannot check device availability of device # %d\n", i); - } - - if(available) - { - device_id = device_ids[i]; - break; - } - else - { - char name[200]; - err = clGetDeviceInfo(device_ids[i], CL_DEVICE_NAME, sizeof(name), name, NULL); - if(err == CL_SUCCESS) - { - printf("INFO: Device %s not available for compute\n", name); - } - else - { - printf("INFO: Device # %d not available for compute\n", i); - } - } - } - if(!device_id) - { - log_error((char*)"None of the devices available for compute ... aborting test\n"); - //test_finish(); - return -1; - } - } + return conversion * (double) difference; +} + +void computeReferenceF(clFFT_SplitComplex *out, clFFT_Dim3 n, + unsigned int batchSize, clFFT_Dimension dim, clFFT_Direction dir) +{ + FFTSetup plan_vdsp; + DSPSplitComplex out_vdsp; + FFTDirection dir_vdsp = dir == clFFT_Forward ? FFT_FORWARD : FFT_INVERSE; + + unsigned int i, j, k; + unsigned int stride; + unsigned int log2Nx = (unsigned int) log2(n.x); + unsigned int log2Ny = (unsigned int) log2(n.y); + unsigned int log2Nz = (unsigned int) log2(n.z); + unsigned int log2N; + + log2N = log2Nx; + log2N = log2N > log2Ny ? log2N : log2Ny; + log2N = log2N > log2Nz ? log2N : log2Nz; + + plan_vdsp = vDSP_create_fftsetup(log2N, 2); + + switch(dim) + { + case clFFT_1D: + + for(i = 0; i < batchSize; i++) + { + stride = i * n.x; + out_vdsp.realp = out->real + stride; + out_vdsp.imagp = out->imag + stride; + + vDSP_fft_zip(plan_vdsp, &out_vdsp, 1, log2Nx, dir_vdsp); + } + break; + + case clFFT_2D: + + for(i = 0; i < batchSize; i++) + { + for(j = 0; j < n.y; j++) + { + stride = j * n.x + i * n.x * n.y; + out_vdsp.realp = out->real + stride; + out_vdsp.imagp = out->imag + stride; + + vDSP_fft_zip(plan_vdsp, &out_vdsp, 1, log2Nx, dir_vdsp); + } + } + for(i = 0; i < batchSize; i++) + { + for(j = 0; j < n.x; j++) + { + stride = j + i * n.x * n.y; + out_vdsp.realp = out->real + stride; + out_vdsp.imagp = out->imag + stride; + + vDSP_fft_zip(plan_vdsp, &out_vdsp, n.x, log2Ny, dir_vdsp); + } + } + break; + + case clFFT_3D: + + for(i = 0; i < batchSize; i++) + { + for(j = 0; j < n.z; j++) + { + for(k = 0; k < n.y; k++) + { + stride = k * n.x + j * n.x * n.y + i * n.x * n.y * n.z; + out_vdsp.realp = out->real + stride; + out_vdsp.imagp = out->imag + stride; + + vDSP_fft_zip(plan_vdsp, &out_vdsp, 1, log2Nx, dir_vdsp); + } + } + } + for(i = 0; i < batchSize; i++) + { + for(j = 0; j < n.z; j++) + { + for(k = 0; k < n.x; k++) + { + stride = k + j * n.x * n.y + i * n.x * n.y * n.z; + out_vdsp.realp = out->real + stride; + out_vdsp.imagp = out->imag + stride; + + vDSP_fft_zip(plan_vdsp, &out_vdsp, n.x, log2Ny, dir_vdsp); + } + } + } + for(i = 0; i < batchSize; i++) + { + for(j = 0; j < n.y; j++) + { + for(k = 0; k < n.x; k++) + { + stride = k + j * n.x + i * n.x * n.y * n.z; + out_vdsp.realp = out->real + stride; + out_vdsp.imagp = out->imag + stride; + + vDSP_fft_zip(plan_vdsp, &out_vdsp, n.x*n.y, log2Nz, dir_vdsp); + } + } + } + break; + } + + vDSP_destroy_fftsetup(plan_vdsp); +} + +void computeReferenceD(clFFT_SplitComplexDouble *out, clFFT_Dim3 n, + unsigned int batchSize, clFFT_Dimension dim, clFFT_Direction dir) +{ + FFTSetupD plan_vdsp; + DSPDoubleSplitComplex out_vdsp; + FFTDirection dir_vdsp = dir == clFFT_Forward ? FFT_FORWARD : FFT_INVERSE; + + unsigned int i, j, k; + unsigned int stride; + unsigned int log2Nx = (int) log2(n.x); + unsigned int log2Ny = (int) log2(n.y); + unsigned int log2Nz = (int) log2(n.z); + unsigned int log2N; + + log2N = log2Nx; + log2N = log2N > log2Ny ? log2N : log2Ny; + log2N = log2N > log2Nz ? log2N : log2Nz; + + plan_vdsp = vDSP_create_fftsetupD(log2N, 2); + + switch(dim) + { + case clFFT_1D: + + for(i = 0; i < batchSize; i++) + { + stride = i * n.x; + out_vdsp.realp = out->real + stride; + out_vdsp.imagp = out->imag + stride; + + vDSP_fft_zipD(plan_vdsp, &out_vdsp, 1, log2Nx, dir_vdsp); + } + break; + + case clFFT_2D: + + for(i = 0; i < batchSize; i++) + { + for(j = 0; j < n.y; j++) + { + stride = j * n.x + i * n.x * n.y; + out_vdsp.realp = out->real + stride; + out_vdsp.imagp = out->imag + stride; + + vDSP_fft_zipD(plan_vdsp, &out_vdsp, 1, log2Nx, dir_vdsp); + } + } + for(i = 0; i < batchSize; i++) + { + for(j = 0; j < n.x; j++) + { + stride = j + i * n.x * n.y; + out_vdsp.realp = out->real + stride; + out_vdsp.imagp = out->imag + stride; + + vDSP_fft_zipD(plan_vdsp, &out_vdsp, n.x, log2Ny, dir_vdsp); + } + } + break; + + case clFFT_3D: + + for(i = 0; i < batchSize; i++) + { + for(j = 0; j < n.z; j++) + { + for(k = 0; k < n.y; k++) + { + stride = k * n.x + j * n.x * n.y + i * n.x * n.y * n.z; + out_vdsp.realp = out->real + stride; + out_vdsp.imagp = out->imag + stride; + + vDSP_fft_zipD(plan_vdsp, &out_vdsp, 1, log2Nx, dir_vdsp); + } + } + } + for(i = 0; i < batchSize; i++) + { + for(j = 0; j < n.z; j++) + { + for(k = 0; k < n.x; k++) + { + stride = k + j * n.x * n.y + i * n.x * n.y * n.z; + out_vdsp.realp = out->real + stride; + out_vdsp.imagp = out->imag + stride; + + vDSP_fft_zipD(plan_vdsp, &out_vdsp, n.x, log2Ny, dir_vdsp); + } + } + } + for(i = 0; i < batchSize; i++) + { + for(j = 0; j < n.y; j++) + { + for(k = 0; k < n.x; k++) + { + stride = k + j * n.x + i * n.x * n.y * n.z; + out_vdsp.realp = out->real + stride; + out_vdsp.imagp = out->imag + stride; + + vDSP_fft_zipD(plan_vdsp, &out_vdsp, n.x*n.y, log2Nz, dir_vdsp); + } + } + } + break; + } + + vDSP_destroy_fftsetupD(plan_vdsp); +} + +double complexNormSq(clFFT_ComplexDouble a) +{ + return (a.real * a.real + a.imag * a.imag); +} + +double computeL2Error(clFFT_SplitComplex *data, clFFT_SplitComplexDouble *data_ref, int n, int batchSize, double *max_diff, double *min_diff) +{ + int i, j; + double avg_norm = 0.0; + *max_diff = 0.0; + *min_diff = 0x1.0p1000; + + for(j = 0; j < batchSize; j++) + { + double norm_ref = 0.0; + double norm = 0.0; + for(i = 0; i < n; i++) + { + int index = j * n + i; + clFFT_ComplexDouble diff = (clFFT_ComplexDouble) { data_ref->real[index] - data->real[index], data_ref->imag[index] - data->imag[index] }; + double norm_tmp = complexNormSq(diff); + norm += norm_tmp; + norm_ref += (data_ref->real[index] * data_ref->real[index] + data_ref->imag[index] * data_ref->imag[index]); + } + double curr_norm = sqrt( norm / norm_ref ) / FLT_EPSILON; + avg_norm += curr_norm; + *max_diff = *max_diff < curr_norm ? curr_norm : *max_diff; + *min_diff = *min_diff > curr_norm ? curr_norm : *min_diff; + } + + return avg_norm / batchSize; +} + +void convertInterleavedToSplit(clFFT_SplitComplex *result_split, clFFT_Complex *data_cl, int length) +{ + int i; + for(i = 0; i < length; i++) { + result_split->real[i] = data_cl[i].real; + result_split->imag[i] = data_cl[i].imag; + } +} + +int runTest(clFFT_Dim3 n, int batchSize, clFFT_Direction dir, clFFT_Dimension dim, + clFFT_DataFormat dataFormat, int numIter, clFFT_TestType testType) +{ + cl_int err = CL_SUCCESS; + int iter; + double t; + + uint64_t t0, t1; + int mx = log2(n.x); + int my = log2(n.y); + int mz = log2(n.z); + + int length = n.x * n.y * n.z * batchSize; + + double gflops = 5e-9 * ((double)mx + (double)my + (double)mz) * (double)n.x * (double)n.y * (double)n.z * (double)batchSize * (double)numIter; + + clFFT_SplitComplex data_i_split = (clFFT_SplitComplex) { NULL, NULL }; + clFFT_SplitComplex data_cl_split = (clFFT_SplitComplex) { NULL, NULL }; + clFFT_Complex *data_i = NULL; + clFFT_Complex *data_cl = NULL; + clFFT_SplitComplexDouble data_iref = (clFFT_SplitComplexDouble) { NULL, NULL }; + clFFT_SplitComplexDouble data_oref = (clFFT_SplitComplexDouble) { NULL, NULL }; + + clFFT_Plan plan = NULL; + cl_mem data_in = NULL; + cl_mem data_out = NULL; + cl_mem data_in_real = NULL; + cl_mem data_in_imag = NULL; + cl_mem data_out_real = NULL; + cl_mem data_out_imag = NULL; + + if(dataFormat == clFFT_SplitComplexFormat) { + data_i_split.real = (float *) malloc(sizeof(float) * length); + data_i_split.imag = (float *) malloc(sizeof(float) * length); + data_cl_split.real = (float *) malloc(sizeof(float) * length); + data_cl_split.imag = (float *) malloc(sizeof(float) * length); + if(!data_i_split.real || !data_i_split.imag || !data_cl_split.real || !data_cl_split.imag) + { + err = -1; + log_error("Out-of-Resources\n"); + goto cleanup; + } + } + else { + data_i = (clFFT_Complex *) malloc(sizeof(clFFT_Complex)*length); + data_cl = (clFFT_Complex *) malloc(sizeof(clFFT_Complex)*length); + if(!data_i || !data_cl) + { + err = -2; + log_error("Out-of-Resouces\n"); + goto cleanup; + } + } + + data_iref.real = (double *) malloc(sizeof(double) * length); + data_iref.imag = (double *) malloc(sizeof(double) * length); + data_oref.real = (double *) malloc(sizeof(double) * length); + data_oref.imag = (double *) malloc(sizeof(double) * length); + if(!data_iref.real || !data_iref.imag || !data_oref.real || !data_oref.imag) + { + err = -3; + log_error("Out-of-Resources\n"); + goto cleanup; + } + + int i; + if(dataFormat == clFFT_SplitComplexFormat) { + for(i = 0; i < length; i++) + { + data_i_split.real[i] = 2.0f * (float) rand() / (float) RAND_MAX - 1.0f; + data_i_split.imag[i] = 2.0f * (float) rand() / (float) RAND_MAX - 1.0f; + data_cl_split.real[i] = 0.0f; + data_cl_split.imag[i] = 0.0f; + data_iref.real[i] = data_i_split.real[i]; + data_iref.imag[i] = data_i_split.imag[i]; + data_oref.real[i] = data_iref.real[i]; + data_oref.imag[i] = data_iref.imag[i]; + } + } + else { + for(i = 0; i < length; i++) + { + data_i[i].real = 2.0f * (float) rand() / (float) RAND_MAX - 1.0f; + data_i[i].imag = 2.0f * (float) rand() / (float) RAND_MAX - 1.0f; + data_cl[i].real = 0.0f; + data_cl[i].imag = 0.0f; + data_iref.real[i] = data_i[i].real; + data_iref.imag[i] = data_i[i].imag; + data_oref.real[i] = data_iref.real[i]; + data_oref.imag[i] = data_iref.imag[i]; + } + } + + plan = clFFT_CreatePlan( context, n, dim, dataFormat, &err ); + if(!plan || err) + { + log_error("clFFT_CreatePlan failed\n"); + goto cleanup; + } + + //clFFT_DumpPlan(plan, stdout); + + if(dataFormat == clFFT_SplitComplexFormat) + { + data_in_real = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length*sizeof(float), data_i_split.real, &err); + if(!data_in_real || err) + { + log_error("clCreateBuffer failed\n"); + goto cleanup; + } + + data_in_imag = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length*sizeof(float), data_i_split.imag, &err); + if(!data_in_imag || err) + { + log_error("clCreateBuffer failed\n"); + goto cleanup; + } + + if(testType == clFFT_OUT_OF_PLACE) + { + data_out_real = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length*sizeof(float), data_cl_split.real, &err); + if(!data_out_real || err) + { + log_error("clCreateBuffer failed\n"); + goto cleanup; + } + + data_out_imag = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length*sizeof(float), data_cl_split.imag, &err); + if(!data_out_imag || err) + { + log_error("clCreateBuffer failed\n"); + goto cleanup; + } + } + else + { + data_out_real = data_in_real; + data_out_imag = data_in_imag; + } + } + else + { + data_in = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length*sizeof(float)*2, data_i, &err); + if(!data_in) + { + log_error("clCreateBuffer failed\n"); + goto cleanup; + } + if(testType == clFFT_OUT_OF_PLACE) + { + data_out = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length*sizeof(float)*2, data_cl, &err); + if(!data_out) + { + log_error("clCreateBuffer failed\n"); + goto cleanup; + } + } + else + data_out = data_in; + } + + + err = CL_SUCCESS; + + t0 = mach_absolute_time(); + if(dataFormat == clFFT_SplitComplexFormat) + { + for(iter = 0; iter < numIter; iter++) + err |= clFFT_ExecutePlannar(queue, plan, batchSize, dir, data_in_real, data_in_imag, data_out_real, data_out_imag, 0, NULL, NULL); + } + else + { + for(iter = 0; iter < numIter; iter++) + err |= clFFT_ExecuteInterleaved(queue, plan, batchSize, dir, data_in, data_out, 0, NULL, NULL); + } + + err |= clFinish(queue); + + if(err) + { + log_error("clFFT_Execute\n"); + goto cleanup; + } + + t1 = mach_absolute_time(); + t = subtractTimes(t1, t0); + char temp[100]; + sprintf(temp, "GFlops achieved for n = (%d, %d, %d), batchsize = %d", n.x, n.y, n.z, batchSize); + log_perf(gflops / (float) t, 1, "GFlops/s", "%s", temp); + + if(dataFormat == clFFT_SplitComplexFormat) + { + err |= clEnqueueReadBuffer(queue, data_out_real, CL_TRUE, 0, length*sizeof(float), data_cl_split.real, 0, NULL, NULL); + err |= clEnqueueReadBuffer(queue, data_out_imag, CL_TRUE, 0, length*sizeof(float), data_cl_split.imag, 0, NULL, NULL); + } + else + { + err |= clEnqueueReadBuffer(queue, data_out, CL_TRUE, 0, length*sizeof(float)*2, data_cl, 0, NULL, NULL); + } + + if(err) + { + log_error("clEnqueueReadBuffer failed\n"); + goto cleanup; + } + + computeReferenceD(&data_oref, n, batchSize, dim, dir); + + double diff_avg, diff_max, diff_min; + if(dataFormat == clFFT_SplitComplexFormat) { + diff_avg = computeL2Error(&data_cl_split, &data_oref, n.x*n.y*n.z, batchSize, &diff_max, &diff_min); + if(diff_avg > eps_avg) + log_error("Test failed (n=(%d, %d, %d), batchsize=%d): %s Test: rel. L2-error = %f eps (max=%f eps, min=%f eps)\n", n.x, n.y, n.z, batchSize, (testType == clFFT_OUT_OF_PLACE) ? "out-of-place" : "in-place", diff_avg, diff_max, diff_min); + else + log_info("Test passed (n=(%d, %d, %d), batchsize=%d): %s Test: rel. L2-error = %f eps (max=%f eps, min=%f eps)\n", n.x, n.y, n.z, batchSize, (testType == clFFT_OUT_OF_PLACE) ? "out-of-place" : "in-place", diff_avg, diff_max, diff_min); + } + else { + clFFT_SplitComplex result_split; + result_split.real = (float *) malloc(length*sizeof(float)); + result_split.imag = (float *) malloc(length*sizeof(float)); + convertInterleavedToSplit(&result_split, data_cl, length); + diff_avg = computeL2Error(&result_split, &data_oref, n.x*n.y*n.z, batchSize, &diff_max, &diff_min); + + if(diff_avg > eps_avg) + log_error("Test failed (n=(%d, %d, %d), batchsize=%d): %s Test: rel. L2-error = %f eps (max=%f eps, min=%f eps)\n", n.x, n.y, n.z, batchSize, (testType == clFFT_OUT_OF_PLACE) ? "out-of-place" : "in-place", diff_avg, diff_max, diff_min); + else + log_info("Test passed (n=(%d, %d, %d), batchsize=%d): %s Test: rel. L2-error = %f eps (max=%f eps, min=%f eps)\n", n.x, n.y, n.z, batchSize, (testType == clFFT_OUT_OF_PLACE) ? "out-of-place" : "in-place", diff_avg, diff_max, diff_min); + free(result_split.real); + free(result_split.imag); + } + +cleanup: + clFFT_DestroyPlan(plan); + if(dataFormat == clFFT_SplitComplexFormat) + { + if(data_i_split.real) + free(data_i_split.real); + if(data_i_split.imag) + free(data_i_split.imag); + if(data_cl_split.real) + free(data_cl_split.real); + if(data_cl_split.imag) + free(data_cl_split.imag); + + if(data_in_real) + clReleaseMemObject(data_in_real); + if(data_in_imag) + clReleaseMemObject(data_in_imag); + if(data_out_real && testType == clFFT_OUT_OF_PLACE) + clReleaseMemObject(data_out_real); + if(data_out_imag && clFFT_OUT_OF_PLACE) + clReleaseMemObject(data_out_imag); + } + else + { + if(data_i) + free(data_i); + if(data_cl) + free(data_cl); + + if(data_in) + clReleaseMemObject(data_in); + if(data_out && testType == clFFT_OUT_OF_PLACE) + clReleaseMemObject(data_out); + } + + if(data_iref.real) + free(data_iref.real); + if(data_iref.imag) + free(data_iref.imag); + if(data_oref.real) + free(data_oref.real); + if(data_oref.imag) + free(data_oref.imag); + + return err; +} + +bool ifLineCommented(const char *line) { + const char *Line = line; + while(*Line != '\0') + if((*Line == '/') && (*(Line + 1) == '/')) + return true; + else + Line++; + return false; +} + +cl_device_type getGlobalDeviceType() +{ + char *force_cpu = getenv( "CL_DEVICE_TYPE" ); + if( force_cpu != NULL ) + { + if( strcmp( force_cpu, "gpu" ) == 0 || strcmp( force_cpu, "CL_DEVICE_TYPE_GPU" ) == 0 ) + return CL_DEVICE_TYPE_GPU; + else if( strcmp( force_cpu, "cpu" ) == 0 || strcmp( force_cpu, "CL_DEVICE_TYPE_CPU" ) == 0 ) + return CL_DEVICE_TYPE_CPU; + else if( strcmp( force_cpu, "accelerator" ) == 0 || strcmp( force_cpu, "CL_DEVICE_TYPE_ACCELERATOR" ) == 0 ) + return CL_DEVICE_TYPE_ACCELERATOR; + else if( strcmp( force_cpu, "CL_DEVICE_TYPE_DEFAULT" ) == 0 ) + return CL_DEVICE_TYPE_DEFAULT; + } + // default + return CL_DEVICE_TYPE_GPU; +} + +void +notify_callback(const char *errinfo, const void *private_info, size_t cb, void *user_data) +{ + log_error( "%s\n", errinfo ); +} - char name[200]; - err = clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(name), name, NULL); - if(err == CL_SUCCESS) - { - printf("INFO: Using device %s...\n", name); - } - else - { - printf("INFO: Using device # %d...\n", i); - } - - context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); - if(!context || err) - { - log_error((char*)"clCreateContext failed\n"); - //test_finish(); - return -1; - } +int +checkMemRequirements(clFFT_Dim3 n, int batchSize, clFFT_TestType testType, cl_ulong gMemSize) +{ + cl_ulong memReq = (testType == clFFT_OUT_OF_PLACE) ? 3 : 2; + memReq *= n.x*n.y*n.z*sizeof(clFFT_Complex)*batchSize; + memReq = memReq/1024/1024; + if(memReq >= gMemSize) + return -1; + return 0; +} - queue = clCreateCommandQueue(context, device_id, 0, &err); - if(!queue || err) - { - log_error((char*)"clCreateCommandQueue() failed.\n"); - clReleaseContext(context); - //test_finish(); - return -1; - } - - err = clGetDeviceInfo(device_id, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(cl_ulong), &gMemSize, NULL); - if(err) - { - log_error((char*)"Failed to get global mem size\n"); - clReleaseContext(context); - clReleaseCommandQueue(queue); - //test_finish(); - return -2; - } - - gMemSize /= (1024*1024); - - char delim[] = " \n"; - char tmpStr[100]; - char line[200]; - char *param, *val; - int total_errors = 0; - if(argc == 1) { - log_error((char*)"Need file name with list of parameters to run the test\n"); - //test_finish(); - return -1; - } - - if(argc >= 2) { // arguments are supplied in a file with arguments for a single run are all on the same line - paramFile = fopen(argv[1], "r"); - if(!paramFile) { - log_error((char*)"Cannot open the parameter file\n"); - clReleaseContext(context); - clReleaseCommandQueue(queue); - //test_finish(); - return -3; - } - while(fgets(line, 199, paramFile)) { - if(!strcmp(line, "") || !strcmp(line, "\n") || ifLineCommented(line)) - continue; - param = strtok(line, delim); - while(param) { - val = strtok(NULL, delim); - if(!strcmp(param, "-n")) { - sscanf(val, "%d", &n.x); - val = strtok(NULL, delim); - sscanf(val, "%d", &n.y); - val = strtok(NULL, delim); - sscanf(val, "%d", &n.z); - } - else if(!strcmp(param, "-batchsize")) - sscanf(val, "%d", &batchSize); - else if(!strcmp(param, "-dir")) { - sscanf(val, "%s", tmpStr); - if(!strcmp(tmpStr, "forward")) - dir = clFFT_Forward; - else if(!strcmp(tmpStr, "inverse")) - dir = clFFT_Inverse; - } - else if(!strcmp(param, "-dim")) { - sscanf(val, "%s", tmpStr); - if(!strcmp(tmpStr, "1D")) - dim = clFFT_1D; - else if(!strcmp(tmpStr, "2D")) - dim = clFFT_2D; - else if(!strcmp(tmpStr, "3D")) - dim = clFFT_3D; - } - else if(!strcmp(param, "-format")) { - sscanf(val, "%s", tmpStr); - if(!strcmp(tmpStr, "plannar")) - dataFormat = clFFT_SplitComplexFormat; - else if(!strcmp(tmpStr, "interleaved")) - dataFormat = clFFT_InterleavedComplexFormat; - } - else if(!strcmp(param, "-numiter")) - sscanf(val, "%d", &numIter); - else if(!strcmp(param, "-testtype")) { - sscanf(val, "%s", tmpStr); - if(!strcmp(tmpStr, "out-of-place")) - testType = clFFT_OUT_OF_PLACE; - else if(!strcmp(tmpStr, "in-place")) - testType = clFFT_IN_PLACE; - } - param = strtok(NULL, delim); - } - - if(checkMemRequirements(n, batchSize, testType, gMemSize)) { - log_info((char*)"This test cannot run because memory requirements canot be met by the available device\n"); - continue; - } - - err = runTest(n, batchSize, dir, dim, dataFormat, numIter, testType); - if (err) - total_errors++; - } - } - - clReleaseContext(context); - clReleaseCommandQueue(queue); - - return total_errors; -} +int main (int argc, char * const argv[]) { + + test_start(); + + cl_ulong gMemSize; + clFFT_Direction dir = clFFT_Forward; + int numIter = 1; + clFFT_Dim3 n = { 1024, 1, 1 }; + int batchSize = 1; + clFFT_DataFormat dataFormat = clFFT_SplitComplexFormat; + clFFT_Dimension dim = clFFT_1D; + clFFT_TestType testType = clFFT_OUT_OF_PLACE; + cl_device_id device_ids[16]; + + FILE *paramFile; + + cl_int err; + unsigned int num_devices; + + cl_device_type device_type = getGlobalDeviceType(); + if(device_type != CL_DEVICE_TYPE_GPU) + { + log_info("Test only supported on DEVICE_TYPE_GPU\n"); + test_finish(); + exit(0); + } + + err = clGetDeviceIDs(NULL, device_type, sizeof(device_ids), device_ids, &num_devices); + if(err) + { + log_error("clGetComputeDevice failed\n"); + test_finish(); + return -1; + } + + device_id = NULL; +/* + unsigned int i; + for(i = 0; i < num_devices; i++) + { + cl_bool available; + err = clGetDeviceInfo(device_ids[i], CL_DEVICE_AVAILABLE, sizeof(cl_bool), &available, NULL); + if(err) + { + log_error("Cannot check device availability of device # %d\n", i); + } + + if(available) + { + device_id = device_ids[i]; + break; + } + else + { + char name[200]; + err = clGetDeviceInfo(device_ids[i], CL_DEVICE_NAME, sizeof(name), name, NULL); + if(err == CL_SUCCESS) + { + log_info("Device %s not available for compute\n", name); + } + else + { + log_info("Device # %d not available for compute\n", i); + } + } + } + + if(!device_id) + { + log_error("None of the devices available for compute ... aborting test\n"); + test_finish(); + return -1; + } +*/ +device_id = device_ids[1]; + context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); + if(!context || err) + { + log_error("clCreateContext failed\n"); + test_finish(); + return -1; + } + + queue = clCreateCommandQueue(context, device_id, 0, &err); + if(!queue || err) + { + log_error("clCreateCommandQueue() failed.\n"); + clReleaseContext(context); + test_finish(); + return -1; + } + + err = clGetDeviceInfo(device_id, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(cl_ulong), &gMemSize, NULL); + if(err) + { + log_error("Failed to get global mem size\n"); + clReleaseContext(context); + clReleaseCommandQueue(queue); + test_finish(); + return -2; + } + + gMemSize /= (1024*1024); + + char delim[] = " \n"; + char tmpStr[100]; + char line[200]; + char *param, *val; + int total_errors = 0; + if(argc == 1) { + log_error("Need file name with list of parameters to run the test\n"); + test_finish(); + return -1; + } + + if(argc == 2) { // arguments are supplied in a file with arguments for a single run are all on the same line + paramFile = fopen(argv[1], "r"); + if(!paramFile) { + log_error("Cannot open the parameter file\n"); + clReleaseContext(context); + clReleaseCommandQueue(queue); + test_finish(); + return -3; + } + while(fgets(line, 199, paramFile)) { + if(!strcmp(line, "") || !strcmp(line, "\n") || ifLineCommented(line)) + continue; + param = strtok(line, delim); + while(param) { + val = strtok(NULL, delim); + if(!strcmp(param, "-n")) { + sscanf(val, "%d", &n.x); + val = strtok(NULL, delim); + sscanf(val, "%d", &n.y); + val = strtok(NULL, delim); + sscanf(val, "%d", &n.z); + } + else if(!strcmp(param, "-batchsize")) + sscanf(val, "%d", &batchSize); + else if(!strcmp(param, "-dir")) { + sscanf(val, "%s", tmpStr); + if(!strcmp(tmpStr, "forward")) + dir = clFFT_Forward; + else if(!strcmp(tmpStr, "inverse")) + dir = clFFT_Inverse; + } + else if(!strcmp(param, "-dim")) { + sscanf(val, "%s", tmpStr); + if(!strcmp(tmpStr, "1D")) + dim = clFFT_1D; + else if(!strcmp(tmpStr, "2D")) + dim = clFFT_2D; + else if(!strcmp(tmpStr, "3D")) + dim = clFFT_3D; + } + else if(!strcmp(param, "-format")) { + sscanf(val, "%s", tmpStr); + if(!strcmp(tmpStr, "plannar")) + dataFormat = clFFT_SplitComplexFormat; + else if(!strcmp(tmpStr, "interleaved")) + dataFormat = clFFT_InterleavedComplexFormat; + } + else if(!strcmp(param, "-numiter")) + sscanf(val, "%d", &numIter); + else if(!strcmp(param, "-testtype")) { + sscanf(val, "%s", tmpStr); + if(!strcmp(tmpStr, "out-of-place")) + testType = clFFT_OUT_OF_PLACE; + else if(!strcmp(tmpStr, "in-place")) + testType = clFFT_IN_PLACE; + } + param = strtok(NULL, delim); + } + + if(checkMemRequirements(n, batchSize, testType, gMemSize)) { + log_info("This test cannot run because memory requirements canot be met by the available device\n"); + continue; + } + + err = runTest(n, batchSize, dir, dim, dataFormat, numIter, testType); + if (err) + total_errors++; + } + } + + clReleaseContext(context); + clReleaseCommandQueue(queue); + + test_finish(); + return total_errors; +} diff --git a/include/clFFT.h b/include/clFFT.h index 4476635912a7aba7d429d93a905ff97b56f48b3f..2c455939f2ac23b932f94186c1b7a1bd54270997 100644 --- a/include/clFFT.h +++ b/include/clFFT.h @@ -1,86 +1,129 @@ -#ifndef __CLFFT_H -#define __CLFFT_H -#ifdef __APPLE__ - #include <OpenCL/cl.h> -#else - #include <CL/cl.h> +// +// File: clFFT.h +// +// Version: <1.0> +// +// Disclaimer: IMPORTANT: This Apple software is supplied to you by Apple Inc. ("Apple") +// in consideration of your agreement to the following terms, and your use, +// installation, modification or redistribution of this Apple software +// constitutes acceptance of these terms. If you do not agree with these +// terms, please do not use, install, modify or redistribute this Apple +// software. +// +// In consideration of your agreement to abide by the following terms, and +// subject to these terms, Apple grants you a personal, non - exclusive +// license, under Apple's copyrights in this original Apple software ( the +// "Apple Software" ), to use, reproduce, modify and redistribute the Apple +// Software, with or without modifications, in source and / or binary forms; +// provided that if you redistribute the Apple Software in its entirety and +// without modifications, you must retain this notice and the following text +// and disclaimers in all such redistributions of the Apple Software. Neither +// the name, trademarks, service marks or logos of Apple Inc. may be used to +// endorse or promote products derived from the Apple Software without specific +// prior written permission from Apple. Except as expressly stated in this +// notice, no other rights or licenses, express or implied, are granted by +// Apple herein, including but not limited to any patent rights that may be +// infringed by your derivative works or by other works in which the Apple +// Software may be incorporated. +// +// The Apple Software is provided by Apple on an "AS IS" basis. APPLE MAKES NO +// WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED +// WARRANTIES OF NON - INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A +// PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION +// ALONE OR IN COMBINATION WITH YOUR PRODUCTS. +// +// IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR +// CONSEQUENTIAL DAMAGES ( INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION ) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION +// AND / OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER +// UNDER THEORY OF CONTRACT, TORT ( INCLUDING NEGLIGENCE ), STRICT LIABILITY OR +// OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Copyright ( C ) 2008 Apple Inc. All Rights Reserved. +// +//////////////////////////////////////////////////////////////////////////////////////////////////// + + +#ifndef __CLFFT_H +#define __CLFFT_H + +#ifdef __cplusplus +extern "C" { #endif - -#ifdef __cplusplus -extern "C" { -#endif -#include <stdio.h> - -// XForm type -typedef enum -{ - clFFT_Forward = -1, - clFFT_Inverse = 1 - -}clFFT_Direction; - -// XForm dimension -typedef enum -{ - clFFT_1D = 0, - clFFT_2D = 1, - clFFT_3D = 3 - -}clFFT_Dimension; - -// XForm Data type +#include <OpenCL/opencl.h> +#include <stdio.h> + +// XForm type typedef enum -{ - clFFT_SplitComplexFormat = 0, - clFFT_InterleavedComplexFormat = 1 -}clFFT_DataFormat; - -typedef struct -{ - unsigned int x; - unsigned int y; - unsigned int z; -}clFFT_Dim3; - -typedef struct -{ - float *real; - float *imag; -} clFFT_SplitComplex; - -typedef struct -{ - float real; - float imag; -}clFFT_Complex; - -typedef void* clFFT_Plan; - -clFFT_Plan clFFT_CreatePlan( cl_context context, clFFT_Dim3 n, clFFT_Dimension dim, clFFT_DataFormat dataFormat, cl_int *error_code ); - -void clFFT_DestroyPlan( clFFT_Plan plan ); - -cl_int clFFT_ExecuteInterleaved( cl_command_queue queue, clFFT_Plan plan, cl_int batchSize, clFFT_Direction dir, - cl_mem data_in, cl_mem data_out, - cl_int num_events, cl_event *event_list, cl_event *event ); - -cl_int clFFT_ExecutePlannar( cl_command_queue queue, clFFT_Plan plan, cl_int batchSize, clFFT_Direction dir, - cl_mem data_in_real, cl_mem data_in_imag, cl_mem data_out_real, cl_mem data_out_imag, - cl_int num_events, cl_event *event_list, cl_event *event ); - -cl_int clFFT_1DTwistInterleaved(clFFT_Plan Plan, cl_command_queue queue, cl_mem array, - size_t numRows, size_t numCols, size_t startRow, size_t rowsToProcess, clFFT_Direction dir); - - -cl_int clFFT_1DTwistPlannar(clFFT_Plan Plan, cl_command_queue queue, cl_mem array_real, cl_mem array_imag, - size_t numRows, size_t numCols, size_t startRow, size_t rowsToProcess, clFFT_Direction dir); - -void clFFT_DumpPlan( clFFT_Plan plan, FILE *file); - -#ifdef __cplusplus -} +{ + clFFT_Forward = -1, + clFFT_Inverse = 1 + +}clFFT_Direction; + +// XForm dimension +typedef enum +{ + clFFT_1D = 0, + clFFT_2D = 1, + clFFT_3D = 3 + +}clFFT_Dimension; + +// XForm Data type +typedef enum +{ + clFFT_SplitComplexFormat = 0, + clFFT_InterleavedComplexFormat = 1 +}clFFT_DataFormat; + +typedef struct +{ + unsigned int x; + unsigned int y; + unsigned int z; +}clFFT_Dim3; + +typedef struct +{ + float *real; + float *imag; +} clFFT_SplitComplex; + +typedef struct +{ + float real; + float imag; +}clFFT_Complex; + +typedef void* clFFT_Plan; + +clFFT_Plan clFFT_CreatePlan( cl_context context, clFFT_Dim3 n, clFFT_Dimension dim, clFFT_DataFormat dataFormat, cl_int *error_code ); + +void clFFT_DestroyPlan( clFFT_Plan plan ); + +cl_int clFFT_ExecuteInterleaved( cl_command_queue queue, clFFT_Plan plan, cl_int batchSize, clFFT_Direction dir, + cl_mem data_in, cl_mem data_out, + cl_int num_events, cl_event *event_list, cl_event *event ); + +cl_int clFFT_ExecutePlannar( cl_command_queue queue, clFFT_Plan plan, cl_int batchSize, clFFT_Direction dir, + cl_mem data_in_real, cl_mem data_in_imag, cl_mem data_out_real, cl_mem data_out_imag, + cl_int num_events, cl_event *event_list, cl_event *event ); + +cl_int clFFT_1DTwistInterleaved(clFFT_Plan Plan, cl_command_queue queue, cl_mem array, + size_t numRows, size_t numCols, size_t startRow, size_t rowsToProcess, clFFT_Direction dir); + + +cl_int clFFT_1DTwistPlannar(clFFT_Plan Plan, cl_command_queue queue, cl_mem array_real, cl_mem array_imag, + size_t numRows, size_t numCols, size_t startRow, size_t rowsToProcess, clFFT_Direction dir); + +void clFFT_DumpPlan( clFFT_Plan plan, FILE *file); + +#ifdef __cplusplus +} +#endif + #endif - -#endif diff --git a/src/fft_base_kernels.h b/src/fft_base_kernels.h index 2481445d4e91120801d7566a3720d9dafbaa33f1..101795697f55e125e4fbafa8757aef5de033fad6 100644 --- a/src/fft_base_kernels.h +++ b/src/fft_base_kernels.h @@ -1,232 +1,277 @@ -#ifndef __CL_FFT_BASE_KERNELS_ -#define __CL_FFT_BASE_KERNELS_ -#include <string.h> +// +// File: fft_base_kernels.h +// +// Version: <1.0> +// +// Disclaimer: IMPORTANT: This Apple software is supplied to you by Apple Inc. ("Apple") +// in consideration of your agreement to the following terms, and your use, +// installation, modification or redistribution of this Apple software +// constitutes acceptance of these terms. If you do not agree with these +// terms, please do not use, install, modify or redistribute this Apple +// software. +// +// In consideration of your agreement to abide by the following terms, and +// subject to these terms, Apple grants you a personal, non - exclusive +// license, under Apple's copyrights in this original Apple software ( the +// "Apple Software" ), to use, reproduce, modify and redistribute the Apple +// Software, with or without modifications, in source and / or binary forms; +// provided that if you redistribute the Apple Software in its entirety and +// without modifications, you must retain this notice and the following text +// and disclaimers in all such redistributions of the Apple Software. Neither +// the name, trademarks, service marks or logos of Apple Inc. may be used to +// endorse or promote products derived from the Apple Software without specific +// prior written permission from Apple. Except as expressly stated in this +// notice, no other rights or licenses, express or implied, are granted by +// Apple herein, including but not limited to any patent rights that may be +// infringed by your derivative works or by other works in which the Apple +// Software may be incorporated. +// +// The Apple Software is provided by Apple on an "AS IS" basis. APPLE MAKES NO +// WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED +// WARRANTIES OF NON - INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A +// PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION +// ALONE OR IN COMBINATION WITH YOUR PRODUCTS. +// +// IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR +// CONSEQUENTIAL DAMAGES ( INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION ) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION +// AND / OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER +// UNDER THEORY OF CONTRACT, TORT ( INCLUDING NEGLIGENCE ), STRICT LIABILITY OR +// OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Copyright ( C ) 2008 Apple Inc. All Rights Reserved. +// +//////////////////////////////////////////////////////////////////////////////////////////////////// -using namespace std; -static string baseKernels = string -( - "#ifndef M_PI\n" - "#define M_PI 0x1.921fb54442d18p+1\n" - "#endif\n" - "#define complexMul(a,b) ((float2)(mad(-(a).y, (b).y, (a).x * (b).x), mad((a).y, (b).x, (a).x * (b).y)))\n" - "#define conj(a) ((float2)((a).x, -(a).y))\n" - "#define conjTransp(a) ((float2)(-(a).y, (a).x))\n" - "\n" - "#define fftKernel2(a,dir) \\\n" - "{ \\\n" - " float2 c = (a)[0]; \\\n" - " (a)[0] = c + (a)[1]; \\\n" - " (a)[1] = c - (a)[1]; \\\n" - "}\n" - "\n" - "#define fftKernel2S(d1,d2,dir) \\\n" - "{ \\\n" - " float2 c = (d1); \\\n" - " (d1) = c + (d2); \\\n" - " (d2) = c - (d2); \\\n" - "}\n" - "\n" - "#define fftKernel4(a,dir) \\\n" - "{ \\\n" - " fftKernel2S((a)[0], (a)[2], dir); \\\n" - " fftKernel2S((a)[1], (a)[3], dir); \\\n" - " fftKernel2S((a)[0], (a)[1], dir); \\\n" - " (a)[3] = (float2)(dir)*(conjTransp((a)[3])); \\\n" - " fftKernel2S((a)[2], (a)[3], dir); \\\n" - " float2 c = (a)[1]; \\\n" - " (a)[1] = (a)[2]; \\\n" - " (a)[2] = c; \\\n" - "}\n" - "\n" - "#define fftKernel4s(a0,a1,a2,a3,dir) \\\n" - "{ \\\n" - " fftKernel2S((a0), (a2), dir); \\\n" - " fftKernel2S((a1), (a3), dir); \\\n" - " fftKernel2S((a0), (a1), dir); \\\n" - " (a3) = (float2)(dir)*(conjTransp((a3))); \\\n" - " fftKernel2S((a2), (a3), dir); \\\n" - " float2 c = (a1); \\\n" - " (a1) = (a2); \\\n" - " (a2) = c; \\\n" - "}\n" - "\n" - "#define bitreverse8(a) \\\n" - "{ \\\n" - " float2 c; \\\n" - " c = (a)[1]; \\\n" - " (a)[1] = (a)[4]; \\\n" - " (a)[4] = c; \\\n" - " c = (a)[3]; \\\n" - " (a)[3] = (a)[6]; \\\n" - " (a)[6] = c; \\\n" - "}\n" - "\n" - "#define fftKernel8(a,dir) \\\n" - "{ \\\n" - " const float2 w1 = (float2)(0x1.6a09e6p-1f, dir*0x1.6a09e6p-1f); \\\n" - " const float2 w3 = (float2)(-0x1.6a09e6p-1f, dir*0x1.6a09e6p-1f); \\\n" - " float2 c; \\\n" - " fftKernel2S((a)[0], (a)[4], dir); \\\n" - " fftKernel2S((a)[1], (a)[5], dir); \\\n" - " fftKernel2S((a)[2], (a)[6], dir); \\\n" - " fftKernel2S((a)[3], (a)[7], dir); \\\n" - " (a)[5] = complexMul(w1, (a)[5]); \\\n" - " (a)[6] = (float2)(dir)*(conjTransp((a)[6])); \\\n" - " (a)[7] = complexMul(w3, (a)[7]); \\\n" - " fftKernel2S((a)[0], (a)[2], dir); \\\n" - " fftKernel2S((a)[1], (a)[3], dir); \\\n" - " fftKernel2S((a)[4], (a)[6], dir); \\\n" - " fftKernel2S((a)[5], (a)[7], dir); \\\n" - " (a)[3] = (float2)(dir)*(conjTransp((a)[3])); \\\n" - " (a)[7] = (float2)(dir)*(conjTransp((a)[7])); \\\n" - " fftKernel2S((a)[0], (a)[1], dir); \\\n" - " fftKernel2S((a)[2], (a)[3], dir); \\\n" - " fftKernel2S((a)[4], (a)[5], dir); \\\n" - " fftKernel2S((a)[6], (a)[7], dir); \\\n" - " bitreverse8((a)); \\\n" - "}\n" - "\n" - "#define bitreverse4x4(a) \\\n" - "{ \\\n" - " float2 c; \\\n" - " c = (a)[1]; (a)[1] = (a)[4]; (a)[4] = c; \\\n" - " c = (a)[2]; (a)[2] = (a)[8]; (a)[8] = c; \\\n" - " c = (a)[3]; (a)[3] = (a)[12]; (a)[12] = c; \\\n" - " c = (a)[6]; (a)[6] = (a)[9]; (a)[9] = c; \\\n" - " c = (a)[7]; (a)[7] = (a)[13]; (a)[13] = c; \\\n" - " c = (a)[11]; (a)[11] = (a)[14]; (a)[14] = c; \\\n" - "}\n" - "\n" - "#define fftKernel16(a,dir) \\\n" - "{ \\\n" - " const float w0 = 0x1.d906bcp-1f; \\\n" - " const float w1 = 0x1.87de2ap-2f; \\\n" - " const float w2 = 0x1.6a09e6p-1f; \\\n" - " fftKernel4s((a)[0], (a)[4], (a)[8], (a)[12], dir); \\\n" - " fftKernel4s((a)[1], (a)[5], (a)[9], (a)[13], dir); \\\n" - " fftKernel4s((a)[2], (a)[6], (a)[10], (a)[14], dir); \\\n" - " fftKernel4s((a)[3], (a)[7], (a)[11], (a)[15], dir); \\\n" - " (a)[5] = complexMul((a)[5], (float2)(w0, dir*w1)); \\\n" - " (a)[6] = complexMul((a)[6], (float2)(w2, dir*w2)); \\\n" - " (a)[7] = complexMul((a)[7], (float2)(w1, dir*w0)); \\\n" - " (a)[9] = complexMul((a)[9], (float2)(w2, dir*w2)); \\\n" - " (a)[10] = (float2)(dir)*(conjTransp((a)[10])); \\\n" - " (a)[11] = complexMul((a)[11], (float2)(-w2, dir*w2)); \\\n" - " (a)[13] = complexMul((a)[13], (float2)(w1, dir*w0)); \\\n" - " (a)[14] = complexMul((a)[14], (float2)(-w2, dir*w2)); \\\n" - " (a)[15] = complexMul((a)[15], (float2)(-w0, dir*-w1)); \\\n" - " fftKernel4((a), dir); \\\n" - " fftKernel4((a) + 4, dir); \\\n" - " fftKernel4((a) + 8, dir); \\\n" - " fftKernel4((a) + 12, dir); \\\n" - " bitreverse4x4((a)); \\\n" - "}\n" - "\n" - "#define bitreverse32(a) \\\n" - "{ \\\n" - " float2 c1, c2; \\\n" - " c1 = (a)[2]; (a)[2] = (a)[1]; c2 = (a)[4]; (a)[4] = c1; c1 = (a)[8]; (a)[8] = c2; c2 = (a)[16]; (a)[16] = c1; (a)[1] = c2; \\\n" - " c1 = (a)[6]; (a)[6] = (a)[3]; c2 = (a)[12]; (a)[12] = c1; c1 = (a)[24]; (a)[24] = c2; c2 = (a)[17]; (a)[17] = c1; (a)[3] = c2; \\\n" - " c1 = (a)[10]; (a)[10] = (a)[5]; c2 = (a)[20]; (a)[20] = c1; c1 = (a)[9]; (a)[9] = c2; c2 = (a)[18]; (a)[18] = c1; (a)[5] = c2; \\\n" - " c1 = (a)[14]; (a)[14] = (a)[7]; c2 = (a)[28]; (a)[28] = c1; c1 = (a)[25]; (a)[25] = c2; c2 = (a)[19]; (a)[19] = c1; (a)[7] = c2; \\\n" - " c1 = (a)[22]; (a)[22] = (a)[11]; c2 = (a)[13]; (a)[13] = c1; c1 = (a)[26]; (a)[26] = c2; c2 = (a)[21]; (a)[21] = c1; (a)[11] = c2; \\\n" - " c1 = (a)[30]; (a)[30] = (a)[15]; c2 = (a)[29]; (a)[29] = c1; c1 = (a)[27]; (a)[27] = c2; c2 = (a)[23]; (a)[23] = c1; (a)[15] = c2; \\\n" - "}\n" - "\n" - "#define fftKernel32(a,dir) \\\n" - "{ \\\n" - " fftKernel2S((a)[0], (a)[16], dir); \\\n" - " fftKernel2S((a)[1], (a)[17], dir); \\\n" - " fftKernel2S((a)[2], (a)[18], dir); \\\n" - " fftKernel2S((a)[3], (a)[19], dir); \\\n" - " fftKernel2S((a)[4], (a)[20], dir); \\\n" - " fftKernel2S((a)[5], (a)[21], dir); \\\n" - " fftKernel2S((a)[6], (a)[22], dir); \\\n" - " fftKernel2S((a)[7], (a)[23], dir); \\\n" - " fftKernel2S((a)[8], (a)[24], dir); \\\n" - " fftKernel2S((a)[9], (a)[25], dir); \\\n" - " fftKernel2S((a)[10], (a)[26], dir); \\\n" - " fftKernel2S((a)[11], (a)[27], dir); \\\n" - " fftKernel2S((a)[12], (a)[28], dir); \\\n" - " fftKernel2S((a)[13], (a)[29], dir); \\\n" - " fftKernel2S((a)[14], (a)[30], dir); \\\n" - " fftKernel2S((a)[15], (a)[31], dir); \\\n" - " (a)[17] = complexMul((a)[17], (float2)(0x1.f6297cp-1f, dir*0x1.8f8b84p-3f)); \\\n" - " (a)[18] = complexMul((a)[18], (float2)(0x1.d906bcp-1f, dir*0x1.87de2ap-2f)); \\\n" - " (a)[19] = complexMul((a)[19], (float2)(0x1.a9b662p-1f, dir*0x1.1c73b4p-1f)); \\\n" - " (a)[20] = complexMul((a)[20], (float2)(0x1.6a09e6p-1f, dir*0x1.6a09e6p-1f)); \\\n" - " (a)[21] = complexMul((a)[21], (float2)(0x1.1c73b4p-1f, dir*0x1.a9b662p-1f)); \\\n" - " (a)[22] = complexMul((a)[22], (float2)(0x1.87de2ap-2f, dir*0x1.d906bcp-1f)); \\\n" - " (a)[23] = complexMul((a)[23], (float2)(0x1.8f8b84p-3f, dir*0x1.f6297cp-1f)); \\\n" - " (a)[24] = complexMul((a)[24], (float2)(0x0p+0f, dir*0x1p+0f)); \\\n" - " (a)[25] = complexMul((a)[25], (float2)(-0x1.8f8b84p-3f, dir*0x1.f6297cp-1f)); \\\n" - " (a)[26] = complexMul((a)[26], (float2)(-0x1.87de2ap-2f, dir*0x1.d906bcp-1f)); \\\n" - " (a)[27] = complexMul((a)[27], (float2)(-0x1.1c73b4p-1f, dir*0x1.a9b662p-1f)); \\\n" - " (a)[28] = complexMul((a)[28], (float2)(-0x1.6a09e6p-1f, dir*0x1.6a09e6p-1f)); \\\n" - " (a)[29] = complexMul((a)[29], (float2)(-0x1.a9b662p-1f, dir*0x1.1c73b4p-1f)); \\\n" - " (a)[30] = complexMul((a)[30], (float2)(-0x1.d906bcp-1f, dir*0x1.87de2ap-2f)); \\\n" - " (a)[31] = complexMul((a)[31], (float2)(-0x1.f6297cp-1f, dir*0x1.8f8b84p-3f)); \\\n" - " fftKernel16((a), dir); \\\n" - " fftKernel16((a) + 16, dir); \\\n" - " bitreverse32((a)); \\\n" - "}\n\n" -); +#ifndef __CL_FFT_BASE_KERNELS_ +#define __CL_FFT_BASE_KERNELS_ -static string twistKernelInterleaved = string -( - "__kernel void \\\n" - "clFFT_1DTwistInterleaved(__global float2 *in, unsigned int startRow, unsigned int numCols, unsigned int N, unsigned int numRowsToProcess, int dir) \\\n" - "{ \\\n" - " float2 a, w; \\\n" - " float ang; \\\n" - " unsigned int j; \\\n" - " unsigned int i = get_global_id(0); \\\n" - " unsigned int startIndex = i; \\\n" - " \\\n" - " if(i < numCols) \\\n" - " { \\\n" - " for(j = 0; j < numRowsToProcess; j++) \\\n" - " { \\\n" - " a = in[startIndex]; \\\n" - " ang = 2.0f * M_PI * dir * i * (startRow + j) / N; \\\n" - " w = (float2)(native_cos(ang), native_sin(ang)); \\\n" - " a = complexMul(a, w); \\\n" - " in[startIndex] = a; \\\n" - " startIndex += numCols; \\\n" - " } \\\n" - " } \\\n" - "} \\\n" -); +#include <string> -static string twistKernelPlannar = string -( - "__kernel void \\\n" - "clFFT_1DTwistSplit(__global float *in_real, __global float *in_imag , unsigned int startRow, unsigned int numCols, unsigned int N, unsigned int numRowsToProcess, int dir) \\\n" - "{ \\\n" - " float2 a, w; \\\n" - " float ang; \\\n" - " unsigned int j; \\\n" - " unsigned int i = get_global_id(0); \\\n" - " unsigned int startIndex = i; \\\n" - " \\\n" - " if(i < numCols) \\\n" - " { \\\n" - " for(j = 0; j < numRowsToProcess; j++) \\\n" - " { \\\n" - " a = (float2)(in_real[startIndex], in_imag[startIndex]); \\\n" - " ang = 2.0f * M_PI * dir * i * (startRow + j) / N; \\\n" - " w = (float2)(native_cos(ang), native_sin(ang)); \\\n" - " a = complexMul(a, w); \\\n" - " in_real[startIndex] = a.x; \\\n" - " in_imag[startIndex] = a.y; \\\n" - " startIndex += numCols; \\\n" - " } \\\n" - " } \\\n" - "} \\\n" -); +using namespace std; +static string baseKernels = string( + "#ifndef M_PI\n" + "#define M_PI 0x1.921fb54442d18p+1\n" + "#endif\n" + "#define complexMul(a,b) ((float2)(mad(-(a).y, (b).y, (a).x * (b).x), mad((a).y, (b).x, (a).x * (b).y)))\n" + "#define conj(a) ((float2)((a).x, -(a).y))\n" + "#define conjTransp(a) ((float2)(-(a).y, (a).x))\n" + "\n" + "#define fftKernel2(a,dir) \\\n" + "{ \\\n" + " float2 c = (a)[0]; \\\n" + " (a)[0] = c + (a)[1]; \\\n" + " (a)[1] = c - (a)[1]; \\\n" + "}\n" + "\n" + "#define fftKernel2S(d1,d2,dir) \\\n" + "{ \\\n" + " float2 c = (d1); \\\n" + " (d1) = c + (d2); \\\n" + " (d2) = c - (d2); \\\n" + "}\n" + "\n" + "#define fftKernel4(a,dir) \\\n" + "{ \\\n" + " fftKernel2S((a)[0], (a)[2], dir); \\\n" + " fftKernel2S((a)[1], (a)[3], dir); \\\n" + " fftKernel2S((a)[0], (a)[1], dir); \\\n" + " (a)[3] = (float2)(dir)*(conjTransp((a)[3])); \\\n" + " fftKernel2S((a)[2], (a)[3], dir); \\\n" + " float2 c = (a)[1]; \\\n" + " (a)[1] = (a)[2]; \\\n" + " (a)[2] = c; \\\n" + "}\n" + "\n" + "#define fftKernel4s(a0,a1,a2,a3,dir) \\\n" + "{ \\\n" + " fftKernel2S((a0), (a2), dir); \\\n" + " fftKernel2S((a1), (a3), dir); \\\n" + " fftKernel2S((a0), (a1), dir); \\\n" + " (a3) = (float2)(dir)*(conjTransp((a3))); \\\n" + " fftKernel2S((a2), (a3), dir); \\\n" + " float2 c = (a1); \\\n" + " (a1) = (a2); \\\n" + " (a2) = c; \\\n" + "}\n" + "\n" + "#define bitreverse8(a) \\\n" + "{ \\\n" + " float2 c; \\\n" + " c = (a)[1]; \\\n" + " (a)[1] = (a)[4]; \\\n" + " (a)[4] = c; \\\n" + " c = (a)[3]; \\\n" + " (a)[3] = (a)[6]; \\\n" + " (a)[6] = c; \\\n" + "}\n" + "\n" + "#define fftKernel8(a,dir) \\\n" + "{ \\\n" + " const float2 w1 = (float2)(0x1.6a09e6p-1f, dir*0x1.6a09e6p-1f); \\\n" + " const float2 w3 = (float2)(-0x1.6a09e6p-1f, dir*0x1.6a09e6p-1f); \\\n" + " float2 c; \\\n" + " fftKernel2S((a)[0], (a)[4], dir); \\\n" + " fftKernel2S((a)[1], (a)[5], dir); \\\n" + " fftKernel2S((a)[2], (a)[6], dir); \\\n" + " fftKernel2S((a)[3], (a)[7], dir); \\\n" + " (a)[5] = complexMul(w1, (a)[5]); \\\n" + " (a)[6] = (float2)(dir)*(conjTransp((a)[6])); \\\n" + " (a)[7] = complexMul(w3, (a)[7]); \\\n" + " fftKernel2S((a)[0], (a)[2], dir); \\\n" + " fftKernel2S((a)[1], (a)[3], dir); \\\n" + " fftKernel2S((a)[4], (a)[6], dir); \\\n" + " fftKernel2S((a)[5], (a)[7], dir); \\\n" + " (a)[3] = (float2)(dir)*(conjTransp((a)[3])); \\\n" + " (a)[7] = (float2)(dir)*(conjTransp((a)[7])); \\\n" + " fftKernel2S((a)[0], (a)[1], dir); \\\n" + " fftKernel2S((a)[2], (a)[3], dir); \\\n" + " fftKernel2S((a)[4], (a)[5], dir); \\\n" + " fftKernel2S((a)[6], (a)[7], dir); \\\n" + " bitreverse8((a)); \\\n" + "}\n" + "\n" + "#define bitreverse4x4(a) \\\n" + "{ \\\n" + " float2 c; \\\n" + " c = (a)[1]; (a)[1] = (a)[4]; (a)[4] = c; \\\n" + " c = (a)[2]; (a)[2] = (a)[8]; (a)[8] = c; \\\n" + " c = (a)[3]; (a)[3] = (a)[12]; (a)[12] = c; \\\n" + " c = (a)[6]; (a)[6] = (a)[9]; (a)[9] = c; \\\n" + " c = (a)[7]; (a)[7] = (a)[13]; (a)[13] = c; \\\n" + " c = (a)[11]; (a)[11] = (a)[14]; (a)[14] = c; \\\n" + "}\n" + "\n" + "#define fftKernel16(a,dir) \\\n" + "{ \\\n" + " const float w0 = 0x1.d906bcp-1f; \\\n" + " const float w1 = 0x1.87de2ap-2f; \\\n" + " const float w2 = 0x1.6a09e6p-1f; \\\n" + " fftKernel4s((a)[0], (a)[4], (a)[8], (a)[12], dir); \\\n" + " fftKernel4s((a)[1], (a)[5], (a)[9], (a)[13], dir); \\\n" + " fftKernel4s((a)[2], (a)[6], (a)[10], (a)[14], dir); \\\n" + " fftKernel4s((a)[3], (a)[7], (a)[11], (a)[15], dir); \\\n" + " (a)[5] = complexMul((a)[5], (float2)(w0, dir*w1)); \\\n" + " (a)[6] = complexMul((a)[6], (float2)(w2, dir*w2)); \\\n" + " (a)[7] = complexMul((a)[7], (float2)(w1, dir*w0)); \\\n" + " (a)[9] = complexMul((a)[9], (float2)(w2, dir*w2)); \\\n" + " (a)[10] = (float2)(dir)*(conjTransp((a)[10])); \\\n" + " (a)[11] = complexMul((a)[11], (float2)(-w2, dir*w2)); \\\n" + " (a)[13] = complexMul((a)[13], (float2)(w1, dir*w0)); \\\n" + " (a)[14] = complexMul((a)[14], (float2)(-w2, dir*w2)); \\\n" + " (a)[15] = complexMul((a)[15], (float2)(-w0, dir*-w1)); \\\n" + " fftKernel4((a), dir); \\\n" + " fftKernel4((a) + 4, dir); \\\n" + " fftKernel4((a) + 8, dir); \\\n" + " fftKernel4((a) + 12, dir); \\\n" + " bitreverse4x4((a)); \\\n" + "}\n" + "\n" + "#define bitreverse32(a) \\\n" + "{ \\\n" + " float2 c1, c2; \\\n" + " c1 = (a)[2]; (a)[2] = (a)[1]; c2 = (a)[4]; (a)[4] = c1; c1 = (a)[8]; (a)[8] = c2; c2 = (a)[16]; (a)[16] = c1; (a)[1] = c2; \\\n" + " c1 = (a)[6]; (a)[6] = (a)[3]; c2 = (a)[12]; (a)[12] = c1; c1 = (a)[24]; (a)[24] = c2; c2 = (a)[17]; (a)[17] = c1; (a)[3] = c2; \\\n" + " c1 = (a)[10]; (a)[10] = (a)[5]; c2 = (a)[20]; (a)[20] = c1; c1 = (a)[9]; (a)[9] = c2; c2 = (a)[18]; (a)[18] = c1; (a)[5] = c2; \\\n" + " c1 = (a)[14]; (a)[14] = (a)[7]; c2 = (a)[28]; (a)[28] = c1; c1 = (a)[25]; (a)[25] = c2; c2 = (a)[19]; (a)[19] = c1; (a)[7] = c2; \\\n" + " c1 = (a)[22]; (a)[22] = (a)[11]; c2 = (a)[13]; (a)[13] = c1; c1 = (a)[26]; (a)[26] = c2; c2 = (a)[21]; (a)[21] = c1; (a)[11] = c2; \\\n" + " c1 = (a)[30]; (a)[30] = (a)[15]; c2 = (a)[29]; (a)[29] = c1; c1 = (a)[27]; (a)[27] = c2; c2 = (a)[23]; (a)[23] = c1; (a)[15] = c2; \\\n" + "}\n" + "\n" + "#define fftKernel32(a,dir) \\\n" + "{ \\\n" + " fftKernel2S((a)[0], (a)[16], dir); \\\n" + " fftKernel2S((a)[1], (a)[17], dir); \\\n" + " fftKernel2S((a)[2], (a)[18], dir); \\\n" + " fftKernel2S((a)[3], (a)[19], dir); \\\n" + " fftKernel2S((a)[4], (a)[20], dir); \\\n" + " fftKernel2S((a)[5], (a)[21], dir); \\\n" + " fftKernel2S((a)[6], (a)[22], dir); \\\n" + " fftKernel2S((a)[7], (a)[23], dir); \\\n" + " fftKernel2S((a)[8], (a)[24], dir); \\\n" + " fftKernel2S((a)[9], (a)[25], dir); \\\n" + " fftKernel2S((a)[10], (a)[26], dir); \\\n" + " fftKernel2S((a)[11], (a)[27], dir); \\\n" + " fftKernel2S((a)[12], (a)[28], dir); \\\n" + " fftKernel2S((a)[13], (a)[29], dir); \\\n" + " fftKernel2S((a)[14], (a)[30], dir); \\\n" + " fftKernel2S((a)[15], (a)[31], dir); \\\n" + " (a)[17] = complexMul((a)[17], (float2)(0x1.f6297cp-1f, dir*0x1.8f8b84p-3f)); \\\n" + " (a)[18] = complexMul((a)[18], (float2)(0x1.d906bcp-1f, dir*0x1.87de2ap-2f)); \\\n" + " (a)[19] = complexMul((a)[19], (float2)(0x1.a9b662p-1f, dir*0x1.1c73b4p-1f)); \\\n" + " (a)[20] = complexMul((a)[20], (float2)(0x1.6a09e6p-1f, dir*0x1.6a09e6p-1f)); \\\n" + " (a)[21] = complexMul((a)[21], (float2)(0x1.1c73b4p-1f, dir*0x1.a9b662p-1f)); \\\n" + " (a)[22] = complexMul((a)[22], (float2)(0x1.87de2ap-2f, dir*0x1.d906bcp-1f)); \\\n" + " (a)[23] = complexMul((a)[23], (float2)(0x1.8f8b84p-3f, dir*0x1.f6297cp-1f)); \\\n" + " (a)[24] = complexMul((a)[24], (float2)(0x0p+0f, dir*0x1p+0f)); \\\n" + " (a)[25] = complexMul((a)[25], (float2)(-0x1.8f8b84p-3f, dir*0x1.f6297cp-1f)); \\\n" + " (a)[26] = complexMul((a)[26], (float2)(-0x1.87de2ap-2f, dir*0x1.d906bcp-1f)); \\\n" + " (a)[27] = complexMul((a)[27], (float2)(-0x1.1c73b4p-1f, dir*0x1.a9b662p-1f)); \\\n" + " (a)[28] = complexMul((a)[28], (float2)(-0x1.6a09e6p-1f, dir*0x1.6a09e6p-1f)); \\\n" + " (a)[29] = complexMul((a)[29], (float2)(-0x1.a9b662p-1f, dir*0x1.1c73b4p-1f)); \\\n" + " (a)[30] = complexMul((a)[30], (float2)(-0x1.d906bcp-1f, dir*0x1.87de2ap-2f)); \\\n" + " (a)[31] = complexMul((a)[31], (float2)(-0x1.f6297cp-1f, dir*0x1.8f8b84p-3f)); \\\n" + " fftKernel16((a), dir); \\\n" + " fftKernel16((a) + 16, dir); \\\n" + " bitreverse32((a)); \\\n" + "}\n\n" + ); +static string twistKernelInterleaved = string( + "__kernel void \\\n" + "clFFT_1DTwistInterleaved(__global float2 *in, unsigned int startRow, unsigned int numCols, unsigned int N, unsigned int numRowsToProcess, int dir) \\\n" + "{ \\\n" + " float2 a, w; \\\n" + " float ang; \\\n" + " unsigned int j; \\\n" + " unsigned int i = get_global_id(0); \\\n" + " unsigned int startIndex = i; \\\n" + " \\\n" + " if(i < numCols) \\\n" + " { \\\n" + " for(j = 0; j < numRowsToProcess; j++) \\\n" + " { \\\n" + " a = in[startIndex]; \\\n" + " ang = 2.0f * M_PI * dir * i * (startRow + j) / N; \\\n" + " w = (float2)(native_cos(ang), native_sin(ang)); \\\n" + " a = complexMul(a, w); \\\n" + " in[startIndex] = a; \\\n" + " startIndex += numCols; \\\n" + " } \\\n" + " } \\\n" + "} \\\n" + ); -#endif +static string twistKernelPlannar = string( + "__kernel void \\\n" + "clFFT_1DTwistSplit(__global float *in_real, __global float *in_imag , unsigned int startRow, unsigned int numCols, unsigned int N, unsigned int numRowsToProcess, int dir) \\\n" + "{ \\\n" + " float2 a, w; \\\n" + " float ang; \\\n" + " unsigned int j; \\\n" + " unsigned int i = get_global_id(0); \\\n" + " unsigned int startIndex = i; \\\n" + " \\\n" + " if(i < numCols) \\\n" + " { \\\n" + " for(j = 0; j < numRowsToProcess; j++) \\\n" + " { \\\n" + " a = (float2)(in_real[startIndex], in_imag[startIndex]); \\\n" + " ang = 2.0f * M_PI * dir * i * (startRow + j) / N; \\\n" + " w = (float2)(native_cos(ang), native_sin(ang)); \\\n" + " a = complexMul(a, w); \\\n" + " in_real[startIndex] = a.x; \\\n" + " in_imag[startIndex] = a.y; \\\n" + " startIndex += numCols; \\\n" + " } \\\n" + " } \\\n" + "} \\\n" + ); + + + +#endif diff --git a/src/fft_execute.cpp b/src/fft_execute.cpp index 9ab1f1342d54fb76e5df7a609f0765c3a8d343fb..90d24dde0686aebb091021490953131123c3908c 100644 --- a/src/fft_execute.cpp +++ b/src/fft_execute.cpp @@ -1,357 +1,405 @@ -#include "fft_internal.h" -#include <clFFT.h> -#include <stdlib.h> -#include <stdio.h> -#include <math.h> - -#define max(a,b) (((a)>(b)) ? (a) : (b)) -#define min(a,b) (((a)<(b)) ? (a) : (b)) - -static cl_int -allocateTemporaryBufferInterleaved(cl_fft_plan *plan, cl_uint batchSize) -{ - cl_int err = CL_SUCCESS; - if(plan->temp_buffer_needed && plan->last_batch_size != batchSize) - { - plan->last_batch_size = batchSize; - size_t tmpLength = plan->n.x * plan->n.y * plan->n.z * batchSize * 2 * sizeof(cl_float); - - if(plan->tempmemobj) - clReleaseMemObject(plan->tempmemobj); - - plan->tempmemobj = clCreateBuffer(plan->context, CL_MEM_READ_WRITE, tmpLength, NULL, &err); - } - return err; -} - -static cl_int -allocateTemporaryBufferPlannar(cl_fft_plan *plan, cl_uint batchSize) -{ - cl_int err = CL_SUCCESS; - cl_int terr; - if(plan->temp_buffer_needed && plan->last_batch_size != batchSize) - { - plan->last_batch_size = batchSize; - size_t tmpLength = plan->n.x * plan->n.y * plan->n.z * batchSize * sizeof(cl_float); - - if(plan->tempmemobj_real) - clReleaseMemObject(plan->tempmemobj_real); - - if(plan->tempmemobj_imag) - clReleaseMemObject(plan->tempmemobj_imag); - - plan->tempmemobj_real = clCreateBuffer(plan->context, CL_MEM_READ_WRITE, tmpLength, NULL, &err); - plan->tempmemobj_imag = clCreateBuffer(plan->context, CL_MEM_READ_WRITE, tmpLength, NULL, &terr); - err |= terr; - } - return err; -} - -void -getKernelWorkDimensions(cl_fft_plan *plan, cl_fft_kernel_info *kernelInfo, cl_int *batchSize, size_t *gWorkItems, size_t *lWorkItems) -{ - *lWorkItems = kernelInfo->num_workitems_per_workgroup; - int numWorkGroups = kernelInfo->num_workgroups; - int numXFormsPerWG = kernelInfo->num_xforms_per_workgroup; - - switch(kernelInfo->dir) - { - case cl_fft_kernel_x: - *batchSize *= (plan->n.y * plan->n.z); - numWorkGroups = (*batchSize % numXFormsPerWG) ? (*batchSize/numXFormsPerWG + 1) : (*batchSize/numXFormsPerWG); - numWorkGroups *= kernelInfo->num_workgroups; - break; - case cl_fft_kernel_y: - *batchSize *= plan->n.z; - numWorkGroups *= *batchSize; - break; - case cl_fft_kernel_z: - numWorkGroups *= *batchSize; - break; - } - - *gWorkItems = numWorkGroups * *lWorkItems; -} - -cl_int -clFFT_ExecuteInterleaved( cl_command_queue queue, clFFT_Plan Plan, cl_int batchSize, clFFT_Direction dir, - cl_mem data_in, cl_mem data_out, - cl_int num_events, cl_event *event_list, cl_event *event ) -{ - int s; - cl_fft_plan *plan = (cl_fft_plan *) Plan; - if(plan->format != clFFT_InterleavedComplexFormat) - return CL_INVALID_VALUE; - - cl_int err; - size_t gWorkItems, lWorkItems; - int inPlaceDone = 0; - - cl_int isInPlace = data_in == data_out ? 1 : 0; - - if((err = allocateTemporaryBufferInterleaved(plan, batchSize)) != CL_SUCCESS) - return err; - - cl_mem memObj[3]; - memObj[0] = data_in; - memObj[1] = data_out; - memObj[2] = plan->tempmemobj; - cl_fft_kernel_info *kernelInfo = plan->kernel_info; - int numKernels = plan->num_kernels; - - int numKernelsOdd = numKernels & 1; - int currRead = 0; - int currWrite = 1; - - // at least one external dram shuffle (transpose) required - if(plan->temp_buffer_needed) - { - // in-place transform - if(isInPlace) - { - inPlaceDone = 0; - currRead = 1; - currWrite = 2; - } - else - { - currWrite = (numKernels & 1) ? 1 : 2; - } - - while(kernelInfo) - { - if( isInPlace && numKernelsOdd && !inPlaceDone && kernelInfo->in_place_possible) - { - currWrite = currRead; - inPlaceDone = 1; - } - - s = batchSize; - getKernelWorkDimensions(plan, kernelInfo, &s, &gWorkItems, &lWorkItems); - err |= clSetKernelArg(kernelInfo->kernel, 0, sizeof(cl_mem), &memObj[currRead]); - err |= clSetKernelArg(kernelInfo->kernel, 1, sizeof(cl_mem), &memObj[currWrite]); - err |= clSetKernelArg(kernelInfo->kernel, 2, sizeof(cl_int), &dir); - err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_int), &s); - - err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, 0, NULL, NULL); - if(err) - return err; - - currRead = (currWrite == 1) ? 1 : 2; - currWrite = (currWrite == 1) ? 2 : 1; - - kernelInfo = kernelInfo->next; - } - } - // no dram shuffle (transpose required) transform - // all kernels can execute in-place. - else { - - while(kernelInfo) - { - s = batchSize; - getKernelWorkDimensions(plan, kernelInfo, &s, &gWorkItems, &lWorkItems); - err |= clSetKernelArg(kernelInfo->kernel, 0, sizeof(cl_mem), &memObj[currRead]); - err |= clSetKernelArg(kernelInfo->kernel, 1, sizeof(cl_mem), &memObj[currWrite]); - err |= clSetKernelArg(kernelInfo->kernel, 2, sizeof(cl_int), &dir); - err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_int), &s); - - err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, 0, NULL, NULL); - if(err) - return err; - - currRead = 1; - currWrite = 1; - - kernelInfo = kernelInfo->next; - } - } - - return err; -} - -cl_int -clFFT_ExecutePlannar( cl_command_queue queue, clFFT_Plan Plan, cl_int batchSize, clFFT_Direction dir, - cl_mem data_in_real, cl_mem data_in_imag, cl_mem data_out_real, cl_mem data_out_imag, - cl_int num_events, cl_event *event_list, cl_event *event) -{ - int s; - cl_fft_plan *plan = (cl_fft_plan *) Plan; - - if(plan->format != clFFT_SplitComplexFormat) - return CL_INVALID_VALUE; - - cl_int err; - size_t gWorkItems, lWorkItems; - int inPlaceDone = 0; - - cl_int isInPlace = ((data_in_real == data_out_real) && (data_in_imag == data_out_imag)) ? 1 : 0; - - if((err = allocateTemporaryBufferPlannar(plan, batchSize)) != CL_SUCCESS) - return err; - - cl_mem memObj_real[3]; - cl_mem memObj_imag[3]; - memObj_real[0] = data_in_real; - memObj_real[1] = data_out_real; - memObj_real[2] = plan->tempmemobj_real; - memObj_imag[0] = data_in_imag; - memObj_imag[1] = data_out_imag; - memObj_imag[2] = plan->tempmemobj_imag; - - cl_fft_kernel_info *kernelInfo = plan->kernel_info; - int numKernels = plan->num_kernels; - - int numKernelsOdd = numKernels & 1; - int currRead = 0; - int currWrite = 1; - - // at least one external dram shuffle (transpose) required - if(plan->temp_buffer_needed) - { - // in-place transform - if(isInPlace) - { - inPlaceDone = 0; - currRead = 1; - currWrite = 2; - } - else - { - currWrite = (numKernels & 1) ? 1 : 2; - } - - while(kernelInfo) - { - if( isInPlace && numKernelsOdd && !inPlaceDone && kernelInfo->in_place_possible) - { - currWrite = currRead; - inPlaceDone = 1; - } - - s = batchSize; - getKernelWorkDimensions(plan, kernelInfo, &s, &gWorkItems, &lWorkItems); - err |= clSetKernelArg(kernelInfo->kernel, 0, sizeof(cl_mem), &memObj_real[currRead]); - err |= clSetKernelArg(kernelInfo->kernel, 1, sizeof(cl_mem), &memObj_imag[currRead]); - err |= clSetKernelArg(kernelInfo->kernel, 2, sizeof(cl_mem), &memObj_real[currWrite]); - err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_mem), &memObj_imag[currWrite]); - err |= clSetKernelArg(kernelInfo->kernel, 4, sizeof(cl_int), &dir); - err |= clSetKernelArg(kernelInfo->kernel, 5, sizeof(cl_int), &s); - - err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, 0, NULL, NULL); - if(err) - return err; - - currRead = (currWrite == 1) ? 1 : 2; - currWrite = (currWrite == 1) ? 2 : 1; - - kernelInfo = kernelInfo->next; - } - } - // no dram shuffle (transpose required) transform - else { - - while(kernelInfo) - { - s = batchSize; - getKernelWorkDimensions(plan, kernelInfo, &s, &gWorkItems, &lWorkItems); - err |= clSetKernelArg(kernelInfo->kernel, 0, sizeof(cl_mem), &memObj_real[currRead]); - err |= clSetKernelArg(kernelInfo->kernel, 1, sizeof(cl_mem), &memObj_imag[currRead]); - err |= clSetKernelArg(kernelInfo->kernel, 2, sizeof(cl_mem), &memObj_real[currWrite]); - err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_mem), &memObj_imag[currWrite]); - err |= clSetKernelArg(kernelInfo->kernel, 4, sizeof(cl_int), &dir); - err |= clSetKernelArg(kernelInfo->kernel, 5, sizeof(cl_int), &s); - - err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, 0, NULL, NULL); - if(err) - return err; - - currRead = 1; - currWrite = 1; - - kernelInfo = kernelInfo->next; - } - } - - return err; -} - -cl_int -clFFT_1DTwistInterleaved(clFFT_Plan Plan, cl_command_queue queue, cl_mem array, - size_t numRows, size_t numCols, size_t startRow, size_t rowsToProcess, clFFT_Direction dir) -{ - puts ("X"); - cl_fft_plan *plan = (cl_fft_plan *) Plan; - - unsigned int N = numRows*numCols; - unsigned int nCols = numCols; - unsigned int sRow = startRow; - unsigned int rToProcess = rowsToProcess; - int d = dir; - int err = 0; - - cl_device_id device_id; - err = clGetCommandQueueInfo(queue, CL_QUEUE_DEVICE, sizeof(cl_device_id), &device_id, NULL); - if(err) - return err; - - size_t gSize; - err = clGetKernelWorkGroupInfo(plan->twist_kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &gSize, NULL); - if(err) - return err; - - gSize = min(128, gSize); - size_t numGlobalThreads[1] = { max(numCols / gSize, 1)*gSize }; - size_t numLocalThreads[1] = { gSize }; - - err |= clSetKernelArg(plan->twist_kernel, 0, sizeof(cl_mem), &array); - err |= clSetKernelArg(plan->twist_kernel, 1, sizeof(unsigned int), &sRow); - err |= clSetKernelArg(plan->twist_kernel, 2, sizeof(unsigned int), &nCols); - err |= clSetKernelArg(plan->twist_kernel, 3, sizeof(unsigned int), &N); - err |= clSetKernelArg(plan->twist_kernel, 4, sizeof(unsigned int), &rToProcess); - err |= clSetKernelArg(plan->twist_kernel, 5, sizeof(int), &d); - - err |= clEnqueueNDRangeKernel(queue, plan->twist_kernel, 1, NULL, numGlobalThreads, numLocalThreads, 0, NULL, NULL); - - return err; -} -cl_int -clFFT_1DTwistPlannar(clFFT_Plan Plan, cl_command_queue queue, cl_mem array_real, cl_mem array_imag, - size_t numRows, size_t numCols, size_t startRow, size_t rowsToProcess, clFFT_Direction dir) -{ - puts ("Y"); - cl_fft_plan *plan = (cl_fft_plan *) Plan; - - unsigned int N = numRows*numCols; - unsigned int nCols = numCols; - unsigned int sRow = startRow; - unsigned int rToProcess = rowsToProcess; - int d = dir; - int err = 0; - - cl_device_id device_id; - err = clGetCommandQueueInfo(queue, CL_QUEUE_DEVICE, sizeof(cl_device_id), &device_id, NULL); - if(err) - return err; - - size_t gSize; - err = clGetKernelWorkGroupInfo(plan->twist_kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &gSize, NULL); - if(err) - return err; - - gSize = min(128, gSize); - size_t numGlobalThreads[1] = { max(numCols / gSize, 1)*gSize }; - size_t numLocalThreads[1] = { gSize }; - - err |= clSetKernelArg(plan->twist_kernel, 0, sizeof(cl_mem), &array_real); - err |= clSetKernelArg(plan->twist_kernel, 1, sizeof(cl_mem), &array_imag); - err |= clSetKernelArg(plan->twist_kernel, 2, sizeof(unsigned int), &sRow); - err |= clSetKernelArg(plan->twist_kernel, 3, sizeof(unsigned int), &nCols); - err |= clSetKernelArg(plan->twist_kernel, 4, sizeof(unsigned int), &N); - err |= clSetKernelArg(plan->twist_kernel, 5, sizeof(unsigned int), &rToProcess); - err |= clSetKernelArg(plan->twist_kernel, 6, sizeof(int), &d); - - err |= clEnqueueNDRangeKernel(queue, plan->twist_kernel, 1, NULL, numGlobalThreads, numLocalThreads, 0, NULL, NULL); - - return err; -} + +// +// File: fft_execute.cpp +// +// Version: <1.0> +// +// Disclaimer: IMPORTANT: This Apple software is supplied to you by Apple Inc. ("Apple") +// in consideration of your agreement to the following terms, and your use, +// installation, modification or redistribution of this Apple software +// constitutes acceptance of these terms. If you do not agree with these +// terms, please do not use, install, modify or redistribute this Apple +// software.¬ +// +// In consideration of your agreement to abide by the following terms, and +// subject to these terms, Apple grants you a personal, non - exclusive +// license, under Apple's copyrights in this original Apple software ( the +// "Apple Software" ), to use, reproduce, modify and redistribute the Apple +// Software, with or without modifications, in source and / or binary forms; +// provided that if you redistribute the Apple Software in its entirety and +// without modifications, you must retain this notice and the following text +// and disclaimers in all such redistributions of the Apple Software. Neither +// the name, trademarks, service marks or logos of Apple Inc. may be used to +// endorse or promote products derived from the Apple Software without specific +// prior written permission from Apple. Except as expressly stated in this +// notice, no other rights or licenses, express or implied, are granted by +// Apple herein, including but not limited to any patent rights that may be +// infringed by your derivative works or by other works in which the Apple +// Software may be incorporated. +// +// The Apple Software is provided by Apple on an "AS IS" basis. APPLE MAKES NO +// WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED +// WARRANTIES OF NON - INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A +// PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION +// ALONE OR IN COMBINATION WITH YOUR PRODUCTS. +// +// IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR +// CONSEQUENTIAL DAMAGES ( INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION ) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION +// AND / OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER +// UNDER THEORY OF CONTRACT, TORT ( INCLUDING NEGLIGENCE ), STRICT LIABILITY OR +// OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Copyright ( C ) 2008 Apple Inc. All Rights Reserved. +// +//////////////////////////////////////////////////////////////////////////////////////////////////// + + +#include "fft_internal.h" +#include "clFFT.h" +#include <stdlib.h> +#include <stdio.h> +#include <math.h> + +#define max(a,b) (((a)>(b)) ? (a) : (b)) +#define min(a,b) (((a)<(b)) ? (a) : (b)) + +static cl_int +allocateTemporaryBufferInterleaved(cl_fft_plan *plan, cl_uint batchSize) +{ + cl_int err = CL_SUCCESS; + if(plan->temp_buffer_needed && plan->last_batch_size != batchSize) + { + plan->last_batch_size = batchSize; + size_t tmpLength = plan->n.x * plan->n.y * plan->n.z * batchSize * 2 * sizeof(cl_float); + + if(plan->tempmemobj) + clReleaseMemObject(plan->tempmemobj); + + plan->tempmemobj = clCreateBuffer(plan->context, CL_MEM_READ_WRITE, tmpLength, NULL, &err); + } + return err; +} + +static cl_int +allocateTemporaryBufferPlannar(cl_fft_plan *plan, cl_uint batchSize) +{ + cl_int err = CL_SUCCESS; + cl_int terr; + if(plan->temp_buffer_needed && plan->last_batch_size != batchSize) + { + plan->last_batch_size = batchSize; + size_t tmpLength = plan->n.x * plan->n.y * plan->n.z * batchSize * sizeof(cl_float); + + if(plan->tempmemobj_real) + clReleaseMemObject(plan->tempmemobj_real); + + if(plan->tempmemobj_imag) + clReleaseMemObject(plan->tempmemobj_imag); + + plan->tempmemobj_real = clCreateBuffer(plan->context, CL_MEM_READ_WRITE, tmpLength, NULL, &err); + plan->tempmemobj_imag = clCreateBuffer(plan->context, CL_MEM_READ_WRITE, tmpLength, NULL, &terr); + err |= terr; + } + return err; +} + +void +getKernelWorkDimensions(cl_fft_plan *plan, cl_fft_kernel_info *kernelInfo, cl_int *batchSize, size_t *gWorkItems, size_t *lWorkItems) +{ + *lWorkItems = kernelInfo->num_workitems_per_workgroup; + int numWorkGroups = kernelInfo->num_workgroups; + int numXFormsPerWG = kernelInfo->num_xforms_per_workgroup; + + switch(kernelInfo->dir) + { + case cl_fft_kernel_x: + *batchSize *= (plan->n.y * plan->n.z); + numWorkGroups = (*batchSize % numXFormsPerWG) ? (*batchSize/numXFormsPerWG + 1) : (*batchSize/numXFormsPerWG); + numWorkGroups *= kernelInfo->num_workgroups; + break; + case cl_fft_kernel_y: + *batchSize *= plan->n.z; + numWorkGroups *= *batchSize; + break; + case cl_fft_kernel_z: + numWorkGroups *= *batchSize; + break; + } + + *gWorkItems = numWorkGroups * *lWorkItems; +} + +cl_int +clFFT_ExecuteInterleaved( cl_command_queue queue, clFFT_Plan Plan, cl_int batchSize, clFFT_Direction dir, + cl_mem data_in, cl_mem data_out, + cl_int num_events, cl_event *event_list, cl_event *event ) +{ + int s; + cl_fft_plan *plan = (cl_fft_plan *) Plan; + if(plan->format != clFFT_InterleavedComplexFormat) + return CL_INVALID_VALUE; + + cl_int err; + size_t gWorkItems, lWorkItems; + int inPlaceDone; + + cl_int isInPlace = data_in == data_out ? 1 : 0; + + if((err = allocateTemporaryBufferInterleaved(plan, batchSize)) != CL_SUCCESS) + return err; + + cl_mem memObj[3]; + memObj[0] = data_in; + memObj[1] = data_out; + memObj[2] = plan->tempmemobj; + cl_fft_kernel_info *kernelInfo = plan->kernel_info; + int numKernels = plan->num_kernels; + + int numKernelsOdd = numKernels & 1; + int currRead = 0; + int currWrite = 1; + + // at least one external dram shuffle (transpose) required + if(plan->temp_buffer_needed) + { + // in-place transform + if(isInPlace) + { + inPlaceDone = 0; + currRead = 1; + currWrite = 2; + } + else + { + currWrite = (numKernels & 1) ? 1 : 2; + } + + while(kernelInfo) + { + if( isInPlace && numKernelsOdd && !inPlaceDone && kernelInfo->in_place_possible) + { + currWrite = currRead; + inPlaceDone = 1; + } + + s = batchSize; + getKernelWorkDimensions(plan, kernelInfo, &s, &gWorkItems, &lWorkItems); + err |= clSetKernelArg(kernelInfo->kernel, 0, sizeof(cl_mem), &memObj[currRead]); + err |= clSetKernelArg(kernelInfo->kernel, 1, sizeof(cl_mem), &memObj[currWrite]); + err |= clSetKernelArg(kernelInfo->kernel, 2, sizeof(cl_int), &dir); + err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_int), &s); + + err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, 0, NULL, NULL); + if(err) + return err; + + currRead = (currWrite == 1) ? 1 : 2; + currWrite = (currWrite == 1) ? 2 : 1; + + kernelInfo = kernelInfo->next; + } + } + // no dram shuffle (transpose required) transform + // all kernels can execute in-place. + else { + + while(kernelInfo) + { + s = batchSize; + getKernelWorkDimensions(plan, kernelInfo, &s, &gWorkItems, &lWorkItems); + err |= clSetKernelArg(kernelInfo->kernel, 0, sizeof(cl_mem), &memObj[currRead]); + err |= clSetKernelArg(kernelInfo->kernel, 1, sizeof(cl_mem), &memObj[currWrite]); + err |= clSetKernelArg(kernelInfo->kernel, 2, sizeof(cl_int), &dir); + err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_int), &s); + + err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, 0, NULL, NULL); + if(err) + return err; + + currRead = 1; + currWrite = 1; + + kernelInfo = kernelInfo->next; + } + } + + return err; +} + +cl_int +clFFT_ExecutePlannar( cl_command_queue queue, clFFT_Plan Plan, cl_int batchSize, clFFT_Direction dir, + cl_mem data_in_real, cl_mem data_in_imag, cl_mem data_out_real, cl_mem data_out_imag, + cl_int num_events, cl_event *event_list, cl_event *event) +{ + int s; + cl_fft_plan *plan = (cl_fft_plan *) Plan; + + if(plan->format != clFFT_SplitComplexFormat) + return CL_INVALID_VALUE; + + cl_int err; + size_t gWorkItems, lWorkItems; + int inPlaceDone; + + cl_int isInPlace = ((data_in_real == data_out_real) && (data_in_imag == data_out_imag)) ? 1 : 0; + + if((err = allocateTemporaryBufferPlannar(plan, batchSize)) != CL_SUCCESS) + return err; + + cl_mem memObj_real[3]; + cl_mem memObj_imag[3]; + memObj_real[0] = data_in_real; + memObj_real[1] = data_out_real; + memObj_real[2] = plan->tempmemobj_real; + memObj_imag[0] = data_in_imag; + memObj_imag[1] = data_out_imag; + memObj_imag[2] = plan->tempmemobj_imag; + + cl_fft_kernel_info *kernelInfo = plan->kernel_info; + int numKernels = plan->num_kernels; + + int numKernelsOdd = numKernels & 1; + int currRead = 0; + int currWrite = 1; + + // at least one external dram shuffle (transpose) required + if(plan->temp_buffer_needed) + { + // in-place transform + if(isInPlace) + { + inPlaceDone = 0; + currRead = 1; + currWrite = 2; + } + else + { + currWrite = (numKernels & 1) ? 1 : 2; + } + + while(kernelInfo) + { + if( isInPlace && numKernelsOdd && !inPlaceDone && kernelInfo->in_place_possible) + { + currWrite = currRead; + inPlaceDone = 1; + } + + s = batchSize; + getKernelWorkDimensions(plan, kernelInfo, &s, &gWorkItems, &lWorkItems); + err |= clSetKernelArg(kernelInfo->kernel, 0, sizeof(cl_mem), &memObj_real[currRead]); + err |= clSetKernelArg(kernelInfo->kernel, 1, sizeof(cl_mem), &memObj_imag[currRead]); + err |= clSetKernelArg(kernelInfo->kernel, 2, sizeof(cl_mem), &memObj_real[currWrite]); + err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_mem), &memObj_imag[currWrite]); + err |= clSetKernelArg(kernelInfo->kernel, 4, sizeof(cl_int), &dir); + err |= clSetKernelArg(kernelInfo->kernel, 5, sizeof(cl_int), &s); + + err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, 0, NULL, NULL); + if(err) + return err; + + currRead = (currWrite == 1) ? 1 : 2; + currWrite = (currWrite == 1) ? 2 : 1; + + kernelInfo = kernelInfo->next; + } + } + // no dram shuffle (transpose required) transform + else { + + while(kernelInfo) + { + s = batchSize; + getKernelWorkDimensions(plan, kernelInfo, &s, &gWorkItems, &lWorkItems); + err |= clSetKernelArg(kernelInfo->kernel, 0, sizeof(cl_mem), &memObj_real[currRead]); + err |= clSetKernelArg(kernelInfo->kernel, 1, sizeof(cl_mem), &memObj_imag[currRead]); + err |= clSetKernelArg(kernelInfo->kernel, 2, sizeof(cl_mem), &memObj_real[currWrite]); + err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_mem), &memObj_imag[currWrite]); + err |= clSetKernelArg(kernelInfo->kernel, 4, sizeof(cl_int), &dir); + err |= clSetKernelArg(kernelInfo->kernel, 5, sizeof(cl_int), &s); + + err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, 0, NULL, NULL); + if(err) + return err; + + currRead = 1; + currWrite = 1; + + kernelInfo = kernelInfo->next; + } + } + + return err; +} + +cl_int +clFFT_1DTwistInterleaved(clFFT_Plan Plan, cl_command_queue queue, cl_mem array, + size_t numRows, size_t numCols, size_t startRow, size_t rowsToProcess, clFFT_Direction dir) +{ + cl_fft_plan *plan = (cl_fft_plan *) Plan; + + unsigned int N = numRows*numCols; + unsigned int nCols = numCols; + unsigned int sRow = startRow; + unsigned int rToProcess = rowsToProcess; + int d = dir; + int err = 0; + + cl_device_id device_id; + err = clGetCommandQueueInfo(queue, CL_QUEUE_DEVICE, sizeof(cl_device_id), &device_id, NULL); + if(err) + return err; + + size_t gSize; + err = clGetKernelWorkGroupInfo(plan->twist_kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &gSize, NULL); + if(err) + return err; + + gSize = min(128, gSize); + size_t numGlobalThreads[1] = { max(numCols / gSize, 1)*gSize }; + size_t numLocalThreads[1] = { gSize }; + + err |= clSetKernelArg(plan->twist_kernel, 0, sizeof(cl_mem), &array); + err |= clSetKernelArg(plan->twist_kernel, 1, sizeof(unsigned int), &sRow); + err |= clSetKernelArg(plan->twist_kernel, 2, sizeof(unsigned int), &nCols); + err |= clSetKernelArg(plan->twist_kernel, 3, sizeof(unsigned int), &N); + err |= clSetKernelArg(plan->twist_kernel, 4, sizeof(unsigned int), &rToProcess); + err |= clSetKernelArg(plan->twist_kernel, 5, sizeof(int), &d); + + err |= clEnqueueNDRangeKernel(queue, plan->twist_kernel, 1, NULL, numGlobalThreads, numLocalThreads, 0, NULL, NULL); + + return err; +} + +cl_int +clFFT_1DTwistPlannar(clFFT_Plan Plan, cl_command_queue queue, cl_mem array_real, cl_mem array_imag, + size_t numRows, size_t numCols, size_t startRow, size_t rowsToProcess, clFFT_Direction dir) +{ + cl_fft_plan *plan = (cl_fft_plan *) Plan; + + unsigned int N = numRows*numCols; + unsigned int nCols = numCols; + unsigned int sRow = startRow; + unsigned int rToProcess = rowsToProcess; + int d = dir; + int err = 0; + + cl_device_id device_id; + err = clGetCommandQueueInfo(queue, CL_QUEUE_DEVICE, sizeof(cl_device_id), &device_id, NULL); + if(err) + return err; + + size_t gSize; + err = clGetKernelWorkGroupInfo(plan->twist_kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &gSize, NULL); + if(err) + return err; + + gSize = min(128, gSize); + size_t numGlobalThreads[1] = { max(numCols / gSize, 1)*gSize }; + size_t numLocalThreads[1] = { gSize }; + + err |= clSetKernelArg(plan->twist_kernel, 0, sizeof(cl_mem), &array_real); + err |= clSetKernelArg(plan->twist_kernel, 1, sizeof(cl_mem), &array_imag); + err |= clSetKernelArg(plan->twist_kernel, 2, sizeof(unsigned int), &sRow); + err |= clSetKernelArg(plan->twist_kernel, 3, sizeof(unsigned int), &nCols); + err |= clSetKernelArg(plan->twist_kernel, 4, sizeof(unsigned int), &N); + err |= clSetKernelArg(plan->twist_kernel, 5, sizeof(unsigned int), &rToProcess); + err |= clSetKernelArg(plan->twist_kernel, 6, sizeof(int), &d); + + err |= clEnqueueNDRangeKernel(queue, plan->twist_kernel, 1, NULL, numGlobalThreads, numLocalThreads, 0, NULL, NULL); + + return err; +} + diff --git a/src/fft_internal.h b/src/fft_internal.h index eafff7c65b6c01d7ba78e95789c57c0a7a4f06f6..a45b69c98af037b2228aa29b4a046b1c4f1cf86f 100644 --- a/src/fft_internal.h +++ b/src/fft_internal.h @@ -1,115 +1,163 @@ -#ifndef __CLFFT_INTERNAL_H -#define __CLFFT_INTERNAL_H - -#include "clFFT.h" -#include <iostream> -#include <string> -#include <sstream> - -using namespace std; - -typedef enum kernel_dir_t -{ - cl_fft_kernel_x, - cl_fft_kernel_y, - cl_fft_kernel_z -}cl_fft_kernel_dir; - -typedef struct kernel_info_t -{ - cl_kernel kernel; - char *kernel_name; - size_t lmem_size; - size_t num_workgroups; - size_t num_xforms_per_workgroup; - size_t num_workitems_per_workgroup; - cl_fft_kernel_dir dir; - int in_place_possible; - kernel_info_t *next; -}cl_fft_kernel_info; - -typedef struct -{ - // context in which fft resources are created and kernels are executed - cl_context context; - - // size of signal - clFFT_Dim3 n; - - // dimension of transform ... must be either 1D, 2D or 3D - clFFT_Dimension dim; - - // data format ... must be either interleaved or plannar - clFFT_DataFormat format; - - // string containing kernel source. Generated at runtime based on - // n, dim, format and other parameters - string *kernel_string; - - // CL program containing source and kernel this particular - // n, dim, data format - cl_program program; - - // linked list of kernels which needs to be executed for this fft - cl_fft_kernel_info *kernel_info; - - // number of kernels - int num_kernels; - - // twist kernel for virtualizing fft of very large sizes that do not - // fit in GPU global memory - cl_kernel twist_kernel; - - // flag indicating if temporary intermediate buffer is needed or not. - // this depends on fft kernels being executed and if transform is - // in-place or out-of-place. e.g. Local memory fft (say 1D 1024 ... - // one that does not require global transpose do not need temporary buffer) - // 2D 1024x1024 out-of-place fft however do require intermediate buffer. - // If temp buffer is needed, its allocation is lazy i.e. its not allocated - // until its needed - cl_int temp_buffer_needed; - - // Batch size is runtime parameter and size of temporary buffer (if needed) - // depends on batch size. Allocation of temporary buffer is lazy i.e. its - // only created when needed. Once its created at first call of clFFT_Executexxx - // it is not allocated next time if next time clFFT_Executexxx is called with - // batch size different than the first call. last_batch_size caches the last - // batch size with which this plan is used so that we dont keep allocating/deallocating - // temp buffer if same batch size is used again and again. - size_t last_batch_size; - - // temporary buffer for interleaved plan - cl_mem tempmemobj; - - // temporary buffer for planner plan. Only one of tempmemobj or - // (tempmemobj_real, tempmemobj_imag) pair is valid (allocated) depending - // data format of plan (plannar or interleaved) - cl_mem tempmemobj_real, tempmemobj_imag; - - // Maximum size of signal for which local memory transposed based - // fft is sufficient i.e. no global mem transpose (communication) - // is needed - size_t max_localmem_fft_size; - - // Maximum work items per work group allowed. This, along with max_radix below controls - // maximum local memory being used by fft kernels of this plan. Set to 256 by default - size_t max_work_item_per_workgroup; - - // Maximum base radix for local memory fft ... this controls the maximum register - // space used by work items. Currently defaults to 16 - size_t max_radix; - - // Device depended parameter that tells how many work-items need to be read consecutive - // values to make sure global memory access by work-items of a work-group result in - // coalesced memory access to utilize full bandwidth e.g. on NVidia tesla, this is 16 - size_t min_mem_coalesce_width; - - // Number of local memory banks. This is used to geneate kernel with local memory - // transposes with appropriate padding to avoid bank conflicts to local memory - // e.g. on NVidia it is 16. - size_t num_local_mem_banks; -}cl_fft_plan; - -void FFT1D(cl_fft_plan *plan, cl_fft_kernel_dir dir); - -#endif + +// +// File: fft_internal.h +// +// Version: <1.0> +// +// Disclaimer: IMPORTANT: This Apple software is supplied to you by Apple Inc. ("Apple") +// in consideration of your agreement to the following terms, and your use, +// installation, modification or redistribution of this Apple software +// constitutes acceptance of these terms. If you do not agree with these +// terms, please do not use, install, modify or redistribute this Apple +// software. +// +// In consideration of your agreement to abide by the following terms, and +// subject to these terms, Apple grants you a personal, non - exclusive +// license, under Apple's copyrights in this original Apple software ( the +// "Apple Software" ), to use, reproduce, modify and redistribute the Apple +// Software, with or without modifications, in source and / or binary forms; +// provided that if you redistribute the Apple Software in its entirety and +// without modifications, you must retain this notice and the following text +// and disclaimers in all such redistributions of the Apple Software. Neither +// the name, trademarks, service marks or logos of Apple Inc. may be used to +// endorse or promote products derived from the Apple Software without specific +// prior written permission from Apple. Except as expressly stated in this +// notice, no other rights or licenses, express or implied, are granted by +// Apple herein, including but not limited to any patent rights that may be +// infringed by your derivative works or by other works in which the Apple +// Software may be incorporated. +// +// The Apple Software is provided by Apple on an "AS IS" basis. APPLE MAKES NO +// WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED +// WARRANTIES OF NON - INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A +// PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION +// ALONE OR IN COMBINATION WITH YOUR PRODUCTS. +// +// IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR +// CONSEQUENTIAL DAMAGES ( INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION ) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION +// AND / OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER +// UNDER THEORY OF CONTRACT, TORT ( INCLUDING NEGLIGENCE ), STRICT LIABILITY OR +// OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Copyright ( C ) 2008 Apple Inc. All Rights Reserved. +// +//////////////////////////////////////////////////////////////////////////////////////////////////// + + +#ifndef __CLFFT_INTERNAL_H +#define __CLFFT_INTERNAL_H + +#include "clFFT.h" +#include <iostream> +#include <string> +#include <sstream> + +using namespace std; + +typedef enum kernel_dir_t +{ + cl_fft_kernel_x, + cl_fft_kernel_y, + cl_fft_kernel_z +}cl_fft_kernel_dir; + +typedef struct kernel_info_t +{ + cl_kernel kernel; + char *kernel_name; + size_t lmem_size; + size_t num_workgroups; + size_t num_xforms_per_workgroup; + size_t num_workitems_per_workgroup; + cl_fft_kernel_dir dir; + int in_place_possible; + kernel_info_t *next; +}cl_fft_kernel_info; + +typedef struct +{ + // context in which fft resources are created and kernels are executed + cl_context context; + + // size of signal + clFFT_Dim3 n; + + // dimension of transform ... must be either 1D, 2D or 3D + clFFT_Dimension dim; + + // data format ... must be either interleaved or plannar + clFFT_DataFormat format; + + // string containing kernel source. Generated at runtime based on + // n, dim, format and other parameters + string *kernel_string; + + // CL program containing source and kernel this particular + // n, dim, data format + cl_program program; + + // linked list of kernels which needs to be executed for this fft + cl_fft_kernel_info *kernel_info; + + // number of kernels + int num_kernels; + + // twist kernel for virtualizing fft of very large sizes that do not + // fit in GPU global memory + cl_kernel twist_kernel; + + // flag indicating if temporary intermediate buffer is needed or not. + // this depends on fft kernels being executed and if transform is + // in-place or out-of-place. e.g. Local memory fft (say 1D 1024 ... + // one that does not require global transpose do not need temporary buffer) + // 2D 1024x1024 out-of-place fft however do require intermediate buffer. + // If temp buffer is needed, its allocation is lazy i.e. its not allocated + // until its needed + cl_int temp_buffer_needed; + + // Batch size is runtime parameter and size of temporary buffer (if needed) + // depends on batch size. Allocation of temporary buffer is lazy i.e. its + // only created when needed. Once its created at first call of clFFT_Executexxx + // it is not allocated next time if next time clFFT_Executexxx is called with + // batch size different than the first call. last_batch_size caches the last + // batch size with which this plan is used so that we dont keep allocating/deallocating + // temp buffer if same batch size is used again and again. + size_t last_batch_size; + + // temporary buffer for interleaved plan + cl_mem tempmemobj; + + // temporary buffer for planner plan. Only one of tempmemobj or + // (tempmemobj_real, tempmemobj_imag) pair is valid (allocated) depending + // data format of plan (plannar or interleaved) + cl_mem tempmemobj_real, tempmemobj_imag; + + // Maximum size of signal for which local memory transposed based + // fft is sufficient i.e. no global mem transpose (communication) + // is needed + size_t max_localmem_fft_size; + + // Maximum work items per work group allowed. This, along with max_radix below controls + // maximum local memory being used by fft kernels of this plan. Set to 256 by default + size_t max_work_item_per_workgroup; + + // Maximum base radix for local memory fft ... this controls the maximum register + // space used by work items. Currently defaults to 16 + size_t max_radix; + + // Device depended parameter that tells how many work-items need to be read consecutive + // values to make sure global memory access by work-items of a work-group result in + // coalesced memory access to utilize full bandwidth e.g. on NVidia tesla, this is 16 + size_t min_mem_coalesce_width; + + // Number of local memory banks. This is used to geneate kernel with local memory + // transposes with appropriate padding to avoid bank conflicts to local memory + // e.g. on NVidia it is 16. + size_t num_local_mem_banks; +}cl_fft_plan; + +void FFT1D(cl_fft_plan *plan, cl_fft_kernel_dir dir); + +#endif diff --git a/src/fft_kernelstring.cpp b/src/fft_kernelstring.cpp index 75750d97dfa309ef17d6b7a0769593b349138206..e603ecdc4b2662d48cebd22039584d0a4b802830 100644 --- a/src/fft_kernelstring.cpp +++ b/src/fft_kernelstring.cpp @@ -1,1207 +1,1256 @@ -#include <stdio.h> -#include <stdlib.h> -#include <math.h> -#include <iostream> -#include <sstream> -#include <string.h> -#include <assert.h> -#include "fft_internal.h" -#include <clFFT.h> - -using namespace std; - -#define max(A,B) ((A) > (B) ? (A) : (B)) -#define min(A,B) ((A) < (B) ? (A) : (B)) - -static string -num2str(int num) -{ - char temp[200]; - sprintf(temp, "%d", num); - return string(temp); -} - -// For any n, this function decomposes n into factors for loacal memory tranpose -// based fft. Factors (radices) are sorted such that the first one (radixArray[0]) -// is the largest. This base radix determines the number of registers used by each -// work item and product of remaining radices determine the size of work group needed. -// To make things concrete with and example, suppose n = 1024. It is decomposed into -// 1024 = 16 x 16 x 4. Hence kernel uses float2 a[16], for local in-register fft and -// needs 16 x 4 = 64 work items per work group. So kernel first performance 64 length -// 16 ffts (64 work items working in parallel) following by transpose using local -// memory followed by again 64 length 16 ffts followed by transpose using local memory -// followed by 256 length 4 ffts. For the last step since with size of work group is -// 64 and each work item can array for 16 values, 64 work items can compute 256 length -// 4 ffts by each work item computing 4 length 4 ffts. -// Similarly for n = 2048 = 8 x 8 x 8 x 4, each work group has 8 x 8 x 4 = 256 work -// iterms which each computes 256 (in-parallel) length 8 ffts in-register, followed -// by transpose using local memory, followed by 256 length 8 in-register ffts, followed -// by transpose using local memory, followed by 256 length 8 in-register ffts, followed -// by transpose using local memory, followed by 512 length 4 in-register ffts. Again, -// for the last step, each work item computes two length 4 in-register ffts and thus -// 256 work items are needed to compute all 512 ffts. -// For n = 32 = 8 x 4, 4 work items first compute 4 in-register -// lenth 8 ffts, followed by transpose using local memory followed by 8 in-register -// length 4 ffts, where each work item computes two length 4 ffts thus 4 work items -// can compute 8 length 4 ffts. However if work group size of say 64 is choosen, -// each work group can compute 64/ 4 = 16 size 32 ffts (batched transform). -// Users can play with these parameters to figure what gives best performance on -// their particular device i.e. some device have less register space thus using -// smaller base radix can avoid spilling ... some has small local memory thus -// using smaller work group size may be required etc - -static void -getRadixArray(unsigned int n, unsigned int *radixArray, unsigned int *numRadices, unsigned int maxRadix) -{ - if(maxRadix > 1) - { - maxRadix = min(n, maxRadix); - unsigned int cnt = 0; - while(n > maxRadix) - { - radixArray[cnt++] = maxRadix; - n /= maxRadix; - } - radixArray[cnt++] = n; - *numRadices = cnt; - return; - } - - switch(n) - { - case 2: - *numRadices = 1; - radixArray[0] = 2; - break; - - case 4: - *numRadices = 1; - radixArray[0] = 4; - break; - - case 8: - *numRadices = 1; - radixArray[0] = 8; - break; - - case 16: - *numRadices = 2; - radixArray[0] = 8; radixArray[1] = 2; - break; - - case 32: - *numRadices = 2; - radixArray[0] = 8; radixArray[1] = 4; - break; - - case 64: - *numRadices = 2; - radixArray[0] = 8; radixArray[1] = 8; - break; - - case 128: - *numRadices = 3; - radixArray[0] = 8; radixArray[1] = 4; radixArray[2] = 4; - break; - - case 256: - *numRadices = 4; - radixArray[0] = 4; radixArray[1] = 4; radixArray[2] = 4; radixArray[3] = 4; - break; - - case 512: - *numRadices = 3; - radixArray[0] = 8; radixArray[1] = 8; radixArray[2] = 8; - break; - - case 1024: - *numRadices = 3; - radixArray[0] = 16; radixArray[1] = 16; radixArray[2] = 4; - break; - case 2048: - *numRadices = 4; - radixArray[0] = 8; radixArray[1] = 8; radixArray[2] = 8; radixArray[3] = 4; - break; - default: - *numRadices = 0; - return; - } -} - -static void -insertHeader(string &kernelString, string &kernelName, clFFT_DataFormat dataFormat) -{ - if(dataFormat == clFFT_SplitComplexFormat) - kernelString += string("__kernel void ") + kernelName + string("(__global float *in_real, __global float *in_imag, __global float *out_real, __global float *out_imag, int dir, int S)\n"); - else - kernelString += string("__kernel void ") + kernelName + string("(__global float2 *in, __global float2 *out, int dir, int S)\n"); -} - -static void -insertVariables(string &kStream, int maxRadix) -{ - kStream += string(" int i, j, r, indexIn, indexOut, index, tid, bNum, xNum, k, l;\n"); - kStream += string(" int s, ii, jj, offset;\n"); - kStream += string(" float2 w;\n"); - kStream += string(" float ang, angf, ang1;\n"); - kStream += string(" __local float *lMemStore, *lMemLoad;\n"); - kStream += string(" float2 a[") + num2str(maxRadix) + string("];\n"); - kStream += string(" int lId = get_local_id( 0 );\n"); - kStream += string(" int groupId = get_group_id( 0 );\n"); -} - -static void -formattedLoad(string &kernelString, int aIndex, int gIndex, clFFT_DataFormat dataFormat) -{ - if(dataFormat == clFFT_InterleavedComplexFormat) - kernelString += string(" a[") + num2str(aIndex) + string("] = in[") + num2str(gIndex) + string("];\n"); - else - { - kernelString += string(" a[") + num2str(aIndex) + string("].x = in_real[") + num2str(gIndex) + string("];\n"); - kernelString += string(" a[") + num2str(aIndex) + string("].y = in_imag[") + num2str(gIndex) + string("];\n"); - } -} - -static void -formattedStore(string &kernelString, int aIndex, int gIndex, clFFT_DataFormat dataFormat) -{ - if(dataFormat == clFFT_InterleavedComplexFormat) - kernelString += string(" out[") + num2str(gIndex) + string("] = a[") + num2str(aIndex) + string("];\n"); - else - { - kernelString += string(" out_real[") + num2str(gIndex) + string("] = a[") + num2str(aIndex) + string("].x;\n"); - kernelString += string(" out_imag[") + num2str(gIndex) + string("] = a[") + num2str(aIndex) + string("].y;\n"); - } -} - -static int -insertGlobalLoadsAndTranspose(string &kernelString, int N, int numWorkItemsPerXForm, int numXFormsPerWG, int R0, int mem_coalesce_width, clFFT_DataFormat dataFormat) -{ - int log2NumWorkItemsPerXForm = (int) log2(numWorkItemsPerXForm); - int groupSize = numWorkItemsPerXForm * numXFormsPerWG; - int i, j; - int lMemSize = 0; - - if(numXFormsPerWG > 1) - kernelString += string(" s = S & ") + num2str(numXFormsPerWG - 1) + string(";\n"); - - if(numWorkItemsPerXForm >= mem_coalesce_width) - { - if(numXFormsPerWG > 1) - { - kernelString += string(" ii = lId & ") + num2str(numWorkItemsPerXForm-1) + string(";\n"); - kernelString += string(" jj = lId >> ") + num2str(log2NumWorkItemsPerXForm) + string(";\n"); - kernelString += string(" if( !s || (groupId < get_num_groups(0)-1) || (jj < s) ) {\n"); - kernelString += string(" offset = mad24( mad24(groupId, ") + num2str(numXFormsPerWG) + string(", jj), ") + num2str(N) + string(", ii );\n"); - if(dataFormat == clFFT_InterleavedComplexFormat) - { - kernelString += string(" in += offset;\n"); - kernelString += string(" out += offset;\n"); - } - else - { - kernelString += string(" in_real += offset;\n"); - kernelString += string(" in_imag += offset;\n"); - kernelString += string(" out_real += offset;\n"); - kernelString += string(" out_imag += offset;\n"); - } - for(i = 0; i < R0; i++) - formattedLoad(kernelString, i, i*numWorkItemsPerXForm, dataFormat); - kernelString += string(" }\n"); - } - else - { - kernelString += string(" ii = lId;\n"); - kernelString += string(" jj = 0;\n"); - kernelString += string(" offset = mad24(groupId, ") + num2str(N) + string(", ii);\n"); - if(dataFormat == clFFT_InterleavedComplexFormat) - { - kernelString += string(" in += offset;\n"); - kernelString += string(" out += offset;\n"); - } - else - { - kernelString += string(" in_real += offset;\n"); - kernelString += string(" in_imag += offset;\n"); - kernelString += string(" out_real += offset;\n"); - kernelString += string(" out_imag += offset;\n"); - } - for(i = 0; i < R0; i++) - formattedLoad(kernelString, i, i*numWorkItemsPerXForm, dataFormat); - } - } - else if( N >= mem_coalesce_width ) - { - int numInnerIter = N / mem_coalesce_width; - int numOuterIter = numXFormsPerWG / ( groupSize / mem_coalesce_width ); - - kernelString += string(" ii = lId & ") + num2str(mem_coalesce_width - 1) + string(";\n"); - kernelString += string(" jj = lId >> ") + num2str((int)log2(mem_coalesce_width)) + string(";\n"); - kernelString += string(" lMemStore = sMem + mad24( jj, ") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n"); - kernelString += string(" offset = mad24( groupId, ") + num2str(numXFormsPerWG) + string(", jj);\n"); - kernelString += string(" offset = mad24( offset, ") + num2str(N) + string(", ii );\n"); - if(dataFormat == clFFT_InterleavedComplexFormat) - { - kernelString += string(" in += offset;\n"); - kernelString += string(" out += offset;\n"); - } - else - { - kernelString += string(" in_real += offset;\n"); - kernelString += string(" in_imag += offset;\n"); - kernelString += string(" out_real += offset;\n"); - kernelString += string(" out_imag += offset;\n"); - } - - kernelString += string("if((groupId == get_num_groups(0)-1) && s) {\n"); - for(i = 0; i < numOuterIter; i++ ) - { - kernelString += string(" if( jj < s ) {\n"); - for(j = 0; j < numInnerIter; j++ ) - formattedLoad(kernelString, i * numInnerIter + j, j * mem_coalesce_width + i * ( groupSize / mem_coalesce_width ) * N, dataFormat); - kernelString += string(" }\n"); - if(i != numOuterIter - 1) - kernelString += string(" jj += ") + num2str(groupSize / mem_coalesce_width) + string(";\n"); - } - kernelString += string("}\n "); - kernelString += string("else {\n"); - for(i = 0; i < numOuterIter; i++ ) - { - for(j = 0; j < numInnerIter; j++ ) - formattedLoad(kernelString, i * numInnerIter + j, j * mem_coalesce_width + i * ( groupSize / mem_coalesce_width ) * N, dataFormat); - } - kernelString += string("}\n"); - - kernelString += string(" ii = lId & ") + num2str(numWorkItemsPerXForm - 1) + string(";\n"); - kernelString += string(" jj = lId >> ") + num2str(log2NumWorkItemsPerXForm) + string(";\n"); - kernelString += string(" lMemLoad = sMem + mad24( jj, ") + num2str(N + numWorkItemsPerXForm) + string(", ii);\n"); - - for( i = 0; i < numOuterIter; i++ ) - { - for( j = 0; j < numInnerIter; j++ ) - { - kernelString += string(" lMemStore[") + num2str(j * mem_coalesce_width + i * ( groupSize / mem_coalesce_width ) * (N + numWorkItemsPerXForm )) + string("] = a[") + - num2str(i * numInnerIter + j) + string("].x;\n"); - } - } - kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); - - for( i = 0; i < R0; i++ ) - kernelString += string(" a[") + num2str(i) + string("].x = lMemLoad[") + num2str(i * numWorkItemsPerXForm) + string("];\n"); - kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); - - for( i = 0; i < numOuterIter; i++ ) - { - for( j = 0; j < numInnerIter; j++ ) - { - kernelString += string(" lMemStore[") + num2str(j * mem_coalesce_width + i * ( groupSize / mem_coalesce_width ) * (N + numWorkItemsPerXForm )) + string("] = a[") + - num2str(i * numInnerIter + j) + string("].y;\n"); - } - } - kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); - - for( i = 0; i < R0; i++ ) - kernelString += string(" a[") + num2str(i) + string("].y = lMemLoad[") + num2str(i * numWorkItemsPerXForm) + string("];\n"); - kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); - - lMemSize = (N + numWorkItemsPerXForm) * numXFormsPerWG; - } - else - { - kernelString += string(" offset = mad24( groupId, ") + num2str(N * numXFormsPerWG) + string(", lId );\n"); - if(dataFormat == clFFT_InterleavedComplexFormat) - { - kernelString += string(" in += offset;\n"); - kernelString += string(" out += offset;\n"); - } - else - { - kernelString += string(" in_real += offset;\n"); - kernelString += string(" in_imag += offset;\n"); - kernelString += string(" out_real += offset;\n"); - kernelString += string(" out_imag += offset;\n"); - } - - kernelString += string(" ii = lId & ") + num2str(N-1) + string(";\n"); - kernelString += string(" jj = lId >> ") + num2str((int)log2(N)) + string(";\n"); - kernelString += string(" lMemStore = sMem + mad24( jj, ") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n"); - - kernelString += string("if((groupId == get_num_groups(0)-1) && s) {\n"); - for( i = 0; i < R0; i++ ) - { - kernelString += string(" if(jj < s )\n"); - formattedLoad(kernelString, i, i*groupSize, dataFormat); - if(i != R0 - 1) - kernelString += string(" jj += ") + num2str(groupSize / N) + string(";\n"); - } - kernelString += string("}\n"); - kernelString += string("else {\n"); - for( i = 0; i < R0; i++ ) - { - formattedLoad(kernelString, i, i*groupSize, dataFormat); - } - kernelString += string("}\n"); - - if(numWorkItemsPerXForm > 1) - { - kernelString += string(" ii = lId & ") + num2str(numWorkItemsPerXForm - 1) + string(";\n"); - kernelString += string(" jj = lId >> ") + num2str(log2NumWorkItemsPerXForm) + string(";\n"); - kernelString += string(" lMemLoad = sMem + mad24( jj, ") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n"); - } - else - { - kernelString += string(" ii = 0;\n"); - kernelString += string(" jj = lId;\n"); - kernelString += string(" lMemLoad = sMem + mul24( jj, ") + num2str(N + numWorkItemsPerXForm) + string(");\n"); - } - - - for( i = 0; i < R0; i++ ) - kernelString += string(" lMemStore[") + num2str(i * ( groupSize / N ) * ( N + numWorkItemsPerXForm )) + string("] = a[") + num2str(i) + string("].x;\n"); - kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); - - for( i = 0; i < R0; i++ ) - kernelString += string(" a[") + num2str(i) + string("].x = lMemLoad[") + num2str(i * numWorkItemsPerXForm) + string("];\n"); - kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); - - for( i = 0; i < R0; i++ ) - kernelString += string(" lMemStore[") + num2str(i * ( groupSize / N ) * ( N + numWorkItemsPerXForm )) + string("] = a[") + num2str(i) + string("].y;\n"); - kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); - - for( i = 0; i < R0; i++ ) - kernelString += string(" a[") + num2str(i) + string("].y = lMemLoad[") + num2str(i * numWorkItemsPerXForm) + string("];\n"); - kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); - - lMemSize = (N + numWorkItemsPerXForm) * numXFormsPerWG; - } - - return lMemSize; -} - -static int -insertGlobalStoresAndTranspose(string &kernelString, int N, int maxRadix, int Nr, int numWorkItemsPerXForm, int numXFormsPerWG, int mem_coalesce_width, clFFT_DataFormat dataFormat) -{ - int groupSize = numWorkItemsPerXForm * numXFormsPerWG; - int i, j, k, ind; - int lMemSize = 0; - int numIter = maxRadix / Nr; - string indent = string(""); - - if( numWorkItemsPerXForm >= mem_coalesce_width ) - { - if(numXFormsPerWG > 1) - { - kernelString += string(" if( !s || (groupId < get_num_groups(0)-1) || (jj < s) ) {\n"); - indent = string(" "); - } - for(i = 0; i < maxRadix; i++) - { - j = i % numIter; - k = i / numIter; - ind = j * Nr + k; - formattedStore(kernelString, ind, i*numWorkItemsPerXForm, dataFormat); - } - if(numXFormsPerWG > 1) - kernelString += string(" }\n"); - } - else if( N >= mem_coalesce_width ) - { - int numInnerIter = N / mem_coalesce_width; - int numOuterIter = numXFormsPerWG / ( groupSize / mem_coalesce_width ); - - kernelString += string(" lMemLoad = sMem + mad24( jj, ") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n"); - kernelString += string(" ii = lId & ") + num2str(mem_coalesce_width - 1) + string(";\n"); - kernelString += string(" jj = lId >> ") + num2str((int)log2(mem_coalesce_width)) + string(";\n"); - kernelString += string(" lMemStore = sMem + mad24( jj,") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n"); - - for( i = 0; i < maxRadix; i++ ) - { - j = i % numIter; - k = i / numIter; - ind = j * Nr + k; - kernelString += string(" lMemLoad[") + num2str(i*numWorkItemsPerXForm) + string("] = a[") + num2str(ind) + string("].x;\n"); - } - kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); - - for( i = 0; i < numOuterIter; i++ ) - for( j = 0; j < numInnerIter; j++ ) - kernelString += string(" a[") + num2str(i*numInnerIter + j) + string("].x = lMemStore[") + num2str(j*mem_coalesce_width + i*( groupSize / mem_coalesce_width )*(N + numWorkItemsPerXForm)) + string("];\n"); - kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); - - for( i = 0; i < maxRadix; i++ ) - { - j = i % numIter; - k = i / numIter; - ind = j * Nr + k; - kernelString += string(" lMemLoad[") + num2str(i*numWorkItemsPerXForm) + string("] = a[") + num2str(ind) + string("].y;\n"); - } - kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); - - for( i = 0; i < numOuterIter; i++ ) - for( j = 0; j < numInnerIter; j++ ) - kernelString += string(" a[") + num2str(i*numInnerIter + j) + string("].y = lMemStore[") + num2str(j*mem_coalesce_width + i*( groupSize / mem_coalesce_width )*(N + numWorkItemsPerXForm)) + string("];\n"); - kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); - - kernelString += string("if((groupId == get_num_groups(0)-1) && s) {\n"); - for(i = 0; i < numOuterIter; i++ ) - { - kernelString += string(" if( jj < s ) {\n"); - for(j = 0; j < numInnerIter; j++ ) - formattedStore(kernelString, i*numInnerIter + j, j*mem_coalesce_width + i*(groupSize/mem_coalesce_width)*N, dataFormat); - kernelString += string(" }\n"); - if(i != numOuterIter - 1) - kernelString += string(" jj += ") + num2str(groupSize / mem_coalesce_width) + string(";\n"); - } - kernelString += string("}\n"); - kernelString += string("else {\n"); - for(i = 0; i < numOuterIter; i++ ) - { - for(j = 0; j < numInnerIter; j++ ) - formattedStore(kernelString, i*numInnerIter + j, j*mem_coalesce_width + i*(groupSize/mem_coalesce_width)*N, dataFormat); - } - kernelString += string("}\n"); - - lMemSize = (N + numWorkItemsPerXForm) * numXFormsPerWG; - } - else - { - kernelString += string(" lMemLoad = sMem + mad24( jj,") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n"); - - kernelString += string(" ii = lId & ") + num2str(N - 1) + string(";\n"); - kernelString += string(" jj = lId >> ") + num2str((int) log2(N)) + string(";\n"); - kernelString += string(" lMemStore = sMem + mad24( jj,") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n"); - - for( i = 0; i < maxRadix; i++ ) - { - j = i % numIter; - k = i / numIter; - ind = j * Nr + k; - kernelString += string(" lMemLoad[") + num2str(i*numWorkItemsPerXForm) + string("] = a[") + num2str(ind) + string("].x;\n"); - } - kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); - - for( i = 0; i < maxRadix; i++ ) - kernelString += string(" a[") + num2str(i) + string("].x = lMemStore[") + num2str(i*( groupSize / N )*( N + numWorkItemsPerXForm )) + string("];\n"); - kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); - - for( i = 0; i < maxRadix; i++ ) - { - j = i % numIter; - k = i / numIter; - ind = j * Nr + k; - kernelString += string(" lMemLoad[") + num2str(i*numWorkItemsPerXForm) + string("] = a[") + num2str(ind) + string("].y;\n"); - } - kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); - - for( i = 0; i < maxRadix; i++ ) - kernelString += string(" a[") + num2str(i) + string("].y = lMemStore[") + num2str(i*( groupSize / N )*( N + numWorkItemsPerXForm )) + string("];\n"); - kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); - - kernelString += string("if((groupId == get_num_groups(0)-1) && s) {\n"); - for( i = 0; i < maxRadix; i++ ) - { - kernelString += string(" if(jj < s ) {\n"); - formattedStore(kernelString, i, i*groupSize, dataFormat); - kernelString += string(" }\n"); - if( i != maxRadix - 1) - kernelString += string(" jj +=") + num2str(groupSize / N) + string(";\n"); - } - kernelString += string("}\n"); - kernelString += string("else {\n"); - for( i = 0; i < maxRadix; i++ ) - { - formattedStore(kernelString, i, i*groupSize, dataFormat); - } - kernelString += string("}\n"); - - lMemSize = (N + numWorkItemsPerXForm) * numXFormsPerWG; - } - - return lMemSize; -} - -static void -insertfftKernel(string &kernelString, int Nr, int numIter) -{ - int i; - for(i = 0; i < numIter; i++) - { - kernelString += string(" fftKernel") + num2str(Nr) + string("(a+") + num2str(i*Nr) + string(", dir);\n"); - } -} - -static void -insertTwiddleKernel(string &kernelString, int Nr, int numIter, int Nprev, int len, int numWorkItemsPerXForm) -{ - int z, k; - int logNPrev = log2(Nprev); - - for(z = 0; z < numIter; z++) - { - if(z == 0) - { - if(Nprev > 1) - kernelString += string(" angf = (float) (ii >> ") + num2str(logNPrev) + string(");\n"); - else - kernelString += string(" angf = (float) ii;\n"); - } - else - { - if(Nprev > 1) - kernelString += string(" angf = (float) ((") + num2str(z*numWorkItemsPerXForm) + string(" + ii) >>") + num2str(logNPrev) + string(");\n"); - else - kernelString += string(" angf = (float) (") + num2str(z*numWorkItemsPerXForm) + string(" + ii);\n"); - } - - for(k = 1; k < Nr; k++) { - int ind = z*Nr + k; - //float fac = (float) (2.0 * M_PI * (double) k / (double) len); - kernelString += string(" ang = dir * ( 2.0f * M_PI * ") + num2str(k) + string(".0f / ") + num2str(len) + string(".0f )") + string(" * angf;\n"); - kernelString += string(" w = (float2)(native_cos(ang), native_sin(ang));\n"); - kernelString += string(" a[") + num2str(ind) + string("] = complexMul(a[") + num2str(ind) + string("], w);\n"); - } - } -} - -static int -getPadding(int numWorkItemsPerXForm, int Nprev, int numWorkItemsReq, int numXFormsPerWG, int Nr, int numBanks, int *offset, int *midPad) -{ - if((numWorkItemsPerXForm <= Nprev) || (Nprev >= numBanks)) - *offset = 0; - else { - int numRowsReq = ((numWorkItemsPerXForm < numBanks) ? numWorkItemsPerXForm : numBanks) / Nprev; - int numColsReq = 1; - if(numRowsReq > Nr) - numColsReq = numRowsReq / Nr; - numColsReq = Nprev * numColsReq; - *offset = numColsReq; - } - - if(numWorkItemsPerXForm >= numBanks || numXFormsPerWG == 1) - *midPad = 0; - else { - int bankNum = ( (numWorkItemsReq + *offset) * Nr ) & (numBanks - 1); - if( bankNum >= numWorkItemsPerXForm ) - *midPad = 0; - else - *midPad = numWorkItemsPerXForm - bankNum; - } - - int lMemSize = ( numWorkItemsReq + *offset) * Nr * numXFormsPerWG + *midPad * (numXFormsPerWG - 1); - return lMemSize; -} - - -static void -insertLocalStores(string &kernelString, int numIter, int Nr, int numWorkItemsPerXForm, int numWorkItemsReq, int offset, string &comp) -{ - int z, k; - - for(z = 0; z < numIter; z++) { - for(k = 0; k < Nr; k++) { - int index = k*(numWorkItemsReq + offset) + z*numWorkItemsPerXForm; - kernelString += string(" lMemStore[") + num2str(index) + string("] = a[") + num2str(z*Nr + k) + string("].") + comp + string(";\n"); - } - } - kernelString += string(" barrier(CLK_LOCAL_MEM_FENCE);\n"); -} - -static void -insertLocalLoads(string &kernelString, int n, int Nr, int Nrn, int Nprev, int Ncurr, int numWorkItemsPerXForm, int numWorkItemsReq, int offset, string &comp) -{ - int numWorkItemsReqN = n / Nrn; - int interBlockHNum = max( Nprev / numWorkItemsPerXForm, 1 ); - int interBlockHStride = numWorkItemsPerXForm; - int vertWidth = max(numWorkItemsPerXForm / Nprev, 1); - vertWidth = min( vertWidth, Nr); - int vertNum = Nr / vertWidth; - int vertStride = ( n / Nr + offset ) * vertWidth; - int iter = max( numWorkItemsReqN / numWorkItemsPerXForm, 1); - int intraBlockHStride = (numWorkItemsPerXForm / (Nprev*Nr)) > 1 ? (numWorkItemsPerXForm / (Nprev*Nr)) : 1; - intraBlockHStride *= Nprev; - - int stride = numWorkItemsReq / Nrn; - int i; - for(i = 0; i < iter; i++) { - int ii = i / (interBlockHNum * vertNum); - int zz = i % (interBlockHNum * vertNum); - int jj = zz % interBlockHNum; - int kk = zz / interBlockHNum; - int z; - for(z = 0; z < Nrn; z++) { - int st = kk * vertStride + jj * interBlockHStride + ii * intraBlockHStride + z * stride; - kernelString += string(" a[") + num2str(i*Nrn + z) + string("].") + comp + string(" = lMemLoad[") + num2str(st) + string("];\n"); - } - } - kernelString += string(" barrier(CLK_LOCAL_MEM_FENCE);\n"); -} - -static void -insertLocalLoadIndexArithmatic(string &kernelString, int Nprev, int Nr, int numWorkItemsReq, int numWorkItemsPerXForm, int numXFormsPerWG, int offset, int midPad) -{ - int Ncurr = Nprev * Nr; - int logNcurr = log2(Ncurr); - int logNprev = log2(Nprev); - int incr = (numWorkItemsReq + offset) * Nr + midPad; - - if(Ncurr < numWorkItemsPerXForm) - { - if(Nprev == 1) - kernelString += string(" j = ii & ") + num2str(Ncurr - 1) + string(";\n"); - else - kernelString += string(" j = (ii & ") + num2str(Ncurr - 1) + string(") >> ") + num2str(logNprev) + string(";\n"); - - if(Nprev == 1) - kernelString += string(" i = ii >> ") + num2str(logNcurr) + string(";\n"); - else - kernelString += string(" i = mad24(ii >> ") + num2str(logNcurr) + string(", ") + num2str(Nprev) + string(", ii & ") + num2str(Nprev - 1) + string(");\n"); - } - else - { - if(Nprev == 1) - kernelString += string(" j = ii;\n"); - else - kernelString += string(" j = ii >> ") + num2str(logNprev) + string(";\n"); - if(Nprev == 1) - kernelString += string(" i = 0;\n"); - else - kernelString += string(" i = ii & ") + num2str(Nprev - 1) + string(";\n"); - } - - if(numXFormsPerWG > 1) - kernelString += string(" i = mad24(jj, ") + num2str(incr) + string(", i);\n"); - - kernelString += string(" lMemLoad = sMem + mad24(j, ") + num2str(numWorkItemsReq + offset) + string(", i);\n"); -} - -static void -insertLocalStoreIndexArithmatic(string &kernelString, int numWorkItemsReq, int numXFormsPerWG, int Nr, int offset, int midPad) -{ - if(numXFormsPerWG == 1) { - kernelString += string(" lMemStore = sMem + ii;\n"); - } - else { - kernelString += string(" lMemStore = sMem + mad24(jj, ") + num2str((numWorkItemsReq + offset)*Nr + midPad) + string(", ii);\n"); - } -} - - -static void -createLocalMemfftKernelString(cl_fft_plan *plan) -{ - unsigned int radixArray[10]; - unsigned int numRadix; - - unsigned int n = plan->n.x; - - assert(n <= plan->max_work_item_per_workgroup * plan->max_radix && "signal lenght too big for local mem fft\n"); - - getRadixArray(n, radixArray, &numRadix, 0); - assert(numRadix > 0 && "no radix array supplied\n"); - - if(n/radixArray[0] > plan->max_work_item_per_workgroup) - getRadixArray(n, radixArray, &numRadix, plan->max_radix); - - assert(radixArray[0] <= plan->max_radix && "max radix choosen is greater than allowed\n"); - assert(n/radixArray[0] <= plan->max_work_item_per_workgroup && "required work items per xform greater than maximum work items allowed per work group for local mem fft\n"); - - unsigned int tmpLen = 1; - unsigned int i; - for(i = 0; i < numRadix; i++) - { - assert( radixArray[i] && !( (radixArray[i] - 1) & radixArray[i] ) ); - tmpLen *= radixArray[i]; - } - assert(tmpLen == n && "product of radices choosen doesnt match the length of signal\n"); - - int offset, midPad; - string localString(""), kernelName(""); - - clFFT_DataFormat dataFormat = plan->format; - string *kernelString = plan->kernel_string; - - - cl_fft_kernel_info **kInfo = &plan->kernel_info; - int kCount = 0; - - while(*kInfo) - { - kInfo = &(*kInfo)->next; - kCount++; - } - - kernelName = string("fft") + num2str(kCount); - - *kInfo = (cl_fft_kernel_info *) malloc(sizeof(cl_fft_kernel_info)); - (*kInfo)->kernel = 0; - (*kInfo)->lmem_size = 0; - (*kInfo)->num_workgroups = 0; - (*kInfo)->num_workitems_per_workgroup = 0; - (*kInfo)->dir = cl_fft_kernel_x; - (*kInfo)->in_place_possible = 1; - (*kInfo)->next = NULL; - (*kInfo)->kernel_name = (char *) malloc(sizeof(char)*(kernelName.size()+1)); - strcpy((*kInfo)->kernel_name, kernelName.c_str()); - - unsigned int numWorkItemsPerXForm = n / radixArray[0]; - unsigned int numWorkItemsPerWG = numWorkItemsPerXForm <= 64 ? 64 : numWorkItemsPerXForm; - assert(numWorkItemsPerWG <= plan->max_work_item_per_workgroup); - int numXFormsPerWG = numWorkItemsPerWG / numWorkItemsPerXForm; - (*kInfo)->num_workgroups = 1; - (*kInfo)->num_xforms_per_workgroup = numXFormsPerWG; - (*kInfo)->num_workitems_per_workgroup = numWorkItemsPerWG; - - unsigned int *N = radixArray; - unsigned int maxRadix = N[0]; - unsigned int lMemSize = 0; - - insertVariables(localString, maxRadix); - - lMemSize = insertGlobalLoadsAndTranspose(localString, n, numWorkItemsPerXForm, numXFormsPerWG, maxRadix, plan->min_mem_coalesce_width, dataFormat); - (*kInfo)->lmem_size = (lMemSize > (*kInfo)->lmem_size) ? lMemSize : (*kInfo)->lmem_size; - - string xcomp = string("x"); - string ycomp = string("y"); - - unsigned int Nprev = 1; - unsigned int len = n; - unsigned int r; - for(r = 0; r < numRadix; r++) - { - int numIter = N[0] / N[r]; - int numWorkItemsReq = n / N[r]; - int Ncurr = Nprev * N[r]; - insertfftKernel(localString, N[r], numIter); - - if(r < (numRadix - 1)) { - insertTwiddleKernel(localString, N[r], numIter, Nprev, len, numWorkItemsPerXForm); - lMemSize = getPadding(numWorkItemsPerXForm, Nprev, numWorkItemsReq, numXFormsPerWG, N[r], plan->num_local_mem_banks, &offset, &midPad); - (*kInfo)->lmem_size = (lMemSize > (*kInfo)->lmem_size) ? lMemSize : (*kInfo)->lmem_size; - insertLocalStoreIndexArithmatic(localString, numWorkItemsReq, numXFormsPerWG, N[r], offset, midPad); - insertLocalLoadIndexArithmatic(localString, Nprev, N[r], numWorkItemsReq, numWorkItemsPerXForm, numXFormsPerWG, offset, midPad); - insertLocalStores(localString, numIter, N[r], numWorkItemsPerXForm, numWorkItemsReq, offset, xcomp); - insertLocalLoads(localString, n, N[r], N[r+1], Nprev, Ncurr, numWorkItemsPerXForm, numWorkItemsReq, offset, xcomp); - insertLocalStores(localString, numIter, N[r], numWorkItemsPerXForm, numWorkItemsReq, offset, ycomp); - insertLocalLoads(localString, n, N[r], N[r+1], Nprev, Ncurr, numWorkItemsPerXForm, numWorkItemsReq, offset, ycomp); - Nprev = Ncurr; - len = len / N[r]; - } - } - - lMemSize = insertGlobalStoresAndTranspose(localString, n, maxRadix, N[numRadix - 1], numWorkItemsPerXForm, numXFormsPerWG, plan->min_mem_coalesce_width, dataFormat); - (*kInfo)->lmem_size = (lMemSize > (*kInfo)->lmem_size) ? lMemSize : (*kInfo)->lmem_size; - - insertHeader(*kernelString, kernelName, dataFormat); - *kernelString += string("{\n"); - if((*kInfo)->lmem_size) - *kernelString += string(" __local float sMem[") + num2str((*kInfo)->lmem_size) + string("];\n"); - *kernelString += localString; - *kernelString += string("}\n"); -} - -// For n larger than what can be computed using local memory fft, global transposes -// multiple kernel launces is needed. For these sizes, n can be decomposed using -// much larger base radices i.e. say n = 262144 = 128 x 64 x 32. Thus three kernel -// launches will be needed, first computing 64 x 32, length 128 ffts, second computing -// 128 x 32 length 64 ffts, and finally a kernel computing 128 x 64 length 32 ffts. -// Each of these base radices can futher be divided into factors so that each of these -// base ffts can be computed within one kernel launch using in-register ffts and local -// memory transposes i.e for the first kernel above which computes 64 x 32 ffts on length -// 128, 128 can be decomposed into 128 = 16 x 8 i.e. 8 work items can compute 8 length -// 16 ffts followed by transpose using local memory followed by each of these eight -// work items computing 2 length 8 ffts thus computing 16 length 8 ffts in total. This -// means only 8 work items are needed for computing one length 128 fft. If we choose -// work group size of say 64, we can compute 64/8 = 8 length 128 ffts within one -// work group. Since we need to compute 64 x 32 length 128 ffts in first kernel, this -// means we need to launch 64 x 32 / 8 = 256 work groups with 64 work items in each -// work group where each work group is computing 8 length 128 ffts where each length -// 128 fft is computed by 8 work items. Same logic can be applied to other two kernels -// in this example. Users can play with difference base radices and difference -// decompositions of base radices to generates different kernels and see which gives -// best performance. Following function is just fixed to use 128 as base radix - -void -getGlobalRadixInfo(int n, int *radix, int *R1, int *R2, int *numRadices) -{ - int baseRadix = min(n, 128); - - int numR = 0; - int N = n; - while(N > baseRadix) - { - N /= baseRadix; - numR++; - } - - for(int i = 0; i < numR; i++) - radix[i] = baseRadix; - - radix[numR] = N; - numR++; - *numRadices = numR; - - for(int i = 0; i < numR; i++) - { - int B = radix[i]; - if(B <= 8) - { - R1[i] = B; - R2[i] = 1; - continue; - } - - int r1 = 2; - int r2 = B / r1; - while(r2 > r1) - { - r1 *=2; - r2 = B / r1; - } - R1[i] = r1; - R2[i] = r2; - } -} - -static void -createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir dir, int vertBS) -{ - int i, j, k, t; - int radixArr[10] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; - int R1Arr[10] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; - int R2Arr[10] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; - int radix, R1, R2; - int numRadices; - - int maxThreadsPerBlock = plan->max_work_item_per_workgroup; - int maxArrayLen = plan->max_radix; - int batchSize = plan->min_mem_coalesce_width; - clFFT_DataFormat dataFormat = plan->format; - int vertical = (dir == cl_fft_kernel_x) ? 0 : 1; - - getGlobalRadixInfo(n, radixArr, R1Arr, R2Arr, &numRadices); - - int numPasses = numRadices; - - string localString(""), kernelName(""); - string *kernelString = plan->kernel_string; - cl_fft_kernel_info **kInfo = &plan->kernel_info; - int kCount = 0; - - while(*kInfo) - { - kInfo = &(*kInfo)->next; - kCount++; - } - - int N = n; - int m = (int)log2(n); - int Rinit = vertical ? BS : 1; - batchSize = vertical ? min(BS, batchSize) : batchSize; - int passNum; - - for(passNum = 0; passNum < numPasses; passNum++) - { - - localString.clear(); - kernelName.clear(); - - radix = radixArr[passNum]; - R1 = R1Arr[passNum]; - R2 = R2Arr[passNum]; - - int strideI = Rinit; - for(i = 0; i < numPasses; i++) - if(i != passNum) - strideI *= radixArr[i]; - - int strideO = Rinit; - for(i = 0; i < passNum; i++) - strideO *= radixArr[i]; - - int threadsPerXForm = R2; - batchSize = R2 == 1 ? plan->max_work_item_per_workgroup : batchSize; - batchSize = min(batchSize, strideI); - int threadsPerBlock = batchSize * threadsPerXForm; - threadsPerBlock = min(threadsPerBlock, maxThreadsPerBlock); - batchSize = threadsPerBlock / threadsPerXForm; - assert(R2 <= R1); - assert(R1*R2 == radix); - assert(R1 <= maxArrayLen); - assert(threadsPerBlock <= maxThreadsPerBlock); - - int numIter = R1 / R2; - int gInInc = threadsPerBlock / batchSize; - - - int lgStrideO = log2(strideO); - int numBlocksPerXForm = strideI / batchSize; - int numBlocks = numBlocksPerXForm; - if(!vertical) - numBlocks *= BS; - else - numBlocks *= vertBS; - - kernelName = string("fft") + num2str(kCount); - *kInfo = (cl_fft_kernel_info *) malloc(sizeof(cl_fft_kernel_info)); - (*kInfo)->kernel = 0; - if(R2 == 1) - (*kInfo)->lmem_size = 0; - else - { - if(strideO == 1) - (*kInfo)->lmem_size = (radix + 1)*batchSize; - else - (*kInfo)->lmem_size = threadsPerBlock*R1; - } - (*kInfo)->num_workgroups = numBlocks; - (*kInfo)->num_xforms_per_workgroup = 1; - (*kInfo)->num_workitems_per_workgroup = threadsPerBlock; - (*kInfo)->dir = dir; - if( (passNum == (numPasses - 1)) && (numPasses & 1) ) - (*kInfo)->in_place_possible = 1; - else - (*kInfo)->in_place_possible = 0; - (*kInfo)->next = NULL; - (*kInfo)->kernel_name = (char *) malloc(sizeof(char)*(kernelName.size()+1)); - strcpy((*kInfo)->kernel_name, kernelName.c_str()); - - insertVariables(localString, R1); - - if(vertical) - { - localString += string("xNum = groupId >> ") + num2str((int)log2(numBlocksPerXForm)) + string(";\n"); - localString += string("groupId = groupId & ") + num2str(numBlocksPerXForm - 1) + string(";\n"); - localString += string("indexIn = mad24(groupId, ") + num2str(batchSize) + string(", xNum << ") + num2str((int)log2(n*BS)) + string(");\n"); - localString += string("tid = mul24(groupId, ") + num2str(batchSize) + string(");\n"); - localString += string("i = tid >> ") + num2str(lgStrideO) + string(";\n"); - localString += string("j = tid & ") + num2str(strideO - 1) + string(";\n"); - int stride = radix*Rinit; - for(i = 0; i < passNum; i++) - stride *= radixArr[i]; - localString += string("indexOut = mad24(i, ") + num2str(stride) + string(", j + ") + string("(xNum << ") + num2str((int) log2(n*BS)) + string("));\n"); - localString += string("bNum = groupId;\n"); - } - else - { - int lgNumBlocksPerXForm = log2(numBlocksPerXForm); - localString += string("bNum = groupId & ") + num2str(numBlocksPerXForm - 1) + string(";\n"); - localString += string("xNum = groupId >> ") + num2str(lgNumBlocksPerXForm) + string(";\n"); - localString += string("indexIn = mul24(bNum, ") + num2str(batchSize) + string(");\n"); - localString += string("tid = indexIn;\n"); - localString += string("i = tid >> ") + num2str(lgStrideO) + string(";\n"); - localString += string("j = tid & ") + num2str(strideO - 1) + string(";\n"); - int stride = radix*Rinit; - for(i = 0; i < passNum; i++) - stride *= radixArr[i]; - localString += string("indexOut = mad24(i, ") + num2str(stride) + string(", j);\n"); - localString += string("indexIn += (xNum << ") + num2str(m) + string(");\n"); - localString += string("indexOut += (xNum << ") + num2str(m) + string(");\n"); - } - - // Load Data - int lgBatchSize = log2(batchSize); - localString += string("tid = lId;\n"); - localString += string("i = tid & ") + num2str(batchSize - 1) + string(";\n"); - localString += string("j = tid >> ") + num2str(lgBatchSize) + string(";\n"); - localString += string("indexIn += mad24(j, ") + num2str(strideI) + string(", i);\n"); - - if(dataFormat == clFFT_SplitComplexFormat) - { - localString += string("in_real += indexIn;\n"); - localString += string("in_imag += indexIn;\n"); - for(j = 0; j < R1; j++) - localString += string("a[") + num2str(j) + string("].x = in_real[") + num2str(j*gInInc*strideI) + string("];\n"); - for(j = 0; j < R1; j++) - localString += string("a[") + num2str(j) + string("].y = in_imag[") + num2str(j*gInInc*strideI) + string("];\n"); - } - else - { - localString += string("in += indexIn;\n"); - for(j = 0; j < R1; j++) - localString += string("a[") + num2str(j) + string("] = in[") + num2str(j*gInInc*strideI) + string("];\n"); - } - - localString += string("fftKernel") + num2str(R1) + string("(a, dir);\n"); - - if(R2 > 1) - { - // twiddle - for(k = 1; k < R1; k++) - { - localString += string("ang = dir*(2.0f*M_PI*") + num2str(k) + string("/") + num2str(radix) + string(")*j;\n"); - localString += string("w = (float2)(native_cos(ang), native_sin(ang));\n"); - localString += string("a[") + num2str(k) + string("] = complexMul(a[") + num2str(k) + string("], w);\n"); - } - - // shuffle - numIter = R1 / R2; - localString += string("indexIn = mad24(j, ") + num2str(threadsPerBlock*numIter) + string(", i);\n"); - localString += string("lMemStore = sMem + tid;\n"); - localString += string("lMemLoad = sMem + indexIn;\n"); - for(k = 0; k < R1; k++) - localString += string("lMemStore[") + num2str(k*threadsPerBlock) + string("] = a[") + num2str(k) + string("].x;\n"); - localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n"); - for(k = 0; k < numIter; k++) - for(t = 0; t < R2; t++) - localString += string("a[") + num2str(k*R2+t) + string("].x = lMemLoad[") + num2str(t*batchSize + k*threadsPerBlock) + string("];\n"); - localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n"); - for(k = 0; k < R1; k++) - localString += string("lMemStore[") + num2str(k*threadsPerBlock) + string("] = a[") + num2str(k) + string("].y;\n"); - localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n"); - for(k = 0; k < numIter; k++) - for(t = 0; t < R2; t++) - localString += string("a[") + num2str(k*R2+t) + string("].y = lMemLoad[") + num2str(t*batchSize + k*threadsPerBlock) + string("];\n"); - localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n"); - - for(j = 0; j < numIter; j++) - localString += string("fftKernel") + num2str(R2) + string("(a + ") + num2str(j*R2) + string(", dir);\n"); - } - - // twiddle - if(passNum < (numPasses - 1)) - { - localString += string("l = ((bNum << ") + num2str(lgBatchSize) + string(") + i) >> ") + num2str(lgStrideO) + string(";\n"); - localString += string("k = j << ") + num2str((int)log2(R1/R2)) + string(";\n"); - localString += string("ang1 = dir*(2.0f*M_PI/") + num2str(N) + string(")*l;\n"); - for(t = 0; t < R1; t++) - { - localString += string("ang = ang1*(k + ") + num2str((t%R2)*R1 + (t/R2)) + string(");\n"); - localString += string("w = (float2)(native_cos(ang), native_sin(ang));\n"); - localString += string("a[") + num2str(t) + string("] = complexMul(a[") + num2str(t) + string("], w);\n"); - } - } - - // Store Data - if(strideO == 1) - { - - localString += string("lMemStore = sMem + mad24(i, ") + num2str(radix + 1) + string(", j << ") + num2str((int)log2(R1/R2)) + string(");\n"); - localString += string("lMemLoad = sMem + mad24(tid >> ") + num2str((int)log2(radix)) + string(", ") + num2str(radix+1) + string(", tid & ") + num2str(radix-1) + string(");\n"); - - for(int i = 0; i < R1/R2; i++) - for(int j = 0; j < R2; j++) - localString += string("lMemStore[ ") + num2str(i + j*R1) + string("] = a[") + num2str(i*R2+j) + string("].x;\n"); - localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n"); - if(threadsPerBlock >= radix) - { - for(int i = 0; i < R1; i++) - localString += string("a[") + num2str(i) + string("].x = lMemLoad[") + num2str(i*(radix+1)*(threadsPerBlock/radix)) + string("];\n"); - } - else - { - int innerIter = radix/threadsPerBlock; - int outerIter = R1/innerIter; - for(int i = 0; i < outerIter; i++) - for(int j = 0; j < innerIter; j++) - localString += string("a[") + num2str(i*innerIter+j) + string("].x = lMemLoad[") + num2str(j*threadsPerBlock + i*(radix+1)) + string("];\n"); - } - localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n"); - - for(int i = 0; i < R1/R2; i++) - for(int j = 0; j < R2; j++) - localString += string("lMemStore[ ") + num2str(i + j*R1) + string("] = a[") + num2str(i*R2+j) + string("].y;\n"); - localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n"); - if(threadsPerBlock >= radix) - { - for(int i = 0; i < R1; i++) - localString += string("a[") + num2str(i) + string("].y = lMemLoad[") + num2str(i*(radix+1)*(threadsPerBlock/radix)) + string("];\n"); - } - else - { - int innerIter = radix/threadsPerBlock; - int outerIter = R1/innerIter; - for(int i = 0; i < outerIter; i++) - for(int j = 0; j < innerIter; j++) - localString += string("a[") + num2str(i*innerIter+j) + string("].y = lMemLoad[") + num2str(j*threadsPerBlock + i*(radix+1)) + string("];\n"); - } - localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n"); - - localString += string("indexOut += tid;\n"); - if(dataFormat == clFFT_SplitComplexFormat) { - localString += string("out_real += indexOut;\n"); - localString += string("out_imag += indexOut;\n"); - for(k = 0; k < R1; k++) - localString += string("out_real[") + num2str(k*threadsPerBlock) + string("] = a[") + num2str(k) + string("].x;\n"); - for(k = 0; k < R1; k++) - localString += string("out_imag[") + num2str(k*threadsPerBlock) + string("] = a[") + num2str(k) + string("].y;\n"); - } - else { - localString += string("out += indexOut;\n"); - for(k = 0; k < R1; k++) - localString += string("out[") + num2str(k*threadsPerBlock) + string("] = a[") + num2str(k) + string("];\n"); - } - - } - else - { - localString += string("indexOut += mad24(j, ") + num2str(numIter*strideO) + string(", i);\n"); - if(dataFormat == clFFT_SplitComplexFormat) { - localString += string("out_real += indexOut;\n"); - localString += string("out_imag += indexOut;\n"); - for(k = 0; k < R1; k++) - localString += string("out_real[") + num2str(((k%R2)*R1 + (k/R2))*strideO) + string("] = a[") + num2str(k) + string("].x;\n"); - for(k = 0; k < R1; k++) - localString += string("out_imag[") + num2str(((k%R2)*R1 + (k/R2))*strideO) + string("] = a[") + num2str(k) + string("].y;\n"); - } - else { - localString += string("out += indexOut;\n"); - for(k = 0; k < R1; k++) - localString += string("out[") + num2str(((k%R2)*R1 + (k/R2))*strideO) + string("] = a[") + num2str(k) + string("];\n"); - } - } - - insertHeader(*kernelString, kernelName, dataFormat); - *kernelString += string("{\n"); - if((*kInfo)->lmem_size) - *kernelString += string(" __local float sMem[") + num2str((*kInfo)->lmem_size) + string("];\n"); - *kernelString += localString; - *kernelString += string("}\n"); - - N /= radix; - kInfo = &(*kInfo)->next; - kCount++; - } -} - -void FFT1D(cl_fft_plan *plan, cl_fft_kernel_dir dir) -{ - unsigned int radixArray[10]; - unsigned int numRadix; - - switch(dir) - { - case cl_fft_kernel_x: - if(plan->n.x > plan->max_localmem_fft_size) - { - createGlobalFFTKernelString(plan, plan->n.x, 1, cl_fft_kernel_x, 1); - } - else if(plan->n.x > 1) - { - getRadixArray(plan->n.x, radixArray, &numRadix, 0); - if(plan->n.x / radixArray[0] <= plan->max_work_item_per_workgroup) - { - createLocalMemfftKernelString(plan); - } - else - { - getRadixArray(plan->n.x, radixArray, &numRadix, plan->max_radix); - if(plan->n.x / radixArray[0] <= plan->max_work_item_per_workgroup) - createLocalMemfftKernelString(plan); - else - createGlobalFFTKernelString(plan, plan->n.x, 1, cl_fft_kernel_x, 1); - } - } - break; - - case cl_fft_kernel_y: - if(plan->n.y > 1) - createGlobalFFTKernelString(plan, plan->n.y, plan->n.x, cl_fft_kernel_y, 1); - break; - - case cl_fft_kernel_z: - if(plan->n.z > 1) - createGlobalFFTKernelString(plan, plan->n.z, plan->n.x*plan->n.y, cl_fft_kernel_z, 1); - default: - return; - } -} + +// +// File: fft_kernelstring.cpp +// +// Version: <1.0> +// +// Disclaimer: IMPORTANT: This Apple software is supplied to you by Apple Inc. ("Apple") +// in consideration of your agreement to the following terms, and your use, +// installation, modification or redistribution of this Apple software +// constitutes acceptance of these terms. If you do not agree with these +// terms, please do not use, install, modify or redistribute this Apple +// software. +// +// In consideration of your agreement to abide by the following terms, and +// subject to these terms, Apple grants you a personal, non - exclusive +// license, under Apple's copyrights in this original Apple software ( the +// "Apple Software" ), to use, reproduce, modify and redistribute the Apple +// Software, with or without modifications, in source and / or binary forms; +// provided that if you redistribute the Apple Software in its entirety and +// without modifications, you must retain this notice and the following text +// and disclaimers in all such redistributions of the Apple Software. Neither +// the name, trademarks, service marks or logos of Apple Inc. may be used to +// endorse or promote products derived from the Apple Software without specific +// prior written permission from Apple. Except as expressly stated in this +// notice, no other rights or licenses, express or implied, are granted by +// Apple herein, including but not limited to any patent rights that may be +// infringed by your derivative works or by other works in which the Apple +// Software may be incorporated. +// +// The Apple Software is provided by Apple on an "AS IS" basis. APPLE MAKES NO +// WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED +// WARRANTIES OF NON - INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A +// PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION +// ALONE OR IN COMBINATION WITH YOUR PRODUCTS. +// +// IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR +// CONSEQUENTIAL DAMAGES ( INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION ) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION +// AND / OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER +// UNDER THEORY OF CONTRACT, TORT ( INCLUDING NEGLIGENCE ), STRICT LIABILITY OR +// OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Copyright ( C ) 2008 Apple Inc. All Rights Reserved. +// +//////////////////////////////////////////////////////////////////////////////////////////////////// + + +#include <stdio.h> +#include <stdlib.h> +#include <math.h> +#include <iostream> +#include <sstream> +#include <string> +#include <assert.h> +#include "fft_internal.h" +#include "clFFT.h" + +using namespace std; + +#define max(A,B) ((A) > (B) ? (A) : (B)) +#define min(A,B) ((A) < (B) ? (A) : (B)) + +static string +num2str(int num) +{ + char temp[200]; + sprintf(temp, "%d", num); + return string(temp); +} + +// For any n, this function decomposes n into factors for loacal memory tranpose +// based fft. Factors (radices) are sorted such that the first one (radixArray[0]) +// is the largest. This base radix determines the number of registers used by each +// work item and product of remaining radices determine the size of work group needed. +// To make things concrete with and example, suppose n = 1024. It is decomposed into +// 1024 = 16 x 16 x 4. Hence kernel uses float2 a[16], for local in-register fft and +// needs 16 x 4 = 64 work items per work group. So kernel first performance 64 length +// 16 ffts (64 work items working in parallel) following by transpose using local +// memory followed by again 64 length 16 ffts followed by transpose using local memory +// followed by 256 length 4 ffts. For the last step since with size of work group is +// 64 and each work item can array for 16 values, 64 work items can compute 256 length +// 4 ffts by each work item computing 4 length 4 ffts. +// Similarly for n = 2048 = 8 x 8 x 8 x 4, each work group has 8 x 8 x 4 = 256 work +// iterms which each computes 256 (in-parallel) length 8 ffts in-register, followed +// by transpose using local memory, followed by 256 length 8 in-register ffts, followed +// by transpose using local memory, followed by 256 length 8 in-register ffts, followed +// by transpose using local memory, followed by 512 length 4 in-register ffts. Again, +// for the last step, each work item computes two length 4 in-register ffts and thus +// 256 work items are needed to compute all 512 ffts. +// For n = 32 = 8 x 4, 4 work items first compute 4 in-register +// lenth 8 ffts, followed by transpose using local memory followed by 8 in-register +// length 4 ffts, where each work item computes two length 4 ffts thus 4 work items +// can compute 8 length 4 ffts. However if work group size of say 64 is choosen, +// each work group can compute 64/ 4 = 16 size 32 ffts (batched transform). +// Users can play with these parameters to figure what gives best performance on +// their particular device i.e. some device have less register space thus using +// smaller base radix can avoid spilling ... some has small local memory thus +// using smaller work group size may be required etc + +static void +getRadixArray(unsigned int n, unsigned int *radixArray, unsigned int *numRadices, unsigned int maxRadix) +{ + if(maxRadix > 1) + { + maxRadix = min(n, maxRadix); + unsigned int cnt = 0; + while(n > maxRadix) + { + radixArray[cnt++] = maxRadix; + n /= maxRadix; + } + radixArray[cnt++] = n; + *numRadices = cnt; + return; + } + + switch(n) + { + case 2: + *numRadices = 1; + radixArray[0] = 2; + break; + + case 4: + *numRadices = 1; + radixArray[0] = 4; + break; + + case 8: + *numRadices = 1; + radixArray[0] = 8; + break; + + case 16: + *numRadices = 2; + radixArray[0] = 8; radixArray[1] = 2; + break; + + case 32: + *numRadices = 2; + radixArray[0] = 8; radixArray[1] = 4; + break; + + case 64: + *numRadices = 2; + radixArray[0] = 8; radixArray[1] = 8; + break; + + case 128: + *numRadices = 3; + radixArray[0] = 8; radixArray[1] = 4; radixArray[2] = 4; + break; + + case 256: + *numRadices = 4; + radixArray[0] = 4; radixArray[1] = 4; radixArray[2] = 4; radixArray[3] = 4; + break; + + case 512: + *numRadices = 3; + radixArray[0] = 8; radixArray[1] = 8; radixArray[2] = 8; + break; + + case 1024: + *numRadices = 3; + radixArray[0] = 16; radixArray[1] = 16; radixArray[2] = 4; + break; + case 2048: + *numRadices = 4; + radixArray[0] = 8; radixArray[1] = 8; radixArray[2] = 8; radixArray[3] = 4; + break; + default: + *numRadices = 0; + return; + } +} + +static void +insertHeader(string &kernelString, string &kernelName, clFFT_DataFormat dataFormat) +{ + if(dataFormat == clFFT_SplitComplexFormat) + kernelString += string("__kernel void ") + kernelName + string("(__global float *in_real, __global float *in_imag, __global float *out_real, __global float *out_imag, int dir, int S)\n"); + else + kernelString += string("__kernel void ") + kernelName + string("(__global float2 *in, __global float2 *out, int dir, int S)\n"); +} + +static void +insertVariables(string &kStream, int maxRadix) +{ + kStream += string(" int i, j, r, indexIn, indexOut, index, tid, bNum, xNum, k, l;\n"); + kStream += string(" int s, ii, jj, offset;\n"); + kStream += string(" float2 w;\n"); + kStream += string(" float ang, angf, ang1;\n"); + kStream += string(" __local float *lMemStore, *lMemLoad;\n"); + kStream += string(" float2 a[") + num2str(maxRadix) + string("];\n"); + kStream += string(" int lId = get_local_id( 0 );\n"); + kStream += string(" int groupId = get_group_id( 0 );\n"); +} + +static void +formattedLoad(string &kernelString, int aIndex, int gIndex, clFFT_DataFormat dataFormat) +{ + if(dataFormat == clFFT_InterleavedComplexFormat) + kernelString += string(" a[") + num2str(aIndex) + string("] = in[") + num2str(gIndex) + string("];\n"); + else + { + kernelString += string(" a[") + num2str(aIndex) + string("].x = in_real[") + num2str(gIndex) + string("];\n"); + kernelString += string(" a[") + num2str(aIndex) + string("].y = in_imag[") + num2str(gIndex) + string("];\n"); + } +} + +static void +formattedStore(string &kernelString, int aIndex, int gIndex, clFFT_DataFormat dataFormat) +{ + if(dataFormat == clFFT_InterleavedComplexFormat) + kernelString += string(" out[") + num2str(gIndex) + string("] = a[") + num2str(aIndex) + string("];\n"); + else + { + kernelString += string(" out_real[") + num2str(gIndex) + string("] = a[") + num2str(aIndex) + string("].x;\n"); + kernelString += string(" out_imag[") + num2str(gIndex) + string("] = a[") + num2str(aIndex) + string("].y;\n"); + } +} + +static int +insertGlobalLoadsAndTranspose(string &kernelString, int N, int numWorkItemsPerXForm, int numXFormsPerWG, int R0, int mem_coalesce_width, clFFT_DataFormat dataFormat) +{ + int log2NumWorkItemsPerXForm = (int) log2(numWorkItemsPerXForm); + int groupSize = numWorkItemsPerXForm * numXFormsPerWG; + int i, j; + int lMemSize = 0; + + if(numXFormsPerWG > 1) + kernelString += string(" s = S & ") + num2str(numXFormsPerWG - 1) + string(";\n"); + + if(numWorkItemsPerXForm >= mem_coalesce_width) + { + if(numXFormsPerWG > 1) + { + kernelString += string(" ii = lId & ") + num2str(numWorkItemsPerXForm-1) + string(";\n"); + kernelString += string(" jj = lId >> ") + num2str(log2NumWorkItemsPerXForm) + string(";\n"); + kernelString += string(" if( !s || (groupId < get_num_groups(0)-1) || (jj < s) ) {\n"); + kernelString += string(" offset = mad24( mad24(groupId, ") + num2str(numXFormsPerWG) + string(", jj), ") + num2str(N) + string(", ii );\n"); + if(dataFormat == clFFT_InterleavedComplexFormat) + { + kernelString += string(" in += offset;\n"); + kernelString += string(" out += offset;\n"); + } + else + { + kernelString += string(" in_real += offset;\n"); + kernelString += string(" in_imag += offset;\n"); + kernelString += string(" out_real += offset;\n"); + kernelString += string(" out_imag += offset;\n"); + } + for(i = 0; i < R0; i++) + formattedLoad(kernelString, i, i*numWorkItemsPerXForm, dataFormat); + kernelString += string(" }\n"); + } + else + { + kernelString += string(" ii = lId;\n"); + kernelString += string(" jj = 0;\n"); + kernelString += string(" offset = mad24(groupId, ") + num2str(N) + string(", ii);\n"); + if(dataFormat == clFFT_InterleavedComplexFormat) + { + kernelString += string(" in += offset;\n"); + kernelString += string(" out += offset;\n"); + } + else + { + kernelString += string(" in_real += offset;\n"); + kernelString += string(" in_imag += offset;\n"); + kernelString += string(" out_real += offset;\n"); + kernelString += string(" out_imag += offset;\n"); + } + for(i = 0; i < R0; i++) + formattedLoad(kernelString, i, i*numWorkItemsPerXForm, dataFormat); + } + } + else if( N >= mem_coalesce_width ) + { + int numInnerIter = N / mem_coalesce_width; + int numOuterIter = numXFormsPerWG / ( groupSize / mem_coalesce_width ); + + kernelString += string(" ii = lId & ") + num2str(mem_coalesce_width - 1) + string(";\n"); + kernelString += string(" jj = lId >> ") + num2str((int)log2(mem_coalesce_width)) + string(";\n"); + kernelString += string(" lMemStore = sMem + mad24( jj, ") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n"); + kernelString += string(" offset = mad24( groupId, ") + num2str(numXFormsPerWG) + string(", jj);\n"); + kernelString += string(" offset = mad24( offset, ") + num2str(N) + string(", ii );\n"); + if(dataFormat == clFFT_InterleavedComplexFormat) + { + kernelString += string(" in += offset;\n"); + kernelString += string(" out += offset;\n"); + } + else + { + kernelString += string(" in_real += offset;\n"); + kernelString += string(" in_imag += offset;\n"); + kernelString += string(" out_real += offset;\n"); + kernelString += string(" out_imag += offset;\n"); + } + + kernelString += string("if((groupId == get_num_groups(0)-1) && s) {\n"); + for(i = 0; i < numOuterIter; i++ ) + { + kernelString += string(" if( jj < s ) {\n"); + for(j = 0; j < numInnerIter; j++ ) + formattedLoad(kernelString, i * numInnerIter + j, j * mem_coalesce_width + i * ( groupSize / mem_coalesce_width ) * N, dataFormat); + kernelString += string(" }\n"); + if(i != numOuterIter - 1) + kernelString += string(" jj += ") + num2str(groupSize / mem_coalesce_width) + string(";\n"); + } + kernelString += string("}\n "); + kernelString += string("else {\n"); + for(i = 0; i < numOuterIter; i++ ) + { + for(j = 0; j < numInnerIter; j++ ) + formattedLoad(kernelString, i * numInnerIter + j, j * mem_coalesce_width + i * ( groupSize / mem_coalesce_width ) * N, dataFormat); + } + kernelString += string("}\n"); + + kernelString += string(" ii = lId & ") + num2str(numWorkItemsPerXForm - 1) + string(";\n"); + kernelString += string(" jj = lId >> ") + num2str(log2NumWorkItemsPerXForm) + string(";\n"); + kernelString += string(" lMemLoad = sMem + mad24( jj, ") + num2str(N + numWorkItemsPerXForm) + string(", ii);\n"); + + for( i = 0; i < numOuterIter; i++ ) + { + for( j = 0; j < numInnerIter; j++ ) + { + kernelString += string(" lMemStore[") + num2str(j * mem_coalesce_width + i * ( groupSize / mem_coalesce_width ) * (N + numWorkItemsPerXForm )) + string("] = a[") + + num2str(i * numInnerIter + j) + string("].x;\n"); + } + } + kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); + + for( i = 0; i < R0; i++ ) + kernelString += string(" a[") + num2str(i) + string("].x = lMemLoad[") + num2str(i * numWorkItemsPerXForm) + string("];\n"); + kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); + + for( i = 0; i < numOuterIter; i++ ) + { + for( j = 0; j < numInnerIter; j++ ) + { + kernelString += string(" lMemStore[") + num2str(j * mem_coalesce_width + i * ( groupSize / mem_coalesce_width ) * (N + numWorkItemsPerXForm )) + string("] = a[") + + num2str(i * numInnerIter + j) + string("].y;\n"); + } + } + kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); + + for( i = 0; i < R0; i++ ) + kernelString += string(" a[") + num2str(i) + string("].y = lMemLoad[") + num2str(i * numWorkItemsPerXForm) + string("];\n"); + kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); + + lMemSize = (N + numWorkItemsPerXForm) * numXFormsPerWG; + } + else + { + kernelString += string(" offset = mad24( groupId, ") + num2str(N * numXFormsPerWG) + string(", lId );\n"); + if(dataFormat == clFFT_InterleavedComplexFormat) + { + kernelString += string(" in += offset;\n"); + kernelString += string(" out += offset;\n"); + } + else + { + kernelString += string(" in_real += offset;\n"); + kernelString += string(" in_imag += offset;\n"); + kernelString += string(" out_real += offset;\n"); + kernelString += string(" out_imag += offset;\n"); + } + + kernelString += string(" ii = lId & ") + num2str(N-1) + string(";\n"); + kernelString += string(" jj = lId >> ") + num2str((int)log2(N)) + string(";\n"); + kernelString += string(" lMemStore = sMem + mad24( jj, ") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n"); + + kernelString += string("if((groupId == get_num_groups(0)-1) && s) {\n"); + for( i = 0; i < R0; i++ ) + { + kernelString += string(" if(jj < s )\n"); + formattedLoad(kernelString, i, i*groupSize, dataFormat); + if(i != R0 - 1) + kernelString += string(" jj += ") + num2str(groupSize / N) + string(";\n"); + } + kernelString += string("}\n"); + kernelString += string("else {\n"); + for( i = 0; i < R0; i++ ) + { + formattedLoad(kernelString, i, i*groupSize, dataFormat); + } + kernelString += string("}\n"); + + if(numWorkItemsPerXForm > 1) + { + kernelString += string(" ii = lId & ") + num2str(numWorkItemsPerXForm - 1) + string(";\n"); + kernelString += string(" jj = lId >> ") + num2str(log2NumWorkItemsPerXForm) + string(";\n"); + kernelString += string(" lMemLoad = sMem + mad24( jj, ") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n"); + } + else + { + kernelString += string(" ii = 0;\n"); + kernelString += string(" jj = lId;\n"); + kernelString += string(" lMemLoad = sMem + mul24( jj, ") + num2str(N + numWorkItemsPerXForm) + string(");\n"); + } + + + for( i = 0; i < R0; i++ ) + kernelString += string(" lMemStore[") + num2str(i * ( groupSize / N ) * ( N + numWorkItemsPerXForm )) + string("] = a[") + num2str(i) + string("].x;\n"); + kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); + + for( i = 0; i < R0; i++ ) + kernelString += string(" a[") + num2str(i) + string("].x = lMemLoad[") + num2str(i * numWorkItemsPerXForm) + string("];\n"); + kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); + + for( i = 0; i < R0; i++ ) + kernelString += string(" lMemStore[") + num2str(i * ( groupSize / N ) * ( N + numWorkItemsPerXForm )) + string("] = a[") + num2str(i) + string("].y;\n"); + kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); + + for( i = 0; i < R0; i++ ) + kernelString += string(" a[") + num2str(i) + string("].y = lMemLoad[") + num2str(i * numWorkItemsPerXForm) + string("];\n"); + kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); + + lMemSize = (N + numWorkItemsPerXForm) * numXFormsPerWG; + } + + return lMemSize; +} + +static int +insertGlobalStoresAndTranspose(string &kernelString, int N, int maxRadix, int Nr, int numWorkItemsPerXForm, int numXFormsPerWG, int mem_coalesce_width, clFFT_DataFormat dataFormat) +{ + int groupSize = numWorkItemsPerXForm * numXFormsPerWG; + int i, j, k, ind; + int lMemSize = 0; + int numIter = maxRadix / Nr; + string indent = string(""); + + if( numWorkItemsPerXForm >= mem_coalesce_width ) + { + if(numXFormsPerWG > 1) + { + kernelString += string(" if( !s || (groupId < get_num_groups(0)-1) || (jj < s) ) {\n"); + indent = string(" "); + } + for(i = 0; i < maxRadix; i++) + { + j = i % numIter; + k = i / numIter; + ind = j * Nr + k; + formattedStore(kernelString, ind, i*numWorkItemsPerXForm, dataFormat); + } + if(numXFormsPerWG > 1) + kernelString += string(" }\n"); + } + else if( N >= mem_coalesce_width ) + { + int numInnerIter = N / mem_coalesce_width; + int numOuterIter = numXFormsPerWG / ( groupSize / mem_coalesce_width ); + + kernelString += string(" lMemLoad = sMem + mad24( jj, ") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n"); + kernelString += string(" ii = lId & ") + num2str(mem_coalesce_width - 1) + string(";\n"); + kernelString += string(" jj = lId >> ") + num2str((int)log2(mem_coalesce_width)) + string(";\n"); + kernelString += string(" lMemStore = sMem + mad24( jj,") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n"); + + for( i = 0; i < maxRadix; i++ ) + { + j = i % numIter; + k = i / numIter; + ind = j * Nr + k; + kernelString += string(" lMemLoad[") + num2str(i*numWorkItemsPerXForm) + string("] = a[") + num2str(ind) + string("].x;\n"); + } + kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); + + for( i = 0; i < numOuterIter; i++ ) + for( j = 0; j < numInnerIter; j++ ) + kernelString += string(" a[") + num2str(i*numInnerIter + j) + string("].x = lMemStore[") + num2str(j*mem_coalesce_width + i*( groupSize / mem_coalesce_width )*(N + numWorkItemsPerXForm)) + string("];\n"); + kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); + + for( i = 0; i < maxRadix; i++ ) + { + j = i % numIter; + k = i / numIter; + ind = j * Nr + k; + kernelString += string(" lMemLoad[") + num2str(i*numWorkItemsPerXForm) + string("] = a[") + num2str(ind) + string("].y;\n"); + } + kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); + + for( i = 0; i < numOuterIter; i++ ) + for( j = 0; j < numInnerIter; j++ ) + kernelString += string(" a[") + num2str(i*numInnerIter + j) + string("].y = lMemStore[") + num2str(j*mem_coalesce_width + i*( groupSize / mem_coalesce_width )*(N + numWorkItemsPerXForm)) + string("];\n"); + kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); + + kernelString += string("if((groupId == get_num_groups(0)-1) && s) {\n"); + for(i = 0; i < numOuterIter; i++ ) + { + kernelString += string(" if( jj < s ) {\n"); + for(j = 0; j < numInnerIter; j++ ) + formattedStore(kernelString, i*numInnerIter + j, j*mem_coalesce_width + i*(groupSize/mem_coalesce_width)*N, dataFormat); + kernelString += string(" }\n"); + if(i != numOuterIter - 1) + kernelString += string(" jj += ") + num2str(groupSize / mem_coalesce_width) + string(";\n"); + } + kernelString += string("}\n"); + kernelString += string("else {\n"); + for(i = 0; i < numOuterIter; i++ ) + { + for(j = 0; j < numInnerIter; j++ ) + formattedStore(kernelString, i*numInnerIter + j, j*mem_coalesce_width + i*(groupSize/mem_coalesce_width)*N, dataFormat); + } + kernelString += string("}\n"); + + lMemSize = (N + numWorkItemsPerXForm) * numXFormsPerWG; + } + else + { + kernelString += string(" lMemLoad = sMem + mad24( jj,") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n"); + + kernelString += string(" ii = lId & ") + num2str(N - 1) + string(";\n"); + kernelString += string(" jj = lId >> ") + num2str((int) log2(N)) + string(";\n"); + kernelString += string(" lMemStore = sMem + mad24( jj,") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n"); + + for( i = 0; i < maxRadix; i++ ) + { + j = i % numIter; + k = i / numIter; + ind = j * Nr + k; + kernelString += string(" lMemLoad[") + num2str(i*numWorkItemsPerXForm) + string("] = a[") + num2str(ind) + string("].x;\n"); + } + kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); + + for( i = 0; i < maxRadix; i++ ) + kernelString += string(" a[") + num2str(i) + string("].x = lMemStore[") + num2str(i*( groupSize / N )*( N + numWorkItemsPerXForm )) + string("];\n"); + kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); + + for( i = 0; i < maxRadix; i++ ) + { + j = i % numIter; + k = i / numIter; + ind = j * Nr + k; + kernelString += string(" lMemLoad[") + num2str(i*numWorkItemsPerXForm) + string("] = a[") + num2str(ind) + string("].y;\n"); + } + kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); + + for( i = 0; i < maxRadix; i++ ) + kernelString += string(" a[") + num2str(i) + string("].y = lMemStore[") + num2str(i*( groupSize / N )*( N + numWorkItemsPerXForm )) + string("];\n"); + kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); + + kernelString += string("if((groupId == get_num_groups(0)-1) && s) {\n"); + for( i = 0; i < maxRadix; i++ ) + { + kernelString += string(" if(jj < s ) {\n"); + formattedStore(kernelString, i, i*groupSize, dataFormat); + kernelString += string(" }\n"); + if( i != maxRadix - 1) + kernelString += string(" jj +=") + num2str(groupSize / N) + string(";\n"); + } + kernelString += string("}\n"); + kernelString += string("else {\n"); + for( i = 0; i < maxRadix; i++ ) + { + formattedStore(kernelString, i, i*groupSize, dataFormat); + } + kernelString += string("}\n"); + + lMemSize = (N + numWorkItemsPerXForm) * numXFormsPerWG; + } + + return lMemSize; +} + +static void +insertfftKernel(string &kernelString, int Nr, int numIter) +{ + int i; + for(i = 0; i < numIter; i++) + { + kernelString += string(" fftKernel") + num2str(Nr) + string("(a+") + num2str(i*Nr) + string(", dir);\n"); + } +} + +static void +insertTwiddleKernel(string &kernelString, int Nr, int numIter, int Nprev, int len, int numWorkItemsPerXForm) +{ + int z, k; + int logNPrev = log2(Nprev); + + for(z = 0; z < numIter; z++) + { + if(z == 0) + { + if(Nprev > 1) + kernelString += string(" angf = (float) (ii >> ") + num2str(logNPrev) + string(");\n"); + else + kernelString += string(" angf = (float) ii;\n"); + } + else + { + if(Nprev > 1) + kernelString += string(" angf = (float) ((") + num2str(z*numWorkItemsPerXForm) + string(" + ii) >>") + num2str(logNPrev) + string(");\n"); + else + kernelString += string(" angf = (float) (") + num2str(z*numWorkItemsPerXForm) + string(" + ii);\n"); + } + + for(k = 1; k < Nr; k++) { + int ind = z*Nr + k; + //float fac = (float) (2.0 * M_PI * (double) k / (double) len); + kernelString += string(" ang = dir * ( 2.0f * M_PI * ") + num2str(k) + string(".0f / ") + num2str(len) + string(".0f )") + string(" * angf;\n"); + kernelString += string(" w = (float2)(native_cos(ang), native_sin(ang));\n"); + kernelString += string(" a[") + num2str(ind) + string("] = complexMul(a[") + num2str(ind) + string("], w);\n"); + } + } +} + +static int +getPadding(int numWorkItemsPerXForm, int Nprev, int numWorkItemsReq, int numXFormsPerWG, int Nr, int numBanks, int *offset, int *midPad) +{ + if((numWorkItemsPerXForm <= Nprev) || (Nprev >= numBanks)) + *offset = 0; + else { + int numRowsReq = ((numWorkItemsPerXForm < numBanks) ? numWorkItemsPerXForm : numBanks) / Nprev; + int numColsReq = 1; + if(numRowsReq > Nr) + numColsReq = numRowsReq / Nr; + numColsReq = Nprev * numColsReq; + *offset = numColsReq; + } + + if(numWorkItemsPerXForm >= numBanks || numXFormsPerWG == 1) + *midPad = 0; + else { + int bankNum = ( (numWorkItemsReq + *offset) * Nr ) & (numBanks - 1); + if( bankNum >= numWorkItemsPerXForm ) + *midPad = 0; + else + *midPad = numWorkItemsPerXForm - bankNum; + } + + int lMemSize = ( numWorkItemsReq + *offset) * Nr * numXFormsPerWG + *midPad * (numXFormsPerWG - 1); + return lMemSize; +} + + +static void +insertLocalStores(string &kernelString, int numIter, int Nr, int numWorkItemsPerXForm, int numWorkItemsReq, int offset, string &comp) +{ + int z, k; + + for(z = 0; z < numIter; z++) { + for(k = 0; k < Nr; k++) { + int index = k*(numWorkItemsReq + offset) + z*numWorkItemsPerXForm; + kernelString += string(" lMemStore[") + num2str(index) + string("] = a[") + num2str(z*Nr + k) + string("].") + comp + string(";\n"); + } + } + kernelString += string(" barrier(CLK_LOCAL_MEM_FENCE);\n"); +} + +static void +insertLocalLoads(string &kernelString, int n, int Nr, int Nrn, int Nprev, int Ncurr, int numWorkItemsPerXForm, int numWorkItemsReq, int offset, string &comp) +{ + int numWorkItemsReqN = n / Nrn; + int interBlockHNum = max( Nprev / numWorkItemsPerXForm, 1 ); + int interBlockHStride = numWorkItemsPerXForm; + int vertWidth = max(numWorkItemsPerXForm / Nprev, 1); + vertWidth = min( vertWidth, Nr); + int vertNum = Nr / vertWidth; + int vertStride = ( n / Nr + offset ) * vertWidth; + int iter = max( numWorkItemsReqN / numWorkItemsPerXForm, 1); + int intraBlockHStride = (numWorkItemsPerXForm / (Nprev*Nr)) > 1 ? (numWorkItemsPerXForm / (Nprev*Nr)) : 1; + intraBlockHStride *= Nprev; + + int stride = numWorkItemsReq / Nrn; + int i; + for(i = 0; i < iter; i++) { + int ii = i / (interBlockHNum * vertNum); + int zz = i % (interBlockHNum * vertNum); + int jj = zz % interBlockHNum; + int kk = zz / interBlockHNum; + int z; + for(z = 0; z < Nrn; z++) { + int st = kk * vertStride + jj * interBlockHStride + ii * intraBlockHStride + z * stride; + kernelString += string(" a[") + num2str(i*Nrn + z) + string("].") + comp + string(" = lMemLoad[") + num2str(st) + string("];\n"); + } + } + kernelString += string(" barrier(CLK_LOCAL_MEM_FENCE);\n"); +} + +static void +insertLocalLoadIndexArithmatic(string &kernelString, int Nprev, int Nr, int numWorkItemsReq, int numWorkItemsPerXForm, int numXFormsPerWG, int offset, int midPad) +{ + int Ncurr = Nprev * Nr; + int logNcurr = log2(Ncurr); + int logNprev = log2(Nprev); + int incr = (numWorkItemsReq + offset) * Nr + midPad; + + if(Ncurr < numWorkItemsPerXForm) + { + if(Nprev == 1) + kernelString += string(" j = ii & ") + num2str(Ncurr - 1) + string(";\n"); + else + kernelString += string(" j = (ii & ") + num2str(Ncurr - 1) + string(") >> ") + num2str(logNprev) + string(";\n"); + + if(Nprev == 1) + kernelString += string(" i = ii >> ") + num2str(logNcurr) + string(";\n"); + else + kernelString += string(" i = mad24(ii >> ") + num2str(logNcurr) + string(", ") + num2str(Nprev) + string(", ii & ") + num2str(Nprev - 1) + string(");\n"); + } + else + { + if(Nprev == 1) + kernelString += string(" j = ii;\n"); + else + kernelString += string(" j = ii >> ") + num2str(logNprev) + string(";\n"); + if(Nprev == 1) + kernelString += string(" i = 0;\n"); + else + kernelString += string(" i = ii & ") + num2str(Nprev - 1) + string(";\n"); + } + + if(numXFormsPerWG > 1) + kernelString += string(" i = mad24(jj, ") + num2str(incr) + string(", i);\n"); + + kernelString += string(" lMemLoad = sMem + mad24(j, ") + num2str(numWorkItemsReq + offset) + string(", i);\n"); +} + +static void +insertLocalStoreIndexArithmatic(string &kernelString, int numWorkItemsReq, int numXFormsPerWG, int Nr, int offset, int midPad) +{ + if(numXFormsPerWG == 1) { + kernelString += string(" lMemStore = sMem + ii;\n"); + } + else { + kernelString += string(" lMemStore = sMem + mad24(jj, ") + num2str((numWorkItemsReq + offset)*Nr + midPad) + string(", ii);\n"); + } +} + + +static void +createLocalMemfftKernelString(cl_fft_plan *plan) +{ + unsigned int radixArray[10]; + unsigned int numRadix; + + unsigned int n = plan->n.x; + + assert(n <= plan->max_work_item_per_workgroup * plan->max_radix && "signal lenght too big for local mem fft\n"); + + getRadixArray(n, radixArray, &numRadix, 0); + assert(numRadix > 0 && "no radix array supplied\n"); + + if(n/radixArray[0] > plan->max_work_item_per_workgroup) + getRadixArray(n, radixArray, &numRadix, plan->max_radix); + + assert(radixArray[0] <= plan->max_radix && "max radix choosen is greater than allowed\n"); + assert(n/radixArray[0] <= plan->max_work_item_per_workgroup && "required work items per xform greater than maximum work items allowed per work group for local mem fft\n"); + + unsigned int tmpLen = 1; + unsigned int i; + for(i = 0; i < numRadix; i++) + { + assert( radixArray[i] && !( (radixArray[i] - 1) & radixArray[i] ) ); + tmpLen *= radixArray[i]; + } + assert(tmpLen == n && "product of radices choosen doesnt match the length of signal\n"); + + int offset, midPad; + string localString(""), kernelName(""); + + clFFT_DataFormat dataFormat = plan->format; + string *kernelString = plan->kernel_string; + + + cl_fft_kernel_info **kInfo = &plan->kernel_info; + int kCount = 0; + + while(*kInfo) + { + kInfo = &(*kInfo)->next; + kCount++; + } + + kernelName = string("fft") + num2str(kCount); + + *kInfo = (cl_fft_kernel_info *) malloc(sizeof(cl_fft_kernel_info)); + (*kInfo)->kernel = 0; + (*kInfo)->lmem_size = 0; + (*kInfo)->num_workgroups = 0; + (*kInfo)->num_workitems_per_workgroup = 0; + (*kInfo)->dir = cl_fft_kernel_x; + (*kInfo)->in_place_possible = 1; + (*kInfo)->next = NULL; + (*kInfo)->kernel_name = (char *) malloc(sizeof(char)*(kernelName.size()+1)); + strcpy((*kInfo)->kernel_name, kernelName.c_str()); + + unsigned int numWorkItemsPerXForm = n / radixArray[0]; + unsigned int numWorkItemsPerWG = numWorkItemsPerXForm <= 64 ? 64 : numWorkItemsPerXForm; + assert(numWorkItemsPerWG <= plan->max_work_item_per_workgroup); + int numXFormsPerWG = numWorkItemsPerWG / numWorkItemsPerXForm; + (*kInfo)->num_workgroups = 1; + (*kInfo)->num_xforms_per_workgroup = numXFormsPerWG; + (*kInfo)->num_workitems_per_workgroup = numWorkItemsPerWG; + + unsigned int *N = radixArray; + unsigned int maxRadix = N[0]; + unsigned int lMemSize = 0; + + insertVariables(localString, maxRadix); + + lMemSize = insertGlobalLoadsAndTranspose(localString, n, numWorkItemsPerXForm, numXFormsPerWG, maxRadix, plan->min_mem_coalesce_width, dataFormat); + (*kInfo)->lmem_size = (lMemSize > (*kInfo)->lmem_size) ? lMemSize : (*kInfo)->lmem_size; + + string xcomp = string("x"); + string ycomp = string("y"); + + unsigned int Nprev = 1; + unsigned int len = n; + unsigned int r; + for(r = 0; r < numRadix; r++) + { + int numIter = N[0] / N[r]; + int numWorkItemsReq = n / N[r]; + int Ncurr = Nprev * N[r]; + insertfftKernel(localString, N[r], numIter); + + if(r < (numRadix - 1)) { + insertTwiddleKernel(localString, N[r], numIter, Nprev, len, numWorkItemsPerXForm); + lMemSize = getPadding(numWorkItemsPerXForm, Nprev, numWorkItemsReq, numXFormsPerWG, N[r], plan->num_local_mem_banks, &offset, &midPad); + (*kInfo)->lmem_size = (lMemSize > (*kInfo)->lmem_size) ? lMemSize : (*kInfo)->lmem_size; + insertLocalStoreIndexArithmatic(localString, numWorkItemsReq, numXFormsPerWG, N[r], offset, midPad); + insertLocalLoadIndexArithmatic(localString, Nprev, N[r], numWorkItemsReq, numWorkItemsPerXForm, numXFormsPerWG, offset, midPad); + insertLocalStores(localString, numIter, N[r], numWorkItemsPerXForm, numWorkItemsReq, offset, xcomp); + insertLocalLoads(localString, n, N[r], N[r+1], Nprev, Ncurr, numWorkItemsPerXForm, numWorkItemsReq, offset, xcomp); + insertLocalStores(localString, numIter, N[r], numWorkItemsPerXForm, numWorkItemsReq, offset, ycomp); + insertLocalLoads(localString, n, N[r], N[r+1], Nprev, Ncurr, numWorkItemsPerXForm, numWorkItemsReq, offset, ycomp); + Nprev = Ncurr; + len = len / N[r]; + } + } + + lMemSize = insertGlobalStoresAndTranspose(localString, n, maxRadix, N[numRadix - 1], numWorkItemsPerXForm, numXFormsPerWG, plan->min_mem_coalesce_width, dataFormat); + (*kInfo)->lmem_size = (lMemSize > (*kInfo)->lmem_size) ? lMemSize : (*kInfo)->lmem_size; + + insertHeader(*kernelString, kernelName, dataFormat); + *kernelString += string("{\n"); + if((*kInfo)->lmem_size) + *kernelString += string(" __local float sMem[") + num2str((*kInfo)->lmem_size) + string("];\n"); + *kernelString += localString; + *kernelString += string("}\n"); +} + +// For n larger than what can be computed using local memory fft, global transposes +// multiple kernel launces is needed. For these sizes, n can be decomposed using +// much larger base radices i.e. say n = 262144 = 128 x 64 x 32. Thus three kernel +// launches will be needed, first computing 64 x 32, length 128 ffts, second computing +// 128 x 32 length 64 ffts, and finally a kernel computing 128 x 64 length 32 ffts. +// Each of these base radices can futher be divided into factors so that each of these +// base ffts can be computed within one kernel launch using in-register ffts and local +// memory transposes i.e for the first kernel above which computes 64 x 32 ffts on length +// 128, 128 can be decomposed into 128 = 16 x 8 i.e. 8 work items can compute 8 length +// 16 ffts followed by transpose using local memory followed by each of these eight +// work items computing 2 length 8 ffts thus computing 16 length 8 ffts in total. This +// means only 8 work items are needed for computing one length 128 fft. If we choose +// work group size of say 64, we can compute 64/8 = 8 length 128 ffts within one +// work group. Since we need to compute 64 x 32 length 128 ffts in first kernel, this +// means we need to launch 64 x 32 / 8 = 256 work groups with 64 work items in each +// work group where each work group is computing 8 length 128 ffts where each length +// 128 fft is computed by 8 work items. Same logic can be applied to other two kernels +// in this example. Users can play with difference base radices and difference +// decompositions of base radices to generates different kernels and see which gives +// best performance. Following function is just fixed to use 128 as base radix + +void +getGlobalRadixInfo(int n, int *radix, int *R1, int *R2, int *numRadices) +{ + int baseRadix = min(n, 128); + + int numR = 0; + int N = n; + while(N > baseRadix) + { + N /= baseRadix; + numR++; + } + + for(int i = 0; i < numR; i++) + radix[i] = baseRadix; + + radix[numR] = N; + numR++; + *numRadices = numR; + + for(int i = 0; i < numR; i++) + { + int B = radix[i]; + if(B <= 8) + { + R1[i] = B; + R2[i] = 1; + continue; + } + + int r1 = 2; + int r2 = B / r1; + while(r2 > r1) + { + r1 *=2; + r2 = B / r1; + } + R1[i] = r1; + R2[i] = r2; + } +} + +static void +createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir dir, int vertBS) +{ + int i, j, k, t; + int radixArr[10] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + int R1Arr[10] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + int R2Arr[10] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + int radix, R1, R2; + int numRadices; + + int maxThreadsPerBlock = plan->max_work_item_per_workgroup; + int maxArrayLen = plan->max_radix; + int batchSize = plan->min_mem_coalesce_width; + clFFT_DataFormat dataFormat = plan->format; + int vertical = (dir == cl_fft_kernel_x) ? 0 : 1; + + getGlobalRadixInfo(n, radixArr, R1Arr, R2Arr, &numRadices); + + int numPasses = numRadices; + + string localString(""), kernelName(""); + string *kernelString = plan->kernel_string; + cl_fft_kernel_info **kInfo = &plan->kernel_info; + int kCount = 0; + + while(*kInfo) + { + kInfo = &(*kInfo)->next; + kCount++; + } + + int N = n; + int m = (int)log2(n); + int Rinit = vertical ? BS : 1; + batchSize = vertical ? min(BS, batchSize) : batchSize; + int passNum; + + for(passNum = 0; passNum < numPasses; passNum++) + { + + localString.clear(); + kernelName.clear(); + + radix = radixArr[passNum]; + R1 = R1Arr[passNum]; + R2 = R2Arr[passNum]; + + int strideI = Rinit; + for(i = 0; i < numPasses; i++) + if(i != passNum) + strideI *= radixArr[i]; + + int strideO = Rinit; + for(i = 0; i < passNum; i++) + strideO *= radixArr[i]; + + int threadsPerXForm = R2; + batchSize = R2 == 1 ? plan->max_work_item_per_workgroup : batchSize; + batchSize = min(batchSize, strideI); + int threadsPerBlock = batchSize * threadsPerXForm; + threadsPerBlock = min(threadsPerBlock, maxThreadsPerBlock); + batchSize = threadsPerBlock / threadsPerXForm; + assert(R2 <= R1); + assert(R1*R2 == radix); + assert(R1 <= maxArrayLen); + assert(threadsPerBlock <= maxThreadsPerBlock); + + int numIter = R1 / R2; + int gInInc = threadsPerBlock / batchSize; + + + int lgStrideO = log2(strideO); + int numBlocksPerXForm = strideI / batchSize; + int numBlocks = numBlocksPerXForm; + if(!vertical) + numBlocks *= BS; + else + numBlocks *= vertBS; + + kernelName = string("fft") + num2str(kCount); + *kInfo = (cl_fft_kernel_info *) malloc(sizeof(cl_fft_kernel_info)); + (*kInfo)->kernel = 0; + if(R2 == 1) + (*kInfo)->lmem_size = 0; + else + { + if(strideO == 1) + (*kInfo)->lmem_size = (radix + 1)*batchSize; + else + (*kInfo)->lmem_size = threadsPerBlock*R1; + } + (*kInfo)->num_workgroups = numBlocks; + (*kInfo)->num_xforms_per_workgroup = 1; + (*kInfo)->num_workitems_per_workgroup = threadsPerBlock; + (*kInfo)->dir = dir; + if( (passNum == (numPasses - 1)) && (numPasses & 1) ) + (*kInfo)->in_place_possible = 1; + else + (*kInfo)->in_place_possible = 0; + (*kInfo)->next = NULL; + (*kInfo)->kernel_name = (char *) malloc(sizeof(char)*(kernelName.size()+1)); + strcpy((*kInfo)->kernel_name, kernelName.c_str()); + + insertVariables(localString, R1); + + if(vertical) + { + localString += string("xNum = groupId >> ") + num2str((int)log2(numBlocksPerXForm)) + string(";\n"); + localString += string("groupId = groupId & ") + num2str(numBlocksPerXForm - 1) + string(";\n"); + localString += string("indexIn = mad24(groupId, ") + num2str(batchSize) + string(", xNum << ") + num2str((int)log2(n*BS)) + string(");\n"); + localString += string("tid = mul24(groupId, ") + num2str(batchSize) + string(");\n"); + localString += string("i = tid >> ") + num2str(lgStrideO) + string(";\n"); + localString += string("j = tid & ") + num2str(strideO - 1) + string(";\n"); + int stride = radix*Rinit; + for(i = 0; i < passNum; i++) + stride *= radixArr[i]; + localString += string("indexOut = mad24(i, ") + num2str(stride) + string(", j + ") + string("(xNum << ") + num2str((int) log2(n*BS)) + string("));\n"); + localString += string("bNum = groupId;\n"); + } + else + { + int lgNumBlocksPerXForm = log2(numBlocksPerXForm); + localString += string("bNum = groupId & ") + num2str(numBlocksPerXForm - 1) + string(";\n"); + localString += string("xNum = groupId >> ") + num2str(lgNumBlocksPerXForm) + string(";\n"); + localString += string("indexIn = mul24(bNum, ") + num2str(batchSize) + string(");\n"); + localString += string("tid = indexIn;\n"); + localString += string("i = tid >> ") + num2str(lgStrideO) + string(";\n"); + localString += string("j = tid & ") + num2str(strideO - 1) + string(";\n"); + int stride = radix*Rinit; + for(i = 0; i < passNum; i++) + stride *= radixArr[i]; + localString += string("indexOut = mad24(i, ") + num2str(stride) + string(", j);\n"); + localString += string("indexIn += (xNum << ") + num2str(m) + string(");\n"); + localString += string("indexOut += (xNum << ") + num2str(m) + string(");\n"); + } + + // Load Data + int lgBatchSize = log2(batchSize); + localString += string("tid = lId;\n"); + localString += string("i = tid & ") + num2str(batchSize - 1) + string(";\n"); + localString += string("j = tid >> ") + num2str(lgBatchSize) + string(";\n"); + localString += string("indexIn += mad24(j, ") + num2str(strideI) + string(", i);\n"); + + if(dataFormat == clFFT_SplitComplexFormat) + { + localString += string("in_real += indexIn;\n"); + localString += string("in_imag += indexIn;\n"); + for(j = 0; j < R1; j++) + localString += string("a[") + num2str(j) + string("].x = in_real[") + num2str(j*gInInc*strideI) + string("];\n"); + for(j = 0; j < R1; j++) + localString += string("a[") + num2str(j) + string("].y = in_imag[") + num2str(j*gInInc*strideI) + string("];\n"); + } + else + { + localString += string("in += indexIn;\n"); + for(j = 0; j < R1; j++) + localString += string("a[") + num2str(j) + string("] = in[") + num2str(j*gInInc*strideI) + string("];\n"); + } + + localString += string("fftKernel") + num2str(R1) + string("(a, dir);\n"); + + if(R2 > 1) + { + // twiddle + for(k = 1; k < R1; k++) + { + localString += string("ang = dir*(2.0f*M_PI*") + num2str(k) + string("/") + num2str(radix) + string(")*j;\n"); + localString += string("w = (float2)(native_cos(ang), native_sin(ang));\n"); + localString += string("a[") + num2str(k) + string("] = complexMul(a[") + num2str(k) + string("], w);\n"); + } + + // shuffle + numIter = R1 / R2; + localString += string("indexIn = mad24(j, ") + num2str(threadsPerBlock*numIter) + string(", i);\n"); + localString += string("lMemStore = sMem + tid;\n"); + localString += string("lMemLoad = sMem + indexIn;\n"); + for(k = 0; k < R1; k++) + localString += string("lMemStore[") + num2str(k*threadsPerBlock) + string("] = a[") + num2str(k) + string("].x;\n"); + localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n"); + for(k = 0; k < numIter; k++) + for(t = 0; t < R2; t++) + localString += string("a[") + num2str(k*R2+t) + string("].x = lMemLoad[") + num2str(t*batchSize + k*threadsPerBlock) + string("];\n"); + localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n"); + for(k = 0; k < R1; k++) + localString += string("lMemStore[") + num2str(k*threadsPerBlock) + string("] = a[") + num2str(k) + string("].y;\n"); + localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n"); + for(k = 0; k < numIter; k++) + for(t = 0; t < R2; t++) + localString += string("a[") + num2str(k*R2+t) + string("].y = lMemLoad[") + num2str(t*batchSize + k*threadsPerBlock) + string("];\n"); + localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n"); + + for(j = 0; j < numIter; j++) + localString += string("fftKernel") + num2str(R2) + string("(a + ") + num2str(j*R2) + string(", dir);\n"); + } + + // twiddle + if(passNum < (numPasses - 1)) + { + localString += string("l = ((bNum << ") + num2str(lgBatchSize) + string(") + i) >> ") + num2str(lgStrideO) + string(";\n"); + localString += string("k = j << ") + num2str((int)log2(R1/R2)) + string(";\n"); + localString += string("ang1 = dir*(2.0f*M_PI/") + num2str(N) + string(")*l;\n"); + for(t = 0; t < R1; t++) + { + localString += string("ang = ang1*(k + ") + num2str((t%R2)*R1 + (t/R2)) + string(");\n"); + localString += string("w = (float2)(native_cos(ang), native_sin(ang));\n"); + localString += string("a[") + num2str(t) + string("] = complexMul(a[") + num2str(t) + string("], w);\n"); + } + } + + // Store Data + if(strideO == 1) + { + + localString += string("lMemStore = sMem + mad24(i, ") + num2str(radix + 1) + string(", j << ") + num2str((int)log2(R1/R2)) + string(");\n"); + localString += string("lMemLoad = sMem + mad24(tid >> ") + num2str((int)log2(radix)) + string(", ") + num2str(radix+1) + string(", tid & ") + num2str(radix-1) + string(");\n"); + + for(int i = 0; i < R1/R2; i++) + for(int j = 0; j < R2; j++) + localString += string("lMemStore[ ") + num2str(i + j*R1) + string("] = a[") + num2str(i*R2+j) + string("].x;\n"); + localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n"); + if(threadsPerBlock >= radix) + { + for(int i = 0; i < R1; i++) + localString += string("a[") + num2str(i) + string("].x = lMemLoad[") + num2str(i*(radix+1)*(threadsPerBlock/radix)) + string("];\n"); + } + else + { + int innerIter = radix/threadsPerBlock; + int outerIter = R1/innerIter; + for(int i = 0; i < outerIter; i++) + for(int j = 0; j < innerIter; j++) + localString += string("a[") + num2str(i*innerIter+j) + string("].x = lMemLoad[") + num2str(j*threadsPerBlock + i*(radix+1)) + string("];\n"); + } + localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n"); + + for(int i = 0; i < R1/R2; i++) + for(int j = 0; j < R2; j++) + localString += string("lMemStore[ ") + num2str(i + j*R1) + string("] = a[") + num2str(i*R2+j) + string("].y;\n"); + localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n"); + if(threadsPerBlock >= radix) + { + for(int i = 0; i < R1; i++) + localString += string("a[") + num2str(i) + string("].y = lMemLoad[") + num2str(i*(radix+1)*(threadsPerBlock/radix)) + string("];\n"); + } + else + { + int innerIter = radix/threadsPerBlock; + int outerIter = R1/innerIter; + for(int i = 0; i < outerIter; i++) + for(int j = 0; j < innerIter; j++) + localString += string("a[") + num2str(i*innerIter+j) + string("].y = lMemLoad[") + num2str(j*threadsPerBlock + i*(radix+1)) + string("];\n"); + } + localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n"); + + localString += string("indexOut += tid;\n"); + if(dataFormat == clFFT_SplitComplexFormat) { + localString += string("out_real += indexOut;\n"); + localString += string("out_imag += indexOut;\n"); + for(k = 0; k < R1; k++) + localString += string("out_real[") + num2str(k*threadsPerBlock) + string("] = a[") + num2str(k) + string("].x;\n"); + for(k = 0; k < R1; k++) + localString += string("out_imag[") + num2str(k*threadsPerBlock) + string("] = a[") + num2str(k) + string("].y;\n"); + } + else { + localString += string("out += indexOut;\n"); + for(k = 0; k < R1; k++) + localString += string("out[") + num2str(k*threadsPerBlock) + string("] = a[") + num2str(k) + string("];\n"); + } + + } + else + { + localString += string("indexOut += mad24(j, ") + num2str(numIter*strideO) + string(", i);\n"); + if(dataFormat == clFFT_SplitComplexFormat) { + localString += string("out_real += indexOut;\n"); + localString += string("out_imag += indexOut;\n"); + for(k = 0; k < R1; k++) + localString += string("out_real[") + num2str(((k%R2)*R1 + (k/R2))*strideO) + string("] = a[") + num2str(k) + string("].x;\n"); + for(k = 0; k < R1; k++) + localString += string("out_imag[") + num2str(((k%R2)*R1 + (k/R2))*strideO) + string("] = a[") + num2str(k) + string("].y;\n"); + } + else { + localString += string("out += indexOut;\n"); + for(k = 0; k < R1; k++) + localString += string("out[") + num2str(((k%R2)*R1 + (k/R2))*strideO) + string("] = a[") + num2str(k) + string("];\n"); + } + } + + insertHeader(*kernelString, kernelName, dataFormat); + *kernelString += string("{\n"); + if((*kInfo)->lmem_size) + *kernelString += string(" __local float sMem[") + num2str((*kInfo)->lmem_size) + string("];\n"); + *kernelString += localString; + *kernelString += string("}\n"); + + N /= radix; + kInfo = &(*kInfo)->next; + kCount++; + } +} + +void FFT1D(cl_fft_plan *plan, cl_fft_kernel_dir dir) +{ + unsigned int radixArray[10]; + unsigned int numRadix; + + switch(dir) + { + case cl_fft_kernel_x: + if(plan->n.x > plan->max_localmem_fft_size) + { + createGlobalFFTKernelString(plan, plan->n.x, 1, cl_fft_kernel_x, 1); + } + else if(plan->n.x > 1) + { + getRadixArray(plan->n.x, radixArray, &numRadix, 0); + if(plan->n.x / radixArray[0] <= plan->max_work_item_per_workgroup) + { + createLocalMemfftKernelString(plan); + } + else + { + getRadixArray(plan->n.x, radixArray, &numRadix, plan->max_radix); + if(plan->n.x / radixArray[0] <= plan->max_work_item_per_workgroup) + createLocalMemfftKernelString(plan); + else + createGlobalFFTKernelString(plan, plan->n.x, 1, cl_fft_kernel_x, 1); + } + } + break; + + case cl_fft_kernel_y: + if(plan->n.y > 1) + createGlobalFFTKernelString(plan, plan->n.y, plan->n.x, cl_fft_kernel_y, 1); + break; + + case cl_fft_kernel_z: + if(plan->n.z > 1) + createGlobalFFTKernelString(plan, plan->n.z, plan->n.x*plan->n.y, cl_fft_kernel_z, 1); + default: + return; + } +} + diff --git a/src/fft_setup.cpp b/src/fft_setup.cpp index 1221bb88672207806eece85313206fb975f18479..4eefeb119b045e35ce3386007699d33eaf66e2b5 100644 --- a/src/fft_setup.cpp +++ b/src/fft_setup.cpp @@ -1,356 +1,401 @@ -#include "fft_internal.h" -#include "fft_base_kernels.h" -#include <stdlib.h> -#include <string.h> -#include <sys/types.h> -#include <sys/stat.h> -#include <iostream> -#include <string> -#include <sstream> - -using namespace std; - -extern void getKernelWorkDimensions(cl_fft_plan *plan, cl_fft_kernel_info *kernelInfo, cl_int *batchSize, size_t *gWorkItems, size_t *lWorkItems); - -static void -getBlockConfigAndKernelString(cl_fft_plan *plan) -{ - plan->temp_buffer_needed = 0; - *plan->kernel_string += baseKernels; - - if(plan->format == clFFT_SplitComplexFormat) - *plan->kernel_string += twistKernelPlannar; - else - *plan->kernel_string += twistKernelInterleaved; - - switch(plan->dim) - { - case clFFT_1D: - FFT1D(plan, cl_fft_kernel_x); - break; - - case clFFT_2D: - FFT1D(plan, cl_fft_kernel_x); - FFT1D(plan, cl_fft_kernel_y); - break; - - case clFFT_3D: - FFT1D(plan, cl_fft_kernel_x); - FFT1D(plan, cl_fft_kernel_y); - FFT1D(plan, cl_fft_kernel_z); - break; - - default: - return; - } - - plan->temp_buffer_needed = 0; - cl_fft_kernel_info *kInfo = plan->kernel_info; - while(kInfo) - { - plan->temp_buffer_needed |= !kInfo->in_place_possible; - kInfo = kInfo->next; - } -} - - + +// +// File: fft_setup.cpp +// +// Version: <1.0> +// +// Disclaimer: IMPORTANT: This Apple software is supplied to you by Apple Inc. ("Apple") +// in consideration of your agreement to the following terms, and your use, +// installation, modification or redistribution of this Apple software +// constitutes acceptance of these terms. If you do not agree with these +// terms, please do not use, install, modify or redistribute this Apple +// software. +// +// In consideration of your agreement to abide by the following terms, and +// subject to these terms, Apple grants you a personal, non - exclusive +// license, under Apple's copyrights in this original Apple software ( the +// "Apple Software" ), to use, reproduce, modify and redistribute the Apple +// Software, with or without modifications, in source and / or binary forms; +// provided that if you redistribute the Apple Software in its entirety and +// without modifications, you must retain this notice and the following text +// and disclaimers in all such redistributions of the Apple Software. Neither +// the name, trademarks, service marks or logos of Apple Inc. may be used to +// endorse or promote products derived from the Apple Software without specific +// prior written permission from Apple. Except as expressly stated in this +// notice, no other rights or licenses, express or implied, are granted by +// Apple herein, including but not limited to any patent rights that may be +// infringed by your derivative works or by other works in which the Apple +// Software may be incorporated. +// +// The Apple Software is provided by Apple on an "AS IS" basis. APPLE MAKES NO +// WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED +// WARRANTIES OF NON - INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A +// PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION +// ALONE OR IN COMBINATION WITH YOUR PRODUCTS. +// +// IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR +// CONSEQUENTIAL DAMAGES ( INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION ) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION +// AND / OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER +// UNDER THEORY OF CONTRACT, TORT ( INCLUDING NEGLIGENCE ), STRICT LIABILITY OR +// OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Copyright ( C ) 2008 Apple Inc. All Rights Reserved. +// +//////////////////////////////////////////////////////////////////////////////////////////////////// + + +#include "fft_internal.h" +#include "fft_base_kernels.h" +#include <stdlib.h> +#include <string.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <iostream> +#include <string> +#include <sstream> + +using namespace std; + +extern void getKernelWorkDimensions(cl_fft_plan *plan, cl_fft_kernel_info *kernelInfo, cl_int *batchSize, size_t *gWorkItems, size_t *lWorkItems); + static void -deleteKernelInfo(cl_fft_kernel_info *kInfo) -{ - if(kInfo) - { - if(kInfo->kernel_name) - free(kInfo->kernel_name); - if(kInfo->kernel) - clReleaseKernel(kInfo->kernel); - free(kInfo); - } -} - -static void -destroy_plan(cl_fft_plan *Plan) -{ - cl_fft_kernel_info *kernel_info = Plan->kernel_info; - - while(kernel_info) - { - cl_fft_kernel_info *tmp = kernel_info->next; - deleteKernelInfo(kernel_info); - kernel_info = tmp; - } - - Plan->kernel_info = NULL; - - if(Plan->kernel_string) - { - delete Plan->kernel_string; - Plan->kernel_string = NULL; - } - if(Plan->twist_kernel) - { - clReleaseKernel(Plan->twist_kernel); - Plan->twist_kernel = NULL; - } - if(Plan->program) - { - clReleaseProgram(Plan->program); - Plan->program = NULL; - } - if(Plan->tempmemobj) - { - clReleaseMemObject(Plan->tempmemobj); - Plan->tempmemobj = NULL; - } - if(Plan->tempmemobj_real) - { - clReleaseMemObject(Plan->tempmemobj_real); - Plan->tempmemobj_real = NULL; - } - if(Plan->tempmemobj_imag) - { - clReleaseMemObject(Plan->tempmemobj_imag); - Plan->tempmemobj_imag = NULL; - } -} - -static int -createKernelList(cl_fft_plan *plan) -{ - cl_program program = plan->program; - cl_fft_kernel_info *kernel_info = plan->kernel_info; - - cl_int err; - while(kernel_info) - { - kernel_info->kernel = clCreateKernel(program, kernel_info->kernel_name, &err); - if(!kernel_info->kernel || err != CL_SUCCESS) - return err; - kernel_info = kernel_info->next; - } - - if(plan->format == clFFT_SplitComplexFormat) - plan->twist_kernel = clCreateKernel(program, "clFFT_1DTwistSplit", &err); - else - plan->twist_kernel = clCreateKernel(program, "clFFT_1DTwistInterleaved", &err); - - if(!plan->twist_kernel || err) - return err; - - return CL_SUCCESS; -} - -int getMaxKernelWorkGroupSize(cl_fft_plan *plan, unsigned int *max_wg_size, unsigned int num_devices, cl_device_id *devices) -{ - int reg_needed = 0; - //*max_wg_size = INT_MAX; - *max_wg_size = 0x7fffffff; - int err; - size_t wg_size; - - unsigned int i; - for(i = 0; i < num_devices; i++) - { - cl_fft_kernel_info *kInfo = plan->kernel_info; - while(kInfo) - { - err = clGetKernelWorkGroupInfo(kInfo->kernel, devices[i], CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &wg_size, NULL); - if(err != CL_SUCCESS) - return -1; - - if(wg_size < kInfo->num_workitems_per_workgroup) - reg_needed |= 1; - - if(*max_wg_size > wg_size) - *max_wg_size = wg_size; - - kInfo = kInfo->next; - } - } - - return reg_needed; -} - +getBlockConfigAndKernelString(cl_fft_plan *plan) +{ + plan->temp_buffer_needed = 0; + *plan->kernel_string += baseKernels; + + if(plan->format == clFFT_SplitComplexFormat) + *plan->kernel_string += twistKernelPlannar; + else + *plan->kernel_string += twistKernelInterleaved; + + switch(plan->dim) + { + case clFFT_1D: + FFT1D(plan, cl_fft_kernel_x); + break; + + case clFFT_2D: + FFT1D(plan, cl_fft_kernel_x); + FFT1D(plan, cl_fft_kernel_y); + break; + + case clFFT_3D: + FFT1D(plan, cl_fft_kernel_x); + FFT1D(plan, cl_fft_kernel_y); + FFT1D(plan, cl_fft_kernel_z); + break; + + default: + return; + } + + plan->temp_buffer_needed = 0; + cl_fft_kernel_info *kInfo = plan->kernel_info; + while(kInfo) + { + plan->temp_buffer_needed |= !kInfo->in_place_possible; + kInfo = kInfo->next; + } +} + + +static void +deleteKernelInfo(cl_fft_kernel_info *kInfo) +{ + if(kInfo) + { + if(kInfo->kernel_name) + free(kInfo->kernel_name); + if(kInfo->kernel) + clReleaseKernel(kInfo->kernel); + free(kInfo); + } +} + +static void +destroy_plan(cl_fft_plan *Plan) +{ + cl_fft_kernel_info *kernel_info = Plan->kernel_info; + + while(kernel_info) + { + cl_fft_kernel_info *tmp = kernel_info->next; + deleteKernelInfo(kernel_info); + kernel_info = tmp; + } + + Plan->kernel_info = NULL; + + if(Plan->kernel_string) + { + delete Plan->kernel_string; + Plan->kernel_string = NULL; + } + if(Plan->twist_kernel) + { + clReleaseKernel(Plan->twist_kernel); + Plan->twist_kernel = NULL; + } + if(Plan->program) + { + clReleaseProgram(Plan->program); + Plan->program = NULL; + } + if(Plan->tempmemobj) + { + clReleaseMemObject(Plan->tempmemobj); + Plan->tempmemobj = NULL; + } + if(Plan->tempmemobj_real) + { + clReleaseMemObject(Plan->tempmemobj_real); + Plan->tempmemobj_real = NULL; + } + if(Plan->tempmemobj_imag) + { + clReleaseMemObject(Plan->tempmemobj_imag); + Plan->tempmemobj_imag = NULL; + } +} + +static int +createKernelList(cl_fft_plan *plan) +{ + cl_program program = plan->program; + cl_fft_kernel_info *kernel_info = plan->kernel_info; + + cl_int err; + while(kernel_info) + { + kernel_info->kernel = clCreateKernel(program, kernel_info->kernel_name, &err); + if(!kernel_info->kernel || err != CL_SUCCESS) + return err; + kernel_info = kernel_info->next; + } + + if(plan->format == clFFT_SplitComplexFormat) + plan->twist_kernel = clCreateKernel(program, "clFFT_1DTwistSplit", &err); + else + plan->twist_kernel = clCreateKernel(program, "clFFT_1DTwistInterleaved", &err); + + if(!plan->twist_kernel || err) + return err; + + return CL_SUCCESS; +} + +int getMaxKernelWorkGroupSize(cl_fft_plan *plan, unsigned int *max_wg_size, unsigned int num_devices, cl_device_id *devices) +{ + int reg_needed = 0; + *max_wg_size = INT_MAX; + int err; + size_t wg_size; + + unsigned int i; + for(i = 0; i < num_devices; i++) + { + cl_fft_kernel_info *kInfo = plan->kernel_info; + while(kInfo) + { + err = clGetKernelWorkGroupInfo(kInfo->kernel, devices[i], CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &wg_size, NULL); + if(err != CL_SUCCESS) + return -1; + + if(wg_size < kInfo->num_workitems_per_workgroup) + reg_needed |= 1; + + if(*max_wg_size > wg_size) + *max_wg_size = wg_size; + + kInfo = kInfo->next; + } + } + + return reg_needed; +} + #define ERR_MACRO(err) { \ if( err != CL_SUCCESS) \ { \ if(error_code) \ *error_code = err; \ clFFT_DestroyPlan((clFFT_Plan) plan); \ - return (clFFT_Plan) NULL; \ + return (clFFT_Plan) NULL; \ } \ - } - -clFFT_Plan -clFFT_CreatePlan(cl_context context, clFFT_Dim3 n, clFFT_Dimension dim, clFFT_DataFormat dataFormat, cl_int *error_code ) -{ - int i; - cl_int err; - int isPow2 = 1; - cl_fft_plan *plan = NULL; - ostringstream kString; - int num_devices; - int gpu_found = 0; - cl_device_id devices[16]; - size_t ret_size; - cl_device_type device_type; - - if(!context) - ERR_MACRO(CL_INVALID_VALUE); - - isPow2 |= n.x && !( (n.x - 1) & n.x ); - isPow2 |= n.y && !( (n.y - 1) & n.y ); - isPow2 |= n.z && !( (n.z - 1) & n.z ); - - if(!isPow2) - ERR_MACRO(CL_INVALID_VALUE); - - if( (dim == clFFT_1D && (n.y != 1 || n.z != 1)) || (dim == clFFT_2D && n.z != 1) ) - ERR_MACRO(CL_INVALID_VALUE); - - plan = (cl_fft_plan *) malloc(sizeof(cl_fft_plan)); - if(!plan) - ERR_MACRO(CL_OUT_OF_RESOURCES); - - plan->context = context; - clRetainContext(context); - plan->n = n; - plan->dim = dim; - plan->format = dataFormat; - plan->kernel_info = 0; - plan->num_kernels = 0; - plan->twist_kernel = 0; - plan->program = 0; - plan->temp_buffer_needed = 0; - plan->last_batch_size = 0; - plan->tempmemobj = 0; - plan->tempmemobj_real = 0; - plan->tempmemobj_imag = 0; - plan->max_localmem_fft_size = 2048; - plan->max_work_item_per_workgroup = 256; - plan->max_radix = 16; - plan->min_mem_coalesce_width = 16; - plan->num_local_mem_banks = 16; -patch_kernel_source: - - plan->kernel_string = new string(""); - if(!plan->kernel_string) - ERR_MACRO(CL_OUT_OF_RESOURCES); - - getBlockConfigAndKernelString(plan); - - const char *source_str = plan->kernel_string->c_str(); - plan->program = clCreateProgramWithSource(context, 1, (const char**) &source_str, NULL, &err); - ERR_MACRO(err); - - err = clGetContextInfo(context, CL_CONTEXT_DEVICES, sizeof(devices), devices, &ret_size); - ERR_MACRO(err); - - num_devices = ret_size / sizeof(cl_device_id); - - for(i = 0; i < num_devices; i++) - { - err = clGetDeviceInfo(devices[i], CL_DEVICE_TYPE, sizeof(device_type), &device_type, NULL); - ERR_MACRO(err); - - if(device_type == CL_DEVICE_TYPE_GPU) - { - gpu_found = 1; - err = clBuildProgram(plan->program, 1, &devices[i], "-cl-mad-enable", NULL, NULL); - if (err != CL_SUCCESS) - { - char *build_log; - char devicename[200]; - size_t log_size; - - err = clGetProgramBuildInfo(plan->program, devices[i], CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); - printf ("E: %d\n", err); - ERR_MACRO(err); - - build_log = (char *) malloc(log_size + 1); - - err = clGetProgramBuildInfo(plan->program, devices[i], CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL); - printf ("E: %d\n", err); - ERR_MACRO(err); - - err = clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(devicename), devicename, NULL); - printf ("E: %d\n", err); - ERR_MACRO(err); - - fprintf(stdout, "FFT program build log on device %s\n", devicename); - fprintf(stdout, "%s\n", build_log); - free(build_log); - - ERR_MACRO(err); - } - } - } - - if(!gpu_found) - ERR_MACRO(CL_INVALID_CONTEXT); - - err = createKernelList(plan); - ERR_MACRO(err); - - // we created program and kernels based on "some max work group size (default 256)" ... this work group size - // may be larger than what kernel may execute with ... if thats the case we need to regenerate the kernel source - // setting this as limit i.e max group size and rebuild. - unsigned int max_kernel_wg_size; - int patching_req = getMaxKernelWorkGroupSize(plan, &max_kernel_wg_size, num_devices, devices); - if(patching_req == -1) - { - ERR_MACRO(err); - } - - if(patching_req) - { - destroy_plan(plan); - plan->max_work_item_per_workgroup = max_kernel_wg_size; - goto patch_kernel_source; - } - - cl_fft_kernel_info *kInfo = plan->kernel_info; - while(kInfo) - { - plan->num_kernels++; - kInfo = kInfo->next; - } - - if(error_code) - *error_code = CL_SUCCESS; - - return (clFFT_Plan) plan; -} - -void -clFFT_DestroyPlan(clFFT_Plan plan) -{ - cl_fft_plan *Plan = (cl_fft_plan *) plan; - if(Plan) - { - destroy_plan(Plan); - clReleaseContext(Plan->context); - free(Plan); - } -} - -void clFFT_DumpPlan( clFFT_Plan Plan, FILE *file) -{ - size_t gDim, lDim; - FILE *out; - if(!file) - out = stdout; - else - out = file; - - cl_fft_plan *plan = (cl_fft_plan *) Plan; - cl_fft_kernel_info *kInfo = plan->kernel_info; - - while(kInfo) - { - cl_int s = 1; - getKernelWorkDimensions(plan, kInfo, &s, &gDim, &lDim); - fprintf(out, "Run kernel %s with global dim = {%zd*BatchSize}, local dim={%zd}\n", kInfo->kernel_name, gDim, lDim); - kInfo = kInfo->next; - } - fprintf(out, "%s\n", plan->kernel_string->c_str()); -} + } + +clFFT_Plan +clFFT_CreatePlan(cl_context context, clFFT_Dim3 n, clFFT_Dimension dim, clFFT_DataFormat dataFormat, cl_int *error_code ) +{ + int i; + cl_int err; + int isPow2 = 1; + cl_fft_plan *plan = NULL; + ostringstream kString; + int num_devices; + int gpu_found = 0; + cl_device_id devices[16]; + size_t ret_size; + cl_device_type device_type; + + if(!context) + ERR_MACRO(CL_INVALID_VALUE); + + isPow2 |= n.x && !( (n.x - 1) & n.x ); + isPow2 |= n.y && !( (n.y - 1) & n.y ); + isPow2 |= n.z && !( (n.z - 1) & n.z ); + + if(!isPow2) + ERR_MACRO(CL_INVALID_VALUE); + + if( (dim == clFFT_1D && (n.y != 1 || n.z != 1)) || (dim == clFFT_2D && n.z != 1) ) + ERR_MACRO(CL_INVALID_VALUE); + + plan = (cl_fft_plan *) malloc(sizeof(cl_fft_plan)); + if(!plan) + ERR_MACRO(CL_OUT_OF_RESOURCES); + + plan->context = context; + clRetainContext(context); + plan->n = n; + plan->dim = dim; + plan->format = dataFormat; + plan->kernel_info = 0; + plan->num_kernels = 0; + plan->twist_kernel = 0; + plan->program = 0; + plan->temp_buffer_needed = 0; + plan->last_batch_size = 0; + plan->tempmemobj = 0; + plan->tempmemobj_real = 0; + plan->tempmemobj_imag = 0; + plan->max_localmem_fft_size = 2048; + plan->max_work_item_per_workgroup = 256; + plan->max_radix = 16; + plan->min_mem_coalesce_width = 16; + plan->num_local_mem_banks = 16; + +patch_kernel_source: + + plan->kernel_string = new string(""); + if(!plan->kernel_string) + ERR_MACRO(CL_OUT_OF_RESOURCES); + + getBlockConfigAndKernelString(plan); + + const char *source_str = plan->kernel_string->c_str(); + plan->program = clCreateProgramWithSource(context, 1, (const char**) &source_str, NULL, &err); + ERR_MACRO(err); + + err = clGetContextInfo(context, CL_CONTEXT_DEVICES, sizeof(devices), devices, &ret_size); + ERR_MACRO(err); + + num_devices = ret_size / sizeof(cl_device_id); + + for(i = 0; i < num_devices; i++) + { + err = clGetDeviceInfo(devices[i], CL_DEVICE_TYPE, sizeof(device_type), &device_type, NULL); + ERR_MACRO(err); + + if(device_type == CL_DEVICE_TYPE_GPU) + { + gpu_found = 1; + err = clBuildProgram(plan->program, 1, &devices[i], "-cl-mad-enable", NULL, NULL); + if (err != CL_SUCCESS) + { + char *build_log; + char devicename[200]; + size_t log_size; + + err = clGetProgramBuildInfo(plan->program, devices[i], CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); + ERR_MACRO(err); + + build_log = (char *) malloc(log_size + 1); + + err = clGetProgramBuildInfo(plan->program, devices[i], CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL); + ERR_MACRO(err); + + err = clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(devicename), devicename, NULL); + ERR_MACRO(err); + + fprintf(stdout, "FFT program build log on device %s\n", devicename); + fprintf(stdout, "%s\n", build_log); + free(build_log); + + ERR_MACRO(err); + } + } + } + + if(!gpu_found) + ERR_MACRO(CL_INVALID_CONTEXT); + + err = createKernelList(plan); + ERR_MACRO(err); + + // we created program and kernels based on "some max work group size (default 256)" ... this work group size + // may be larger than what kernel may execute with ... if thats the case we need to regenerate the kernel source + // setting this as limit i.e max group size and rebuild. + unsigned int max_kernel_wg_size; + int patching_req = getMaxKernelWorkGroupSize(plan, &max_kernel_wg_size, num_devices, devices); + if(patching_req == -1) + { + ERR_MACRO(err); + } + + if(patching_req) + { + destroy_plan(plan); + plan->max_work_item_per_workgroup = max_kernel_wg_size; + goto patch_kernel_source; + } + + cl_fft_kernel_info *kInfo = plan->kernel_info; + while(kInfo) + { + plan->num_kernels++; + kInfo = kInfo->next; + } + + if(error_code) + *error_code = CL_SUCCESS; + + return (clFFT_Plan) plan; +} + +void +clFFT_DestroyPlan(clFFT_Plan plan) +{ + cl_fft_plan *Plan = (cl_fft_plan *) plan; + if(Plan) + { + destroy_plan(Plan); + clReleaseContext(Plan->context); + free(Plan); + } +} + +void clFFT_DumpPlan( clFFT_Plan Plan, FILE *file) +{ + size_t gDim, lDim; + FILE *out; + if(!file) + out = stdout; + else + out = file; + + cl_fft_plan *plan = (cl_fft_plan *) Plan; + cl_fft_kernel_info *kInfo = plan->kernel_info; + + while(kInfo) + { + cl_int s = 1; + getKernelWorkDimensions(plan, kInfo, &s, &gDim, &lDim); + fprintf(out, "Run kernel %s with global dim = {%zd*BatchSize}, local dim={%zd}\n", kInfo->kernel_name, gDim, lDim); + kInfo = kInfo->next; + } + fprintf(out, "%s\n", plan->kernel_string->c_str()); +} \ No newline at end of file