diff --git a/src/fft_base_kernels.h b/src/fft_base_kernels.h index fe70c8b86a0fe59ab51bddab94e530f5f4334650..67f7d2ede92c4be6da24f6ab248fbd1325893f42 100644 --- a/src/fft_base_kernels.h +++ b/src/fft_base_kernels.h @@ -58,6 +58,22 @@ static string baseKernels = string( "#define M_PI 0x1.921fb54442d18p+1\n" "#endif\n" "#define complexMul(a,b) ((float2)(mad(-(a).y, (b).y, (a).x * (b).x), mad((a).y, (b).x, (a).x * (b).y)))\n" + "\n" + "#define cos_sinLUT1(res,dir,i,sinLUT,cosLUT)\\\n" + "{\\\n" + "(res)=(float2)((cosLUT)[i] , (dir)*(sinLUT)[i]);\\\n" + "}\n" + "\n" + "#define cos_sinLUT2(res,dir,_i,_k,sinLUT1,cosLUT1,sinLUT2,cosLUT2) \\\n" + "{ float _sin_1= (sinLUT1)[_i]; \\\n" + " float _sin_2= (sinLUT2)[_k]; \\\n" + " float _cos_1= (cosLUT1)[_i]; \\\n" + " float _cos_2= (cosLUT2)[_k]; \\\n" + " float _cos_res = _cos_1 * _cos_2 - _sin_1 * _sin_2; \\\n" + " float _sin_res = (dir) * (_sin_1 * _cos_2 + _cos_1 * _sin_2); \\\n" + " (res)=(float2)(_cos_res,_sin_res); \\\n" + "}\n" + "\n" "#define conj(a) ((float2)((a).x, -(a).y))\n" "#define conjTransp(a) ((float2)(-(a).y, (a).x))\n" "\n" diff --git a/src/fft_execute.cpp b/src/fft_execute.cpp index bdeb7a3b3624c297a7a053433511dbe142e57180..21d72dbd79b3c584081dee7853941a77adb188d9 100644 --- a/src/fft_execute.cpp +++ b/src/fft_execute.cpp @@ -180,6 +180,11 @@ clFFT_ExecuteInterleaved( cl_command_queue queue, clFFT_Plan Plan, cl_int batchS err |= clSetKernelArg(kernelInfo->kernel, 1, sizeof(cl_mem), &memObj[currWrite]); err |= clSetKernelArg(kernelInfo->kernel, 2, sizeof(cl_int), &dir); err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_int), &s); + err |= clSetKernelArg(kernelInfo->kernel, 4, sizeof(cl_mem), &(plan->sin_LUT_d1)); + err |= clSetKernelArg(kernelInfo->kernel, 5, sizeof(cl_mem), &(plan->cos_LUT_d1)); + err |= clSetKernelArg(kernelInfo->kernel, 6, sizeof(cl_mem), &(plan->sin_LUT_d2)); + err |= clSetKernelArg(kernelInfo->kernel, 7, sizeof(cl_mem), &(plan->cos_LUT_d2)); + err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, 0, NULL, NULL); if(err) @@ -203,6 +208,10 @@ clFFT_ExecuteInterleaved( cl_command_queue queue, clFFT_Plan Plan, cl_int batchS err |= clSetKernelArg(kernelInfo->kernel, 1, sizeof(cl_mem), &memObj[currWrite]); err |= clSetKernelArg(kernelInfo->kernel, 2, sizeof(cl_int), &dir); err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_int), &s); + err |= clSetKernelArg(kernelInfo->kernel, 4, sizeof(cl_mem), &(plan->sin_LUT_d1)); + err |= clSetKernelArg(kernelInfo->kernel, 5, sizeof(cl_mem), &(plan->cos_LUT_d1)); + err |= clSetKernelArg(kernelInfo->kernel, 6, sizeof(cl_mem), &(plan->sin_LUT_d2)); + err |= clSetKernelArg(kernelInfo->kernel, 7, sizeof(cl_mem), &(plan->cos_LUT_d2)); err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, 0, NULL, NULL); if(err) @@ -285,6 +294,12 @@ clFFT_ExecutePlannar( cl_command_queue queue, clFFT_Plan Plan, cl_int batchSize, err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_mem), &memObj_imag[currWrite]); err |= clSetKernelArg(kernelInfo->kernel, 4, sizeof(cl_int), &dir); err |= clSetKernelArg(kernelInfo->kernel, 5, sizeof(cl_int), &s); + err |= clSetKernelArg(kernelInfo->kernel, 6, sizeof(cl_mem), &(plan->sin_LUT_d1)); + err |= clSetKernelArg(kernelInfo->kernel, 7, sizeof(cl_mem), &(plan->cos_LUT_d1)); + err |= clSetKernelArg(kernelInfo->kernel, 8, sizeof(cl_mem), &(plan->sin_LUT_d2)); + err |= clSetKernelArg(kernelInfo->kernel, 9, sizeof(cl_mem), &(plan->cos_LUT_d2)); + + err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, 0, NULL, NULL); if(err) @@ -309,6 +324,10 @@ clFFT_ExecutePlannar( cl_command_queue queue, clFFT_Plan Plan, cl_int batchSize, err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_mem), &memObj_imag[currWrite]); err |= clSetKernelArg(kernelInfo->kernel, 4, sizeof(cl_int), &dir); err |= clSetKernelArg(kernelInfo->kernel, 5, sizeof(cl_int), &s); + err |= clSetKernelArg(kernelInfo->kernel, 6, sizeof(cl_mem), &(plan->sin_LUT_d1)); + err |= clSetKernelArg(kernelInfo->kernel, 7, sizeof(cl_mem), &(plan->cos_LUT_d1)); + err |= clSetKernelArg(kernelInfo->kernel, 8, sizeof(cl_mem), &(plan->sin_LUT_d2)); + err |= clSetKernelArg(kernelInfo->kernel, 9, sizeof(cl_mem), &(plan->cos_LUT_d2)); err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, 0, NULL, NULL); if(err) diff --git a/src/fft_internal.h b/src/fft_internal.h index 22c3969aa982e6ca1398df43fb1b7bc387098023..98557ab8fff3627d21bc64c5407aca932c9f9fe0 100644 --- a/src/fft_internal.h +++ b/src/fft_internal.h @@ -94,7 +94,7 @@ typedef struct // n, dim, format and other parameters string *kernel_string; - // CL program containing source and kernel this particular + // CL program containing source and kernel for this particular // n, dim, data format cl_program program; @@ -134,6 +134,17 @@ typedef struct // data format of plan (plannar or interleaved) cl_mem tempmemobj_real, tempmemobj_imag; + + // precomputed lookup tables for sin,cos calculations, each of size + // sqrt(n) or 2*sqrt(n), n is size of signal; + + cl_mem sin_LUT_d1,sin_LUT_d2; + cl_mem cos_LUT_d1,cos_LUT_d2; + int logN1; + int logN2; + size_t N1; + size_t N2; + // Maximum size of signal for which local memory transposed based // fft is sufficient i.e. no global mem transpose (communication) // is needed diff --git a/src/fft_kernelstring.cpp b/src/fft_kernelstring.cpp index 52a667c6a7f27124bd17c8856050d76718f04046..0fbe523606adb64461df9c2623648e4fe07331eb 100644 --- a/src/fft_kernelstring.cpp +++ b/src/fft_kernelstring.cpp @@ -181,9 +181,15 @@ static void insertHeader(string &kernelString, string &kernelName, clFFT_DataFormat dataFormat) { if(dataFormat == clFFT_SplitComplexFormat) - kernelString += string("__kernel void ") + kernelName + string("(__global float *in_real, __global float *in_imag, __global float *out_real, __global float *out_imag, int dir, int S)\n"); - else - kernelString += string("__kernel void ") + kernelName + string("(__global float2 *in, __global float2 *out, int dir, int S)\n"); + kernelString += string("__kernel void ") + kernelName + string("(__global float *in_real, __global float *in_imag, __global float *out_real, __global float *out_imag, int dir, int S," + + "__global float * sinLUT1, __global float * cosLUT1, __global float * sinLUT2, __global float * cosLUT2)\n"); + +// "__constant float * sinLUT1, __constant float * cosLUT1, __constant float * sinLUT2, __constant float * cosLUT2)\n"); + else +// kernelString += string("__kernel void ") + kernelName + string("(__global float2 *in, __global float2 *out, int dir, int S, __constant float * sinLUT1, __constant float * cosLUT1, __constant float * sinLUT2, __constant float * cosLUT2)\n"); + kernelString += string("__kernel void ") + kernelName + string("(__global float2 *in, __global float2 *out, int dir, int S, __global float * sinLUT1, __global float * cosLUT1, __global float * sinLUT2, __global float * cosLUT2)\n"); + } static void @@ -579,6 +585,8 @@ insertfftKernel(string &kernelString, int Nr, int numIter) } } + + static void insertTwiddleKernel(string &kernelString, int Nr, int numIter, int Nprev, int len, int numWorkItemsPerXForm) { @@ -612,6 +620,10 @@ insertTwiddleKernel(string &kernelString, int Nr, int numIter, int Nprev, int le } } + + + + static int getPadding(int numWorkItemsPerXForm, int Nprev, int numWorkItemsReq, int numXFormsPerWG, int Nr, int numBanks, int *offset, int *midPad) { @@ -911,6 +923,89 @@ getGlobalRadixInfo(int n, int *radix, int *R1, int *R2, int *numRadices) } } +// generate code to calculate cos(w),sin(w) and assign to a variable varRes +// where w= dir * 2.0 * PI * num/denom * {expr} +// and dir is a variable in the generated code +// PI can be assumed to be defined by the macro M_PI +// num and denom are integers passed to the function +// expr is C++ code for an expression evaluating to an integer, +// such that num * value(expr)< denom + +static void +insertSinCosCalcDirect(string & kernel_string, cl_fft_plan *plan, int num, int denom , string & expr, string & varRes) +{ + if(denom & (denom-1)) { + kernel_string += string("ang = dir*(2.0f*M_PI*") + num2str(num) + string("/") + num2str(denom) + string(")*("+expr+");\n"); + } else { + // denom is a power of two + int logDenom =0; + int d= denom; + while (d != 1) { + logDenom++; + d >>= 1; + } + int pi_exp=1+1-logDenom; + string pi_mult_hex = string("0x1.921fb54442d18p") + (pi_exp>0 ? string("+") : string("")) + num2str(pi_exp); + switch (num) { + case 0 : + kernel_string += string("ang = 0.0f;\n"); + break; + case 1 : + kernel_string += string("ang = dir*(" + pi_mult_hex + string(") * (")+expr+");\n"); + break; + default: + float pi2=0x1.921fb54442d18p+2; + char tw[200]; + sprintf(tw,"%a",pi2*num / (float) denom); + kernel_string += string("ang = dir*(" + string(tw) + string(") * (")+expr+");\n"); + break; + } + } + kernel_string += varRes+string(" = (float2)(native_cos(ang), native_sin(ang));\n"); +} + +static void +insertSinCosCalc(string & kernel_string, cl_fft_plan *plan, int num, int denom , string & expr, string & varRes) +{ + if(denom & (denom-1)) { + kernel_string += string("ang = dir*(2.0f*M_PI*") + num2str(num) + string("/") + num2str(denom) + string(")*("+expr+");\n"); + kernel_string += varRes+string(" = (float2)(native_cos(ang), native_sin(ang));\n"); + } else { + + switch (num) { + case 0 : + kernel_string += string("ang = 0.0f;\n"); + kernel_string += varRes+string(" = (float2)(1.0f, 0.0f);\n"); + break; + default: + int num_norm = num*(plan->N1 * plan->N2)/denom; + + if(num_norm % plan->N1 == 0) { + + kernel_string += string("{ int ang_index = ") + num2str(num_norm) + string(" * ( ") + expr + string(") ; \n"); + kernel_string += string(" int ang_index_k = (ang_index >> " ) + num2str(plan->logN1)+ string(") & "+num2str(plan->N2-1)+ ";\n"); + kernel_string += string(" cos_sinLUT1("+varRes+",dir,ang_index_k,sinLUT2,cosLUT2);\n"); + kernel_string += string("}\n"); + + + + } else { +// insertSinCosCalcDirect(kernel_string,plan,num,denom,expr,varRes); + + kernel_string += string("{ int ang_index = ") + num2str(num_norm) + string(" * ( ") + expr + string(") ; \n"); + kernel_string += string(" int ang_index_k = ((ang_index >> " ) + num2str(plan->logN1)+ string(") & "+num2str(plan->N2-1)+ ");\n"); + kernel_string += string(" int ang_index_i = (ang_index & " ) + num2str(plan->N1-1)+ string(");\n"); + kernel_string += string(" cos_sinLUT2("+varRes+",dir,ang_index_i,ang_index_k,sinLUT1,cosLUT1,sinLUT2,cosLUT2);\n"); + kernel_string += string("}\n"); + + } + break; + } + } +} + + + static void createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir dir, int vertBS) { @@ -1075,10 +1170,14 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir if(R2 > 1) { // twiddle + string expr = string("j"); + string resVar = string("w"); + for(k = 1; k < R1; k++) { - localString += string("ang = dir*(2.0f*M_PI*") + num2str(k) + string("/") + num2str(radix) + string(")*j;\n"); - localString += string("w = (float2)(native_cos(ang), native_sin(ang));\n"); + insertSinCosCalc(localString,plan, k, radix , expr, resVar); +// localString += string("ang = dir*(2.0f*M_PI*") + num2str(k) + string("/") + num2str(radix) + string(")*j;\n"); +// localString += string("w = (float2)(native_cos(ang), native_sin(ang));\n"); localString += string("a[") + num2str(k) + string("] = complexMul(a[") + num2str(k) + string("], w);\n"); } @@ -1111,11 +1210,20 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir { localString += string("l = ((bNum << ") + num2str(lgBatchSize) + string(") + i) >> ") + num2str(lgStrideO) + string(";\n"); localString += string("k = j << ") + num2str((int)log2(R1/R2)) + string(";\n"); - localString += string("ang1 = dir*(2.0f*M_PI/") + num2str(N) + string(")*l;\n"); +// localString += string("ang1 = dir*(2.0f*M_PI/") + num2str(N) + string(")*l;\n"); + + string varRes=string("w"); + + for(t = 0; t < R1; t++) { - localString += string("ang = ang1*(k + ") + num2str((t%R2)*R1 + (t/R2)) + string(");\n"); - localString += string("w = (float2)(native_cos(ang), native_sin(ang));\n"); + string expr = string("l * (k + ") + num2str((t%R2)*R1 + (t/R2)) + string(")"); + + insertSinCosCalc(localString, plan, 1, N , expr, varRes) ; + + +// localString += string("ang = ang1*(k + ") + num2str((t%R2)*R1 + (t/R2)) + string(");\n"); +// localString += string("w = (float2)(native_cos(ang), native_sin(ang));\n"); localString += string("a[") + num2str(t) + string("] = complexMul(a[") + num2str(t) + string("], w);\n"); } } @@ -1255,3 +1363,4 @@ void FFT1D(cl_fft_plan *plan, cl_fft_kernel_dir dir) } } + diff --git a/src/fft_setup.cpp b/src/fft_setup.cpp index f91365440edde10ef322e58148b85329e0203863..05b2680f2f8a2f36ca5984986bdcac0928ad46d8 100644 --- a/src/fft_setup.cpp +++ b/src/fft_setup.cpp @@ -55,6 +55,7 @@ #include <iostream> #include <string> #include <sstream> +#include <cmath> #include <limits.h> using namespace std; @@ -160,6 +161,27 @@ destroy_plan(cl_fft_plan *Plan) clReleaseMemObject(Plan->tempmemobj_imag); Plan->tempmemobj_imag = NULL; } + + if(Plan->sin_LUT_d1) + { + clReleaseMemObject(Plan->sin_LUT_d1); + } + + if(Plan->sin_LUT_d2) + { + clReleaseMemObject(Plan->sin_LUT_d2); + } + + if(Plan->cos_LUT_d1) + { + clReleaseMemObject(Plan->cos_LUT_d1); + } + + if(Plan->cos_LUT_d2) + { + clReleaseMemObject(Plan->cos_LUT_d2); + } + } static int @@ -228,6 +250,146 @@ int getMaxKernelWorkGroupSize(cl_fft_plan *plan, unsigned int *max_wg_size, unsi } \ } +static +int precomputeSinCosLUTs(cl_fft_plan * plan,cl_int *error_code) { + + size_t i=0; + cl_int err; + + float * tmpLUT; + + // find logN1,logN2, where + // n = 2^logN1 * 2^logN2 , and logN1=logN2 +/- 1 + + size_t N=plan->n.x*plan->n.y*plan->n.z; + + plan->logN1=0; + plan->logN2=0; + + size_t Nrem=N; + + while(Nrem > 1) { + plan->logN1++; + Nrem >>= 1; + + if(Nrem > 1) { + plan->logN2++; + Nrem >>= 1; + } + } + + plan->N1 = 1 << plan->logN1; + + plan->N2 = 1 << plan->logN2; + + // we use logN1 >= logN2 to allocate a buffer that fits + // the greater of the two LUTs + tmpLUT = (float*) malloc( plan->N1 * sizeof(float)); + + float * tmpLUT_sin1 = (float*) malloc( plan->N1 * sizeof(float)); + float * tmpLUT_sin2 = (float*) malloc( plan->N1 * sizeof(float)); + float * tmpLUT_cos1 = (float*) malloc( plan->N1 * sizeof(float)); + float * tmpLUT_cos2 = (float*) malloc( plan->N1 * sizeof(float)); + + +/* + if(tmpLUT==NULL) { + // TODO: error handling + } +*/ + double PI2 = 8.0*atan(1.0); + + for(i=0; i < plan->N1; i++) { + tmpLUT_sin1[i]=(float)sin(PI2 * (float) i / (float)N); + } + plan->sin_LUT_d1 = clCreateBuffer(plan->context,CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, plan->N1*sizeof(float),tmpLUT_sin1, &err); + + if( err != CL_SUCCESS) + { + if(error_code) + *error_code = err; + free(tmpLUT); + return 1; + } + + + for(i=0; i < plan->N1; i++) { + tmpLUT_cos1[i]=(float)cos(PI2 * (float) i / (float)N); + } + plan->cos_LUT_d1 = clCreateBuffer(plan->context,CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, plan->N1*sizeof(float),tmpLUT_cos1, &err); + if( err != CL_SUCCESS) + { + if(error_code) + *error_code = err; + free(tmpLUT); + return 1; + } + + for(i=0; i < plan->N2; i++) { + tmpLUT_sin2[i]=(float)sin(PI2 * (float) i / (float) plan->N2); + } + plan->sin_LUT_d2 = clCreateBuffer(plan->context,CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, plan->N2*sizeof(float),tmpLUT_sin2, &err); + if( err != CL_SUCCESS) + { + if(error_code) + *error_code = err; + free(tmpLUT); + return 1; + } + + for(i=0; i < plan->N2; i++) { + tmpLUT_cos2[i]=(float)cos(PI2 * (float) i / (float) plan->N2); + } + plan->cos_LUT_d2 = clCreateBuffer(plan->context,CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, plan->N2*sizeof(float),tmpLUT_cos2, &err); + if( err != CL_SUCCESS) + { + if(error_code) + *error_code = err; + free(tmpLUT); + return 1; + } + + + + float diff=0.0f; + + for(int i = 0 ; i < N ; i++) { + int ind1 = i & (plan->N1-1); + int ind2 = i >> plan->logN1; + + float cos_ref = cos(-PI2 * (float) i / (float) N); + float sin_ref = sin(-PI2 * (float) i / (float) N); + + float test_sin =-1.0*(tmpLUT_sin1[ind1]*tmpLUT_cos2[ind2]+tmpLUT_cos1[ind1] * tmpLUT_sin2[ind2]); + float test_cos = tmpLUT_cos1[ind1]*tmpLUT_cos2[ind2]-tmpLUT_sin1[ind1] * tmpLUT_sin2[ind2]; + + float test_diff= std::abs(test_sin - sin_ref); + if(test_diff > diff ) diff=test_diff; + test_diff= std::abs(test_cos - cos_ref); + if(test_diff > diff ) diff=test_diff; + + + + } + fprintf(stdout,"!!!!!!!!! %e !!!!!!!!!!!!\n",diff); + + + free(tmpLUT); + + free(tmpLUT_sin1); + free(tmpLUT_sin2); + free(tmpLUT_cos1); + free(tmpLUT_cos2); + + + + + + return 0; +} + + + clFFT_Plan clFFT_CreatePlan(cl_context context, clFFT_Dim3 n, clFFT_Dimension dim, clFFT_DataFormat dataFormat, cl_int *error_code ) { @@ -273,12 +435,18 @@ clFFT_CreatePlan(cl_context context, clFFT_Dim3 n, clFFT_Dimension dim, clFFT_Da plan->tempmemobj = 0; plan->tempmemobj_real = 0; plan->tempmemobj_imag = 0; + plan->sin_LUT_d1=0; + plan->sin_LUT_d2=0; + plan->cos_LUT_d1=0; + plan->cos_LUT_d2=0; plan->max_localmem_fft_size = 2048; plan->max_work_item_per_workgroup = 256; plan->max_radix = 16; plan->min_mem_coalesce_width = 16; plan->num_local_mem_banks = 16; + precomputeSinCosLUTs(plan,error_code); + patch_kernel_source: plan->kernel_string = new string("");