diff --git a/example/test.single.2 b/example/test.single.2 new file mode 100644 index 0000000000000000000000000000000000000000..0d941e085c328f8232ecfa67f7298813df566d09 --- /dev/null +++ b/example/test.single.2 @@ -0,0 +1 @@ +-n 4194304 1 1 -batchsize 3 -dir forward -dim 1D -format interleaved -numiter 100 -testtype out-of-place -inputdatafile test_waveform.dat2 diff --git a/include/clFFT.h b/include/clFFT.h index 5f59869850369f7925cdc7bfb37b2dd51529609e..32dbbd97949ba8cd4108789c416ef9a7fe9717a9 100644 --- a/include/clFFT.h +++ b/include/clFFT.h @@ -84,6 +84,14 @@ typedef enum clFFT_InterleavedComplexFormat = 1 }clFFT_DataFormat; +typedef enum +{ + clFFT_native_trig = 0, + clFFT_sincosfunc = 1, + clFFT_BigLUT = 2, + clFFT_TaylorLUT = 3 +} clFFT_TwiddleFactorMethod; + typedef struct { unsigned int x; diff --git a/src/fft_base_kernels.h b/src/fft_base_kernels.h index 67f7d2ede92c4be6da24f6ab248fbd1325893f42..00c5bcf58fb53a51bc5feb72867b0ec6f527ff11 100644 --- a/src/fft_base_kernels.h +++ b/src/fft_base_kernels.h @@ -59,16 +59,16 @@ static string baseKernels = string( "#endif\n" "#define complexMul(a,b) ((float2)(mad(-(a).y, (b).y, (a).x * (b).x), mad((a).y, (b).x, (a).x * (b).y)))\n" "\n" - "#define cos_sinLUT1(res,dir,i,sinLUT,cosLUT)\\\n" + "#define cos_sinLUT1(res,dir,i,cossinLUT)\\\n" "{\\\n" - "(res)=(float2)((cosLUT)[i] , (dir)*(sinLUT)[i]);\\\n" + "(res)=(float2)((cossinLUT)[i].x , (dir)*(cossinLUT)[i].y);\\\n" "}\n" "\n" - "#define cos_sinLUT2(res,dir,_i,_k,sinLUT1,cosLUT1,sinLUT2,cosLUT2) \\\n" - "{ float _sin_1= (sinLUT1)[_i]; \\\n" - " float _sin_2= (sinLUT2)[_k]; \\\n" - " float _cos_1= (cosLUT1)[_i]; \\\n" - " float _cos_2= (cosLUT2)[_k]; \\\n" + "#define cos_sinLUT2(res,dir,_i,_k,cossinLUT1,cossinLUT2) \\\n" + "{ float _sin_1= (cossinLUT1)[_i].y; \\\n" + " float _sin_2= (cossinLUT2)[_k].y; \\\n" + " float _cos_1= (cossinLUT1)[_i].x; \\\n" + " float _cos_2= (cossinLUT2)[_k].x; \\\n" " float _cos_res = _cos_1 * _cos_2 - _sin_1 * _sin_2; \\\n" " float _sin_res = (dir) * (_sin_1 * _cos_2 + _cos_1 * _sin_2); \\\n" " (res)=(float2)(_cos_res,_sin_res); \\\n" diff --git a/src/fft_execute.cpp b/src/fft_execute.cpp index 21d72dbd79b3c584081dee7853941a77adb188d9..2ae30585b2f01167edcc5757f1e2ba37efb346bf 100644 --- a/src/fft_execute.cpp +++ b/src/fft_execute.cpp @@ -180,12 +180,9 @@ clFFT_ExecuteInterleaved( cl_command_queue queue, clFFT_Plan Plan, cl_int batchS err |= clSetKernelArg(kernelInfo->kernel, 1, sizeof(cl_mem), &memObj[currWrite]); err |= clSetKernelArg(kernelInfo->kernel, 2, sizeof(cl_int), &dir); err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_int), &s); - err |= clSetKernelArg(kernelInfo->kernel, 4, sizeof(cl_mem), &(plan->sin_LUT_d1)); - err |= clSetKernelArg(kernelInfo->kernel, 5, sizeof(cl_mem), &(plan->cos_LUT_d1)); - err |= clSetKernelArg(kernelInfo->kernel, 6, sizeof(cl_mem), &(plan->sin_LUT_d2)); - err |= clSetKernelArg(kernelInfo->kernel, 7, sizeof(cl_mem), &(plan->cos_LUT_d2)); + err |= clSetKernelArg(kernelInfo->kernel, 4, sizeof(cl_mem), &(plan->cossin_LUT_d1)); + err |= clSetKernelArg(kernelInfo->kernel, 5, sizeof(cl_mem), &(plan->cossin_LUT_d2)); - err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, 0, NULL, NULL); if(err) return err; @@ -208,10 +205,8 @@ clFFT_ExecuteInterleaved( cl_command_queue queue, clFFT_Plan Plan, cl_int batchS err |= clSetKernelArg(kernelInfo->kernel, 1, sizeof(cl_mem), &memObj[currWrite]); err |= clSetKernelArg(kernelInfo->kernel, 2, sizeof(cl_int), &dir); err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_int), &s); - err |= clSetKernelArg(kernelInfo->kernel, 4, sizeof(cl_mem), &(plan->sin_LUT_d1)); - err |= clSetKernelArg(kernelInfo->kernel, 5, sizeof(cl_mem), &(plan->cos_LUT_d1)); - err |= clSetKernelArg(kernelInfo->kernel, 6, sizeof(cl_mem), &(plan->sin_LUT_d2)); - err |= clSetKernelArg(kernelInfo->kernel, 7, sizeof(cl_mem), &(plan->cos_LUT_d2)); + err |= clSetKernelArg(kernelInfo->kernel, 4, sizeof(cl_mem), &(plan->cossin_LUT_d1)); + err |= clSetKernelArg(kernelInfo->kernel, 5, sizeof(cl_mem), &(plan->cossin_LUT_d2)); err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, 0, NULL, NULL); if(err) @@ -294,12 +289,8 @@ clFFT_ExecutePlannar( cl_command_queue queue, clFFT_Plan Plan, cl_int batchSize, err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_mem), &memObj_imag[currWrite]); err |= clSetKernelArg(kernelInfo->kernel, 4, sizeof(cl_int), &dir); err |= clSetKernelArg(kernelInfo->kernel, 5, sizeof(cl_int), &s); - err |= clSetKernelArg(kernelInfo->kernel, 6, sizeof(cl_mem), &(plan->sin_LUT_d1)); - err |= clSetKernelArg(kernelInfo->kernel, 7, sizeof(cl_mem), &(plan->cos_LUT_d1)); - err |= clSetKernelArg(kernelInfo->kernel, 8, sizeof(cl_mem), &(plan->sin_LUT_d2)); - err |= clSetKernelArg(kernelInfo->kernel, 9, sizeof(cl_mem), &(plan->cos_LUT_d2)); - - + err |= clSetKernelArg(kernelInfo->kernel, 6, sizeof(cl_mem), &(plan->cossin_LUT_d1)); + err |= clSetKernelArg(kernelInfo->kernel, 7, sizeof(cl_mem), &(plan->cossin_LUT_d2)); err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, 0, NULL, NULL); if(err) @@ -324,10 +315,8 @@ clFFT_ExecutePlannar( cl_command_queue queue, clFFT_Plan Plan, cl_int batchSize, err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_mem), &memObj_imag[currWrite]); err |= clSetKernelArg(kernelInfo->kernel, 4, sizeof(cl_int), &dir); err |= clSetKernelArg(kernelInfo->kernel, 5, sizeof(cl_int), &s); - err |= clSetKernelArg(kernelInfo->kernel, 6, sizeof(cl_mem), &(plan->sin_LUT_d1)); - err |= clSetKernelArg(kernelInfo->kernel, 7, sizeof(cl_mem), &(plan->cos_LUT_d1)); - err |= clSetKernelArg(kernelInfo->kernel, 8, sizeof(cl_mem), &(plan->sin_LUT_d2)); - err |= clSetKernelArg(kernelInfo->kernel, 9, sizeof(cl_mem), &(plan->cos_LUT_d2)); + err |= clSetKernelArg(kernelInfo->kernel, 6, sizeof(cl_mem), &(plan->cossin_LUT_d1)); + err |= clSetKernelArg(kernelInfo->kernel, 7, sizeof(cl_mem), &(plan->cossin_LUT_d2)); err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, 0, NULL, NULL); if(err) diff --git a/src/fft_internal.h b/src/fft_internal.h index 98557ab8fff3627d21bc64c5407aca932c9f9fe0..03d7deba1b3cf102dde9039aad362ac1aba54f78 100644 --- a/src/fft_internal.h +++ b/src/fft_internal.h @@ -138,12 +138,13 @@ typedef struct // precomputed lookup tables for sin,cos calculations, each of size // sqrt(n) or 2*sqrt(n), n is size of signal; - cl_mem sin_LUT_d1,sin_LUT_d2; - cl_mem cos_LUT_d1,cos_LUT_d2; + cl_mem cossin_LUT_d1; + cl_mem cossin_LUT_d2; int logN1; int logN2; size_t N1; size_t N2; + clFFT_TwiddleFactorMethod twiddleMethod; // Maximum size of signal for which local memory transposed based // fft is sufficient i.e. no global mem transpose (communication) diff --git a/src/fft_kernelstring.cpp b/src/fft_kernelstring.cpp index 0fbe523606adb64461df9c2623648e4fe07331eb..416fbfcd84e9ada83ad8acb5fe21ace43abab51d 100644 --- a/src/fft_kernelstring.cpp +++ b/src/fft_kernelstring.cpp @@ -183,12 +183,12 @@ insertHeader(string &kernelString, string &kernelName, clFFT_DataFormat dataForm if(dataFormat == clFFT_SplitComplexFormat) kernelString += string("__kernel void ") + kernelName + string("(__global float *in_real, __global float *in_imag, __global float *out_real, __global float *out_imag, int dir, int S," - "__global float * sinLUT1, __global float * cosLUT1, __global float * sinLUT2, __global float * cosLUT2)\n"); + "__global float2 * cossinLUT1, __global float2 * cossinLUT2 )\n"); // "__constant float * sinLUT1, __constant float * cosLUT1, __constant float * sinLUT2, __constant float * cosLUT2)\n"); else // kernelString += string("__kernel void ") + kernelName + string("(__global float2 *in, __global float2 *out, int dir, int S, __constant float * sinLUT1, __constant float * cosLUT1, __constant float * sinLUT2, __constant float * cosLUT2)\n"); - kernelString += string("__kernel void ") + kernelName + string("(__global float2 *in, __global float2 *out, int dir, int S, __global float * sinLUT1, __global float * cosLUT1, __global float * sinLUT2, __global float * cosLUT2)\n"); + kernelString += string("__kernel void ") + kernelName + string("(__global float2 *in, __global float2 *out, int dir, int S, __global float2 * cossinLUT1, __global float2 * cossinLUT2 )\n"); } @@ -694,6 +694,7 @@ insertLocalLoads(string &kernelString, int n, int Nr, int Nrn, int Nprev, int Nc kernelString += string(" a[") + num2str(i*Nrn + z) + string("].") + comp + string(" = lMemLoad[") + num2str(st) + string("];\n"); } } + kernelString += string(" barrier(CLK_LOCAL_MEM_FENCE);\n"); } @@ -931,8 +932,10 @@ getGlobalRadixInfo(int n, int *radix, int *R1, int *R2, int *numRadices) // expr is C++ code for an expression evaluating to an integer, // such that num * value(expr)< denom + + static void -insertSinCosCalcDirect(string & kernel_string, cl_fft_plan *plan, int num, int denom , string & expr, string & varRes) +insertSinCosCalcDirectNative(string & kernel_string, cl_fft_plan *plan, int num, int denom , string & expr, string & varRes) { if(denom & (denom-1)) { kernel_string += string("ang = dir*(2.0f*M_PI*") + num2str(num) + string("/") + num2str(denom) + string(")*("+expr+");\n"); @@ -964,6 +967,46 @@ insertSinCosCalcDirect(string & kernel_string, cl_fft_plan *plan, int num, int d kernel_string += varRes+string(" = (float2)(native_cos(ang), native_sin(ang));\n"); } +static void +insertSinCosCalcDirect(string & kernel_string, cl_fft_plan *plan, int num, int denom , string & expr, string & varRes) +{ + if(denom & (denom-1)) { + kernel_string += string("ang = dir*(2.0f*M_PI*") + num2str(num) + string("/") + num2str(denom) + string(")*("+expr+");\n"); + } else { + // denom is a power of two + int logDenom =0; + int d= denom; + while (d != 1) { + logDenom++; + d >>= 1; + } + int pi_exp=1+1-logDenom; + string pi_mult_hex = string("0x1.921fb54442d18p") + (pi_exp>0 ? string("+") : string("")) + num2str(pi_exp); + switch (num) { + case 0 : + kernel_string += string("ang = 0.0f;\n"); + break; + case 1 : + kernel_string += string("ang = dir*(" + pi_mult_hex + string(") * (")+expr+");\n"); + break; + default: + float pi2=0x1.921fb54442d18p+2; + char tw[200]; + sprintf(tw,"%a",pi2*num / (float) denom); + kernel_string += string("ang = dir*(" + string(tw) + string(") * (")+expr+");\n"); + break; + } + } + + kernel_string += string("{\n"); + kernel_string += string(" float _tmpcos,_tmpsin;\n"); + kernel_string += string(" _tmpsin=sincos(ang,&_tmpcos);\n"); + kernel_string += varRes+string(" = (float2)(_tmpcos, _tmpsin);\n"); + kernel_string += string("}\n"); + +} + + static void insertSinCosCalc(string & kernel_string, cl_fft_plan *plan, int num, int denom , string & expr, string & varRes) { @@ -984,18 +1027,13 @@ insertSinCosCalc(string & kernel_string, cl_fft_plan *plan, int num, int denom , kernel_string += string("{ int ang_index = ") + num2str(num_norm) + string(" * ( ") + expr + string(") ; \n"); kernel_string += string(" int ang_index_k = (ang_index >> " ) + num2str(plan->logN1)+ string(") & "+num2str(plan->N2-1)+ ";\n"); - kernel_string += string(" cos_sinLUT1("+varRes+",dir,ang_index_k,sinLUT2,cosLUT2);\n"); + kernel_string += string(" cos_sinLUT1("+varRes+",dir,ang_index_k,cossinLUT2);\n"); kernel_string += string("}\n"); - - - } else { -// insertSinCosCalcDirect(kernel_string,plan,num,denom,expr,varRes); - kernel_string += string("{ int ang_index = ") + num2str(num_norm) + string(" * ( ") + expr + string(") ; \n"); kernel_string += string(" int ang_index_k = ((ang_index >> " ) + num2str(plan->logN1)+ string(") & "+num2str(plan->N2-1)+ ");\n"); kernel_string += string(" int ang_index_i = (ang_index & " ) + num2str(plan->N1-1)+ string(");\n"); - kernel_string += string(" cos_sinLUT2("+varRes+",dir,ang_index_i,ang_index_k,sinLUT1,cosLUT1,sinLUT2,cosLUT2);\n"); + kernel_string += string(" cos_sinLUT2("+varRes+",dir,ang_index_i,ang_index_k,cossinLUT1,cossinLUT2);\n"); kernel_string += string("}\n"); } @@ -1004,7 +1042,114 @@ insertSinCosCalc(string & kernel_string, cl_fft_plan *plan, int num, int denom , } } +// pedantic : use Taylor series approx to degree 3, on 64 grid points between [0,2pi[ + +static void +insertSinCosCalcTaylor3(string & kernel_string, cl_fft_plan *plan, int num, int denom , string & expr, string & varRes) +{ + if(denom & (denom-1)) { + kernel_string += string("ang = dir*(2.0f*M_PI*") + num2str(num) + string("/") + num2str(denom) + string(")*("+expr+");\n"); + kernel_string += varRes+string(" = (float2)(native_cos(ang), native_sin(ang));\n"); + } else { + + switch (num) { + case 0 : + kernel_string += string("ang = 0.0f;\n"); + kernel_string += varRes+string(" = (float2)(1.0f, 0.0f);\n"); + break; + default: + + // normalize num,denom while num is even + while((num % 2 ==0 ) && (denom %2 == 0) ){ + denom >>=1; + num >>=1; + } + // if normalized denom < grid size, pick directly from LUT. + if(denom <= plan->N2) { + kernel_string += string("{ int ang_index = (") + num2str(num) + string(" * ( ") + expr + string(")) & ") + num2str(denom -1) + string("; \n"); + kernel_string += string(" int k = ang_index * ") + num2str(plan->N2 / denom ) + string(";\n"); + kernel_string += string(" float2 cs =cossin_T_LUT[k];\n"); + kernel_string += string(" cs.y *=dir;\n"); + + kernel_string += varRes + string(" = cs;\n"); + kernel_string += string("}\n"); + + + } else { + kernel_string += string("{ int ang_index = (") + num2str(num) + string(" * ( ") + expr + string(")) & ") + num2str(denom -1) + string("; \n"); + + // find nearest bin in grid + + int logDenom=0; + int d = denom; + while (d > 1) { + d>>=1; + logDenom++; + } + + kernel_string += string(" int k = (ang_index << ") + num2str(plan->logN2) + string(" + ") + num2str(denom / 2) + string(") >> ") + num2str(logDenom) + string(";\n"); + + // get cos/sin of grid point from LUT + + kernel_string += string(" float2 csx0 =cossin_T_LUT[k];\n"); + kernel_string += string(" float2 csx0Transp= (float2)(csx0.y,-csx0.x);\n"); + kernel_string += string(" int r=ang_index - k * ( ")+ num2str(denom >> plan->logN2) + string(" );\n") ; + + + // calculate distance (with sign) from this grid point + // DO NOT calculate teh angles here directly and subtract as this will deteriorate precision. + // precompute h0=2*pi/denom; + float pi2=0x1.921fb54442d18p+2; + char tw[200]; + sprintf(tw,"%A",-pi2/(float) denom); + kernel_string += string(" float mh=") +string(tw)+string("*(float)r;\n"); + + // compute taylor series terms to order 3 and add them up, in "reverse" order (highest order first) + + kernel_string += string(" float mhsqr2= mh*mh*(-0.5f);\n"); + kernel_string += string(" float hqub6= mhsqr2*mh*(1.0f/3.0f);\n"); + kernel_string += string(" float2 cs;\n"); // we could use varRes in the first place here.. + kernel_string += string(" cs= hqub6 * csx0Transp;\n"); + + kernel_string += string(" cs += mhsqr2*csx0;\n"); + + kernel_string += string(" cs += mh*csx0Transp;\n"); + + kernel_string += string(" cs += csx0;\n"); + + kernel_string += string(" cs.y *=dir;\n"); + + kernel_string += varRes + string(" = cs;\n"); + kernel_string += string("}\n"); + + } + break; + } + } +} + +static void +insertSinCos(string & kernel_string, cl_fft_plan *plan, int num, int denom , string & expr, string & varRes) +{ + switch (plan->twiddleMethod) { + case clFFT_sincosfunc : insertSinCosCalcDirect(kernel_string,plan,num,denom,expr,varRes); break; + case clFFT_BigLUT : insertSinCosCalc(kernel_string,plan,num,denom,expr,varRes); break; + case clFFT_TaylorLUT : insertSinCosCalcTaylor3(kernel_string,plan,num,denom,expr,varRes); break; + default : insertSinCosCalcDirectNative(kernel_string,plan,num,denom,expr,varRes); break; + } +} + +static void +insertLocalSinCosLUT(string & kernel_string, cl_fft_plan *plan, int workgroupsize) { + + // TODO: conditionally copy to local (shared memory) + + if(plan->twiddleMethod == clFFT_TaylorLUT) { + // second LUT holds grid values for Taylor seres approx + kernel_string += string(" __global float2 * cossin_T_LUT = cossinLUT2;\n"); + } +} static void createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir dir, int vertBS) @@ -1165,6 +1310,8 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir localString += string("a[") + num2str(j) + string("] = in[") + num2str(j*gInInc*strideI) + string("];\n"); } + insertLocalSinCosLUT(localString, plan, threadsPerBlock); + localString += string("fftKernel") + num2str(R1) + string("(a, dir);\n"); if(R2 > 1) @@ -1175,7 +1322,7 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir for(k = 1; k < R1; k++) { - insertSinCosCalc(localString,plan, k, radix , expr, resVar); + insertSinCos(localString,plan, k, radix , expr, resVar); // localString += string("ang = dir*(2.0f*M_PI*") + num2str(k) + string("/") + num2str(radix) + string(")*j;\n"); // localString += string("w = (float2)(native_cos(ang), native_sin(ang));\n"); localString += string("a[") + num2str(k) + string("] = complexMul(a[") + num2str(k) + string("], w);\n"); @@ -1219,7 +1366,7 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir { string expr = string("l * (k + ") + num2str((t%R2)*R1 + (t/R2)) + string(")"); - insertSinCosCalc(localString, plan, 1, N , expr, varRes) ; + insertSinCos(localString, plan, 1, N , expr, varRes) ; // localString += string("ang = ang1*(k + ") + num2str((t%R2)*R1 + (t/R2)) + string(");\n"); diff --git a/src/fft_setup.cpp b/src/fft_setup.cpp index 05b2680f2f8a2f36ca5984986bdcac0928ad46d8..fed40b05e71570d556fcf89c9d3398c91b39d7b2 100644 --- a/src/fft_setup.cpp +++ b/src/fft_setup.cpp @@ -162,24 +162,14 @@ destroy_plan(cl_fft_plan *Plan) Plan->tempmemobj_imag = NULL; } - if(Plan->sin_LUT_d1) + if(Plan->cossin_LUT_d1) { - clReleaseMemObject(Plan->sin_LUT_d1); + clReleaseMemObject(Plan->cossin_LUT_d1); } - if(Plan->sin_LUT_d2) + if(Plan->cossin_LUT_d2) { - clReleaseMemObject(Plan->sin_LUT_d2); - } - - if(Plan->cos_LUT_d1) - { - clReleaseMemObject(Plan->cos_LUT_d1); - } - - if(Plan->cos_LUT_d2) - { - clReleaseMemObject(Plan->cos_LUT_d2); + clReleaseMemObject(Plan->cossin_LUT_d2); } } @@ -256,8 +246,6 @@ int precomputeSinCosLUTs(cl_fft_plan * plan,cl_int *error_code) { size_t i=0; cl_int err; - float * tmpLUT; - // find logN1,logN2, where // n = 2^logN1 * 2^logN2 , and logN1=logN2 +/- 1 @@ -265,124 +253,81 @@ int precomputeSinCosLUTs(cl_fft_plan * plan,cl_int *error_code) { plan->logN1=0; plan->logN2=0; - - size_t Nrem=N; + plan->N1=1; + plan->N2=1; - while(Nrem > 1) { - plan->logN1++; - Nrem >>= 1; - - if(Nrem > 1) { - plan->logN2++; + switch (plan->twiddleMethod) { + case clFFT_native_trig: return 0; + case clFFT_sincosfunc : return 0; + case clFFT_TaylorLUT : + plan->logN1 = 0; + plan->logN2 = 8; + break; + case clFFT_BigLUT : { + + size_t Nrem=N; + + while(Nrem > 1) { + plan->logN1++; Nrem >>= 1; - } - } + if(Nrem > 1) { + plan->logN2++; + Nrem >>= 1; + } + }} + break; + default: return 1; + } + plan->N1 = 1 << plan->logN1; plan->N2 = 1 << plan->logN2; - // we use logN1 >= logN2 to allocate a buffer that fits - // the greater of the two LUTs - tmpLUT = (float*) malloc( plan->N1 * sizeof(float)); - float * tmpLUT_sin1 = (float*) malloc( plan->N1 * sizeof(float)); - float * tmpLUT_sin2 = (float*) malloc( plan->N1 * sizeof(float)); - float * tmpLUT_cos1 = (float*) malloc( plan->N1 * sizeof(float)); - float * tmpLUT_cos2 = (float*) malloc( plan->N1 * sizeof(float)); + float * tmpLUT_cossin1 = (float*) malloc( plan->N1 * 2 * sizeof(float)); + float * tmpLUT_cossin2 = (float*) malloc( plan->N2 * 2 * sizeof(float)); + -/* - if(tmpLUT==NULL) { - // TODO: error handling - } -*/ double PI2 = 8.0*atan(1.0); for(i=0; i < plan->N1; i++) { - tmpLUT_sin1[i]=(float)sin(PI2 * (float) i / (float)N); + tmpLUT_cossin1[i*2] =(float)cos(PI2 * (float) i / (float)N); + tmpLUT_cossin1[i*2+1]=(float)sin(PI2 * (float) i / (float)N); } - plan->sin_LUT_d1 = clCreateBuffer(plan->context,CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, plan->N1*sizeof(float),tmpLUT_sin1, &err); + plan->cossin_LUT_d1 = clCreateBuffer(plan->context,CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, plan->N1*2*sizeof(float),tmpLUT_cossin1, &err); if( err != CL_SUCCESS) { if(error_code) *error_code = err; - free(tmpLUT); + free(tmpLUT_cossin1 ); + free(tmpLUT_cossin2 ); + return 1; } - for(i=0; i < plan->N1; i++) { - tmpLUT_cos1[i]=(float)cos(PI2 * (float) i / (float)N); - } - plan->cos_LUT_d1 = clCreateBuffer(plan->context,CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, plan->N1*sizeof(float),tmpLUT_cos1, &err); - if( err != CL_SUCCESS) - { - if(error_code) - *error_code = err; - free(tmpLUT); - return 1; - } - for(i=0; i < plan->N2; i++) { - tmpLUT_sin2[i]=(float)sin(PI2 * (float) i / (float) plan->N2); + tmpLUT_cossin2[2*i] =(float)cos(PI2 * (float) i / (float) plan->N2); + tmpLUT_cossin2[2*i+1]=(float)sin(PI2 * (float) i / (float) plan->N2); } - plan->sin_LUT_d2 = clCreateBuffer(plan->context,CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, plan->N2*sizeof(float),tmpLUT_sin2, &err); + plan->cossin_LUT_d2 = clCreateBuffer(plan->context,CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, plan->N2*2*sizeof(float),tmpLUT_cossin2, &err); if( err != CL_SUCCESS) { - if(error_code) - *error_code = err; - free(tmpLUT); - return 1; - } - - for(i=0; i < plan->N2; i++) { - tmpLUT_cos2[i]=(float)cos(PI2 * (float) i / (float) plan->N2); - } - plan->cos_LUT_d2 = clCreateBuffer(plan->context,CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, plan->N2*sizeof(float),tmpLUT_cos2, &err); - if( err != CL_SUCCESS) - { - if(error_code) - *error_code = err; - free(tmpLUT); + if(error_code) + *error_code = err; + free(tmpLUT_cossin1 ); + free(tmpLUT_cossin2 ); + return 1; } - float diff=0.0f; - - for(int i = 0 ; i < N ; i++) { - int ind1 = i & (plan->N1-1); - int ind2 = i >> plan->logN1; - - float cos_ref = cos(-PI2 * (float) i / (float) N); - float sin_ref = sin(-PI2 * (float) i / (float) N); - - float test_sin =-1.0*(tmpLUT_sin1[ind1]*tmpLUT_cos2[ind2]+tmpLUT_cos1[ind1] * tmpLUT_sin2[ind2]); - float test_cos = tmpLUT_cos1[ind1]*tmpLUT_cos2[ind2]-tmpLUT_sin1[ind1] * tmpLUT_sin2[ind2]; - - float test_diff= std::abs(test_sin - sin_ref); - if(test_diff > diff ) diff=test_diff; - test_diff= std::abs(test_cos - cos_ref); - if(test_diff > diff ) diff=test_diff; - - - - } - fprintf(stdout,"!!!!!!!!! %e !!!!!!!!!!!!\n",diff); - - - free(tmpLUT); - - free(tmpLUT_sin1); - free(tmpLUT_sin2); - free(tmpLUT_cos1); - free(tmpLUT_cos2); - - - + free(tmpLUT_cossin1); + free(tmpLUT_cossin2); return 0; @@ -435,17 +380,20 @@ clFFT_CreatePlan(cl_context context, clFFT_Dim3 n, clFFT_Dimension dim, clFFT_Da plan->tempmemobj = 0; plan->tempmemobj_real = 0; plan->tempmemobj_imag = 0; - plan->sin_LUT_d1=0; - plan->sin_LUT_d2=0; - plan->cos_LUT_d1=0; - plan->cos_LUT_d2=0; + plan->cossin_LUT_d1=0; + plan->cossin_LUT_d2=0; plan->max_localmem_fft_size = 2048; plan->max_work_item_per_workgroup = 256; plan->max_radix = 16; plan->min_mem_coalesce_width = 16; plan->num_local_mem_banks = 16; + + //TODO: restore native as default + plan->twiddleMethod = clFFT_TaylorLUT; + precomputeSinCosLUTs(plan,error_code); + patch_kernel_source: