Skip to content
Snippets Groups Projects
Commit 20314512 authored by Heinz-Bernd Eggenstein's avatar Heinz-Bernd Eggenstein
Browse files

Bug #1608: clFFT use of native_sin , native_cos can cause validation problems

experimental: -added alternative method for twiddle factor calc, using a smaller LUT (256 * float2 )
               via Taylor series to 3rd order, seems to be almost as accurate as method with 2 bigger LUTs, but faster.
              -improved method w/ 2 bigger LUTs to use LUTs of float2
              -improved method using slow sin/cos functions (now uses sincos combined function), still slow
              - preparaed plan struct to have method switchable at plan creation time.

              TODO: load smaller LUT for Taylor series approx into shared mem.
parent 48a3c019
Branches
Tags
No related merge requests found
-n 4194304 1 1 -batchsize 3 -dir forward -dim 1D -format interleaved -numiter 100 -testtype out-of-place -inputdatafile test_waveform.dat2
...@@ -84,6 +84,14 @@ typedef enum ...@@ -84,6 +84,14 @@ typedef enum
clFFT_InterleavedComplexFormat = 1 clFFT_InterleavedComplexFormat = 1
}clFFT_DataFormat; }clFFT_DataFormat;
typedef enum
{
clFFT_native_trig = 0,
clFFT_sincosfunc = 1,
clFFT_BigLUT = 2,
clFFT_TaylorLUT = 3
} clFFT_TwiddleFactorMethod;
typedef struct typedef struct
{ {
unsigned int x; unsigned int x;
......
...@@ -59,16 +59,16 @@ static string baseKernels = string( ...@@ -59,16 +59,16 @@ static string baseKernels = string(
"#endif\n" "#endif\n"
"#define complexMul(a,b) ((float2)(mad(-(a).y, (b).y, (a).x * (b).x), mad((a).y, (b).x, (a).x * (b).y)))\n" "#define complexMul(a,b) ((float2)(mad(-(a).y, (b).y, (a).x * (b).x), mad((a).y, (b).x, (a).x * (b).y)))\n"
"\n" "\n"
"#define cos_sinLUT1(res,dir,i,sinLUT,cosLUT)\\\n" "#define cos_sinLUT1(res,dir,i,cossinLUT)\\\n"
"{\\\n" "{\\\n"
"(res)=(float2)((cosLUT)[i] , (dir)*(sinLUT)[i]);\\\n" "(res)=(float2)((cossinLUT)[i].x , (dir)*(cossinLUT)[i].y);\\\n"
"}\n" "}\n"
"\n" "\n"
"#define cos_sinLUT2(res,dir,_i,_k,sinLUT1,cosLUT1,sinLUT2,cosLUT2) \\\n" "#define cos_sinLUT2(res,dir,_i,_k,cossinLUT1,cossinLUT2) \\\n"
"{ float _sin_1= (sinLUT1)[_i]; \\\n" "{ float _sin_1= (cossinLUT1)[_i].y; \\\n"
" float _sin_2= (sinLUT2)[_k]; \\\n" " float _sin_2= (cossinLUT2)[_k].y; \\\n"
" float _cos_1= (cosLUT1)[_i]; \\\n" " float _cos_1= (cossinLUT1)[_i].x; \\\n"
" float _cos_2= (cosLUT2)[_k]; \\\n" " float _cos_2= (cossinLUT2)[_k].x; \\\n"
" float _cos_res = _cos_1 * _cos_2 - _sin_1 * _sin_2; \\\n" " float _cos_res = _cos_1 * _cos_2 - _sin_1 * _sin_2; \\\n"
" float _sin_res = (dir) * (_sin_1 * _cos_2 + _cos_1 * _sin_2); \\\n" " float _sin_res = (dir) * (_sin_1 * _cos_2 + _cos_1 * _sin_2); \\\n"
" (res)=(float2)(_cos_res,_sin_res); \\\n" " (res)=(float2)(_cos_res,_sin_res); \\\n"
......
...@@ -180,11 +180,8 @@ clFFT_ExecuteInterleaved( cl_command_queue queue, clFFT_Plan Plan, cl_int batchS ...@@ -180,11 +180,8 @@ clFFT_ExecuteInterleaved( cl_command_queue queue, clFFT_Plan Plan, cl_int batchS
err |= clSetKernelArg(kernelInfo->kernel, 1, sizeof(cl_mem), &memObj[currWrite]); err |= clSetKernelArg(kernelInfo->kernel, 1, sizeof(cl_mem), &memObj[currWrite]);
err |= clSetKernelArg(kernelInfo->kernel, 2, sizeof(cl_int), &dir); err |= clSetKernelArg(kernelInfo->kernel, 2, sizeof(cl_int), &dir);
err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_int), &s); err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_int), &s);
err |= clSetKernelArg(kernelInfo->kernel, 4, sizeof(cl_mem), &(plan->sin_LUT_d1)); err |= clSetKernelArg(kernelInfo->kernel, 4, sizeof(cl_mem), &(plan->cossin_LUT_d1));
err |= clSetKernelArg(kernelInfo->kernel, 5, sizeof(cl_mem), &(plan->cos_LUT_d1)); err |= clSetKernelArg(kernelInfo->kernel, 5, sizeof(cl_mem), &(plan->cossin_LUT_d2));
err |= clSetKernelArg(kernelInfo->kernel, 6, sizeof(cl_mem), &(plan->sin_LUT_d2));
err |= clSetKernelArg(kernelInfo->kernel, 7, sizeof(cl_mem), &(plan->cos_LUT_d2));
err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, 0, NULL, NULL); err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, 0, NULL, NULL);
if(err) if(err)
...@@ -208,10 +205,8 @@ clFFT_ExecuteInterleaved( cl_command_queue queue, clFFT_Plan Plan, cl_int batchS ...@@ -208,10 +205,8 @@ clFFT_ExecuteInterleaved( cl_command_queue queue, clFFT_Plan Plan, cl_int batchS
err |= clSetKernelArg(kernelInfo->kernel, 1, sizeof(cl_mem), &memObj[currWrite]); err |= clSetKernelArg(kernelInfo->kernel, 1, sizeof(cl_mem), &memObj[currWrite]);
err |= clSetKernelArg(kernelInfo->kernel, 2, sizeof(cl_int), &dir); err |= clSetKernelArg(kernelInfo->kernel, 2, sizeof(cl_int), &dir);
err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_int), &s); err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_int), &s);
err |= clSetKernelArg(kernelInfo->kernel, 4, sizeof(cl_mem), &(plan->sin_LUT_d1)); err |= clSetKernelArg(kernelInfo->kernel, 4, sizeof(cl_mem), &(plan->cossin_LUT_d1));
err |= clSetKernelArg(kernelInfo->kernel, 5, sizeof(cl_mem), &(plan->cos_LUT_d1)); err |= clSetKernelArg(kernelInfo->kernel, 5, sizeof(cl_mem), &(plan->cossin_LUT_d2));
err |= clSetKernelArg(kernelInfo->kernel, 6, sizeof(cl_mem), &(plan->sin_LUT_d2));
err |= clSetKernelArg(kernelInfo->kernel, 7, sizeof(cl_mem), &(plan->cos_LUT_d2));
err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, 0, NULL, NULL); err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, 0, NULL, NULL);
if(err) if(err)
...@@ -294,12 +289,8 @@ clFFT_ExecutePlannar( cl_command_queue queue, clFFT_Plan Plan, cl_int batchSize, ...@@ -294,12 +289,8 @@ clFFT_ExecutePlannar( cl_command_queue queue, clFFT_Plan Plan, cl_int batchSize,
err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_mem), &memObj_imag[currWrite]); err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_mem), &memObj_imag[currWrite]);
err |= clSetKernelArg(kernelInfo->kernel, 4, sizeof(cl_int), &dir); err |= clSetKernelArg(kernelInfo->kernel, 4, sizeof(cl_int), &dir);
err |= clSetKernelArg(kernelInfo->kernel, 5, sizeof(cl_int), &s); err |= clSetKernelArg(kernelInfo->kernel, 5, sizeof(cl_int), &s);
err |= clSetKernelArg(kernelInfo->kernel, 6, sizeof(cl_mem), &(plan->sin_LUT_d1)); err |= clSetKernelArg(kernelInfo->kernel, 6, sizeof(cl_mem), &(plan->cossin_LUT_d1));
err |= clSetKernelArg(kernelInfo->kernel, 7, sizeof(cl_mem), &(plan->cos_LUT_d1)); err |= clSetKernelArg(kernelInfo->kernel, 7, sizeof(cl_mem), &(plan->cossin_LUT_d2));
err |= clSetKernelArg(kernelInfo->kernel, 8, sizeof(cl_mem), &(plan->sin_LUT_d2));
err |= clSetKernelArg(kernelInfo->kernel, 9, sizeof(cl_mem), &(plan->cos_LUT_d2));
err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, 0, NULL, NULL); err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, 0, NULL, NULL);
if(err) if(err)
...@@ -324,10 +315,8 @@ clFFT_ExecutePlannar( cl_command_queue queue, clFFT_Plan Plan, cl_int batchSize, ...@@ -324,10 +315,8 @@ clFFT_ExecutePlannar( cl_command_queue queue, clFFT_Plan Plan, cl_int batchSize,
err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_mem), &memObj_imag[currWrite]); err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_mem), &memObj_imag[currWrite]);
err |= clSetKernelArg(kernelInfo->kernel, 4, sizeof(cl_int), &dir); err |= clSetKernelArg(kernelInfo->kernel, 4, sizeof(cl_int), &dir);
err |= clSetKernelArg(kernelInfo->kernel, 5, sizeof(cl_int), &s); err |= clSetKernelArg(kernelInfo->kernel, 5, sizeof(cl_int), &s);
err |= clSetKernelArg(kernelInfo->kernel, 6, sizeof(cl_mem), &(plan->sin_LUT_d1)); err |= clSetKernelArg(kernelInfo->kernel, 6, sizeof(cl_mem), &(plan->cossin_LUT_d1));
err |= clSetKernelArg(kernelInfo->kernel, 7, sizeof(cl_mem), &(plan->cos_LUT_d1)); err |= clSetKernelArg(kernelInfo->kernel, 7, sizeof(cl_mem), &(plan->cossin_LUT_d2));
err |= clSetKernelArg(kernelInfo->kernel, 8, sizeof(cl_mem), &(plan->sin_LUT_d2));
err |= clSetKernelArg(kernelInfo->kernel, 9, sizeof(cl_mem), &(plan->cos_LUT_d2));
err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, 0, NULL, NULL); err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, 0, NULL, NULL);
if(err) if(err)
......
...@@ -138,12 +138,13 @@ typedef struct ...@@ -138,12 +138,13 @@ typedef struct
// precomputed lookup tables for sin,cos calculations, each of size // precomputed lookup tables for sin,cos calculations, each of size
// sqrt(n) or 2*sqrt(n), n is size of signal; // sqrt(n) or 2*sqrt(n), n is size of signal;
cl_mem sin_LUT_d1,sin_LUT_d2; cl_mem cossin_LUT_d1;
cl_mem cos_LUT_d1,cos_LUT_d2; cl_mem cossin_LUT_d2;
int logN1; int logN1;
int logN2; int logN2;
size_t N1; size_t N1;
size_t N2; size_t N2;
clFFT_TwiddleFactorMethod twiddleMethod;
// Maximum size of signal for which local memory transposed based // Maximum size of signal for which local memory transposed based
// fft is sufficient i.e. no global mem transpose (communication) // fft is sufficient i.e. no global mem transpose (communication)
......
...@@ -183,12 +183,12 @@ insertHeader(string &kernelString, string &kernelName, clFFT_DataFormat dataForm ...@@ -183,12 +183,12 @@ insertHeader(string &kernelString, string &kernelName, clFFT_DataFormat dataForm
if(dataFormat == clFFT_SplitComplexFormat) if(dataFormat == clFFT_SplitComplexFormat)
kernelString += string("__kernel void ") + kernelName + string("(__global float *in_real, __global float *in_imag, __global float *out_real, __global float *out_imag, int dir, int S," kernelString += string("__kernel void ") + kernelName + string("(__global float *in_real, __global float *in_imag, __global float *out_real, __global float *out_imag, int dir, int S,"
"__global float * sinLUT1, __global float * cosLUT1, __global float * sinLUT2, __global float * cosLUT2)\n"); "__global float2 * cossinLUT1, __global float2 * cossinLUT2 )\n");
// "__constant float * sinLUT1, __constant float * cosLUT1, __constant float * sinLUT2, __constant float * cosLUT2)\n"); // "__constant float * sinLUT1, __constant float * cosLUT1, __constant float * sinLUT2, __constant float * cosLUT2)\n");
else else
// kernelString += string("__kernel void ") + kernelName + string("(__global float2 *in, __global float2 *out, int dir, int S, __constant float * sinLUT1, __constant float * cosLUT1, __constant float * sinLUT2, __constant float * cosLUT2)\n"); // kernelString += string("__kernel void ") + kernelName + string("(__global float2 *in, __global float2 *out, int dir, int S, __constant float * sinLUT1, __constant float * cosLUT1, __constant float * sinLUT2, __constant float * cosLUT2)\n");
kernelString += string("__kernel void ") + kernelName + string("(__global float2 *in, __global float2 *out, int dir, int S, __global float * sinLUT1, __global float * cosLUT1, __global float * sinLUT2, __global float * cosLUT2)\n"); kernelString += string("__kernel void ") + kernelName + string("(__global float2 *in, __global float2 *out, int dir, int S, __global float2 * cossinLUT1, __global float2 * cossinLUT2 )\n");
} }
...@@ -694,6 +694,7 @@ insertLocalLoads(string &kernelString, int n, int Nr, int Nrn, int Nprev, int Nc ...@@ -694,6 +694,7 @@ insertLocalLoads(string &kernelString, int n, int Nr, int Nrn, int Nprev, int Nc
kernelString += string(" a[") + num2str(i*Nrn + z) + string("].") + comp + string(" = lMemLoad[") + num2str(st) + string("];\n"); kernelString += string(" a[") + num2str(i*Nrn + z) + string("].") + comp + string(" = lMemLoad[") + num2str(st) + string("];\n");
} }
} }
kernelString += string(" barrier(CLK_LOCAL_MEM_FENCE);\n"); kernelString += string(" barrier(CLK_LOCAL_MEM_FENCE);\n");
} }
...@@ -931,8 +932,10 @@ getGlobalRadixInfo(int n, int *radix, int *R1, int *R2, int *numRadices) ...@@ -931,8 +932,10 @@ getGlobalRadixInfo(int n, int *radix, int *R1, int *R2, int *numRadices)
// expr is C++ code for an expression evaluating to an integer, // expr is C++ code for an expression evaluating to an integer,
// such that num * value(expr)< denom // such that num * value(expr)< denom
static void static void
insertSinCosCalcDirect(string & kernel_string, cl_fft_plan *plan, int num, int denom , string & expr, string & varRes) insertSinCosCalcDirectNative(string & kernel_string, cl_fft_plan *plan, int num, int denom , string & expr, string & varRes)
{ {
if(denom & (denom-1)) { if(denom & (denom-1)) {
kernel_string += string("ang = dir*(2.0f*M_PI*") + num2str(num) + string("/") + num2str(denom) + string(")*("+expr+");\n"); kernel_string += string("ang = dir*(2.0f*M_PI*") + num2str(num) + string("/") + num2str(denom) + string(")*("+expr+");\n");
...@@ -964,6 +967,46 @@ insertSinCosCalcDirect(string & kernel_string, cl_fft_plan *plan, int num, int d ...@@ -964,6 +967,46 @@ insertSinCosCalcDirect(string & kernel_string, cl_fft_plan *plan, int num, int d
kernel_string += varRes+string(" = (float2)(native_cos(ang), native_sin(ang));\n"); kernel_string += varRes+string(" = (float2)(native_cos(ang), native_sin(ang));\n");
} }
static void
insertSinCosCalcDirect(string & kernel_string, cl_fft_plan *plan, int num, int denom , string & expr, string & varRes)
{
if(denom & (denom-1)) {
kernel_string += string("ang = dir*(2.0f*M_PI*") + num2str(num) + string("/") + num2str(denom) + string(")*("+expr+");\n");
} else {
// denom is a power of two
int logDenom =0;
int d= denom;
while (d != 1) {
logDenom++;
d >>= 1;
}
int pi_exp=1+1-logDenom;
string pi_mult_hex = string("0x1.921fb54442d18p") + (pi_exp>0 ? string("+") : string("")) + num2str(pi_exp);
switch (num) {
case 0 :
kernel_string += string("ang = 0.0f;\n");
break;
case 1 :
kernel_string += string("ang = dir*(" + pi_mult_hex + string(") * (")+expr+");\n");
break;
default:
float pi2=0x1.921fb54442d18p+2;
char tw[200];
sprintf(tw,"%a",pi2*num / (float) denom);
kernel_string += string("ang = dir*(" + string(tw) + string(") * (")+expr+");\n");
break;
}
}
kernel_string += string("{\n");
kernel_string += string(" float _tmpcos,_tmpsin;\n");
kernel_string += string(" _tmpsin=sincos(ang,&_tmpcos);\n");
kernel_string += varRes+string(" = (float2)(_tmpcos, _tmpsin);\n");
kernel_string += string("}\n");
}
static void static void
insertSinCosCalc(string & kernel_string, cl_fft_plan *plan, int num, int denom , string & expr, string & varRes) insertSinCosCalc(string & kernel_string, cl_fft_plan *plan, int num, int denom , string & expr, string & varRes)
{ {
...@@ -984,18 +1027,99 @@ insertSinCosCalc(string & kernel_string, cl_fft_plan *plan, int num, int denom , ...@@ -984,18 +1027,99 @@ insertSinCosCalc(string & kernel_string, cl_fft_plan *plan, int num, int denom ,
kernel_string += string("{ int ang_index = ") + num2str(num_norm) + string(" * ( ") + expr + string(") ; \n"); kernel_string += string("{ int ang_index = ") + num2str(num_norm) + string(" * ( ") + expr + string(") ; \n");
kernel_string += string(" int ang_index_k = (ang_index >> " ) + num2str(plan->logN1)+ string(") & "+num2str(plan->N2-1)+ ";\n"); kernel_string += string(" int ang_index_k = (ang_index >> " ) + num2str(plan->logN1)+ string(") & "+num2str(plan->N2-1)+ ";\n");
kernel_string += string(" cos_sinLUT1("+varRes+",dir,ang_index_k,sinLUT2,cosLUT2);\n"); kernel_string += string(" cos_sinLUT1("+varRes+",dir,ang_index_k,cossinLUT2);\n");
kernel_string += string("}\n");
} else {
kernel_string += string("{ int ang_index = ") + num2str(num_norm) + string(" * ( ") + expr + string(") ; \n");
kernel_string += string(" int ang_index_k = ((ang_index >> " ) + num2str(plan->logN1)+ string(") & "+num2str(plan->N2-1)+ ");\n");
kernel_string += string(" int ang_index_i = (ang_index & " ) + num2str(plan->N1-1)+ string(");\n");
kernel_string += string(" cos_sinLUT2("+varRes+",dir,ang_index_i,ang_index_k,cossinLUT1,cossinLUT2);\n");
kernel_string += string("}\n"); kernel_string += string("}\n");
}
break;
}
}
}
// pedantic : use Taylor series approx to degree 3, on 64 grid points between [0,2pi[
static void
insertSinCosCalcTaylor3(string & kernel_string, cl_fft_plan *plan, int num, int denom , string & expr, string & varRes)
{
if(denom & (denom-1)) {
kernel_string += string("ang = dir*(2.0f*M_PI*") + num2str(num) + string("/") + num2str(denom) + string(")*("+expr+");\n");
kernel_string += varRes+string(" = (float2)(native_cos(ang), native_sin(ang));\n");
} else { } else {
// insertSinCosCalcDirect(kernel_string,plan,num,denom,expr,varRes);
kernel_string += string("{ int ang_index = ") + num2str(num_norm) + string(" * ( ") + expr + string(") ; \n"); switch (num) {
kernel_string += string(" int ang_index_k = ((ang_index >> " ) + num2str(plan->logN1)+ string(") & "+num2str(plan->N2-1)+ ");\n"); case 0 :
kernel_string += string(" int ang_index_i = (ang_index & " ) + num2str(plan->N1-1)+ string(");\n"); kernel_string += string("ang = 0.0f;\n");
kernel_string += string(" cos_sinLUT2("+varRes+",dir,ang_index_i,ang_index_k,sinLUT1,cosLUT1,sinLUT2,cosLUT2);\n"); kernel_string += varRes+string(" = (float2)(1.0f, 0.0f);\n");
break;
default:
// normalize num,denom while num is even
while((num % 2 ==0 ) && (denom %2 == 0) ){
denom >>=1;
num >>=1;
}
// if normalized denom < grid size, pick directly from LUT.
if(denom <= plan->N2) {
kernel_string += string("{ int ang_index = (") + num2str(num) + string(" * ( ") + expr + string(")) & ") + num2str(denom -1) + string("; \n");
kernel_string += string(" int k = ang_index * ") + num2str(plan->N2 / denom ) + string(";\n");
kernel_string += string(" float2 cs =cossin_T_LUT[k];\n");
kernel_string += string(" cs.y *=dir;\n");
kernel_string += varRes + string(" = cs;\n");
kernel_string += string("}\n");
} else {
kernel_string += string("{ int ang_index = (") + num2str(num) + string(" * ( ") + expr + string(")) & ") + num2str(denom -1) + string("; \n");
// find nearest bin in grid
int logDenom=0;
int d = denom;
while (d > 1) {
d>>=1;
logDenom++;
}
kernel_string += string(" int k = (ang_index << ") + num2str(plan->logN2) + string(" + ") + num2str(denom / 2) + string(") >> ") + num2str(logDenom) + string(";\n");
// get cos/sin of grid point from LUT
kernel_string += string(" float2 csx0 =cossin_T_LUT[k];\n");
kernel_string += string(" float2 csx0Transp= (float2)(csx0.y,-csx0.x);\n");
kernel_string += string(" int r=ang_index - k * ( ")+ num2str(denom >> plan->logN2) + string(" );\n") ;
// calculate distance (with sign) from this grid point
// DO NOT calculate teh angles here directly and subtract as this will deteriorate precision.
// precompute h0=2*pi/denom;
float pi2=0x1.921fb54442d18p+2;
char tw[200];
sprintf(tw,"%A",-pi2/(float) denom);
kernel_string += string(" float mh=") +string(tw)+string("*(float)r;\n");
// compute taylor series terms to order 3 and add them up, in "reverse" order (highest order first)
kernel_string += string(" float mhsqr2= mh*mh*(-0.5f);\n");
kernel_string += string(" float hqub6= mhsqr2*mh*(1.0f/3.0f);\n");
kernel_string += string(" float2 cs;\n"); // we could use varRes in the first place here..
kernel_string += string(" cs= hqub6 * csx0Transp;\n");
kernel_string += string(" cs += mhsqr2*csx0;\n");
kernel_string += string(" cs += mh*csx0Transp;\n");
kernel_string += string(" cs += csx0;\n");
kernel_string += string(" cs.y *=dir;\n");
kernel_string += varRes + string(" = cs;\n");
kernel_string += string("}\n"); kernel_string += string("}\n");
} }
...@@ -1004,7 +1128,28 @@ insertSinCosCalc(string & kernel_string, cl_fft_plan *plan, int num, int denom , ...@@ -1004,7 +1128,28 @@ insertSinCosCalc(string & kernel_string, cl_fft_plan *plan, int num, int denom ,
} }
} }
static void
insertSinCos(string & kernel_string, cl_fft_plan *plan, int num, int denom , string & expr, string & varRes)
{
switch (plan->twiddleMethod) {
case clFFT_sincosfunc : insertSinCosCalcDirect(kernel_string,plan,num,denom,expr,varRes); break;
case clFFT_BigLUT : insertSinCosCalc(kernel_string,plan,num,denom,expr,varRes); break;
case clFFT_TaylorLUT : insertSinCosCalcTaylor3(kernel_string,plan,num,denom,expr,varRes); break;
default : insertSinCosCalcDirectNative(kernel_string,plan,num,denom,expr,varRes); break;
}
}
static void
insertLocalSinCosLUT(string & kernel_string, cl_fft_plan *plan, int workgroupsize) {
// TODO: conditionally copy to local (shared memory)
if(plan->twiddleMethod == clFFT_TaylorLUT) {
// second LUT holds grid values for Taylor seres approx
kernel_string += string(" __global float2 * cossin_T_LUT = cossinLUT2;\n");
}
}
static void static void
createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir dir, int vertBS) createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir dir, int vertBS)
...@@ -1165,6 +1310,8 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir ...@@ -1165,6 +1310,8 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir
localString += string("a[") + num2str(j) + string("] = in[") + num2str(j*gInInc*strideI) + string("];\n"); localString += string("a[") + num2str(j) + string("] = in[") + num2str(j*gInInc*strideI) + string("];\n");
} }
insertLocalSinCosLUT(localString, plan, threadsPerBlock);
localString += string("fftKernel") + num2str(R1) + string("(a, dir);\n"); localString += string("fftKernel") + num2str(R1) + string("(a, dir);\n");
if(R2 > 1) if(R2 > 1)
...@@ -1175,7 +1322,7 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir ...@@ -1175,7 +1322,7 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir
for(k = 1; k < R1; k++) for(k = 1; k < R1; k++)
{ {
insertSinCosCalc(localString,plan, k, radix , expr, resVar); insertSinCos(localString,plan, k, radix , expr, resVar);
// localString += string("ang = dir*(2.0f*M_PI*") + num2str(k) + string("/") + num2str(radix) + string(")*j;\n"); // localString += string("ang = dir*(2.0f*M_PI*") + num2str(k) + string("/") + num2str(radix) + string(")*j;\n");
// localString += string("w = (float2)(native_cos(ang), native_sin(ang));\n"); // localString += string("w = (float2)(native_cos(ang), native_sin(ang));\n");
localString += string("a[") + num2str(k) + string("] = complexMul(a[") + num2str(k) + string("], w);\n"); localString += string("a[") + num2str(k) + string("] = complexMul(a[") + num2str(k) + string("], w);\n");
...@@ -1219,7 +1366,7 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir ...@@ -1219,7 +1366,7 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir
{ {
string expr = string("l * (k + ") + num2str((t%R2)*R1 + (t/R2)) + string(")"); string expr = string("l * (k + ") + num2str((t%R2)*R1 + (t/R2)) + string(")");
insertSinCosCalc(localString, plan, 1, N , expr, varRes) ; insertSinCos(localString, plan, 1, N , expr, varRes) ;
// localString += string("ang = ang1*(k + ") + num2str((t%R2)*R1 + (t/R2)) + string(");\n"); // localString += string("ang = ang1*(k + ") + num2str((t%R2)*R1 + (t/R2)) + string(");\n");
......
...@@ -162,24 +162,14 @@ destroy_plan(cl_fft_plan *Plan) ...@@ -162,24 +162,14 @@ destroy_plan(cl_fft_plan *Plan)
Plan->tempmemobj_imag = NULL; Plan->tempmemobj_imag = NULL;
} }
if(Plan->sin_LUT_d1) if(Plan->cossin_LUT_d1)
{ {
clReleaseMemObject(Plan->sin_LUT_d1); clReleaseMemObject(Plan->cossin_LUT_d1);
} }
if(Plan->sin_LUT_d2) if(Plan->cossin_LUT_d2)
{ {
clReleaseMemObject(Plan->sin_LUT_d2); clReleaseMemObject(Plan->cossin_LUT_d2);
}
if(Plan->cos_LUT_d1)
{
clReleaseMemObject(Plan->cos_LUT_d1);
}
if(Plan->cos_LUT_d2)
{
clReleaseMemObject(Plan->cos_LUT_d2);
} }
} }
...@@ -256,8 +246,6 @@ int precomputeSinCosLUTs(cl_fft_plan * plan,cl_int *error_code) { ...@@ -256,8 +246,6 @@ int precomputeSinCosLUTs(cl_fft_plan * plan,cl_int *error_code) {
size_t i=0; size_t i=0;
cl_int err; cl_int err;
float * tmpLUT;
// find logN1,logN2, where // find logN1,logN2, where
// n = 2^logN1 * 2^logN2 , and logN1=logN2 +/- 1 // n = 2^logN1 * 2^logN2 , and logN1=logN2 +/- 1
...@@ -265,6 +253,17 @@ int precomputeSinCosLUTs(cl_fft_plan * plan,cl_int *error_code) { ...@@ -265,6 +253,17 @@ int precomputeSinCosLUTs(cl_fft_plan * plan,cl_int *error_code) {
plan->logN1=0; plan->logN1=0;
plan->logN2=0; plan->logN2=0;
plan->N1=1;
plan->N2=1;
switch (plan->twiddleMethod) {
case clFFT_native_trig: return 0;
case clFFT_sincosfunc : return 0;
case clFFT_TaylorLUT :
plan->logN1 = 0;
plan->logN2 = 8;
break;
case clFFT_BigLUT : {
size_t Nrem=N; size_t Nrem=N;
...@@ -276,113 +275,59 @@ int precomputeSinCosLUTs(cl_fft_plan * plan,cl_int *error_code) { ...@@ -276,113 +275,59 @@ int precomputeSinCosLUTs(cl_fft_plan * plan,cl_int *error_code) {
plan->logN2++; plan->logN2++;
Nrem >>= 1; Nrem >>= 1;
} }
}}
break;
default: return 1;
} }
plan->N1 = 1 << plan->logN1; plan->N1 = 1 << plan->logN1;
plan->N2 = 1 << plan->logN2; plan->N2 = 1 << plan->logN2;
// we use logN1 >= logN2 to allocate a buffer that fits
// the greater of the two LUTs
tmpLUT = (float*) malloc( plan->N1 * sizeof(float));
float * tmpLUT_sin1 = (float*) malloc( plan->N1 * sizeof(float)); float * tmpLUT_cossin1 = (float*) malloc( plan->N1 * 2 * sizeof(float));
float * tmpLUT_sin2 = (float*) malloc( plan->N1 * sizeof(float)); float * tmpLUT_cossin2 = (float*) malloc( plan->N2 * 2 * sizeof(float));
float * tmpLUT_cos1 = (float*) malloc( plan->N1 * sizeof(float));
float * tmpLUT_cos2 = (float*) malloc( plan->N1 * sizeof(float));
/*
if(tmpLUT==NULL) {
// TODO: error handling
}
*/
double PI2 = 8.0*atan(1.0); double PI2 = 8.0*atan(1.0);
for(i=0; i < plan->N1; i++) { for(i=0; i < plan->N1; i++) {
tmpLUT_sin1[i]=(float)sin(PI2 * (float) i / (float)N); tmpLUT_cossin1[i*2] =(float)cos(PI2 * (float) i / (float)N);
tmpLUT_cossin1[i*2+1]=(float)sin(PI2 * (float) i / (float)N);
} }
plan->sin_LUT_d1 = clCreateBuffer(plan->context,CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, plan->N1*sizeof(float),tmpLUT_sin1, &err); plan->cossin_LUT_d1 = clCreateBuffer(plan->context,CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, plan->N1*2*sizeof(float),tmpLUT_cossin1, &err);
if( err != CL_SUCCESS) if( err != CL_SUCCESS)
{ {
if(error_code) if(error_code)
*error_code = err; *error_code = err;
free(tmpLUT); free(tmpLUT_cossin1 );
return 1; free(tmpLUT_cossin2 );
}
for(i=0; i < plan->N1; i++) {
tmpLUT_cos1[i]=(float)cos(PI2 * (float) i / (float)N);
}
plan->cos_LUT_d1 = clCreateBuffer(plan->context,CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, plan->N1*sizeof(float),tmpLUT_cos1, &err);
if( err != CL_SUCCESS)
{
if(error_code)
*error_code = err;
free(tmpLUT);
return 1; return 1;
} }
for(i=0; i < plan->N2; i++) {
tmpLUT_sin2[i]=(float)sin(PI2 * (float) i / (float) plan->N2);
}
plan->sin_LUT_d2 = clCreateBuffer(plan->context,CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, plan->N2*sizeof(float),tmpLUT_sin2, &err);
if( err != CL_SUCCESS)
{
if(error_code)
*error_code = err;
free(tmpLUT);
return 1;
}
for(i=0; i < plan->N2; i++) { for(i=0; i < plan->N2; i++) {
tmpLUT_cos2[i]=(float)cos(PI2 * (float) i / (float) plan->N2); tmpLUT_cossin2[2*i] =(float)cos(PI2 * (float) i / (float) plan->N2);
tmpLUT_cossin2[2*i+1]=(float)sin(PI2 * (float) i / (float) plan->N2);
} }
plan->cos_LUT_d2 = clCreateBuffer(plan->context,CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, plan->N2*sizeof(float),tmpLUT_cos2, &err); plan->cossin_LUT_d2 = clCreateBuffer(plan->context,CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, plan->N2*2*sizeof(float),tmpLUT_cossin2, &err);
if( err != CL_SUCCESS) if( err != CL_SUCCESS)
{ {
if(error_code) if(error_code)
*error_code = err; *error_code = err;
free(tmpLUT); free(tmpLUT_cossin1 );
return 1; free(tmpLUT_cossin2 );
}
float diff=0.0f;
for(int i = 0 ; i < N ; i++) {
int ind1 = i & (plan->N1-1);
int ind2 = i >> plan->logN1;
float cos_ref = cos(-PI2 * (float) i / (float) N);
float sin_ref = sin(-PI2 * (float) i / (float) N);
float test_sin =-1.0*(tmpLUT_sin1[ind1]*tmpLUT_cos2[ind2]+tmpLUT_cos1[ind1] * tmpLUT_sin2[ind2]);
float test_cos = tmpLUT_cos1[ind1]*tmpLUT_cos2[ind2]-tmpLUT_sin1[ind1] * tmpLUT_sin2[ind2];
float test_diff= std::abs(test_sin - sin_ref);
if(test_diff > diff ) diff=test_diff;
test_diff= std::abs(test_cos - cos_ref);
if(test_diff > diff ) diff=test_diff;
return 1;
} }
fprintf(stdout,"!!!!!!!!! %e !!!!!!!!!!!!\n",diff);
free(tmpLUT);
free(tmpLUT_sin1);
free(tmpLUT_sin2);
free(tmpLUT_cos1);
free(tmpLUT_cos2);
free(tmpLUT_cossin1);
free(tmpLUT_cossin2);
return 0; return 0;
...@@ -435,18 +380,21 @@ clFFT_CreatePlan(cl_context context, clFFT_Dim3 n, clFFT_Dimension dim, clFFT_Da ...@@ -435,18 +380,21 @@ clFFT_CreatePlan(cl_context context, clFFT_Dim3 n, clFFT_Dimension dim, clFFT_Da
plan->tempmemobj = 0; plan->tempmemobj = 0;
plan->tempmemobj_real = 0; plan->tempmemobj_real = 0;
plan->tempmemobj_imag = 0; plan->tempmemobj_imag = 0;
plan->sin_LUT_d1=0; plan->cossin_LUT_d1=0;
plan->sin_LUT_d2=0; plan->cossin_LUT_d2=0;
plan->cos_LUT_d1=0;
plan->cos_LUT_d2=0;
plan->max_localmem_fft_size = 2048; plan->max_localmem_fft_size = 2048;
plan->max_work_item_per_workgroup = 256; plan->max_work_item_per_workgroup = 256;
plan->max_radix = 16; plan->max_radix = 16;
plan->min_mem_coalesce_width = 16; plan->min_mem_coalesce_width = 16;
plan->num_local_mem_banks = 16; plan->num_local_mem_banks = 16;
//TODO: restore native as default
plan->twiddleMethod = clFFT_TaylorLUT;
precomputeSinCosLUTs(plan,error_code); precomputeSinCosLUTs(plan,error_code);
patch_kernel_source: patch_kernel_source:
plan->kernel_string = new string(""); plan->kernel_string = new string("");
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment