Skip to content
Snippets Groups Projects
Commit 8d6fe913 authored by Heinz-Bernd Eggenstein's avatar Heinz-Bernd Eggenstein
Browse files

Bug #1608: clFFT use of native_sin , native_cos can cause validation problems

experimanetal: improved Taylor series approx by copying LUT to shared mem.
               TODO: cleanup, expose sin/cos method on plan creation interface,
                     do proper calculation of available shared mem for sin cos LUT
parent 20314512
Branches
Tags
No related merge requests found
...@@ -748,6 +748,38 @@ insertLocalStoreIndexArithmatic(string &kernelString, int numWorkItemsReq, int n ...@@ -748,6 +748,38 @@ insertLocalStoreIndexArithmatic(string &kernelString, int numWorkItemsReq, int n
} }
static void
insertLocalSinCosLUT(string & kernel_string, cl_fft_plan *plan, int workgroupsize) {
// conditionally copy to local (shared) memory
if(plan->twiddleMethod == clFFT_TaylorLUT) {
// LUT holds grid values for Taylor seres approx
kernel_string += string(" __local float2 cossin_T_LUT[256];\n");
int m = (int) ceilf(256.0 / (float) workgroupsize);
kernel_string += string(" int lLUTind= lId; \n");
if (256 % workgroupsize != 0) kernel_string += string(" if(lLUTind < 256) { \n");
kernel_string += string(" cossin_T_LUT[lLUTind]=cossinLUT2[lLUTind]; \n");
if (256 % workgroupsize != 0) kernel_string += string(" } \n");
for(int k= 1 ; k < m ; k++) {
kernel_string += string(" lLUTind+=") + num2str(workgroupsize) + string(";\n");
if (256 % workgroupsize != 0) kernel_string += string(" if(lLUTind < 256) { \n");
kernel_string += string(" cossin_T_LUT[lLUTind]=cossinLUT2[lLUTind]; \n");
if (256 % workgroupsize != 0) kernel_string += string(" }\n");
}
kernel_string += string(" barrier(CLK_LOCAL_MEM_FENCE);\n");
// TODO remove kernel_string += string(" __global float2 * cossin_T_LUT = cossinLUT2;\n");
}
}
static void static void
createLocalMemfftKernelString(cl_fft_plan *plan) createLocalMemfftKernelString(cl_fft_plan *plan)
{ {
...@@ -818,6 +850,7 @@ createLocalMemfftKernelString(cl_fft_plan *plan) ...@@ -818,6 +850,7 @@ createLocalMemfftKernelString(cl_fft_plan *plan)
unsigned int lMemSize = 0; unsigned int lMemSize = 0;
insertVariables(localString, maxRadix); insertVariables(localString, maxRadix);
insertLocalSinCosLUT(localString, plan, numWorkItemsPerWG);
lMemSize = insertGlobalLoadsAndTranspose(localString, n, numWorkItemsPerXForm, numXFormsPerWG, maxRadix, plan->min_mem_coalesce_width, dataFormat); lMemSize = insertGlobalLoadsAndTranspose(localString, n, numWorkItemsPerXForm, numXFormsPerWG, maxRadix, plan->min_mem_coalesce_width, dataFormat);
(*kInfo)->lmem_size = (lMemSize > (*kInfo)->lmem_size) ? lMemSize : (*kInfo)->lmem_size; (*kInfo)->lmem_size = (lMemSize > (*kInfo)->lmem_size) ? lMemSize : (*kInfo)->lmem_size;
...@@ -1140,16 +1173,7 @@ insertSinCos(string & kernel_string, cl_fft_plan *plan, int num, int denom , str ...@@ -1140,16 +1173,7 @@ insertSinCos(string & kernel_string, cl_fft_plan *plan, int num, int denom , str
} }
static void
insertLocalSinCosLUT(string & kernel_string, cl_fft_plan *plan, int workgroupsize) {
// TODO: conditionally copy to local (shared memory)
if(plan->twiddleMethod == clFFT_TaylorLUT) {
// second LUT holds grid values for Taylor seres approx
kernel_string += string(" __global float2 * cossin_T_LUT = cossinLUT2;\n");
}
}
static void static void
createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir dir, int vertBS) createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir dir, int vertBS)
...@@ -1256,6 +1280,9 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir ...@@ -1256,6 +1280,9 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir
insertVariables(localString, R1); insertVariables(localString, R1);
if((R2 > 1) || (passNum < (numPasses - 1))) {
insertLocalSinCosLUT(localString, plan, threadsPerBlock);
}
if(vertical) if(vertical)
{ {
localString += string("xNum = groupId >> ") + num2str((int)log2(numBlocksPerXForm)) + string(";\n"); localString += string("xNum = groupId >> ") + num2str((int)log2(numBlocksPerXForm)) + string(";\n");
...@@ -1310,7 +1337,7 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir ...@@ -1310,7 +1337,7 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir
localString += string("a[") + num2str(j) + string("] = in[") + num2str(j*gInInc*strideI) + string("];\n"); localString += string("a[") + num2str(j) + string("] = in[") + num2str(j*gInInc*strideI) + string("];\n");
} }
insertLocalSinCosLUT(localString, plan, threadsPerBlock);
localString += string("fftKernel") + num2str(R1) + string("(a, dir);\n"); localString += string("fftKernel") + num2str(R1) + string("(a, dir);\n");
...@@ -1368,7 +1395,6 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir ...@@ -1368,7 +1395,6 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir
insertSinCos(localString, plan, 1, N , expr, varRes) ; insertSinCos(localString, plan, 1, N , expr, varRes) ;
// localString += string("ang = ang1*(k + ") + num2str((t%R2)*R1 + (t/R2)) + string(");\n"); // localString += string("ang = ang1*(k + ") + num2str((t%R2)*R1 + (t/R2)) + string(");\n");
// localString += string("w = (float2)(native_cos(ang), native_sin(ang));\n"); // localString += string("w = (float2)(native_cos(ang), native_sin(ang));\n");
localString += string("a[") + num2str(t) + string("] = complexMul(a[") + num2str(t) + string("], w);\n"); localString += string("a[") + num2str(t) + string("] = complexMul(a[") + num2str(t) + string("], w);\n");
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment