diff --git a/src/fft_kernelstring.cpp b/src/fft_kernelstring.cpp
index 416fbfcd84e9ada83ad8acb5fe21ace43abab51d..f1260395dae249b8de585b07289312a264aecd40 100644
--- a/src/fft_kernelstring.cpp
+++ b/src/fft_kernelstring.cpp
@@ -748,6 +748,38 @@ insertLocalStoreIndexArithmatic(string &kernelString, int numWorkItemsReq, int n
 }
 
 
+static void
+insertLocalSinCosLUT(string & kernel_string, cl_fft_plan *plan, int workgroupsize) {
+    
+    // conditionally copy to local (shared) memory 
+    
+    if(plan->twiddleMethod == clFFT_TaylorLUT) {
+        // LUT holds grid values for Taylor seres approx
+        kernel_string += string(" __local  float2  cossin_T_LUT[256];\n");
+        
+        int m = (int) ceilf(256.0 / (float) workgroupsize);
+        
+        
+        kernel_string += string(" int lLUTind= lId;       \n");     
+        
+        if (256 % workgroupsize != 0) kernel_string += string(" if(lLUTind < 256) {     \n");
+        kernel_string += string("     cossin_T_LUT[lLUTind]=cossinLUT2[lLUTind]; \n");
+        if (256 % workgroupsize  != 0)  kernel_string += string(" }                                      \n");
+        
+        for(int k= 1 ; k < m ; k++) {
+            kernel_string += string(" lLUTind+=") + num2str(workgroupsize) + string(";\n");
+            if (256 % workgroupsize != 0) kernel_string += string(" if(lLUTind < 256) { \n");
+            kernel_string += string("     cossin_T_LUT[lLUTind]=cossinLUT2[lLUTind]; \n");
+            if (256 % workgroupsize != 0) kernel_string += string(" }\n");
+        }
+        
+        kernel_string += string(" barrier(CLK_LOCAL_MEM_FENCE);\n");
+        
+// TODO remove        kernel_string += string(" __global float2 * cossin_T_LUT =  cossinLUT2;\n");
+    }
+}
+
+
 static void
 createLocalMemfftKernelString(cl_fft_plan *plan)
 {
@@ -818,6 +850,7 @@ createLocalMemfftKernelString(cl_fft_plan *plan)
     unsigned int lMemSize = 0;
 
     insertVariables(localString, maxRadix);
+    insertLocalSinCosLUT(localString, plan, numWorkItemsPerWG);
 
     lMemSize = insertGlobalLoadsAndTranspose(localString, n, numWorkItemsPerXForm, numXFormsPerWG, maxRadix, plan->min_mem_coalesce_width, dataFormat);
     (*kInfo)->lmem_size = (lMemSize > (*kInfo)->lmem_size) ? lMemSize : (*kInfo)->lmem_size;
@@ -1140,16 +1173,7 @@ insertSinCos(string & kernel_string, cl_fft_plan *plan, int num, int denom , str
 }
 
 
-static void
-insertLocalSinCosLUT(string & kernel_string, cl_fft_plan *plan, int workgroupsize) {
-    
-    // TODO: conditionally copy to local (shared memory) 
-    
-    if(plan->twiddleMethod == clFFT_TaylorLUT) {
-        // second LUT holds grid values for Taylor seres approx
-        kernel_string += string(" __global float2 * cossin_T_LUT =  cossinLUT2;\n");
-    }
-}
+
 
 static void
 createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir dir, int vertBS)
@@ -1187,7 +1211,7 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir
     int Rinit = vertical ? BS : 1;
     batchSize = vertical ? min(BS, batchSize) : batchSize;
     int passNum;
-
+    
     for(passNum = 0; passNum < numPasses; passNum++)
     {
 
@@ -1255,7 +1279,10 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir
         strcpy((*kInfo)->kernel_name, kernelName.c_str());
 
         insertVariables(localString, R1);
-
+        
+        if((R2 > 1) || (passNum < (numPasses - 1))) {
+            insertLocalSinCosLUT(localString, plan, threadsPerBlock);
+        }    
         if(vertical)
         {
             localString += string("xNum = groupId >> ") + num2str((int)log2(numBlocksPerXForm)) + string(";\n");
@@ -1310,7 +1337,7 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir
                 localString += string("a[") + num2str(j) + string("] = in[") + num2str(j*gInInc*strideI) + string("];\n");
         }
 
-        insertLocalSinCosLUT(localString, plan, threadsPerBlock);
+
         
         localString += string("fftKernel") + num2str(R1) + string("(a, dir);\n");
 
@@ -1322,7 +1349,7 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir
 
             for(k = 1; k < R1; k++)
             {
-                insertSinCos(localString,plan, k, radix , expr, resVar);              
+                insertSinCos(localString,plan, k, radix , expr, resVar); 
 //                localString += string("ang = dir*(2.0f*M_PI*") + num2str(k) + string("/") + num2str(radix) + string(")*j;\n");
 //                localString += string("w = (float2)(native_cos(ang), native_sin(ang));\n");
                 localString += string("a[") + num2str(k) + string("] = complexMul(a[") + num2str(k) + string("], w);\n");
@@ -1368,7 +1395,6 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir
                 
                 insertSinCos(localString, plan, 1, N , expr, varRes) ;
                 
-                
 //                localString += string("ang = ang1*(k + ") + num2str((t%R2)*R1 + (t/R2)) + string(");\n");
 //                localString += string("w = (float2)(native_cos(ang), native_sin(ang));\n");
                 localString += string("a[") + num2str(t) + string("] = complexMul(a[") + num2str(t) + string("], w);\n");