Commit e62aadb5 authored by Heinz-Bernd Eggenstein's avatar Heinz-Bernd Eggenstein
Browse files

make index calculation save for long transforms (2^24 +) by eliminating xxx24...

make index calculation save for long transforms (2^24 +) by eliminating xxx24 integer functions that internally perform ploating point arithmetic using floats (24 bit mantissa)
parent 9c5a4b48
...@@ -71,6 +71,8 @@ static string baseKernels = string( ...@@ -71,6 +71,8 @@ static string baseKernels = string(
"#ifndef M_PI\n" "#ifndef M_PI\n"
"#define M_PI 0x1.921fb54442d18p+1\n" "#define M_PI 0x1.921fb54442d18p+1\n"
"#endif\n" "#endif\n"
"#define INTMULFULL(a,b) ((a)*(b)) \n"
"#define INTMADFULL(a,b,c) ((a)*(b)+(c)) \n"
"#define complexMul(a,b) ((float2)(mad(-(a).y, (b).y, (a).x * (b).x), mad((a).y, (b).x, (a).x * (b).y)))\n" "#define complexMul(a,b) ((float2)(mad(-(a).y, (b).y, (a).x * (b).x), mad((a).y, (b).x, (a).x * (b).y)))\n"
"\n" "\n"
"#define cos_sinLUT1(res,dir,i,cossinLUT)\\\n" "#define cos_sinLUT1(res,dir,i,cossinLUT)\\\n"
......
...@@ -261,7 +261,7 @@ insertGlobalLoadsAndTranspose(string &kernelString, int N, int numWorkItemsPerXF ...@@ -261,7 +261,7 @@ insertGlobalLoadsAndTranspose(string &kernelString, int N, int numWorkItemsPerXF
kernelString += string(" ii = lId & ") + num2str(numWorkItemsPerXForm-1) + string(";\n"); kernelString += string(" ii = lId & ") + num2str(numWorkItemsPerXForm-1) + string(";\n");
kernelString += string(" jj = lId >> ") + num2str(log2NumWorkItemsPerXForm) + string(";\n"); kernelString += string(" jj = lId >> ") + num2str(log2NumWorkItemsPerXForm) + string(";\n");
kernelString += string(" if( !s || (groupId < get_num_groups(0)-1) || (jj < s) ) {\n"); kernelString += string(" if( !s || (groupId < get_num_groups(0)-1) || (jj < s) ) {\n");
kernelString += string(" offset = mad24( mad24(groupId, ") + num2str(numXFormsPerWG) + string(", jj), ") + num2str(N) + string(", ii );\n"); kernelString += string(" offset = INTMAD( INTMAD(groupId, ") + num2str(numXFormsPerWG) + string(", jj), ") + num2str(N) + string(", ii );\n");
if(dataFormat == clFFT_InterleavedComplexFormat) if(dataFormat == clFFT_InterleavedComplexFormat)
{ {
kernelString += string(" in += offset;\n"); kernelString += string(" in += offset;\n");
...@@ -282,7 +282,7 @@ insertGlobalLoadsAndTranspose(string &kernelString, int N, int numWorkItemsPerXF ...@@ -282,7 +282,7 @@ insertGlobalLoadsAndTranspose(string &kernelString, int N, int numWorkItemsPerXF
{ {
kernelString += string(" ii = lId;\n"); kernelString += string(" ii = lId;\n");
kernelString += string(" jj = 0;\n"); kernelString += string(" jj = 0;\n");
kernelString += string(" offset = mad24(groupId, ") + num2str(N) + string(", ii);\n"); kernelString += string(" offset = INTMAD(groupId, ") + num2str(N) + string(", ii);\n");
if(dataFormat == clFFT_InterleavedComplexFormat) if(dataFormat == clFFT_InterleavedComplexFormat)
{ {
kernelString += string(" in += offset;\n"); kernelString += string(" in += offset;\n");
...@@ -306,9 +306,9 @@ insertGlobalLoadsAndTranspose(string &kernelString, int N, int numWorkItemsPerXF ...@@ -306,9 +306,9 @@ insertGlobalLoadsAndTranspose(string &kernelString, int N, int numWorkItemsPerXF
kernelString += string(" ii = lId & ") + num2str(mem_coalesce_width - 1) + string(";\n"); kernelString += string(" ii = lId & ") + num2str(mem_coalesce_width - 1) + string(";\n");
kernelString += string(" jj = lId >> ") + num2str((int)log2(mem_coalesce_width)) + string(";\n"); kernelString += string(" jj = lId >> ") + num2str((int)log2(mem_coalesce_width)) + string(";\n");
kernelString += string(" lMemStore = sMem + mad24( jj, ") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n"); kernelString += string(" lMemStore = sMem + INTMAD( jj, ") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n");
kernelString += string(" offset = mad24( groupId, ") + num2str(numXFormsPerWG) + string(", jj);\n"); kernelString += string(" offset = INTMAD( groupId, ") + num2str(numXFormsPerWG) + string(", jj);\n");
kernelString += string(" offset = mad24( offset, ") + num2str(N) + string(", ii );\n"); kernelString += string(" offset = INTMAD( offset, ") + num2str(N) + string(", ii );\n");
if(dataFormat == clFFT_InterleavedComplexFormat) if(dataFormat == clFFT_InterleavedComplexFormat)
{ {
kernelString += string(" in += offset;\n"); kernelString += string(" in += offset;\n");
...@@ -343,7 +343,7 @@ insertGlobalLoadsAndTranspose(string &kernelString, int N, int numWorkItemsPerXF ...@@ -343,7 +343,7 @@ insertGlobalLoadsAndTranspose(string &kernelString, int N, int numWorkItemsPerXF
kernelString += string(" ii = lId & ") + num2str(numWorkItemsPerXForm - 1) + string(";\n"); kernelString += string(" ii = lId & ") + num2str(numWorkItemsPerXForm - 1) + string(";\n");
kernelString += string(" jj = lId >> ") + num2str(log2NumWorkItemsPerXForm) + string(";\n"); kernelString += string(" jj = lId >> ") + num2str(log2NumWorkItemsPerXForm) + string(";\n");
kernelString += string(" lMemLoad = sMem + mad24( jj, ") + num2str(N + numWorkItemsPerXForm) + string(", ii);\n"); kernelString += string(" lMemLoad = sMem + INTMAD( jj, ") + num2str(N + numWorkItemsPerXForm) + string(", ii);\n");
for( i = 0; i < numOuterIter; i++ ) for( i = 0; i < numOuterIter; i++ )
{ {
...@@ -377,7 +377,7 @@ insertGlobalLoadsAndTranspose(string &kernelString, int N, int numWorkItemsPerXF ...@@ -377,7 +377,7 @@ insertGlobalLoadsAndTranspose(string &kernelString, int N, int numWorkItemsPerXF
} }
else else
{ {
kernelString += string(" offset = mad24( groupId, ") + num2str(N * numXFormsPerWG) + string(", lId );\n"); kernelString += string(" offset = INTMAD( groupId, ") + num2str(N * numXFormsPerWG) + string(", lId );\n");
if(dataFormat == clFFT_InterleavedComplexFormat) if(dataFormat == clFFT_InterleavedComplexFormat)
{ {
kernelString += string(" in += offset;\n"); kernelString += string(" in += offset;\n");
...@@ -393,7 +393,7 @@ insertGlobalLoadsAndTranspose(string &kernelString, int N, int numWorkItemsPerXF ...@@ -393,7 +393,7 @@ insertGlobalLoadsAndTranspose(string &kernelString, int N, int numWorkItemsPerXF
kernelString += string(" ii = lId & ") + num2str(N-1) + string(";\n"); kernelString += string(" ii = lId & ") + num2str(N-1) + string(";\n");
kernelString += string(" jj = lId >> ") + num2str((int)log2(N)) + string(";\n"); kernelString += string(" jj = lId >> ") + num2str((int)log2(N)) + string(";\n");
kernelString += string(" lMemStore = sMem + mad24( jj, ") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n"); kernelString += string(" lMemStore = sMem + INTMAD( jj, ") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n");
kernelString += string("if((groupId == get_num_groups(0)-1) && s) {\n"); kernelString += string("if((groupId == get_num_groups(0)-1) && s) {\n");
for( i = 0; i < R0; i++ ) for( i = 0; i < R0; i++ )
...@@ -415,13 +415,13 @@ insertGlobalLoadsAndTranspose(string &kernelString, int N, int numWorkItemsPerXF ...@@ -415,13 +415,13 @@ insertGlobalLoadsAndTranspose(string &kernelString, int N, int numWorkItemsPerXF
{ {
kernelString += string(" ii = lId & ") + num2str(numWorkItemsPerXForm - 1) + string(";\n"); kernelString += string(" ii = lId & ") + num2str(numWorkItemsPerXForm - 1) + string(";\n");
kernelString += string(" jj = lId >> ") + num2str(log2NumWorkItemsPerXForm) + string(";\n"); kernelString += string(" jj = lId >> ") + num2str(log2NumWorkItemsPerXForm) + string(";\n");
kernelString += string(" lMemLoad = sMem + mad24( jj, ") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n"); kernelString += string(" lMemLoad = sMem + INTMAD( jj, ") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n");
} }
else else
{ {
kernelString += string(" ii = 0;\n"); kernelString += string(" ii = 0;\n");
kernelString += string(" jj = lId;\n"); kernelString += string(" jj = lId;\n");
kernelString += string(" lMemLoad = sMem + mul24( jj, ") + num2str(N + numWorkItemsPerXForm) + string(");\n"); kernelString += string(" lMemLoad = sMem + INTMUL( jj, ") + num2str(N + numWorkItemsPerXForm) + string(");\n");
} }
...@@ -478,10 +478,10 @@ insertGlobalStoresAndTranspose(string &kernelString, int N, int maxRadix, int Nr ...@@ -478,10 +478,10 @@ insertGlobalStoresAndTranspose(string &kernelString, int N, int maxRadix, int Nr
int numInnerIter = N / mem_coalesce_width; int numInnerIter = N / mem_coalesce_width;
int numOuterIter = numXFormsPerWG / ( groupSize / mem_coalesce_width ); int numOuterIter = numXFormsPerWG / ( groupSize / mem_coalesce_width );
kernelString += string(" lMemLoad = sMem + mad24( jj, ") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n"); kernelString += string(" lMemLoad = sMem + INTMAD( jj, ") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n");
kernelString += string(" ii = lId & ") + num2str(mem_coalesce_width - 1) + string(";\n"); kernelString += string(" ii = lId & ") + num2str(mem_coalesce_width - 1) + string(";\n");
kernelString += string(" jj = lId >> ") + num2str((int)log2(mem_coalesce_width)) + string(";\n"); kernelString += string(" jj = lId >> ") + num2str((int)log2(mem_coalesce_width)) + string(";\n");
kernelString += string(" lMemStore = sMem + mad24( jj,") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n"); kernelString += string(" lMemStore = sMem + INTMAD( jj,") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n");
for( i = 0; i < maxRadix; i++ ) for( i = 0; i < maxRadix; i++ )
{ {
...@@ -534,11 +534,11 @@ insertGlobalStoresAndTranspose(string &kernelString, int N, int maxRadix, int Nr ...@@ -534,11 +534,11 @@ insertGlobalStoresAndTranspose(string &kernelString, int N, int maxRadix, int Nr
} }
else else
{ {
kernelString += string(" lMemLoad = sMem + mad24( jj,") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n"); kernelString += string(" lMemLoad = sMem + INTMAD( jj,") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n");
kernelString += string(" ii = lId & ") + num2str(N - 1) + string(";\n"); kernelString += string(" ii = lId & ") + num2str(N - 1) + string(";\n");
kernelString += string(" jj = lId >> ") + num2str((int) log2(N)) + string(";\n"); kernelString += string(" jj = lId >> ") + num2str((int) log2(N)) + string(";\n");
kernelString += string(" lMemStore = sMem + mad24( jj,") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n"); kernelString += string(" lMemStore = sMem + INTMAD( jj,") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n");
for( i = 0; i < maxRadix; i++ ) for( i = 0; i < maxRadix; i++ )
{ {
...@@ -730,7 +730,7 @@ insertLocalLoadIndexArithmatic(string &kernelString, int Nprev, int Nr, int numW ...@@ -730,7 +730,7 @@ insertLocalLoadIndexArithmatic(string &kernelString, int Nprev, int Nr, int numW
if(Nprev == 1) if(Nprev == 1)
kernelString += string(" i = ii >> ") + num2str(logNcurr) + string(";\n"); kernelString += string(" i = ii >> ") + num2str(logNcurr) + string(";\n");
else else
kernelString += string(" i = mad24(ii >> ") + num2str(logNcurr) + string(", ") + num2str(Nprev) + string(", ii & ") + num2str(Nprev - 1) + string(");\n"); kernelString += string(" i = INTMAD(ii >> ") + num2str(logNcurr) + string(", ") + num2str(Nprev) + string(", ii & ") + num2str(Nprev - 1) + string(");\n");
} }
else else
{ {
...@@ -745,9 +745,9 @@ insertLocalLoadIndexArithmatic(string &kernelString, int Nprev, int Nr, int numW ...@@ -745,9 +745,9 @@ insertLocalLoadIndexArithmatic(string &kernelString, int Nprev, int Nr, int numW
} }
if(numXFormsPerWG > 1) if(numXFormsPerWG > 1)
kernelString += string(" i = mad24(jj, ") + num2str(incr) + string(", i);\n"); kernelString += string(" i = INTMAD(jj, ") + num2str(incr) + string(", i);\n");
kernelString += string(" lMemLoad = sMem + mad24(j, ") + num2str(numWorkItemsReq + offset) + string(", i);\n"); kernelString += string(" lMemLoad = sMem + INTMAD(j, ") + num2str(numWorkItemsReq + offset) + string(", i);\n");
} }
static void static void
...@@ -757,7 +757,7 @@ insertLocalStoreIndexArithmatic(string &kernelString, int numWorkItemsReq, int n ...@@ -757,7 +757,7 @@ insertLocalStoreIndexArithmatic(string &kernelString, int numWorkItemsReq, int n
kernelString += string(" lMemStore = sMem + ii;\n"); kernelString += string(" lMemStore = sMem + ii;\n");
} }
else { else {
kernelString += string(" lMemStore = sMem + mad24(jj, ") + num2str((numWorkItemsReq + offset)*Nr + midPad) + string(", ii);\n"); kernelString += string(" lMemStore = sMem + INTMAD(jj, ") + num2str((numWorkItemsReq + offset)*Nr + midPad) + string(", ii);\n");
} }
} }
...@@ -932,8 +932,8 @@ createLocalMemfftKernelString(cl_fft_plan *plan) ...@@ -932,8 +932,8 @@ createLocalMemfftKernelString(cl_fft_plan *plan)
void void
getGlobalRadixInfo(int n, int *radix, int *R1, int *R2, int *numRadices) getGlobalRadixInfo(int n, int *radix, int *R1, int *R2, int *numRadices)
{ {
int baseRadix = min(n, 128); // int baseRadix = min(n, 128);
int baseRadix = min(n, 128);
int numR = 0; int numR = 0;
int N = n; int N = n;
while(N > baseRadix) while(N > baseRadix)
...@@ -1321,14 +1321,14 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir ...@@ -1321,14 +1321,14 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir
{ {
localString += string("xNum = groupId >> ") + num2str((int)log2(numBlocksPerXForm)) + string(";\n"); localString += string("xNum = groupId >> ") + num2str((int)log2(numBlocksPerXForm)) + string(";\n");
localString += string("groupId = groupId & ") + num2str(numBlocksPerXForm - 1) + string(";\n"); localString += string("groupId = groupId & ") + num2str(numBlocksPerXForm - 1) + string(";\n");
localString += string("indexIn = mad24(groupId, ") + num2str(batchSize) + string(", xNum << ") + num2str((int)log2(n*BS)) + string(");\n"); localString += string("indexIn = INTMAD(groupId, ") + num2str(batchSize) + string(", xNum << ") + num2str((int)log2(n*BS)) + string(");\n");
localString += string("tid = mul24(groupId, ") + num2str(batchSize) + string(");\n"); localString += string("tid = INTMUL(groupId, ") + num2str(batchSize) + string(");\n");
localString += string("i = tid >> ") + num2str(lgStrideO) + string(";\n"); localString += string("i = tid >> ") + num2str(lgStrideO) + string(";\n");
localString += string("j = tid & ") + num2str(strideO - 1) + string(";\n"); localString += string("j = tid & ") + num2str(strideO - 1) + string(";\n");
int stride = radix*Rinit; int stride = radix*Rinit;
for(i = 0; i < passNum; i++) for(i = 0; i < passNum; i++)
stride *= radixArr[i]; stride *= radixArr[i];
localString += string("indexOut = mad24(i, ") + num2str(stride) + string(", j + ") + string("(xNum << ") + num2str((int) log2(n*BS)) + string("));\n"); localString += string("indexOut = INTMAD(i, ") + num2str(stride) + string(", j + ") + string("(xNum << ") + num2str((int) log2(n*BS)) + string("));\n");
localString += string("bNum = groupId;\n"); localString += string("bNum = groupId;\n");
} }
else else
...@@ -1336,14 +1336,14 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir ...@@ -1336,14 +1336,14 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir
int lgNumBlocksPerXForm = log2(numBlocksPerXForm); int lgNumBlocksPerXForm = log2(numBlocksPerXForm);
localString += string("bNum = groupId & ") + num2str(numBlocksPerXForm - 1) + string(";\n"); localString += string("bNum = groupId & ") + num2str(numBlocksPerXForm - 1) + string(";\n");
localString += string("xNum = groupId >> ") + num2str(lgNumBlocksPerXForm) + string(";\n"); localString += string("xNum = groupId >> ") + num2str(lgNumBlocksPerXForm) + string(";\n");
localString += string("indexIn = mul24(bNum, ") + num2str(batchSize) + string(");\n"); localString += string("indexIn = INTMUL(bNum, ") + num2str(batchSize) + string(");\n");
localString += string("tid = indexIn;\n"); localString += string("tid = indexIn;\n");
localString += string("i = tid >> ") + num2str(lgStrideO) + string(";\n"); localString += string("i = tid >> ") + num2str(lgStrideO) + string(";\n");
localString += string("j = tid & ") + num2str(strideO - 1) + string(";\n"); localString += string("j = tid & ") + num2str(strideO - 1) + string(";\n");
int stride = radix*Rinit; int stride = radix*Rinit;
for(i = 0; i < passNum; i++) for(i = 0; i < passNum; i++)
stride *= radixArr[i]; stride *= radixArr[i];
localString += string("indexOut = mad24(i, ") + num2str(stride) + string(", j);\n"); localString += string("indexOut = INTMAD(i, ") + num2str(stride) + string(", j);\n");
localString += string("indexIn += (xNum << ") + num2str(m) + string(");\n"); localString += string("indexIn += (xNum << ") + num2str(m) + string(");\n");
localString += string("indexOut += (xNum << ") + num2str(m) + string(");\n"); localString += string("indexOut += (xNum << ") + num2str(m) + string(");\n");
} }
...@@ -1353,7 +1353,7 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir ...@@ -1353,7 +1353,7 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir
localString += string("tid = lId;\n"); localString += string("tid = lId;\n");
localString += string("i = tid & ") + num2str(batchSize - 1) + string(";\n"); localString += string("i = tid & ") + num2str(batchSize - 1) + string(";\n");
localString += string("j = tid >> ") + num2str(lgBatchSize) + string(";\n"); localString += string("j = tid >> ") + num2str(lgBatchSize) + string(";\n");
localString += string("indexIn += mad24(j, ") + num2str(strideI) + string(", i);\n"); localString += string("indexIn += INTMAD(j, ") + num2str(strideI) + string(", i);\n");
if(dataFormat == clFFT_SplitComplexFormat) if(dataFormat == clFFT_SplitComplexFormat)
{ {
...@@ -1391,7 +1391,7 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir ...@@ -1391,7 +1391,7 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir
// shuffle // shuffle
numIter = R1 / R2; numIter = R1 / R2;
localString += string("indexIn = mad24(j, ") + num2str(threadsPerBlock*numIter) + string(", i);\n"); localString += string("indexIn = INTMAD(j, ") + num2str(threadsPerBlock*numIter) + string(", i);\n");
localString += string("lMemStore = sMem + tid;\n"); localString += string("lMemStore = sMem + tid;\n");
localString += string("lMemLoad = sMem + indexIn;\n"); localString += string("lMemLoad = sMem + indexIn;\n");
for(k = 0; k < R1; k++) for(k = 0; k < R1; k++)
...@@ -1439,8 +1439,8 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir ...@@ -1439,8 +1439,8 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir
if(strideO == 1) if(strideO == 1)
{ {
localString += string("lMemStore = sMem + mad24(i, ") + num2str(radix + 1) + string(", j << ") + num2str((int)log2(R1/R2)) + string(");\n"); localString += string("lMemStore = sMem + INTMAD(i, ") + num2str(radix + 1) + string(", j << ") + num2str((int)log2(R1/R2)) + string(");\n");
localString += string("lMemLoad = sMem + mad24(tid >> ") + num2str((int)log2(radix)) + string(", ") + num2str(radix+1) + string(", tid & ") + num2str(radix-1) + string(");\n"); localString += string("lMemLoad = sMem + INTMAD(tid >> ") + num2str((int)log2(radix)) + string(", ") + num2str(radix+1) + string(", tid & ") + num2str(radix-1) + string(");\n");
for(int i = 0; i < R1/R2; i++) for(int i = 0; i < R1/R2; i++)
for(int j = 0; j < R2; j++) for(int j = 0; j < R2; j++)
...@@ -1498,7 +1498,7 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir ...@@ -1498,7 +1498,7 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir
} }
else else
{ {
localString += string("indexOut += mad24(j, ") + num2str(numIter*strideO) + string(", i);\n"); localString += string("indexOut += INTMAD(j, ") + num2str(numIter*strideO) + string(", i);\n");
if(dataFormat == clFFT_SplitComplexFormat) { if(dataFormat == clFFT_SplitComplexFormat) {
localString += string("out_real += indexOut;\n"); localString += string("out_real += indexOut;\n");
localString += string("out_imag += indexOut;\n"); localString += string("out_imag += indexOut;\n");
......
...@@ -438,7 +438,11 @@ patch_kernel_source: ...@@ -438,7 +438,11 @@ patch_kernel_source:
if(device_type == CL_DEVICE_TYPE_GPU) if(device_type == CL_DEVICE_TYPE_GPU)
{ {
gpu_found = 1; gpu_found = 1;
err = clBuildProgram(plan->program, 1, &devices[i], "-cl-mad-enable -cl-single-precision-constant", NULL, NULL); if (plan->n.x * plan->n.y * plan->n.z <= ( 1 << 24)) {
err = clBuildProgram(plan->program, 1, &devices[i], "-cl-mad-enable -cl-single-precision-constant -DINTMUL=mul24 -DINTMAD=mad24", NULL, NULL);
} else {
err = clBuildProgram(plan->program, 1, &devices[i], "-cl-mad-enable -cl-single-precision-constant -DINTMUL=INTMULFULL -DINTMAD=INTMADFULL", NULL, NULL);
}
if (err != CL_SUCCESS) if (err != CL_SUCCESS)
{ {
char *build_log; char *build_log;
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment