fft_kernelstring.cpp 49.5 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54

//
// File:       fft_kernelstring.cpp
//
// Version:    <1.0>
//
// Disclaimer: IMPORTANT:  This Apple software is supplied to you by Apple Inc. ("Apple")
//             in consideration of your agreement to the following terms, and your use,
//             installation, modification or redistribution of this Apple software
//             constitutes acceptance of these terms.  If you do not agree with these
//             terms, please do not use, install, modify or redistribute this Apple
//             software.
//
//             In consideration of your agreement to abide by the following terms, and
//             subject to these terms, Apple grants you a personal, non - exclusive
//             license, under Apple's copyrights in this original Apple software ( the
//             "Apple Software" ), to use, reproduce, modify and redistribute the Apple
//             Software, with or without modifications, in source and / or binary forms;
//             provided that if you redistribute the Apple Software in its entirety and
//             without modifications, you must retain this notice and the following text
//             and disclaimers in all such redistributions of the Apple Software. Neither
//             the name, trademarks, service marks or logos of Apple Inc. may be used to
//             endorse or promote products derived from the Apple Software without specific
//             prior written permission from Apple.  Except as expressly stated in this
//             notice, no other rights or licenses, express or implied, are granted by
//             Apple herein, including but not limited to any patent rights that may be
//             infringed by your derivative works or by other works in which the Apple
//             Software may be incorporated.
//
//             The Apple Software is provided by Apple on an "AS IS" basis.  APPLE MAKES NO
//             WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED
//             WARRANTIES OF NON - INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A
//             PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION
//             ALONE OR IN COMBINATION WITH YOUR PRODUCTS.
//
//             IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR
//             CONSEQUENTIAL DAMAGES ( INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
//             SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
//             INTERRUPTION ) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION
//             AND / OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER
//             UNDER THEORY OF CONTRACT, TORT ( INCLUDING NEGLIGENCE ), STRICT LIABILITY OR
//             OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright ( C ) 2008 Apple Inc. All Rights Reserved.
//
////////////////////////////////////////////////////////////////////////////////////////////////////


#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <iostream>
#include <sstream>
#include <string>
55
#include <string.h>
56
57
58
59
60
61
62
63
64
#include <assert.h>
#include "fft_internal.h"
#include "clFFT.h"

using namespace std;

#define max(A,B) ((A) > (B) ? (A) : (B))
#define min(A,B) ((A) < (B) ? (A) : (B))

Oliver Bock's avatar
Oliver Bock committed
65
static string
66
67
68
69
70
71
72
num2str(int num)
{
	char temp[200];
	sprintf(temp, "%d", num);
	return string(temp);
}

Oliver Bock's avatar
Oliver Bock committed
73
// For any n, this function decomposes n into factors for loacal memory tranpose
74
75
76
77
// based fft. Factors (radices) are sorted such that the first one (radixArray[0])
// is the largest. This base radix determines the number of registers used by each
// work item and product of remaining radices determine the size of work group needed.
// To make things concrete with and example, suppose n = 1024. It is decomposed into
Oliver Bock's avatar
Oliver Bock committed
78
// 1024 = 16 x 16 x 4. Hence kernel uses float2 a[16], for local in-register fft and
79
// needs 16 x 4 = 64 work items per work group. So kernel first performance 64 length
Oliver Bock's avatar
Oliver Bock committed
80
// 16 ffts (64 work items working in parallel) following by transpose using local
81
// memory followed by again 64 length 16 ffts followed by transpose using local memory
Oliver Bock's avatar
Oliver Bock committed
82
// followed by 256 length 4 ffts. For the last step since with size of work group is
83
// 64 and each work item can array for 16 values, 64 work items can compute 256 length
Oliver Bock's avatar
Oliver Bock committed
84
// 4 ffts by each work item computing 4 length 4 ffts.
85
86
87
88
89
90
// Similarly for n = 2048 = 8 x 8 x 8 x 4, each work group has 8 x 8 x 4 = 256 work
// iterms which each computes 256 (in-parallel) length 8 ffts in-register, followed
// by transpose using local memory, followed by 256 length 8 in-register ffts, followed
// by transpose using local memory, followed by 256 length 8 in-register ffts, followed
// by transpose using local memory, followed by 512 length 4 in-register ffts. Again,
// for the last step, each work item computes two length 4 in-register ffts and thus
Oliver Bock's avatar
Oliver Bock committed
91
92
// 256 work items are needed to compute all 512 ffts.
// For n = 32 = 8 x 4, 4 work items first compute 4 in-register
93
94
// lenth 8 ffts, followed by transpose using local memory followed by 8 in-register
// length 4 ffts, where each work item computes two length 4 ffts thus 4 work items
Oliver Bock's avatar
Oliver Bock committed
95
96
// can compute 8 length 4 ffts. However if work group size of say 64 is choosen,
// each work group can compute 64/ 4 = 16 size 32 ffts (batched transform).
97
98
// Users can play with these parameters to figure what gives best performance on
// their particular device i.e. some device have less register space thus using
Oliver Bock's avatar
Oliver Bock committed
99
// smaller base radix can avoid spilling ... some has small local memory thus
100
101
// using smaller work group size may be required etc

Oliver Bock's avatar
Oliver Bock committed
102
static void
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
getRadixArray(unsigned int n, unsigned int *radixArray, unsigned int *numRadices, unsigned int maxRadix)
{
    if(maxRadix > 1)
    {
        maxRadix = min(n, maxRadix);
        unsigned int cnt = 0;
        while(n > maxRadix)
        {
            radixArray[cnt++] = maxRadix;
            n /= maxRadix;
        }
        radixArray[cnt++] = n;
        *numRadices = cnt;
        return;
    }

Oliver Bock's avatar
Oliver Bock committed
119
	switch(n)
120
121
122
123
124
	{
		case 2:
			*numRadices = 1;
			radixArray[0] = 2;
			break;
Oliver Bock's avatar
Oliver Bock committed
125

126
127
128
129
		case 4:
			*numRadices = 1;
			radixArray[0] = 4;
			break;
Oliver Bock's avatar
Oliver Bock committed
130

131
132
133
134
		case 8:
			*numRadices = 1;
			radixArray[0] = 8;
			break;
Oliver Bock's avatar
Oliver Bock committed
135

136
137
		case 16:
			*numRadices = 2;
Oliver Bock's avatar
Oliver Bock committed
138
			radixArray[0] = 8; radixArray[1] = 2;
139
			break;
Oliver Bock's avatar
Oliver Bock committed
140

141
142
143
144
		case 32:
			*numRadices = 2;
			radixArray[0] = 8; radixArray[1] = 4;
			break;
Oliver Bock's avatar
Oliver Bock committed
145

146
147
148
149
		case 64:
			*numRadices = 2;
			radixArray[0] = 8; radixArray[1] = 8;
			break;
Oliver Bock's avatar
Oliver Bock committed
150

151
152
153
154
		case 128:
			*numRadices = 3;
			radixArray[0] = 8; radixArray[1] = 4; radixArray[2] = 4;
			break;
Oliver Bock's avatar
Oliver Bock committed
155

156
157
158
159
		case 256:
			*numRadices = 4;
			radixArray[0] = 4; radixArray[1] = 4; radixArray[2] = 4; radixArray[3] = 4;
			break;
Oliver Bock's avatar
Oliver Bock committed
160

161
162
163
		case 512:
			*numRadices = 3;
			radixArray[0] = 8; radixArray[1] = 8; radixArray[2] = 8;
Oliver Bock's avatar
Oliver Bock committed
164
165
			break;

166
167
168
		case 1024:
			*numRadices = 3;
			radixArray[0] = 16; radixArray[1] = 16; radixArray[2] = 4;
Oliver Bock's avatar
Oliver Bock committed
169
			break;
170
171
172
173
174
175
176
177
178
179
180
181
182
		case 2048:
			*numRadices = 4;
			radixArray[0] = 8; radixArray[1] = 8; radixArray[2] = 8; radixArray[3] = 4;
			break;
		default:
			*numRadices = 0;
			return;
	}
}

static void
insertHeader(string &kernelString, string &kernelName, clFFT_DataFormat dataFormat)
{
Oliver Bock's avatar
Oliver Bock committed
183
	if(dataFormat == clFFT_SplitComplexFormat)
184
		kernelString += string("__kernel void ") + kernelName + string("(__global float *in_real, __global float *in_imag, __global float *out_real, __global float *out_imag, int dir, int S)\n");
Oliver Bock's avatar
Oliver Bock committed
185
	else
186
187
188
		kernelString += string("__kernel void ") + kernelName + string("(__global float2 *in, __global float2 *out, int dir, int S)\n");
}

Oliver Bock's avatar
Oliver Bock committed
189
static void
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
insertVariables(string &kStream, int maxRadix)
{
	kStream += string("    int i, j, r, indexIn, indexOut, index, tid, bNum, xNum, k, l;\n");
    kStream += string("    int s, ii, jj, offset;\n");
	kStream += string("    float2 w;\n");
	kStream += string("    float ang, angf, ang1;\n");
    kStream += string("    __local float *lMemStore, *lMemLoad;\n");
    kStream += string("    float2 a[") +  num2str(maxRadix) + string("];\n");
    kStream += string("    int lId = get_local_id( 0 );\n");
    kStream += string("    int groupId = get_group_id( 0 );\n");
}

static void
formattedLoad(string &kernelString, int aIndex, int gIndex, clFFT_DataFormat dataFormat)
{
	if(dataFormat == clFFT_InterleavedComplexFormat)
		kernelString += string("        a[") + num2str(aIndex) + string("] = in[") + num2str(gIndex) + string("];\n");
	else
	{
		kernelString += string("        a[") + num2str(aIndex) + string("].x = in_real[") + num2str(gIndex) + string("];\n");
		kernelString += string("        a[") + num2str(aIndex) + string("].y = in_imag[") + num2str(gIndex) + string("];\n");
	}
}

static void
formattedStore(string &kernelString, int aIndex, int gIndex, clFFT_DataFormat dataFormat)
{
	if(dataFormat == clFFT_InterleavedComplexFormat)
		kernelString += string("        out[") + num2str(gIndex) + string("] = a[") + num2str(aIndex) + string("];\n");
	else
	{
		kernelString += string("        out_real[") + num2str(gIndex) + string("] = a[") + num2str(aIndex) + string("].x;\n");
		kernelString += string("        out_imag[") + num2str(gIndex) + string("] = a[") + num2str(aIndex) + string("].y;\n");
	}
}

static int
insertGlobalLoadsAndTranspose(string &kernelString, int N, int numWorkItemsPerXForm, int numXFormsPerWG, int R0, int mem_coalesce_width, clFFT_DataFormat dataFormat)
{
	int log2NumWorkItemsPerXForm = (int) log2(numWorkItemsPerXForm);
	int groupSize = numWorkItemsPerXForm * numXFormsPerWG;
	int i, j;
	int lMemSize = 0;
Oliver Bock's avatar
Oliver Bock committed
233

234
235
	if(numXFormsPerWG > 1)
	    kernelString += string("        s = S & ") + num2str(numXFormsPerWG - 1) + string(";\n");
Oliver Bock's avatar
Oliver Bock committed
236

237
    if(numWorkItemsPerXForm >= mem_coalesce_width)
Oliver Bock's avatar
Oliver Bock committed
238
    {
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
		if(numXFormsPerWG > 1)
		{
            kernelString += string("    ii = lId & ") + num2str(numWorkItemsPerXForm-1) + string(";\n");
            kernelString += string("    jj = lId >> ") + num2str(log2NumWorkItemsPerXForm) + string(";\n");
            kernelString += string("    if( !s || (groupId < get_num_groups(0)-1) || (jj < s) ) {\n");
			kernelString += string("        offset = mad24( mad24(groupId, ") + num2str(numXFormsPerWG) + string(", jj), ") + num2str(N) + string(", ii );\n");
			if(dataFormat == clFFT_InterleavedComplexFormat)
			{
			    kernelString += string("        in += offset;\n");
			    kernelString += string("        out += offset;\n");
			}
			else
			{
			    kernelString += string("        in_real += offset;\n");
				kernelString += string("        in_imag += offset;\n");
			    kernelString += string("        out_real += offset;\n");
				kernelString += string("        out_imag += offset;\n");
			}
			for(i = 0; i < R0; i++)
				formattedLoad(kernelString, i, i*numWorkItemsPerXForm, dataFormat);
			kernelString += string("    }\n");
		}
		else
		{
			kernelString += string("    ii = lId;\n");
			kernelString += string("    jj = 0;\n");
			kernelString += string("    offset =  mad24(groupId, ") + num2str(N) + string(", ii);\n");
			if(dataFormat == clFFT_InterleavedComplexFormat)
			{
			    kernelString += string("        in += offset;\n");
			    kernelString += string("        out += offset;\n");
			}
			else
			{
			    kernelString += string("        in_real += offset;\n");
				kernelString += string("        in_imag += offset;\n");
			    kernelString += string("        out_real += offset;\n");
				kernelString += string("        out_imag += offset;\n");
			}
			for(i = 0; i < R0; i++)
				formattedLoad(kernelString, i, i*numWorkItemsPerXForm, dataFormat);
		}
    }
    else if( N >= mem_coalesce_width )
    {
        int numInnerIter = N / mem_coalesce_width;
        int numOuterIter = numXFormsPerWG / ( groupSize / mem_coalesce_width );
Oliver Bock's avatar
Oliver Bock committed
286

287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
        kernelString += string("    ii = lId & ") + num2str(mem_coalesce_width - 1) + string(";\n");
        kernelString += string("    jj = lId >> ") + num2str((int)log2(mem_coalesce_width)) + string(";\n");
        kernelString += string("    lMemStore = sMem + mad24( jj, ") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n");
        kernelString += string("    offset = mad24( groupId, ") + num2str(numXFormsPerWG) + string(", jj);\n");
        kernelString += string("    offset = mad24( offset, ") + num2str(N) + string(", ii );\n");
		if(dataFormat == clFFT_InterleavedComplexFormat)
		{
			kernelString += string("        in += offset;\n");
			kernelString += string("        out += offset;\n");
		}
		else
		{
			kernelString += string("        in_real += offset;\n");
			kernelString += string("        in_imag += offset;\n");
			kernelString += string("        out_real += offset;\n");
			kernelString += string("        out_imag += offset;\n");
		}
Oliver Bock's avatar
Oliver Bock committed
304

305
306
307
308
		kernelString += string("if((groupId == get_num_groups(0)-1) && s) {\n");
        for(i = 0; i < numOuterIter; i++ )
        {
            kernelString += string("    if( jj < s ) {\n");
Oliver Bock's avatar
Oliver Bock committed
309
			for(j = 0; j < numInnerIter; j++ )
310
				formattedLoad(kernelString, i * numInnerIter + j, j * mem_coalesce_width + i * ( groupSize / mem_coalesce_width ) * N, dataFormat);
Oliver Bock's avatar
Oliver Bock committed
311
			kernelString += string("    }\n");
312
			if(i != numOuterIter - 1)
Oliver Bock's avatar
Oliver Bock committed
313
			    kernelString += string("    jj += ") + num2str(groupSize / mem_coalesce_width) + string(";\n");
314
315
316
317
318
        }
		kernelString += string("}\n ");
		kernelString += string("else {\n");
        for(i = 0; i < numOuterIter; i++ )
        {
Oliver Bock's avatar
Oliver Bock committed
319
320
321
			for(j = 0; j < numInnerIter; j++ )
				formattedLoad(kernelString, i * numInnerIter + j, j * mem_coalesce_width + i * ( groupSize / mem_coalesce_width ) * N, dataFormat);
        }
322
		kernelString += string("}\n");
Oliver Bock's avatar
Oliver Bock committed
323

324
325
		kernelString += string("    ii = lId & ") + num2str(numWorkItemsPerXForm - 1) + string(";\n");
		kernelString += string("    jj = lId >> ") + num2str(log2NumWorkItemsPerXForm) + string(";\n");
Oliver Bock's avatar
Oliver Bock committed
326
327
        kernelString += string("    lMemLoad  = sMem + mad24( jj, ") + num2str(N + numWorkItemsPerXForm) + string(", ii);\n");

328
329
330
        for( i = 0; i < numOuterIter; i++ )
		{
			for( j = 0; j < numInnerIter; j++ )
Oliver Bock's avatar
Oliver Bock committed
331
332
			{
				kernelString += string("    lMemStore[") + num2str(j * mem_coalesce_width + i * ( groupSize / mem_coalesce_width ) * (N + numWorkItemsPerXForm )) + string("] = a[") +
333
334
				                num2str(i * numInnerIter + j) + string("].x;\n");
			}
Oliver Bock's avatar
Oliver Bock committed
335
		}
336
        kernelString += string("    barrier( CLK_LOCAL_MEM_FENCE );\n");
Oliver Bock's avatar
Oliver Bock committed
337

338
        for( i = 0; i < R0; i++ )
Oliver Bock's avatar
Oliver Bock committed
339
340
			kernelString += string("    a[") + num2str(i) + string("].x = lMemLoad[") + num2str(i * numWorkItemsPerXForm) + string("];\n");
		kernelString += string("    barrier( CLK_LOCAL_MEM_FENCE );\n");
341
342
343
344

	    for( i = 0; i < numOuterIter; i++ )
		{
			for( j = 0; j < numInnerIter; j++ )
Oliver Bock's avatar
Oliver Bock committed
345
346
			{
				kernelString += string("    lMemStore[") + num2str(j * mem_coalesce_width + i * ( groupSize / mem_coalesce_width ) * (N + numWorkItemsPerXForm )) + string("] = a[") +
347
348
								num2str(i * numInnerIter + j) + string("].y;\n");
			}
Oliver Bock's avatar
Oliver Bock committed
349
	    }
350
		kernelString += string("    barrier( CLK_LOCAL_MEM_FENCE );\n");
Oliver Bock's avatar
Oliver Bock committed
351

352
		for( i = 0; i < R0; i++ )
Oliver Bock's avatar
Oliver Bock committed
353
354
355
			kernelString += string("    a[") + num2str(i) + string("].y = lMemLoad[") + num2str(i * numWorkItemsPerXForm) + string("];\n");
		kernelString += string("    barrier( CLK_LOCAL_MEM_FENCE );\n");

356
		lMemSize = (N + numWorkItemsPerXForm) * numXFormsPerWG;
Oliver Bock's avatar
Oliver Bock committed
357
    }
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
    else
    {
        kernelString += string("    offset = mad24( groupId,  ") + num2str(N * numXFormsPerWG) + string(", lId );\n");
		if(dataFormat == clFFT_InterleavedComplexFormat)
		{
			kernelString += string("        in += offset;\n");
			kernelString += string("        out += offset;\n");
		}
		else
		{
			kernelString += string("        in_real += offset;\n");
			kernelString += string("        in_imag += offset;\n");
			kernelString += string("        out_real += offset;\n");
			kernelString += string("        out_imag += offset;\n");
		}
Oliver Bock's avatar
Oliver Bock committed
373

374
375
376
        kernelString += string("    ii = lId & ") + num2str(N-1) + string(";\n");
        kernelString += string("    jj = lId >> ") + num2str((int)log2(N)) + string(";\n");
        kernelString += string("    lMemStore = sMem + mad24( jj, ") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n");
Oliver Bock's avatar
Oliver Bock committed
377

378
379
380
381
382
383
384
385
386
387
388
389
390
		kernelString += string("if((groupId == get_num_groups(0)-1) && s) {\n");
        for( i = 0; i < R0; i++ )
        {
            kernelString += string("    if(jj < s )\n");
			formattedLoad(kernelString, i, i*groupSize, dataFormat);
			if(i != R0 - 1)
			    kernelString += string("    jj += ") + num2str(groupSize / N) + string(";\n");
        }
		kernelString += string("}\n");
		kernelString += string("else {\n");
        for( i = 0; i < R0; i++ )
        {
			formattedLoad(kernelString, i, i*groupSize, dataFormat);
Oliver Bock's avatar
Oliver Bock committed
391
        }
392
		kernelString += string("}\n");
Oliver Bock's avatar
Oliver Bock committed
393

394
395
396
397
		if(numWorkItemsPerXForm > 1)
		{
            kernelString += string("    ii = lId & ") + num2str(numWorkItemsPerXForm - 1) + string(";\n");
            kernelString += string("    jj = lId >> ") + num2str(log2NumWorkItemsPerXForm) + string(";\n");
Oliver Bock's avatar
Oliver Bock committed
398
            kernelString += string("    lMemLoad = sMem + mad24( jj, ") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n");
399
		}
Oliver Bock's avatar
Oliver Bock committed
400
		else
401
402
403
		{
            kernelString += string("    ii = 0;\n");
            kernelString += string("    jj = lId;\n");
Oliver Bock's avatar
Oliver Bock committed
404
            kernelString += string("    lMemLoad = sMem + mul24( jj, ") + num2str(N + numWorkItemsPerXForm) + string(");\n");
405
406
		}

Oliver Bock's avatar
Oliver Bock committed
407

408
        for( i = 0; i < R0; i++ )
Oliver Bock's avatar
Oliver Bock committed
409
410
411
            kernelString += string("    lMemStore[") + num2str(i * ( groupSize / N ) * ( N + numWorkItemsPerXForm )) + string("] = a[") + num2str(i) + string("].x;\n");
        kernelString += string("    barrier( CLK_LOCAL_MEM_FENCE );\n");

412
413
414
        for( i = 0; i < R0; i++ )
            kernelString += string("    a[") + num2str(i) + string("].x = lMemLoad[") + num2str(i * numWorkItemsPerXForm) + string("];\n");
		kernelString += string("    barrier( CLK_LOCAL_MEM_FENCE );\n");
Oliver Bock's avatar
Oliver Bock committed
415

416
        for( i = 0; i < R0; i++ )
Oliver Bock's avatar
Oliver Bock committed
417
418
419
            kernelString += string("    lMemStore[") + num2str(i * ( groupSize / N ) * ( N + numWorkItemsPerXForm )) + string("] = a[") + num2str(i) + string("].y;\n");
        kernelString += string("    barrier( CLK_LOCAL_MEM_FENCE );\n");

420
421
422
        for( i = 0; i < R0; i++ )
            kernelString += string("    a[") + num2str(i) + string("].y = lMemLoad[") + num2str(i * numWorkItemsPerXForm) + string("];\n");
		kernelString += string("    barrier( CLK_LOCAL_MEM_FENCE );\n");
Oliver Bock's avatar
Oliver Bock committed
423

424
425
		lMemSize = (N + numWorkItemsPerXForm) * numXFormsPerWG;
    }
Oliver Bock's avatar
Oliver Bock committed
426

427
428
429
430
431
432
433
434
435
436
437
	return lMemSize;
}

static int
insertGlobalStoresAndTranspose(string &kernelString, int N, int maxRadix, int Nr, int numWorkItemsPerXForm, int numXFormsPerWG, int mem_coalesce_width, clFFT_DataFormat dataFormat)
{
	int groupSize = numWorkItemsPerXForm * numXFormsPerWG;
	int i, j, k, ind;
	int lMemSize = 0;
	int numIter = maxRadix / Nr;
	string indent = string("");
Oliver Bock's avatar
Oliver Bock committed
438

439
    if( numWorkItemsPerXForm >= mem_coalesce_width )
Oliver Bock's avatar
Oliver Bock committed
440
    {
441
442
443
444
		if(numXFormsPerWG > 1)
		{
            kernelString += string("    if( !s || (groupId < get_num_groups(0)-1) || (jj < s) ) {\n");
			indent = string("    ");
Oliver Bock's avatar
Oliver Bock committed
445
446
		}
		for(i = 0; i < maxRadix; i++)
447
448
449
450
451
452
453
454
455
456
457
458
459
		{
			j = i % numIter;
			k = i / numIter;
			ind = j * Nr + k;
			formattedStore(kernelString, ind, i*numWorkItemsPerXForm, dataFormat);
		}
		if(numXFormsPerWG > 1)
		    kernelString += string("    }\n");
    }
    else if( N >= mem_coalesce_width )
    {
        int numInnerIter = N / mem_coalesce_width;
        int numOuterIter = numXFormsPerWG / ( groupSize / mem_coalesce_width );
Oliver Bock's avatar
Oliver Bock committed
460
461

        kernelString += string("    lMemLoad  = sMem + mad24( jj, ") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n");
462
463
464
        kernelString += string("    ii = lId & ") + num2str(mem_coalesce_width - 1) + string(";\n");
        kernelString += string("    jj = lId >> ") + num2str((int)log2(mem_coalesce_width)) + string(";\n");
        kernelString += string("    lMemStore = sMem + mad24( jj,") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n");
Oliver Bock's avatar
Oliver Bock committed
465

466
467
468
469
470
        for( i = 0; i < maxRadix; i++ )
		{
			j = i % numIter;
			k = i / numIter;
			ind = j * Nr + k;
Oliver Bock's avatar
Oliver Bock committed
471
472
473
474
            kernelString += string("    lMemLoad[") + num2str(i*numWorkItemsPerXForm) + string("] = a[") + num2str(ind) + string("].x;\n");
		}
        kernelString += string("    barrier( CLK_LOCAL_MEM_FENCE );\n");

475
476
477
478
        for( i = 0; i < numOuterIter; i++ )
			for( j = 0; j < numInnerIter; j++ )
				kernelString += string("    a[") + num2str(i*numInnerIter + j) + string("].x = lMemStore[") + num2str(j*mem_coalesce_width + i*( groupSize / mem_coalesce_width )*(N + numWorkItemsPerXForm)) + string("];\n");
        kernelString += string("    barrier( CLK_LOCAL_MEM_FENCE );\n");
Oliver Bock's avatar
Oliver Bock committed
479

480
481
482
483
484
        for( i = 0; i < maxRadix; i++ )
		{
			j = i % numIter;
			k = i / numIter;
			ind = j * Nr + k;
Oliver Bock's avatar
Oliver Bock committed
485
486
487
488
            kernelString += string("    lMemLoad[") + num2str(i*numWorkItemsPerXForm) + string("] = a[") + num2str(ind) + string("].y;\n");
		}
        kernelString += string("    barrier( CLK_LOCAL_MEM_FENCE );\n");

489
490
491
        for( i = 0; i < numOuterIter; i++ )
			for( j = 0; j < numInnerIter; j++ )
				kernelString += string("    a[") + num2str(i*numInnerIter + j) + string("].y = lMemStore[") + num2str(j*mem_coalesce_width + i*( groupSize / mem_coalesce_width )*(N + numWorkItemsPerXForm)) + string("];\n");
Oliver Bock's avatar
Oliver Bock committed
492
493
        kernelString += string("    barrier( CLK_LOCAL_MEM_FENCE );\n");

494
495
496
497
		kernelString += string("if((groupId == get_num_groups(0)-1) && s) {\n");
		for(i = 0; i < numOuterIter; i++ )
        {
            kernelString += string("    if( jj < s ) {\n");
Oliver Bock's avatar
Oliver Bock committed
498
499
500
			for(j = 0; j < numInnerIter; j++ )
				formattedStore(kernelString, i*numInnerIter + j, j*mem_coalesce_width + i*(groupSize/mem_coalesce_width)*N, dataFormat);
			kernelString += string("    }\n");
501
			if(i != numOuterIter - 1)
Oliver Bock's avatar
Oliver Bock committed
502
			    kernelString += string("    jj += ") + num2str(groupSize / mem_coalesce_width) + string(";\n");
503
504
505
506
507
        }
		kernelString += string("}\n");
		kernelString += string("else {\n");
		for(i = 0; i < numOuterIter; i++ )
        {
Oliver Bock's avatar
Oliver Bock committed
508
509
510
			for(j = 0; j < numInnerIter; j++ )
				formattedStore(kernelString, i*numInnerIter + j, j*mem_coalesce_width + i*(groupSize/mem_coalesce_width)*N, dataFormat);
        }
511
		kernelString += string("}\n");
Oliver Bock's avatar
Oliver Bock committed
512

513
		lMemSize = (N + numWorkItemsPerXForm) * numXFormsPerWG;
Oliver Bock's avatar
Oliver Bock committed
514
	}
515
    else
Oliver Bock's avatar
Oliver Bock committed
516
517
518
    {
        kernelString += string("    lMemLoad  = sMem + mad24( jj,") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n");

519
520
521
		kernelString += string("    ii = lId & ") + num2str(N - 1) + string(";\n");
        kernelString += string("    jj = lId >> ") + num2str((int) log2(N)) + string(";\n");
        kernelString += string("    lMemStore = sMem + mad24( jj,") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n");
Oliver Bock's avatar
Oliver Bock committed
522

523
524
525
526
527
528
        for( i = 0; i < maxRadix; i++ )
		{
			j = i % numIter;
			k = i / numIter;
			ind = j * Nr + k;
            kernelString += string("    lMemLoad[") + num2str(i*numWorkItemsPerXForm) + string("] = a[") + num2str(ind) + string("].x;\n");
Oliver Bock's avatar
Oliver Bock committed
529
		}
530
        kernelString += string("    barrier( CLK_LOCAL_MEM_FENCE );\n");
Oliver Bock's avatar
Oliver Bock committed
531

532
        for( i = 0; i < maxRadix; i++ )
Oliver Bock's avatar
Oliver Bock committed
533
534
535
            kernelString += string("    a[") + num2str(i) + string("].x = lMemStore[") + num2str(i*( groupSize / N )*( N + numWorkItemsPerXForm )) + string("];\n");
        kernelString += string("    barrier( CLK_LOCAL_MEM_FENCE );\n");

536
537
538
539
540
541
        for( i = 0; i < maxRadix; i++ )
		{
			j = i % numIter;
			k = i / numIter;
			ind = j * Nr + k;
            kernelString += string("    lMemLoad[") + num2str(i*numWorkItemsPerXForm) + string("] = a[") + num2str(ind) + string("].y;\n");
Oliver Bock's avatar
Oliver Bock committed
542
		}
543
        kernelString += string("    barrier( CLK_LOCAL_MEM_FENCE );\n");
Oliver Bock's avatar
Oliver Bock committed
544

545
        for( i = 0; i < maxRadix; i++ )
Oliver Bock's avatar
Oliver Bock committed
546
547
548
            kernelString += string("    a[") + num2str(i) + string("].y = lMemStore[") + num2str(i*( groupSize / N )*( N + numWorkItemsPerXForm )) + string("];\n");
        kernelString += string("    barrier( CLK_LOCAL_MEM_FENCE );\n");

549
550
551
552
553
554
555
556
		kernelString += string("if((groupId == get_num_groups(0)-1) && s) {\n");
		for( i = 0; i < maxRadix; i++ )
        {
            kernelString += string("    if(jj < s ) {\n");
			formattedStore(kernelString, i, i*groupSize, dataFormat);
			kernelString += string("    }\n");
			if( i != maxRadix - 1)
				kernelString += string("    jj +=") + num2str(groupSize / N) + string(";\n");
Oliver Bock's avatar
Oliver Bock committed
557
        }
558
559
560
561
562
		kernelString += string("}\n");
		kernelString += string("else {\n");
		for( i = 0; i < maxRadix; i++ )
        {
			formattedStore(kernelString, i, i*groupSize, dataFormat);
Oliver Bock's avatar
Oliver Bock committed
563
        }
564
		kernelString += string("}\n");
Oliver Bock's avatar
Oliver Bock committed
565

566
567
		lMemSize = (N + numWorkItemsPerXForm) * numXFormsPerWG;
    }
Oliver Bock's avatar
Oliver Bock committed
568

569
570
571
	return lMemSize;
}

Oliver Bock's avatar
Oliver Bock committed
572
static void
573
574
575
insertfftKernel(string &kernelString, int Nr, int numIter)
{
	int i;
Oliver Bock's avatar
Oliver Bock committed
576
	for(i = 0; i < numIter; i++)
577
578
579
580
581
582
583
584
585
586
	{
		kernelString += string("    fftKernel") + num2str(Nr) + string("(a+") + num2str(i*Nr) + string(", dir);\n");
	}
}

static void
insertTwiddleKernel(string &kernelString, int Nr, int numIter, int Nprev, int len, int numWorkItemsPerXForm)
{
	int z, k;
	int logNPrev = log2(Nprev);
Oliver Bock's avatar
Oliver Bock committed
587
588

	for(z = 0; z < numIter; z++)
589
590
591
592
593
594
595
	{
		if(z == 0)
		{
			if(Nprev > 1)
			    kernelString += string("    angf = (float) (ii >> ") + num2str(logNPrev) + string(");\n");
			else
				kernelString += string("    angf = (float) ii;\n");
Oliver Bock's avatar
Oliver Bock committed
596
		}
597
598
599
		else
		{
			if(Nprev > 1)
Oliver Bock's avatar
Oliver Bock committed
600
			    kernelString += string("    angf = (float) ((") + num2str(z*numWorkItemsPerXForm) + string(" + ii) >>") + num2str(logNPrev) + string(");\n");
601
602
			else
				kernelString += string("    angf = (float) (") + num2str(z*numWorkItemsPerXForm) + string(" + ii);\n");
Oliver Bock's avatar
Oliver Bock committed
603
604
		}

605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
		for(k = 1; k < Nr; k++) {
			int ind = z*Nr + k;
			//float fac =  (float) (2.0 * M_PI * (double) k / (double) len);
			kernelString += string("    ang = dir * ( 2.0f * M_PI * ") + num2str(k) + string(".0f / ") + num2str(len) + string(".0f )") + string(" * angf;\n");
			kernelString += string("    w = (float2)(native_cos(ang), native_sin(ang));\n");
			kernelString += string("    a[") + num2str(ind) + string("] = complexMul(a[") + num2str(ind) + string("], w);\n");
		}
	}
}

static int
getPadding(int numWorkItemsPerXForm, int Nprev, int numWorkItemsReq, int numXFormsPerWG, int Nr, int numBanks, int *offset, int *midPad)
{
	if((numWorkItemsPerXForm <= Nprev) || (Nprev >= numBanks))
		*offset = 0;
	else {
		int numRowsReq = ((numWorkItemsPerXForm < numBanks) ? numWorkItemsPerXForm : numBanks) / Nprev;
		int numColsReq = 1;
		if(numRowsReq > Nr)
			numColsReq = numRowsReq / Nr;
		numColsReq = Nprev * numColsReq;
		*offset = numColsReq;
	}
Oliver Bock's avatar
Oliver Bock committed
628

629
630
631
632
633
634
635
636
637
	if(numWorkItemsPerXForm >= numBanks || numXFormsPerWG == 1)
		*midPad = 0;
	else {
		int bankNum = ( (numWorkItemsReq + *offset) * Nr ) & (numBanks - 1);
		if( bankNum >= numWorkItemsPerXForm )
			*midPad = 0;
		else
			*midPad = numWorkItemsPerXForm - bankNum;
	}
Oliver Bock's avatar
Oliver Bock committed
638

639
640
641
642
643
	int lMemSize = ( numWorkItemsReq + *offset) * Nr * numXFormsPerWG + *midPad * (numXFormsPerWG - 1);
	return lMemSize;
}


Oliver Bock's avatar
Oliver Bock committed
644
static void
645
646
647
648
649
650
651
652
653
654
655
656
657
insertLocalStores(string &kernelString, int numIter, int Nr, int numWorkItemsPerXForm, int numWorkItemsReq, int offset, string &comp)
{
	int z, k;

	for(z = 0; z < numIter; z++) {
		for(k = 0; k < Nr; k++) {
			int index = k*(numWorkItemsReq + offset) + z*numWorkItemsPerXForm;
			kernelString += string("    lMemStore[") + num2str(index) + string("] = a[") + num2str(z*Nr + k) + string("].") + comp + string(";\n");
		}
	}
	kernelString += string("    barrier(CLK_LOCAL_MEM_FENCE);\n");
}

Oliver Bock's avatar
Oliver Bock committed
658
static void
659
660
insertLocalLoads(string &kernelString, int n, int Nr, int Nrn, int Nprev, int Ncurr, int numWorkItemsPerXForm, int numWorkItemsReq, int offset, string &comp)
{
Oliver Bock's avatar
Oliver Bock committed
661
662
663
664
665
666
667
	int numWorkItemsReqN = n / Nrn;
	int interBlockHNum = max( Nprev / numWorkItemsPerXForm, 1 );
	int interBlockHStride = numWorkItemsPerXForm;
	int vertWidth = max(numWorkItemsPerXForm / Nprev, 1);
	vertWidth = min( vertWidth, Nr);
	int vertNum = Nr / vertWidth;
	int vertStride = ( n / Nr + offset ) * vertWidth;
668
669
670
	int iter = max( numWorkItemsReqN / numWorkItemsPerXForm, 1);
	int intraBlockHStride = (numWorkItemsPerXForm / (Nprev*Nr)) > 1 ? (numWorkItemsPerXForm / (Nprev*Nr)) : 1;
	intraBlockHStride *= Nprev;
Oliver Bock's avatar
Oliver Bock committed
671
672

	int stride = numWorkItemsReq / Nrn;
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
	int i;
	for(i = 0; i < iter; i++) {
		int ii = i / (interBlockHNum * vertNum);
		int zz = i % (interBlockHNum * vertNum);
		int jj = zz % interBlockHNum;
		int kk = zz / interBlockHNum;
		int z;
		for(z = 0; z < Nrn; z++) {
			int st = kk * vertStride + jj * interBlockHStride + ii * intraBlockHStride + z * stride;
			kernelString += string("    a[") + num2str(i*Nrn + z) + string("].") + comp + string(" = lMemLoad[") + num2str(st) + string("];\n");
		}
	}
	kernelString += string("    barrier(CLK_LOCAL_MEM_FENCE);\n");
}

static void
insertLocalLoadIndexArithmatic(string &kernelString, int Nprev, int Nr, int numWorkItemsReq, int numWorkItemsPerXForm, int numXFormsPerWG, int offset, int midPad)
Oliver Bock's avatar
Oliver Bock committed
690
{
691
692
693
694
	int Ncurr = Nprev * Nr;
	int logNcurr = log2(Ncurr);
	int logNprev = log2(Nprev);
	int incr = (numWorkItemsReq + offset) * Nr + midPad;
Oliver Bock's avatar
Oliver Bock committed
695
696

	if(Ncurr < numWorkItemsPerXForm)
697
698
699
700
701
	{
		if(Nprev == 1)
		    kernelString += string("    j = ii & ") + num2str(Ncurr - 1) + string(";\n");
		else
			kernelString += string("    j = (ii & ") + num2str(Ncurr - 1) + string(") >> ") + num2str(logNprev) + string(";\n");
Oliver Bock's avatar
Oliver Bock committed
702
703

		if(Nprev == 1)
704
			kernelString += string("    i = ii >> ") + num2str(logNcurr) + string(";\n");
Oliver Bock's avatar
Oliver Bock committed
705
706
707
708
		else
			kernelString += string("    i = mad24(ii >> ") + num2str(logNcurr) + string(", ") + num2str(Nprev) + string(", ii & ") + num2str(Nprev - 1) + string(");\n");
	}
	else
709
710
711
712
713
	{
		if(Nprev == 1)
		    kernelString += string("    j = ii;\n");
		else
			kernelString += string("    j = ii >> ") + num2str(logNprev) + string(";\n");
Oliver Bock's avatar
Oliver Bock committed
714
715
716
		if(Nprev == 1)
			kernelString += string("    i = 0;\n");
		else
717
718
719
720
			kernelString += string("    i = ii & ") + num2str(Nprev - 1) + string(";\n");
	}

    if(numXFormsPerWG > 1)
Oliver Bock's avatar
Oliver Bock committed
721
        kernelString += string("    i = mad24(jj, ") + num2str(incr) + string(", i);\n");
722

Oliver Bock's avatar
Oliver Bock committed
723
    kernelString += string("    lMemLoad = sMem + mad24(j, ") + num2str(numWorkItemsReq + offset) + string(", i);\n");
724
725
726
727
728
729
}

static void
insertLocalStoreIndexArithmatic(string &kernelString, int numWorkItemsReq, int numXFormsPerWG, int Nr, int offset, int midPad)
{
	if(numXFormsPerWG == 1) {
Oliver Bock's avatar
Oliver Bock committed
730
		kernelString += string("    lMemStore = sMem + ii;\n");
731
732
	}
	else {
Oliver Bock's avatar
Oliver Bock committed
733
		kernelString += string("    lMemStore = sMem + mad24(jj, ") + num2str((numWorkItemsReq + offset)*Nr + midPad) + string(", ii);\n");
734
735
736
737
738
739
740
741
742
	}
}


static void
createLocalMemfftKernelString(cl_fft_plan *plan)
{
	unsigned int radixArray[10];
	unsigned int numRadix;
Oliver Bock's avatar
Oliver Bock committed
743

744
	unsigned int n = plan->n.x;
Oliver Bock's avatar
Oliver Bock committed
745

746
	assert(n <= plan->max_work_item_per_workgroup * plan->max_radix && "signal lenght too big for local mem fft\n");
Oliver Bock's avatar
Oliver Bock committed
747

748
749
	getRadixArray(n, radixArray, &numRadix, 0);
	assert(numRadix > 0 && "no radix array supplied\n");
Oliver Bock's avatar
Oliver Bock committed
750

751
752
753
754
755
	if(n/radixArray[0] > plan->max_work_item_per_workgroup)
	    getRadixArray(n, radixArray, &numRadix, plan->max_radix);

	assert(radixArray[0] <= plan->max_radix && "max radix choosen is greater than allowed\n");
	assert(n/radixArray[0] <= plan->max_work_item_per_workgroup && "required work items per xform greater than maximum work items allowed per work group for local mem fft\n");
Oliver Bock's avatar
Oliver Bock committed
756

757
758
759
	unsigned int tmpLen = 1;
	unsigned int i;
	for(i = 0; i < numRadix; i++)
Oliver Bock's avatar
Oliver Bock committed
760
	{
761
762
763
764
		assert( radixArray[i] && !( (radixArray[i] - 1) & radixArray[i] ) );
	    tmpLen *= radixArray[i];
	}
	assert(tmpLen == n && "product of radices choosen doesnt match the length of signal\n");
Oliver Bock's avatar
Oliver Bock committed
765

766
767
	int offset, midPad;
	string localString(""), kernelName("");
Oliver Bock's avatar
Oliver Bock committed
768

769
770
	clFFT_DataFormat dataFormat = plan->format;
	string *kernelString = plan->kernel_string;
Oliver Bock's avatar
Oliver Bock committed
771
772


773
774
	cl_fft_kernel_info **kInfo = &plan->kernel_info;
	int kCount = 0;
Oliver Bock's avatar
Oliver Bock committed
775

776
777
778
779
780
	while(*kInfo)
	{
		kInfo = &(*kInfo)->next;
		kCount++;
	}
Oliver Bock's avatar
Oliver Bock committed
781

782
	kernelName = string("fft") + num2str(kCount);
Oliver Bock's avatar
Oliver Bock committed
783

784
785
786
787
788
789
790
791
792
793
	*kInfo = (cl_fft_kernel_info *) malloc(sizeof(cl_fft_kernel_info));
	(*kInfo)->kernel = 0;
	(*kInfo)->lmem_size = 0;
	(*kInfo)->num_workgroups = 0;
	(*kInfo)->num_workitems_per_workgroup = 0;
	(*kInfo)->dir = cl_fft_kernel_x;
	(*kInfo)->in_place_possible = 1;
	(*kInfo)->next = NULL;
	(*kInfo)->kernel_name = (char *) malloc(sizeof(char)*(kernelName.size()+1));
	strcpy((*kInfo)->kernel_name, kernelName.c_str());
Oliver Bock's avatar
Oliver Bock committed
794

795
	unsigned int numWorkItemsPerXForm = n / radixArray[0];
Oliver Bock's avatar
Oliver Bock committed
796
	unsigned int numWorkItemsPerWG = numWorkItemsPerXForm <= 64 ? 64 : numWorkItemsPerXForm;
797
798
799
800
801
	assert(numWorkItemsPerWG <= plan->max_work_item_per_workgroup);
	int numXFormsPerWG = numWorkItemsPerWG / numWorkItemsPerXForm;
	(*kInfo)->num_workgroups = 1;
    (*kInfo)->num_xforms_per_workgroup = numXFormsPerWG;
	(*kInfo)->num_workitems_per_workgroup = numWorkItemsPerWG;
Oliver Bock's avatar
Oliver Bock committed
802

803
804
805
	unsigned int *N = radixArray;
	unsigned int maxRadix = N[0];
	unsigned int lMemSize = 0;
Oliver Bock's avatar
Oliver Bock committed
806

807
	insertVariables(localString, maxRadix);
Oliver Bock's avatar
Oliver Bock committed
808

809
810
	lMemSize = insertGlobalLoadsAndTranspose(localString, n, numWorkItemsPerXForm, numXFormsPerWG, maxRadix, plan->min_mem_coalesce_width, dataFormat);
	(*kInfo)->lmem_size = (lMemSize > (*kInfo)->lmem_size) ? lMemSize : (*kInfo)->lmem_size;
Oliver Bock's avatar
Oliver Bock committed
811

812
813
	string xcomp = string("x");
	string ycomp = string("y");
Oliver Bock's avatar
Oliver Bock committed
814

815
816
817
	unsigned int Nprev = 1;
	unsigned int len = n;
	unsigned int r;
Oliver Bock's avatar
Oliver Bock committed
818
	for(r = 0; r < numRadix; r++)
819
820
821
822
823
	{
		int numIter = N[0] / N[r];
		int numWorkItemsReq = n / N[r];
		int Ncurr = Nprev * N[r];
		insertfftKernel(localString, N[r], numIter);
Oliver Bock's avatar
Oliver Bock committed
824

825
826
827
828
829
830
831
832
833
834
835
836
837
838
		if(r < (numRadix - 1)) {
			insertTwiddleKernel(localString, N[r], numIter, Nprev, len, numWorkItemsPerXForm);
			lMemSize = getPadding(numWorkItemsPerXForm, Nprev, numWorkItemsReq, numXFormsPerWG, N[r], plan->num_local_mem_banks, &offset, &midPad);
			(*kInfo)->lmem_size = (lMemSize > (*kInfo)->lmem_size) ? lMemSize : (*kInfo)->lmem_size;
			insertLocalStoreIndexArithmatic(localString, numWorkItemsReq, numXFormsPerWG, N[r], offset, midPad);
			insertLocalLoadIndexArithmatic(localString, Nprev, N[r], numWorkItemsReq, numWorkItemsPerXForm, numXFormsPerWG, offset, midPad);
			insertLocalStores(localString, numIter, N[r], numWorkItemsPerXForm, numWorkItemsReq, offset, xcomp);
			insertLocalLoads(localString, n, N[r], N[r+1], Nprev, Ncurr, numWorkItemsPerXForm, numWorkItemsReq, offset, xcomp);
			insertLocalStores(localString, numIter, N[r], numWorkItemsPerXForm, numWorkItemsReq, offset, ycomp);
			insertLocalLoads(localString, n, N[r], N[r+1], Nprev, Ncurr, numWorkItemsPerXForm, numWorkItemsReq, offset, ycomp);
			Nprev = Ncurr;
			len = len / N[r];
		}
	}
Oliver Bock's avatar
Oliver Bock committed
839

840
841
	lMemSize = insertGlobalStoresAndTranspose(localString, n, maxRadix, N[numRadix - 1], numWorkItemsPerXForm, numXFormsPerWG, plan->min_mem_coalesce_width, dataFormat);
	(*kInfo)->lmem_size = (lMemSize > (*kInfo)->lmem_size) ? lMemSize : (*kInfo)->lmem_size;
Oliver Bock's avatar
Oliver Bock committed
842

843
844
845
846
847
848
849
850
851
852
853
854
	insertHeader(*kernelString, kernelName, dataFormat);
	*kernelString += string("{\n");
	if((*kInfo)->lmem_size)
        *kernelString += string("    __local float sMem[") + num2str((*kInfo)->lmem_size) + string("];\n");
	*kernelString += localString;
	*kernelString += string("}\n");
}

// For n larger than what can be computed using local memory fft, global transposes
// multiple kernel launces is needed. For these sizes, n can be decomposed using
// much larger base radices i.e. say n = 262144 = 128 x 64 x 32. Thus three kernel
// launches will be needed, first computing 64 x 32, length 128 ffts, second computing
Oliver Bock's avatar
Oliver Bock committed
855
856
857
858
859
860
861
// 128 x 32 length 64 ffts, and finally a kernel computing 128 x 64 length 32 ffts.
// Each of these base radices can futher be divided into factors so that each of these
// base ffts can be computed within one kernel launch using in-register ffts and local
// memory transposes i.e for the first kernel above which computes 64 x 32 ffts on length
// 128, 128 can be decomposed into 128 = 16 x 8 i.e. 8 work items can compute 8 length
// 16 ffts followed by transpose using local memory followed by each of these eight
// work items computing 2 length 8 ffts thus computing 16 length 8 ffts in total. This
862
863
// means only 8 work items are needed for computing one length 128 fft. If we choose
// work group size of say 64, we can compute 64/8 = 8 length 128 ffts within one
Oliver Bock's avatar
Oliver Bock committed
864
865
// work group. Since we need to compute 64 x 32 length 128 ffts in first kernel, this
// means we need to launch 64 x 32 / 8 = 256 work groups with 64 work items in each
866
867
// work group where each work group is computing 8 length 128 ffts where each length
// 128 fft is computed by 8 work items. Same logic can be applied to other two kernels
Oliver Bock's avatar
Oliver Bock committed
868
// in this example. Users can play with difference base radices and difference
869
870
871
872
873
874
875
// decompositions of base radices to generates different kernels and see which gives
// best performance. Following function is just fixed to use 128 as base radix

void
getGlobalRadixInfo(int n, int *radix, int *R1, int *R2, int *numRadices)
{
	int baseRadix = min(n, 128);
Oliver Bock's avatar
Oliver Bock committed
876

877
878
	int numR = 0;
	int N = n;
Oliver Bock's avatar
Oliver Bock committed
879
	while(N > baseRadix)
880
881
882
883
	{
		N /= baseRadix;
		numR++;
	}
Oliver Bock's avatar
Oliver Bock committed
884

885
886
	for(int i = 0; i < numR; i++)
		radix[i] = baseRadix;
Oliver Bock's avatar
Oliver Bock committed
887

888
889
890
	radix[numR] = N;
	numR++;
	*numRadices = numR;
Oliver Bock's avatar
Oliver Bock committed
891

892
893
894
895
896
897
898
899
900
	for(int i = 0; i < numR; i++)
	{
		int B = radix[i];
		if(B <= 8)
		{
			R1[i] = B;
			R2[i] = 1;
			continue;
		}
Oliver Bock's avatar
Oliver Bock committed
901
902

		int r1 = 2;
903
904
905
906
907
908
909
910
		int r2 = B / r1;
	    while(r2 > r1)
	    {
		   r1 *=2;
		   r2 = B / r1;
	    }
		R1[i] = r1;
		R2[i] = r2;
Oliver Bock's avatar
Oliver Bock committed
911
	}
912
913
914
915
}

static void
createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir dir, int vertBS)
Oliver Bock's avatar
Oliver Bock committed
916
{
917
918
919
920
921
922
	int i, j, k, t;
	int radixArr[10] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
    int R1Arr[10] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
    int R2Arr[10] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
	int radix, R1, R2;
	int numRadices;
Oliver Bock's avatar
Oliver Bock committed
923

924
925
	int maxThreadsPerBlock = plan->max_work_item_per_workgroup;
	int maxArrayLen = plan->max_radix;
Oliver Bock's avatar
Oliver Bock committed
926
	int batchSize = plan->min_mem_coalesce_width;
927
	clFFT_DataFormat dataFormat = plan->format;
Oliver Bock's avatar
Oliver Bock committed
928
929
	int vertical = (dir == cl_fft_kernel_x) ? 0 : 1;

930
	getGlobalRadixInfo(n, radixArr, R1Arr, R2Arr, &numRadices);
Oliver Bock's avatar
Oliver Bock committed
931

932
	int numPasses = numRadices;
Oliver Bock's avatar
Oliver Bock committed
933

934
935
	string localString(""), kernelName("");
	string *kernelString = plan->kernel_string;
Oliver Bock's avatar
Oliver Bock committed
936
	cl_fft_kernel_info **kInfo = &plan->kernel_info;
937
	int kCount = 0;
Oliver Bock's avatar
Oliver Bock committed
938

939
940
941
942
943
	while(*kInfo)
	{
		kInfo = &(*kInfo)->next;
		kCount++;
	}
Oliver Bock's avatar
Oliver Bock committed
944

945
946
947
948
949
	int N = n;
	int m = (int)log2(n);
	int Rinit = vertical ? BS : 1;
	batchSize = vertical ? min(BS, batchSize) : batchSize;
	int passNum;
Oliver Bock's avatar
Oliver Bock committed
950
951

	for(passNum = 0; passNum < numPasses; passNum++)
952
	{
Oliver Bock's avatar
Oliver Bock committed
953

954
955
		localString.clear();
		kernelName.clear();
Oliver Bock's avatar
Oliver Bock committed
956

957
958
959
		radix = radixArr[passNum];
		R1 = R1Arr[passNum];
		R2 = R2Arr[passNum];
Oliver Bock's avatar
Oliver Bock committed
960

961
962
963
964
		int strideI = Rinit;
		for(i = 0; i < numPasses; i++)
			if(i != passNum)
				strideI *= radixArr[i];
Oliver Bock's avatar
Oliver Bock committed
965

966
967
968
		int strideO = Rinit;
		for(i = 0; i < passNum; i++)
			strideO *= radixArr[i];
Oliver Bock's avatar
Oliver Bock committed
969

970
971
972
973
974
975
976
977
978
979
		int threadsPerXForm = R2;
		batchSize = R2 == 1 ? plan->max_work_item_per_workgroup : batchSize;
		batchSize = min(batchSize, strideI);
		int threadsPerBlock = batchSize * threadsPerXForm;
		threadsPerBlock = min(threadsPerBlock, maxThreadsPerBlock);
		batchSize = threadsPerBlock / threadsPerXForm;
		assert(R2 <= R1);
		assert(R1*R2 == radix);
		assert(R1 <= maxArrayLen);
		assert(threadsPerBlock <= maxThreadsPerBlock);
Oliver Bock's avatar
Oliver Bock committed
980

981
982
		int numIter = R1 / R2;
		int gInInc = threadsPerBlock / batchSize;
Oliver Bock's avatar
Oliver Bock committed
983
984


985
986
987
988
989
990
991
		int lgStrideO = log2(strideO);
		int numBlocksPerXForm = strideI / batchSize;
		int numBlocks = numBlocksPerXForm;
		if(!vertical)
			numBlocks *= BS;
		else
			numBlocks *= vertBS;
Oliver Bock's avatar
Oliver Bock committed
992

993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
		kernelName = string("fft") + num2str(kCount);
		*kInfo = (cl_fft_kernel_info *) malloc(sizeof(cl_fft_kernel_info));
		(*kInfo)->kernel = 0;
		if(R2 == 1)
			(*kInfo)->lmem_size = 0;
		else
		{
		    if(strideO == 1)
		        (*kInfo)->lmem_size = (radix + 1)*batchSize;
		    else
			    (*kInfo)->lmem_size = threadsPerBlock*R1;
		}
		(*kInfo)->num_workgroups = numBlocks;
        (*kInfo)->num_xforms_per_workgroup = 1;
		(*kInfo)->num_workitems_per_workgroup = threadsPerBlock;
		(*kInfo)->dir = dir;
		if( (passNum == (numPasses - 1)) && (numPasses & 1) )
		    (*kInfo)->in_place_possible = 1;
		else
			(*kInfo)->in_place_possible = 0;
		(*kInfo)->next = NULL;
		(*kInfo)->kernel_name = (char *) malloc(sizeof(char)*(kernelName.size()+1));
		strcpy((*kInfo)->kernel_name, kernelName.c_str());
Oliver Bock's avatar
Oliver Bock committed
1016

1017
		insertVariables(localString, R1);
Oliver Bock's avatar
Oliver Bock committed
1018
1019

		if(vertical)
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
		{
			localString += string("xNum = groupId >> ") + num2str((int)log2(numBlocksPerXForm)) + string(";\n");
			localString += string("groupId = groupId & ") + num2str(numBlocksPerXForm - 1) + string(";\n");
			localString += string("indexIn = mad24(groupId, ") + num2str(batchSize) + string(", xNum << ") + num2str((int)log2(n*BS)) + string(");\n");
			localString += string("tid = mul24(groupId, ") + num2str(batchSize) + string(");\n");
			localString += string("i = tid >> ") + num2str(lgStrideO) + string(";\n");
			localString += string("j = tid & ") + num2str(strideO - 1) + string(";\n");
			int stride = radix*Rinit;
			for(i = 0; i < passNum; i++)
				stride *= radixArr[i];
			localString += string("indexOut = mad24(i, ") + num2str(stride) + string(", j + ") + string("(xNum << ") + num2str((int) log2(n*BS)) + string("));\n");
			localString += string("bNum = groupId;\n");
		}
Oliver Bock's avatar
Oliver Bock committed
1033
		else
1034
1035
1036
1037
1038
1039
1040
		{
			int lgNumBlocksPerXForm = log2(numBlocksPerXForm);
			localString += string("bNum = groupId & ") + num2str(numBlocksPerXForm - 1) + string(";\n");
			localString += string("xNum = groupId >> ") + num2str(lgNumBlocksPerXForm) + string(";\n");
			localString += string("indexIn = mul24(bNum, ") + num2str(batchSize) + string(");\n");
			localString += string("tid = indexIn;\n");
			localString += string("i = tid >> ") + num2str(lgStrideO) + string(";\n");
Oliver Bock's avatar
Oliver Bock committed
1041
			localString += string("j = tid & ") + num2str(strideO - 1) + string(";\n");
1042
1043
1044
			int stride = radix*Rinit;
			for(i = 0; i < passNum; i++)
				stride *= radixArr[i];
Oliver Bock's avatar
Oliver Bock committed
1045
			localString += string("indexOut = mad24(i, ") + num2str(stride) + string(", j);\n");
1046
			localString += string("indexIn += (xNum << ") + num2str(m) + string(");\n");
Oliver Bock's avatar
Oliver Bock committed
1047
			localString += string("indexOut += (xNum << ") + num2str(m) + string(");\n");
1048
		}
Oliver Bock's avatar
Oliver Bock committed
1049

1050
1051
1052
1053
		// Load Data
		int lgBatchSize = log2(batchSize);
		localString += string("tid = lId;\n");
		localString += string("i = tid & ") + num2str(batchSize - 1) + string(";\n");
Oliver Bock's avatar
Oliver Bock committed
1054
		localString += string("j = tid >> ") + num2str(lgBatchSize) + string(";\n");
1055
1056
		localString += string("indexIn += mad24(j, ") + num2str(strideI) + string(", i);\n");

Oliver Bock's avatar
Oliver Bock committed
1057
		if(dataFormat == clFFT_SplitComplexFormat)
1058
1059
		{
			localString += string("in_real += indexIn;\n");
Oliver Bock's avatar
Oliver Bock committed
1060
			localString += string("in_imag += indexIn;\n");
1061
1062
			for(j = 0; j < R1; j++)
				localString += string("a[") + num2str(j) + string("].x = in_real[") + num2str(j*gInInc*strideI) + string("];\n");
Oliver Bock's avatar
Oliver Bock committed
1063
			for(j = 0; j < R1; j++)
1064
1065
				localString += string("a[") + num2str(j) + string("].y = in_imag[") + num2str(j*gInInc*strideI) + string("];\n");
		}
Oliver Bock's avatar
Oliver Bock committed
1066
		else
1067
1068
1069
1070
1071
		{
			localString += string("in += indexIn;\n");
			for(j = 0; j < R1; j++)
				localString += string("a[") + num2str(j) + string("] = in[") + num2str(j*gInInc*strideI) + string("];\n");
	    }
Oliver Bock's avatar
Oliver Bock committed
1072
1073
1074

		localString += string("fftKernel") + num2str(R1) + string("(a, dir);\n");

1075
1076
1077
		if(R2 > 1)
		{
		    // twiddle
Oliver Bock's avatar
Oliver Bock committed
1078
		    for(k = 1; k < R1; k++)
1079
1080
1081
		    {
			    localString += string("ang = dir*(2.0f*M_PI*") + num2str(k) + string("/") + num2str(radix) + string(")*j;\n");
			    localString += string("w = (float2)(native_cos(ang), native_sin(ang));\n");
Oliver Bock's avatar
Oliver Bock committed
1082
			    localString += string("a[") + num2str(k) + string("] = complexMul(a[") + num2str(k) + string("], w);\n");
1083
		    }
Oliver Bock's avatar
Oliver Bock committed
1084

1085
		    // shuffle
Oliver Bock's avatar
Oliver Bock committed
1086
		    numIter = R1 / R2;
1087
1088
1089
		    localString += string("indexIn = mad24(j, ") + num2str(threadsPerBlock*numIter) + string(", i);\n");
		    localString += string("lMemStore = sMem + tid;\n");
		    localString += string("lMemLoad = sMem + indexIn;\n");
Oliver Bock's avatar
Oliver Bock committed
1090
		    for(k = 0; k < R1; k++)
1091
			    localString += string("lMemStore[") + num2str(k*threadsPerBlock) + string("] = a[") + num2str(k) + string("].x;\n");
Oliver Bock's avatar
Oliver Bock committed
1092
		    localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
1093
1094
1095
1096
		    for(k = 0; k < numIter; k++)
			    for(t = 0; t < R2; t++)
				    localString += string("a[") + num2str(k*R2+t) + string("].x = lMemLoad[") + num2str(t*batchSize + k*threadsPerBlock) + string("];\n");
		    localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
Oliver Bock's avatar
Oliver Bock committed
1097
		    for(k = 0; k < R1; k++)
1098
			    localString += string("lMemStore[") + num2str(k*threadsPerBlock) + string("] = a[") + num2str(k) + string("].y;\n");
Oliver Bock's avatar
Oliver Bock committed
1099
		    localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
1100
1101
1102
1103
		    for(k = 0;