fft_setup.cpp 16.1 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
/***************************************************************************
 *   Copyright (C) 2012 by Oliver Bock,Heinz-Bernd Eggenstein              *
 *   oliver.bock[AT]aei.mpg.de                                             *
 *   heinz-bernd.eggenstein[AT]aei.mpg.de                                  *
 *                                                                         *
 *   This file is part of libclfft (originally for Einstein@Home)          *
 *   Derived from clFFT,  (C) Apple, see notice below.                     *
 *                                                                         *
 *                                                                         *
 *   libclfft  is distributed in the hope that it will be useful,          *
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See              *
 *   notice below for more details.                                        *
 *                                                                         *
 ***************************************************************************/
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
//
// File:       fft_setup.cpp
//
// Version:    <1.0>
//
// Disclaimer: IMPORTANT:  This Apple software is supplied to you by Apple Inc. ("Apple")
//             in consideration of your agreement to the following terms, and your use,
//             installation, modification or redistribution of this Apple software
//             constitutes acceptance of these terms.  If you do not agree with these
//             terms, please do not use, install, modify or redistribute this Apple
//             software.
//
//             In consideration of your agreement to abide by the following terms, and
//             subject to these terms, Apple grants you a personal, non - exclusive
//             license, under Apple's copyrights in this original Apple software ( the
//             "Apple Software" ), to use, reproduce, modify and redistribute the Apple
//             Software, with or without modifications, in source and / or binary forms;
//             provided that if you redistribute the Apple Software in its entirety and
//             without modifications, you must retain this notice and the following text
//             and disclaimers in all such redistributions of the Apple Software. Neither
//             the name, trademarks, service marks or logos of Apple Inc. may be used to
//             endorse or promote products derived from the Apple Software without specific
//             prior written permission from Apple.  Except as expressly stated in this
//             notice, no other rights or licenses, express or implied, are granted by
//             Apple herein, including but not limited to any patent rights that may be
//             infringed by your derivative works or by other works in which the Apple
//             Software may be incorporated.
//
//             The Apple Software is provided by Apple on an "AS IS" basis.  APPLE MAKES NO
//             WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED
//             WARRANTIES OF NON - INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A
//             PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION
//             ALONE OR IN COMBINATION WITH YOUR PRODUCTS.
//
//             IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR
//             CONSEQUENTIAL DAMAGES ( INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
//             SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
//             INTERRUPTION ) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION
//             AND / OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER
//             UNDER THEORY OF CONTRACT, TORT ( INCLUDING NEGLIGENCE ), STRICT LIABILITY OR
//             OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright ( C ) 2008 Apple Inc. All Rights Reserved.
//
////////////////////////////////////////////////////////////////////////////////////////////////////


#include "fft_internal.h"
#include "fft_base_kernels.h"
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <iostream>
#include <string>
#include <sstream>
72
#include <cmath>
73
#include <limits.h>
74
75
76
77
78

using namespace std;

extern void getKernelWorkDimensions(cl_fft_plan *plan, cl_fft_kernel_info *kernelInfo, cl_int *batchSize, size_t *gWorkItems, size_t *lWorkItems);

Oliver Bock's avatar
Oliver Bock committed
79
static void
80
81
getBlockConfigAndKernelString(cl_fft_plan *plan)
{
Oliver Bock's avatar
Oliver Bock committed
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
    plan->temp_buffer_needed = 0;
    *plan->kernel_string += baseKernels;

    if(plan->format == clFFT_SplitComplexFormat)
        *plan->kernel_string += twistKernelPlannar;
    else
        *plan->kernel_string += twistKernelInterleaved;

    switch(plan->dim)
    {
        case clFFT_1D:
            FFT1D(plan, cl_fft_kernel_x);
            break;

        case clFFT_2D:
            FFT1D(plan, cl_fft_kernel_x);
            FFT1D(plan, cl_fft_kernel_y);
            break;

        case clFFT_3D:
            FFT1D(plan, cl_fft_kernel_x);
            FFT1D(plan, cl_fft_kernel_y);
            FFT1D(plan, cl_fft_kernel_z);
            break;

        default:
            return;
    }

    plan->temp_buffer_needed = 0;
    cl_fft_kernel_info *kInfo = plan->kernel_info;
    while(kInfo)
    {
        plan->temp_buffer_needed |= !kInfo->in_place_possible;
        kInfo = kInfo->next;
    }
118
119
}

Oliver Bock's avatar
Oliver Bock committed
120

121
122
123
static void
deleteKernelInfo(cl_fft_kernel_info *kInfo)
{
Oliver Bock's avatar
Oliver Bock committed
124
125
126
127
128
129
130
131
    if(kInfo)
    {
        if(kInfo->kernel_name)
            free(kInfo->kernel_name);
        if(kInfo->kernel)
            clReleaseKernel(kInfo->kernel);
        free(kInfo);
    }
132
133
134
135
136
137
138
}

static void
destroy_plan(cl_fft_plan *Plan)
{
    cl_fft_kernel_info *kernel_info = Plan->kernel_info;

Oliver Bock's avatar
Oliver Bock committed
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
    while(kernel_info)
    {
        cl_fft_kernel_info *tmp = kernel_info->next;
        deleteKernelInfo(kernel_info);
        kernel_info = tmp;
    }

    Plan->kernel_info = NULL;

    if(Plan->kernel_string)
    {
        delete Plan->kernel_string;
        Plan->kernel_string = NULL;
    }
    if(Plan->twist_kernel)
    {
        clReleaseKernel(Plan->twist_kernel);
        Plan->twist_kernel = NULL;
    }
    if(Plan->program)
    {
        clReleaseProgram(Plan->program);
        Plan->program = NULL;
    }
    if(Plan->tempmemobj)
    {
        clReleaseMemObject(Plan->tempmemobj);
        Plan->tempmemobj = NULL;
    }
    if(Plan->tempmemobj_real)
    {
        clReleaseMemObject(Plan->tempmemobj_real);
        Plan->tempmemobj_real = NULL;
    }
    if(Plan->tempmemobj_imag)
    {
        clReleaseMemObject(Plan->tempmemobj_imag);
        Plan->tempmemobj_imag = NULL;
    }
178

179
    if(Plan->cossin_LUT_d1)
180
       {
181
        clReleaseMemObject(Plan->cossin_LUT_d1);
182
183
    }

184
    if(Plan->cossin_LUT_d2)
185
       {
186
        clReleaseMemObject(Plan->cossin_LUT_d2);
187
188
    }

189
190
191
}

static int
Oliver Bock's avatar
Oliver Bock committed
192
createKernelList(cl_fft_plan *plan)
193
{
Oliver Bock's avatar
Oliver Bock committed
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
    cl_program program = plan->program;
    cl_fft_kernel_info *kernel_info = plan->kernel_info;

    cl_int err;
    while(kernel_info)
    {
        kernel_info->kernel = clCreateKernel(program, kernel_info->kernel_name, &err);
        if(!kernel_info->kernel || err != CL_SUCCESS)
            return err;
        kernel_info = kernel_info->next;
    }

    if(plan->format == clFFT_SplitComplexFormat)
        plan->twist_kernel = clCreateKernel(program, "clFFT_1DTwistSplit", &err);
    else
        plan->twist_kernel = clCreateKernel(program, "clFFT_1DTwistInterleaved", &err);

    if(!plan->twist_kernel || err)
        return err;

    return CL_SUCCESS;
215
216
217
}

int getMaxKernelWorkGroupSize(cl_fft_plan *plan, unsigned int *max_wg_size, unsigned int num_devices, cl_device_id *devices)
Oliver Bock's avatar
Oliver Bock committed
218
{
219
220
221
222
    int reg_needed = 0;
    *max_wg_size = INT_MAX;
    int err;
    size_t wg_size;
Oliver Bock's avatar
Oliver Bock committed
223

224
225
226
    unsigned int i;
    for(i = 0; i < num_devices; i++)
    {
Oliver Bock's avatar
Oliver Bock committed
227
228
229
230
231
232
        cl_fft_kernel_info *kInfo = plan->kernel_info;
        while(kInfo)
        {
            err = clGetKernelWorkGroupInfo(kInfo->kernel, devices[i], CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &wg_size, NULL);
            if(err != CL_SUCCESS)
                return -1;
Oliver Bock's avatar
Oliver Bock committed
233

Oliver Bock's avatar
Oliver Bock committed
234
235
            if(wg_size < kInfo->num_workitems_per_workgroup)
                reg_needed |= 1;
Oliver Bock's avatar
Oliver Bock committed
236

Oliver Bock's avatar
Oliver Bock committed
237
238
            if(*max_wg_size > wg_size)
                *max_wg_size = wg_size;
Oliver Bock's avatar
Oliver Bock committed
239

Oliver Bock's avatar
Oliver Bock committed
240
241
242
            kInfo = kInfo->next;
        }
    }
Oliver Bock's avatar
Oliver Bock committed
243

Oliver Bock's avatar
Oliver Bock committed
244
    return reg_needed;
Oliver Bock's avatar
Oliver Bock committed
245
}
246

Gaurav Khanna's avatar
Gaurav Khanna committed
247
248
249
250
251
252
#define ERR_MACRO(err) { \
                         if( err != CL_SUCCESS) \
                         { \
                           if(error_code) \
                               *error_code = err; \
                           clFFT_DestroyPlan((clFFT_Plan) plan); \
Oliver Bock's avatar
Oliver Bock committed
253
                           return (clFFT_Plan) NULL; \
Gaurav Khanna's avatar
Gaurav Khanna committed
254
                         } \
Oliver Bock's avatar
Oliver Bock committed
255
                       }
256

257
258
259
260
261
262
263
264
265
266
267
268
269
static
int precomputeSinCosLUTs(cl_fft_plan * plan,cl_int *error_code) {

    size_t i=0;
    cl_int err;
	
    // find logN1,logN2, where 
    // n = 2^logN1 * 2^logN2 , and logN1=logN2 +/- 1
  
    size_t N=plan->n.x*plan->n.y*plan->n.z;

    plan->logN1=0;
    plan->logN2=0;
270
271
    plan->N1=1;
    plan->N2=1;
272
    
273
274
275
276
277
278
279
280
281
282
283
284
285
    switch (plan->twiddleMethod) {
        case  clFFT_native_trig: return 0; 
        case  clFFT_sincosfunc : return 0;
        case  clFFT_TaylorLUT  :
            plan->logN1 = 0;    
            plan->logN2 = 8;
            break;
        case  clFFT_BigLUT     : {   
            
        size_t Nrem=N;
    
        while(Nrem > 1) {
            plan->logN1++;
286
287
            Nrem >>= 1;

288
289
290
291
292
293
294
295
296
            if(Nrem > 1) {
                plan->logN2++;
                Nrem >>= 1;
            }
        }}
        break;
        default: return 1;        
    }
    
297
298
299
300
301
    plan->N1 = 1 << plan->logN1;

    plan->N2 = 1 << plan->logN2;


302
303
304
    float * tmpLUT_cossin1 = (float*) malloc( plan->N1 * 2 * sizeof(float));
    float * tmpLUT_cossin2 = (float*) malloc( plan->N2 * 2 * sizeof(float));

305
306
307
308
309


    double PI2 = 8.0*atan(1.0);
	
    for(i=0; i < plan->N1; i++) {
310
311
        tmpLUT_cossin1[i*2]  =(float)cos(PI2 * (float) i / (float)N);
        tmpLUT_cossin1[i*2+1]=(float)sin(PI2 * (float) i / (float)N); 
312
    } 
313
    plan->cossin_LUT_d1 = clCreateBuffer(plan->context,CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, plan->N1*2*sizeof(float),tmpLUT_cossin1, &err);
314
315
316
317
318

    if( err != CL_SUCCESS) 
    { 
        if(error_code) 
            *error_code = err; 
319
320
321
        free(tmpLUT_cossin1 );
	free(tmpLUT_cossin2 );
 
322
323
324
325
326
        return 1;
    } 

    
    for(i=0; i < plan->N2; i++) {
327
328
        tmpLUT_cossin2[2*i]  =(float)cos(PI2 * (float) i / (float) plan->N2);
        tmpLUT_cossin2[2*i+1]=(float)sin(PI2 * (float) i / (float) plan->N2);
329
    }
330
    plan->cossin_LUT_d2 = clCreateBuffer(plan->context,CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, plan->N2*2*sizeof(float),tmpLUT_cossin2, &err);
331
332
    if( err != CL_SUCCESS) 
    { 
333
334
335
336
337
      if(error_code)
	*error_code = err;
        free(tmpLUT_cossin1 );
        free(tmpLUT_cossin2 );

338
339
340
341
342
        return 1;
    } 



343
344
    free(tmpLUT_cossin1);
    free(tmpLUT_cossin2);
345
346
347
348
349
350


    return 0;
}


351
clFFT_Plan
352
353
354
355
356
357
358
359
clFFT_CreatePlan(cl_context context, clFFT_Dim3 n, clFFT_Dimension dim, clFFT_DataFormat dataFormat, cl_int *error_code ) {
    return clFFT_CreatePlanAdv( context,n, dim, dataFormat, 0,error_code );
}



clFFT_Plan 
clFFT_CreatePlanAdv( cl_context context, clFFT_Dim3 n, clFFT_Dimension dim, clFFT_DataFormat dataFormat, unsigned long flags, cl_int *error_code )
360
{
Oliver Bock's avatar
Oliver Bock committed
361
362
363
364
365
366
367
368
369
370
    int i;
    cl_int err;
    int isPow2 = 1;
    cl_fft_plan *plan = NULL;
    ostringstream kString;
    int num_devices;
    int gpu_found = 0;
    cl_device_id devices[16];
    size_t ret_size;
    cl_device_type device_type;
Oliver Bock's avatar
Oliver Bock committed
371

372
    if(!context)
Oliver Bock's avatar
Oliver Bock committed
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
        ERR_MACRO(CL_INVALID_VALUE);

    isPow2 |= n.x && !( (n.x - 1) & n.x );
    isPow2 |= n.y && !( (n.y - 1) & n.y );
    isPow2 |= n.z && !( (n.z - 1) & n.z );

    if(!isPow2)
        ERR_MACRO(CL_INVALID_VALUE);

    if( (dim == clFFT_1D && (n.y != 1 || n.z != 1)) || (dim == clFFT_2D && n.z != 1) )
        ERR_MACRO(CL_INVALID_VALUE);

    plan = (cl_fft_plan *) malloc(sizeof(cl_fft_plan));
    if(!plan)
        ERR_MACRO(CL_OUT_OF_RESOURCES);

    plan->context = context;
    clRetainContext(context);
    plan->n = n;
    plan->dim = dim;
    plan->format = dataFormat;
    plan->kernel_info = 0;
    plan->num_kernels = 0;
    plan->twist_kernel = 0;
    plan->program = 0;
    plan->temp_buffer_needed = 0;
    plan->last_batch_size = 0;
    plan->tempmemobj = 0;
    plan->tempmemobj_real = 0;
    plan->tempmemobj_imag = 0;
403
404
    plan->cossin_LUT_d1=0;
    plan->cossin_LUT_d2=0;
Oliver Bock's avatar
Oliver Bock committed
405
406
407
408
409
    plan->max_localmem_fft_size = 2048;
    plan->max_work_item_per_workgroup = 256;
    plan->max_radix = 16;
    plan->min_mem_coalesce_width = 16;
    plan->num_local_mem_banks = 16;
Oliver Bock's avatar
Oliver Bock committed
410

411
    plan->twiddleMethod = (clFFT_TwiddleFactorMethod)(flags & 7);    
412
    
413
    precomputeSinCosLUTs(plan,error_code);
414

415
    
416
417
patch_kernel_source:

Oliver Bock's avatar
Oliver Bock committed
418
419
    plan->kernel_string = new string("");
    if(!plan->kernel_string)
420
421
        ERR_MACRO(CL_OUT_OF_RESOURCES);

Oliver Bock's avatar
Oliver Bock committed
422
    getBlockConfigAndKernelString(plan);
Oliver Bock's avatar
Oliver Bock committed
423

Oliver Bock's avatar
Oliver Bock committed
424
425
    const char *source_str = plan->kernel_string->c_str();
    plan->program = clCreateProgramWithSource(context, 1, (const char**) &source_str, NULL, &err);
426
427
    ERR_MACRO(err);

Oliver Bock's avatar
Oliver Bock committed
428
429
    err = clGetContextInfo(context, CL_CONTEXT_DEVICES, sizeof(devices), devices, &ret_size);
    ERR_MACRO(err);
Oliver Bock's avatar
Oliver Bock committed
430

Oliver Bock's avatar
Oliver Bock committed
431
    num_devices = ret_size / sizeof(cl_device_id);
Oliver Bock's avatar
Oliver Bock committed
432

Oliver Bock's avatar
Oliver Bock committed
433
434
435
436
    for(i = 0; i < num_devices; i++)
    {
        err = clGetDeviceInfo(devices[i], CL_DEVICE_TYPE, sizeof(device_type), &device_type, NULL);
        ERR_MACRO(err);
Oliver Bock's avatar
Oliver Bock committed
437

Oliver Bock's avatar
Oliver Bock committed
438
439
440
        if(device_type == CL_DEVICE_TYPE_GPU)
        {
            gpu_found = 1;
441
            err = clBuildProgram(plan->program, 1, &devices[i], "-cl-mad-enable -cl-single-precision-constant", NULL, NULL);
Oliver Bock's avatar
Oliver Bock committed
442
443
444
445
446
            if (err != CL_SUCCESS)
            {
                char *build_log;
                char devicename[200];
                size_t log_size;
Oliver Bock's avatar
Oliver Bock committed
447

Oliver Bock's avatar
Oliver Bock committed
448
449
                err = clGetProgramBuildInfo(plan->program, devices[i], CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
                ERR_MACRO(err);
Oliver Bock's avatar
Oliver Bock committed
450

Oliver Bock's avatar
Oliver Bock committed
451
                build_log = (char *) malloc(log_size + 1);
Oliver Bock's avatar
Oliver Bock committed
452

Oliver Bock's avatar
Oliver Bock committed
453
454
                err = clGetProgramBuildInfo(plan->program, devices[i], CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL);
                ERR_MACRO(err);
Oliver Bock's avatar
Oliver Bock committed
455

Oliver Bock's avatar
Oliver Bock committed
456
457
                err = clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(devicename), devicename, NULL);
                ERR_MACRO(err);
Oliver Bock's avatar
Oliver Bock committed
458

Oliver Bock's avatar
Oliver Bock committed
459
460
461
                fprintf(stdout, "FFT program build log on device %s\n", devicename);
                fprintf(stdout, "%s\n", build_log);
                free(build_log);
Oliver Bock's avatar
Oliver Bock committed
462

Oliver Bock's avatar
Oliver Bock committed
463
464
465
466
                ERR_MACRO(err);
            }
        }
    }
Oliver Bock's avatar
Oliver Bock committed
467

Oliver Bock's avatar
Oliver Bock committed
468
469
    if(!gpu_found)
        ERR_MACRO(CL_INVALID_CONTEXT);
Oliver Bock's avatar
Oliver Bock committed
470

Oliver Bock's avatar
Oliver Bock committed
471
    err = createKernelList(plan);
472
    ERR_MACRO(err);
Oliver Bock's avatar
Oliver Bock committed
473

474
    // we created program and kernels based on "some max work group size (default 256)" ... this work group size
Oliver Bock's avatar
Oliver Bock committed
475
476
    // may be larger than what kernel may execute with ... if thats the case we need to regenerate the kernel source
    // setting this as limit i.e max group size and rebuild.
Oliver Bock's avatar
Oliver Bock committed
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
    unsigned int max_kernel_wg_size;
    int patching_req = getMaxKernelWorkGroupSize(plan, &max_kernel_wg_size, num_devices, devices);
    if(patching_req == -1)
    {
        ERR_MACRO(err);
    }

    if(patching_req)
    {
        destroy_plan(plan);
        plan->max_work_item_per_workgroup = max_kernel_wg_size;
        goto patch_kernel_source;
    }

    cl_fft_kernel_info *kInfo = plan->kernel_info;
    while(kInfo)
    {
        plan->num_kernels++;
        kInfo = kInfo->next;
    }

    if(error_code)
        *error_code = CL_SUCCESS;

    return (clFFT_Plan) plan;
502
503
}

Oliver Bock's avatar
Oliver Bock committed
504
void
505
506
507
clFFT_DestroyPlan(clFFT_Plan plan)
{
    cl_fft_plan *Plan = (cl_fft_plan *) plan;
Oliver Bock's avatar
Oliver Bock committed
508
509
510
511
512
513
    if(Plan)
    {
        destroy_plan(Plan);
        clReleaseContext(Plan->context);
        free(Plan);
    }
514
515
516
517
}

void clFFT_DumpPlan( clFFT_Plan Plan, FILE *file)
{
Oliver Bock's avatar
Oliver Bock committed
518
519
520
521
522
523
524
525
526
527
528
529
530
531
    size_t gDim, lDim;
    FILE *out;
    if(!file)
        out = stdout;
    else
        out = file;

    cl_fft_plan *plan = (cl_fft_plan *) Plan;
    cl_fft_kernel_info *kInfo = plan->kernel_info;

    while(kInfo)
    {
        cl_int s = 1;
        getKernelWorkDimensions(plan, kInfo, &s, &gDim, &lDim);
532
        fprintf(out, "Run kernel %s with global dim = {%lu*BatchSize}, local dim={%lu}\n", kInfo->kernel_name, (unsigned long)gDim, (unsigned long)lDim);
Oliver Bock's avatar
Oliver Bock committed
533
534
535
        kInfo = kInfo->next;
    }
    fprintf(out, "%s\n", plan->kernel_string->c_str());
536
}