Skip to content
Snippets Groups Projects
Commit bb903c74 authored by Gaurav Khanna's avatar Gaurav Khanna Committed by Oliver Bock
Browse files

Initial version

* Based on Apple's OpenCL FFT implementation
* Derivative work done by Gaurav Khanna
parents
No related branches found
No related tags found
No related merge requests found
build 0 → 100755
#! /bin/csh -f
set NV = ~/NVIDIA_GPU_Computing_SDK
g++ -g -o fft -I$NV/OpenCL/common/inc\
main.cpp fft_setup.cpp fft_execute.cpp fft_kernelstring.cpp\
-lOpenCL
clFFT.h 0 → 100644
#ifndef __CLFFT_H
#define __CLFFT_H
#ifdef __cplusplus
extern "C" {
#endif
#include <CL/cl.h>
#include <stdio.h>
// XForm type
typedef enum
{
clFFT_Forward = -1,
clFFT_Inverse = 1
}clFFT_Direction;
// XForm dimension
typedef enum
{
clFFT_1D = 0,
clFFT_2D = 1,
clFFT_3D = 3
}clFFT_Dimension;
// XForm Data type
typedef enum
{
clFFT_SplitComplexFormat = 0,
clFFT_InterleavedComplexFormat = 1
}clFFT_DataFormat;
typedef struct
{
unsigned int x;
unsigned int y;
unsigned int z;
}clFFT_Dim3;
typedef struct
{
float *real;
float *imag;
} clFFT_SplitComplex;
typedef struct
{
float real;
float imag;
}clFFT_Complex;
typedef void* clFFT_Plan;
clFFT_Plan clFFT_CreatePlan( cl_context context, clFFT_Dim3 n, clFFT_Dimension dim, clFFT_DataFormat dataFormat, cl_int *error_code );
void clFFT_DestroyPlan( clFFT_Plan plan );
cl_int clFFT_ExecuteInterleaved( cl_command_queue queue, clFFT_Plan plan, cl_int batchSize, clFFT_Direction dir,
cl_mem data_in, cl_mem data_out,
cl_int num_events, cl_event *event_list, cl_event *event );
cl_int clFFT_ExecutePlannar( cl_command_queue queue, clFFT_Plan plan, cl_int batchSize, clFFT_Direction dir,
cl_mem data_in_real, cl_mem data_in_imag, cl_mem data_out_real, cl_mem data_out_imag,
cl_int num_events, cl_event *event_list, cl_event *event );
cl_int clFFT_1DTwistInterleaved(clFFT_Plan Plan, cl_command_queue queue, cl_mem array,
size_t numRows, size_t numCols, size_t startRow, size_t rowsToProcess, clFFT_Direction dir);
cl_int clFFT_1DTwistPlannar(clFFT_Plan Plan, cl_command_queue queue, cl_mem array_real, cl_mem array_imag,
size_t numRows, size_t numCols, size_t startRow, size_t rowsToProcess, clFFT_Direction dir);
void clFFT_DumpPlan( clFFT_Plan plan, FILE *file);
#ifdef __cplusplus
}
#endif
#endif
#ifndef __CL_FFT_BASE_KERNELS_
#define __CL_FFT_BASE_KERNELS_
#include <string.h>
using namespace std;
static string baseKernels = string
(
"#ifndef M_PI\n"
"#define M_PI 0x1.921fb54442d18p+1\n"
"#endif\n"
"#define complexMul(a,b) ((float2)(mad(-(a).y, (b).y, (a).x * (b).x), mad((a).y, (b).x, (a).x * (b).y)))\n"
"#define conj(a) ((float2)((a).x, -(a).y))\n"
"#define conjTransp(a) ((float2)(-(a).y, (a).x))\n"
"\n"
"#define fftKernel2(a,dir) \\\n"
"{ \\\n"
" float2 c = (a)[0]; \\\n"
" (a)[0] = c + (a)[1]; \\\n"
" (a)[1] = c - (a)[1]; \\\n"
"}\n"
"\n"
"#define fftKernel2S(d1,d2,dir) \\\n"
"{ \\\n"
" float2 c = (d1); \\\n"
" (d1) = c + (d2); \\\n"
" (d2) = c - (d2); \\\n"
"}\n"
"\n"
"#define fftKernel4(a,dir) \\\n"
"{ \\\n"
" fftKernel2S((a)[0], (a)[2], dir); \\\n"
" fftKernel2S((a)[1], (a)[3], dir); \\\n"
" fftKernel2S((a)[0], (a)[1], dir); \\\n"
" (a)[3] = (float2)(dir)*(conjTransp((a)[3])); \\\n"
" fftKernel2S((a)[2], (a)[3], dir); \\\n"
" float2 c = (a)[1]; \\\n"
" (a)[1] = (a)[2]; \\\n"
" (a)[2] = c; \\\n"
"}\n"
"\n"
"#define fftKernel4s(a0,a1,a2,a3,dir) \\\n"
"{ \\\n"
" fftKernel2S((a0), (a2), dir); \\\n"
" fftKernel2S((a1), (a3), dir); \\\n"
" fftKernel2S((a0), (a1), dir); \\\n"
" (a3) = (float2)(dir)*(conjTransp((a3))); \\\n"
" fftKernel2S((a2), (a3), dir); \\\n"
" float2 c = (a1); \\\n"
" (a1) = (a2); \\\n"
" (a2) = c; \\\n"
"}\n"
"\n"
"#define bitreverse8(a) \\\n"
"{ \\\n"
" float2 c; \\\n"
" c = (a)[1]; \\\n"
" (a)[1] = (a)[4]; \\\n"
" (a)[4] = c; \\\n"
" c = (a)[3]; \\\n"
" (a)[3] = (a)[6]; \\\n"
" (a)[6] = c; \\\n"
"}\n"
"\n"
"#define fftKernel8(a,dir) \\\n"
"{ \\\n"
" const float2 w1 = (float2)(0x1.6a09e6p-1f, dir*0x1.6a09e6p-1f); \\\n"
" const float2 w3 = (float2)(-0x1.6a09e6p-1f, dir*0x1.6a09e6p-1f); \\\n"
" float2 c; \\\n"
" fftKernel2S((a)[0], (a)[4], dir); \\\n"
" fftKernel2S((a)[1], (a)[5], dir); \\\n"
" fftKernel2S((a)[2], (a)[6], dir); \\\n"
" fftKernel2S((a)[3], (a)[7], dir); \\\n"
" (a)[5] = complexMul(w1, (a)[5]); \\\n"
" (a)[6] = (float2)(dir)*(conjTransp((a)[6])); \\\n"
" (a)[7] = complexMul(w3, (a)[7]); \\\n"
" fftKernel2S((a)[0], (a)[2], dir); \\\n"
" fftKernel2S((a)[1], (a)[3], dir); \\\n"
" fftKernel2S((a)[4], (a)[6], dir); \\\n"
" fftKernel2S((a)[5], (a)[7], dir); \\\n"
" (a)[3] = (float2)(dir)*(conjTransp((a)[3])); \\\n"
" (a)[7] = (float2)(dir)*(conjTransp((a)[7])); \\\n"
" fftKernel2S((a)[0], (a)[1], dir); \\\n"
" fftKernel2S((a)[2], (a)[3], dir); \\\n"
" fftKernel2S((a)[4], (a)[5], dir); \\\n"
" fftKernel2S((a)[6], (a)[7], dir); \\\n"
" bitreverse8((a)); \\\n"
"}\n"
"\n"
"#define bitreverse4x4(a) \\\n"
"{ \\\n"
" float2 c; \\\n"
" c = (a)[1]; (a)[1] = (a)[4]; (a)[4] = c; \\\n"
" c = (a)[2]; (a)[2] = (a)[8]; (a)[8] = c; \\\n"
" c = (a)[3]; (a)[3] = (a)[12]; (a)[12] = c; \\\n"
" c = (a)[6]; (a)[6] = (a)[9]; (a)[9] = c; \\\n"
" c = (a)[7]; (a)[7] = (a)[13]; (a)[13] = c; \\\n"
" c = (a)[11]; (a)[11] = (a)[14]; (a)[14] = c; \\\n"
"}\n"
"\n"
"#define fftKernel16(a,dir) \\\n"
"{ \\\n"
" const float w0 = 0x1.d906bcp-1f; \\\n"
" const float w1 = 0x1.87de2ap-2f; \\\n"
" const float w2 = 0x1.6a09e6p-1f; \\\n"
" fftKernel4s((a)[0], (a)[4], (a)[8], (a)[12], dir); \\\n"
" fftKernel4s((a)[1], (a)[5], (a)[9], (a)[13], dir); \\\n"
" fftKernel4s((a)[2], (a)[6], (a)[10], (a)[14], dir); \\\n"
" fftKernel4s((a)[3], (a)[7], (a)[11], (a)[15], dir); \\\n"
" (a)[5] = complexMul((a)[5], (float2)(w0, dir*w1)); \\\n"
" (a)[6] = complexMul((a)[6], (float2)(w2, dir*w2)); \\\n"
" (a)[7] = complexMul((a)[7], (float2)(w1, dir*w0)); \\\n"
" (a)[9] = complexMul((a)[9], (float2)(w2, dir*w2)); \\\n"
" (a)[10] = (float2)(dir)*(conjTransp((a)[10])); \\\n"
" (a)[11] = complexMul((a)[11], (float2)(-w2, dir*w2)); \\\n"
" (a)[13] = complexMul((a)[13], (float2)(w1, dir*w0)); \\\n"
" (a)[14] = complexMul((a)[14], (float2)(-w2, dir*w2)); \\\n"
" (a)[15] = complexMul((a)[15], (float2)(-w0, dir*-w1)); \\\n"
" fftKernel4((a), dir); \\\n"
" fftKernel4((a) + 4, dir); \\\n"
" fftKernel4((a) + 8, dir); \\\n"
" fftKernel4((a) + 12, dir); \\\n"
" bitreverse4x4((a)); \\\n"
"}\n"
"\n"
"#define bitreverse32(a) \\\n"
"{ \\\n"
" float2 c1, c2; \\\n"
" c1 = (a)[2]; (a)[2] = (a)[1]; c2 = (a)[4]; (a)[4] = c1; c1 = (a)[8]; (a)[8] = c2; c2 = (a)[16]; (a)[16] = c1; (a)[1] = c2; \\\n"
" c1 = (a)[6]; (a)[6] = (a)[3]; c2 = (a)[12]; (a)[12] = c1; c1 = (a)[24]; (a)[24] = c2; c2 = (a)[17]; (a)[17] = c1; (a)[3] = c2; \\\n"
" c1 = (a)[10]; (a)[10] = (a)[5]; c2 = (a)[20]; (a)[20] = c1; c1 = (a)[9]; (a)[9] = c2; c2 = (a)[18]; (a)[18] = c1; (a)[5] = c2; \\\n"
" c1 = (a)[14]; (a)[14] = (a)[7]; c2 = (a)[28]; (a)[28] = c1; c1 = (a)[25]; (a)[25] = c2; c2 = (a)[19]; (a)[19] = c1; (a)[7] = c2; \\\n"
" c1 = (a)[22]; (a)[22] = (a)[11]; c2 = (a)[13]; (a)[13] = c1; c1 = (a)[26]; (a)[26] = c2; c2 = (a)[21]; (a)[21] = c1; (a)[11] = c2; \\\n"
" c1 = (a)[30]; (a)[30] = (a)[15]; c2 = (a)[29]; (a)[29] = c1; c1 = (a)[27]; (a)[27] = c2; c2 = (a)[23]; (a)[23] = c1; (a)[15] = c2; \\\n"
"}\n"
"\n"
"#define fftKernel32(a,dir) \\\n"
"{ \\\n"
" fftKernel2S((a)[0], (a)[16], dir); \\\n"
" fftKernel2S((a)[1], (a)[17], dir); \\\n"
" fftKernel2S((a)[2], (a)[18], dir); \\\n"
" fftKernel2S((a)[3], (a)[19], dir); \\\n"
" fftKernel2S((a)[4], (a)[20], dir); \\\n"
" fftKernel2S((a)[5], (a)[21], dir); \\\n"
" fftKernel2S((a)[6], (a)[22], dir); \\\n"
" fftKernel2S((a)[7], (a)[23], dir); \\\n"
" fftKernel2S((a)[8], (a)[24], dir); \\\n"
" fftKernel2S((a)[9], (a)[25], dir); \\\n"
" fftKernel2S((a)[10], (a)[26], dir); \\\n"
" fftKernel2S((a)[11], (a)[27], dir); \\\n"
" fftKernel2S((a)[12], (a)[28], dir); \\\n"
" fftKernel2S((a)[13], (a)[29], dir); \\\n"
" fftKernel2S((a)[14], (a)[30], dir); \\\n"
" fftKernel2S((a)[15], (a)[31], dir); \\\n"
" (a)[17] = complexMul((a)[17], (float2)(0x1.f6297cp-1f, dir*0x1.8f8b84p-3f)); \\\n"
" (a)[18] = complexMul((a)[18], (float2)(0x1.d906bcp-1f, dir*0x1.87de2ap-2f)); \\\n"
" (a)[19] = complexMul((a)[19], (float2)(0x1.a9b662p-1f, dir*0x1.1c73b4p-1f)); \\\n"
" (a)[20] = complexMul((a)[20], (float2)(0x1.6a09e6p-1f, dir*0x1.6a09e6p-1f)); \\\n"
" (a)[21] = complexMul((a)[21], (float2)(0x1.1c73b4p-1f, dir*0x1.a9b662p-1f)); \\\n"
" (a)[22] = complexMul((a)[22], (float2)(0x1.87de2ap-2f, dir*0x1.d906bcp-1f)); \\\n"
" (a)[23] = complexMul((a)[23], (float2)(0x1.8f8b84p-3f, dir*0x1.f6297cp-1f)); \\\n"
" (a)[24] = complexMul((a)[24], (float2)(0x0p+0f, dir*0x1p+0f)); \\\n"
" (a)[25] = complexMul((a)[25], (float2)(-0x1.8f8b84p-3f, dir*0x1.f6297cp-1f)); \\\n"
" (a)[26] = complexMul((a)[26], (float2)(-0x1.87de2ap-2f, dir*0x1.d906bcp-1f)); \\\n"
" (a)[27] = complexMul((a)[27], (float2)(-0x1.1c73b4p-1f, dir*0x1.a9b662p-1f)); \\\n"
" (a)[28] = complexMul((a)[28], (float2)(-0x1.6a09e6p-1f, dir*0x1.6a09e6p-1f)); \\\n"
" (a)[29] = complexMul((a)[29], (float2)(-0x1.a9b662p-1f, dir*0x1.1c73b4p-1f)); \\\n"
" (a)[30] = complexMul((a)[30], (float2)(-0x1.d906bcp-1f, dir*0x1.87de2ap-2f)); \\\n"
" (a)[31] = complexMul((a)[31], (float2)(-0x1.f6297cp-1f, dir*0x1.8f8b84p-3f)); \\\n"
" fftKernel16((a), dir); \\\n"
" fftKernel16((a) + 16, dir); \\\n"
" bitreverse32((a)); \\\n"
"}\n\n"
);
static string twistKernelInterleaved = string
(
"__kernel void \\\n"
"clFFT_1DTwistInterleaved(__global float2 *in, unsigned int startRow, unsigned int numCols, unsigned int N, unsigned int numRowsToProcess, int dir) \\\n"
"{ \\\n"
" float2 a, w; \\\n"
" float ang; \\\n"
" unsigned int j; \\\n"
" unsigned int i = get_global_id(0); \\\n"
" unsigned int startIndex = i; \\\n"
" \\\n"
" if(i < numCols) \\\n"
" { \\\n"
" for(j = 0; j < numRowsToProcess; j++) \\\n"
" { \\\n"
" a = in[startIndex]; \\\n"
" ang = 2.0f * M_PI * dir * i * (startRow + j) / N; \\\n"
" w = (float2)(native_cos(ang), native_sin(ang)); \\\n"
" a = complexMul(a, w); \\\n"
" in[startIndex] = a; \\\n"
" startIndex += numCols; \\\n"
" } \\\n"
" } \\\n"
"} \\\n"
);
static string twistKernelPlannar = string
(
"__kernel void \\\n"
"clFFT_1DTwistSplit(__global float *in_real, __global float *in_imag , unsigned int startRow, unsigned int numCols, unsigned int N, unsigned int numRowsToProcess, int dir) \\\n"
"{ \\\n"
" float2 a, w; \\\n"
" float ang; \\\n"
" unsigned int j; \\\n"
" unsigned int i = get_global_id(0); \\\n"
" unsigned int startIndex = i; \\\n"
" \\\n"
" if(i < numCols) \\\n"
" { \\\n"
" for(j = 0; j < numRowsToProcess; j++) \\\n"
" { \\\n"
" a = (float2)(in_real[startIndex], in_imag[startIndex]); \\\n"
" ang = 2.0f * M_PI * dir * i * (startRow + j) / N; \\\n"
" w = (float2)(native_cos(ang), native_sin(ang)); \\\n"
" a = complexMul(a, w); \\\n"
" in_real[startIndex] = a.x; \\\n"
" in_imag[startIndex] = a.y; \\\n"
" startIndex += numCols; \\\n"
" } \\\n"
" } \\\n"
"} \\\n"
);
#endif
#include "fft_internal.h"
#include "clFFT.h"
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#define max(a,b) (((a)>(b)) ? (a) : (b))
#define min(a,b) (((a)<(b)) ? (a) : (b))
static cl_int
allocateTemporaryBufferInterleaved(cl_fft_plan *plan, cl_uint batchSize)
{
cl_int err = CL_SUCCESS;
if(plan->temp_buffer_needed && plan->last_batch_size != batchSize)
{
plan->last_batch_size = batchSize;
size_t tmpLength = plan->n.x * plan->n.y * plan->n.z * batchSize * 2 * sizeof(cl_float);
if(plan->tempmemobj)
clReleaseMemObject(plan->tempmemobj);
plan->tempmemobj = clCreateBuffer(plan->context, CL_MEM_READ_WRITE, tmpLength, NULL, &err);
}
return err;
}
static cl_int
allocateTemporaryBufferPlannar(cl_fft_plan *plan, cl_uint batchSize)
{
cl_int err = CL_SUCCESS;
cl_int terr;
if(plan->temp_buffer_needed && plan->last_batch_size != batchSize)
{
plan->last_batch_size = batchSize;
size_t tmpLength = plan->n.x * plan->n.y * plan->n.z * batchSize * sizeof(cl_float);
if(plan->tempmemobj_real)
clReleaseMemObject(plan->tempmemobj_real);
if(plan->tempmemobj_imag)
clReleaseMemObject(plan->tempmemobj_imag);
plan->tempmemobj_real = clCreateBuffer(plan->context, CL_MEM_READ_WRITE, tmpLength, NULL, &err);
plan->tempmemobj_imag = clCreateBuffer(plan->context, CL_MEM_READ_WRITE, tmpLength, NULL, &terr);
err |= terr;
}
return err;
}
void
getKernelWorkDimensions(cl_fft_plan *plan, cl_fft_kernel_info *kernelInfo, cl_int *batchSize, size_t *gWorkItems, size_t *lWorkItems)
{
*lWorkItems = kernelInfo->num_workitems_per_workgroup;
int numWorkGroups = kernelInfo->num_workgroups;
int numXFormsPerWG = kernelInfo->num_xforms_per_workgroup;
switch(kernelInfo->dir)
{
case cl_fft_kernel_x:
*batchSize *= (plan->n.y * plan->n.z);
numWorkGroups = (*batchSize % numXFormsPerWG) ? (*batchSize/numXFormsPerWG + 1) : (*batchSize/numXFormsPerWG);
numWorkGroups *= kernelInfo->num_workgroups;
break;
case cl_fft_kernel_y:
*batchSize *= plan->n.z;
numWorkGroups *= *batchSize;
break;
case cl_fft_kernel_z:
numWorkGroups *= *batchSize;
break;
}
*gWorkItems = numWorkGroups * *lWorkItems;
}
cl_int
clFFT_ExecuteInterleaved( cl_command_queue queue, clFFT_Plan Plan, cl_int batchSize, clFFT_Direction dir,
cl_mem data_in, cl_mem data_out,
cl_int num_events, cl_event *event_list, cl_event *event )
{
int s;
cl_fft_plan *plan = (cl_fft_plan *) Plan;
if(plan->format != clFFT_InterleavedComplexFormat)
return CL_INVALID_VALUE;
cl_int err;
size_t gWorkItems, lWorkItems;
int inPlaceDone;
cl_int isInPlace = data_in == data_out ? 1 : 0;
if((err = allocateTemporaryBufferInterleaved(plan, batchSize)) != CL_SUCCESS)
return err;
cl_mem memObj[3];
memObj[0] = data_in;
memObj[1] = data_out;
memObj[2] = plan->tempmemobj;
cl_fft_kernel_info *kernelInfo = plan->kernel_info;
int numKernels = plan->num_kernels;
int numKernelsOdd = numKernels & 1;
int currRead = 0;
int currWrite = 1;
// at least one external dram shuffle (transpose) required
if(plan->temp_buffer_needed)
{
// in-place transform
if(isInPlace)
{
inPlaceDone = 0;
currRead = 1;
currWrite = 2;
}
else
{
currWrite = (numKernels & 1) ? 1 : 2;
}
while(kernelInfo)
{
if( isInPlace && numKernelsOdd && !inPlaceDone && kernelInfo->in_place_possible)
{
currWrite = currRead;
inPlaceDone = 1;
}
s = batchSize;
getKernelWorkDimensions(plan, kernelInfo, &s, &gWorkItems, &lWorkItems);
err |= clSetKernelArg(kernelInfo->kernel, 0, sizeof(cl_mem), &memObj[currRead]);
err |= clSetKernelArg(kernelInfo->kernel, 1, sizeof(cl_mem), &memObj[currWrite]);
err |= clSetKernelArg(kernelInfo->kernel, 2, sizeof(cl_int), &dir);
err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_int), &s);
err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, 0, NULL, NULL);
if(err)
return err;
currRead = (currWrite == 1) ? 1 : 2;
currWrite = (currWrite == 1) ? 2 : 1;
kernelInfo = kernelInfo->next;
}
}
// no dram shuffle (transpose required) transform
// all kernels can execute in-place.
else {
while(kernelInfo)
{
s = batchSize;
getKernelWorkDimensions(plan, kernelInfo, &s, &gWorkItems, &lWorkItems);
err |= clSetKernelArg(kernelInfo->kernel, 0, sizeof(cl_mem), &memObj[currRead]);
err |= clSetKernelArg(kernelInfo->kernel, 1, sizeof(cl_mem), &memObj[currWrite]);
err |= clSetKernelArg(kernelInfo->kernel, 2, sizeof(cl_int), &dir);
err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_int), &s);
err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, 0, NULL, NULL);
if(err)
return err;
currRead = 1;
currWrite = 1;
kernelInfo = kernelInfo->next;
}
}
return err;
}
cl_int
clFFT_ExecutePlannar( cl_command_queue queue, clFFT_Plan Plan, cl_int batchSize, clFFT_Direction dir,
cl_mem data_in_real, cl_mem data_in_imag, cl_mem data_out_real, cl_mem data_out_imag,
cl_int num_events, cl_event *event_list, cl_event *event)
{
int s;
cl_fft_plan *plan = (cl_fft_plan *) Plan;
if(plan->format != clFFT_SplitComplexFormat)
return CL_INVALID_VALUE;
cl_int err;
size_t gWorkItems, lWorkItems;
int inPlaceDone;
cl_int isInPlace = ((data_in_real == data_out_real) && (data_in_imag == data_out_imag)) ? 1 : 0;
if((err = allocateTemporaryBufferPlannar(plan, batchSize)) != CL_SUCCESS)
return err;
cl_mem memObj_real[3];
cl_mem memObj_imag[3];
memObj_real[0] = data_in_real;
memObj_real[1] = data_out_real;
memObj_real[2] = plan->tempmemobj_real;
memObj_imag[0] = data_in_imag;
memObj_imag[1] = data_out_imag;
memObj_imag[2] = plan->tempmemobj_imag;
cl_fft_kernel_info *kernelInfo = plan->kernel_info;
int numKernels = plan->num_kernels;
int numKernelsOdd = numKernels & 1;
int currRead = 0;
int currWrite = 1;
// at least one external dram shuffle (transpose) required
if(plan->temp_buffer_needed)
{
// in-place transform
if(isInPlace)
{
inPlaceDone = 0;
currRead = 1;
currWrite = 2;
}
else
{
currWrite = (numKernels & 1) ? 1 : 2;
}
while(kernelInfo)
{
if( isInPlace && numKernelsOdd && !inPlaceDone && kernelInfo->in_place_possible)
{
currWrite = currRead;
inPlaceDone = 1;
}
s = batchSize;
getKernelWorkDimensions(plan, kernelInfo, &s, &gWorkItems, &lWorkItems);
err |= clSetKernelArg(kernelInfo->kernel, 0, sizeof(cl_mem), &memObj_real[currRead]);
err |= clSetKernelArg(kernelInfo->kernel, 1, sizeof(cl_mem), &memObj_imag[currRead]);
err |= clSetKernelArg(kernelInfo->kernel, 2, sizeof(cl_mem), &memObj_real[currWrite]);
err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_mem), &memObj_imag[currWrite]);
err |= clSetKernelArg(kernelInfo->kernel, 4, sizeof(cl_int), &dir);
err |= clSetKernelArg(kernelInfo->kernel, 5, sizeof(cl_int), &s);
err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, 0, NULL, NULL);
if(err)
return err;
currRead = (currWrite == 1) ? 1 : 2;
currWrite = (currWrite == 1) ? 2 : 1;
kernelInfo = kernelInfo->next;
}
}
// no dram shuffle (transpose required) transform
else {
while(kernelInfo)
{
s = batchSize;
getKernelWorkDimensions(plan, kernelInfo, &s, &gWorkItems, &lWorkItems);
err |= clSetKernelArg(kernelInfo->kernel, 0, sizeof(cl_mem), &memObj_real[currRead]);
err |= clSetKernelArg(kernelInfo->kernel, 1, sizeof(cl_mem), &memObj_imag[currRead]);
err |= clSetKernelArg(kernelInfo->kernel, 2, sizeof(cl_mem), &memObj_real[currWrite]);
err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_mem), &memObj_imag[currWrite]);
err |= clSetKernelArg(kernelInfo->kernel, 4, sizeof(cl_int), &dir);
err |= clSetKernelArg(kernelInfo->kernel, 5, sizeof(cl_int), &s);
err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, 0, NULL, NULL);
if(err)
return err;
currRead = 1;
currWrite = 1;
kernelInfo = kernelInfo->next;
}
}
return err;
}
cl_int
clFFT_1DTwistInterleaved(clFFT_Plan Plan, cl_command_queue queue, cl_mem array,
size_t numRows, size_t numCols, size_t startRow, size_t rowsToProcess, clFFT_Direction dir)
{
puts ("X");
cl_fft_plan *plan = (cl_fft_plan *) Plan;
unsigned int N = numRows*numCols;
unsigned int nCols = numCols;
unsigned int sRow = startRow;
unsigned int rToProcess = rowsToProcess;
int d = dir;
int err = 0;
cl_device_id device_id;
err = clGetCommandQueueInfo(queue, CL_QUEUE_DEVICE, sizeof(cl_device_id), &device_id, NULL);
if(err)
return err;
size_t gSize;
err = clGetKernelWorkGroupInfo(plan->twist_kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &gSize, NULL);
if(err)
return err;
gSize = min(128, gSize);
size_t numGlobalThreads[1] = { max(numCols / gSize, 1)*gSize };
size_t numLocalThreads[1] = { gSize };
err |= clSetKernelArg(plan->twist_kernel, 0, sizeof(cl_mem), &array);
err |= clSetKernelArg(plan->twist_kernel, 1, sizeof(unsigned int), &sRow);
err |= clSetKernelArg(plan->twist_kernel, 2, sizeof(unsigned int), &nCols);
err |= clSetKernelArg(plan->twist_kernel, 3, sizeof(unsigned int), &N);
err |= clSetKernelArg(plan->twist_kernel, 4, sizeof(unsigned int), &rToProcess);
err |= clSetKernelArg(plan->twist_kernel, 5, sizeof(int), &d);
err |= clEnqueueNDRangeKernel(queue, plan->twist_kernel, 1, NULL, numGlobalThreads, numLocalThreads, 0, NULL, NULL);
return err;
}
cl_int
clFFT_1DTwistPlannar(clFFT_Plan Plan, cl_command_queue queue, cl_mem array_real, cl_mem array_imag,
size_t numRows, size_t numCols, size_t startRow, size_t rowsToProcess, clFFT_Direction dir)
{
puts ("Y");
cl_fft_plan *plan = (cl_fft_plan *) Plan;
unsigned int N = numRows*numCols;
unsigned int nCols = numCols;
unsigned int sRow = startRow;
unsigned int rToProcess = rowsToProcess;
int d = dir;
int err = 0;
cl_device_id device_id;
err = clGetCommandQueueInfo(queue, CL_QUEUE_DEVICE, sizeof(cl_device_id), &device_id, NULL);
if(err)
return err;
size_t gSize;
err = clGetKernelWorkGroupInfo(plan->twist_kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &gSize, NULL);
if(err)
return err;
gSize = min(128, gSize);
size_t numGlobalThreads[1] = { max(numCols / gSize, 1)*gSize };
size_t numLocalThreads[1] = { gSize };
err |= clSetKernelArg(plan->twist_kernel, 0, sizeof(cl_mem), &array_real);
err |= clSetKernelArg(plan->twist_kernel, 1, sizeof(cl_mem), &array_imag);
err |= clSetKernelArg(plan->twist_kernel, 2, sizeof(unsigned int), &sRow);
err |= clSetKernelArg(plan->twist_kernel, 3, sizeof(unsigned int), &nCols);
err |= clSetKernelArg(plan->twist_kernel, 4, sizeof(unsigned int), &N);
err |= clSetKernelArg(plan->twist_kernel, 5, sizeof(unsigned int), &rToProcess);
err |= clSetKernelArg(plan->twist_kernel, 6, sizeof(int), &d);
err |= clEnqueueNDRangeKernel(queue, plan->twist_kernel, 1, NULL, numGlobalThreads, numLocalThreads, 0, NULL, NULL);
return err;
}
#ifndef __CLFFT_INTERNAL_H
#define __CLFFT_INTERNAL_H
#include "clFFT.h"
#include <iostream>
#include <string>
#include <sstream>
using namespace std;
typedef enum kernel_dir_t
{
cl_fft_kernel_x,
cl_fft_kernel_y,
cl_fft_kernel_z
}cl_fft_kernel_dir;
typedef struct kernel_info_t
{
cl_kernel kernel;
char *kernel_name;
size_t lmem_size;
size_t num_workgroups;
size_t num_xforms_per_workgroup;
size_t num_workitems_per_workgroup;
cl_fft_kernel_dir dir;
int in_place_possible;
kernel_info_t *next;
}cl_fft_kernel_info;
typedef struct
{
// context in which fft resources are created and kernels are executed
cl_context context;
// size of signal
clFFT_Dim3 n;
// dimension of transform ... must be either 1D, 2D or 3D
clFFT_Dimension dim;
// data format ... must be either interleaved or plannar
clFFT_DataFormat format;
// string containing kernel source. Generated at runtime based on
// n, dim, format and other parameters
string *kernel_string;
// CL program containing source and kernel this particular
// n, dim, data format
cl_program program;
// linked list of kernels which needs to be executed for this fft
cl_fft_kernel_info *kernel_info;
// number of kernels
int num_kernels;
// twist kernel for virtualizing fft of very large sizes that do not
// fit in GPU global memory
cl_kernel twist_kernel;
// flag indicating if temporary intermediate buffer is needed or not.
// this depends on fft kernels being executed and if transform is
// in-place or out-of-place. e.g. Local memory fft (say 1D 1024 ...
// one that does not require global transpose do not need temporary buffer)
// 2D 1024x1024 out-of-place fft however do require intermediate buffer.
// If temp buffer is needed, its allocation is lazy i.e. its not allocated
// until its needed
cl_int temp_buffer_needed;
// Batch size is runtime parameter and size of temporary buffer (if needed)
// depends on batch size. Allocation of temporary buffer is lazy i.e. its
// only created when needed. Once its created at first call of clFFT_Executexxx
// it is not allocated next time if next time clFFT_Executexxx is called with
// batch size different than the first call. last_batch_size caches the last
// batch size with which this plan is used so that we dont keep allocating/deallocating
// temp buffer if same batch size is used again and again.
size_t last_batch_size;
// temporary buffer for interleaved plan
cl_mem tempmemobj;
// temporary buffer for planner plan. Only one of tempmemobj or
// (tempmemobj_real, tempmemobj_imag) pair is valid (allocated) depending
// data format of plan (plannar or interleaved)
cl_mem tempmemobj_real, tempmemobj_imag;
// Maximum size of signal for which local memory transposed based
// fft is sufficient i.e. no global mem transpose (communication)
// is needed
size_t max_localmem_fft_size;
// Maximum work items per work group allowed. This, along with max_radix below controls
// maximum local memory being used by fft kernels of this plan. Set to 256 by default
size_t max_work_item_per_workgroup;
// Maximum base radix for local memory fft ... this controls the maximum register
// space used by work items. Currently defaults to 16
size_t max_radix;
// Device depended parameter that tells how many work-items need to be read consecutive
// values to make sure global memory access by work-items of a work-group result in
// coalesced memory access to utilize full bandwidth e.g. on NVidia tesla, this is 16
size_t min_mem_coalesce_width;
// Number of local memory banks. This is used to geneate kernel with local memory
// transposes with appropriate padding to avoid bank conflicts to local memory
// e.g. on NVidia it is 16.
size_t num_local_mem_banks;
}cl_fft_plan;
void FFT1D(cl_fft_plan *plan, cl_fft_kernel_dir dir);
#endif
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <iostream>
#include <sstream>
#include <string.h>
#include <assert.h>
#include "fft_internal.h"
#include "clFFT.h"
using namespace std;
#define max(A,B) ((A) > (B) ? (A) : (B))
#define min(A,B) ((A) < (B) ? (A) : (B))
static string
num2str(int num)
{
char temp[200];
sprintf(temp, "%d", num);
return string(temp);
}
// For any n, this function decomposes n into factors for loacal memory tranpose
// based fft. Factors (radices) are sorted such that the first one (radixArray[0])
// is the largest. This base radix determines the number of registers used by each
// work item and product of remaining radices determine the size of work group needed.
// To make things concrete with and example, suppose n = 1024. It is decomposed into
// 1024 = 16 x 16 x 4. Hence kernel uses float2 a[16], for local in-register fft and
// needs 16 x 4 = 64 work items per work group. So kernel first performance 64 length
// 16 ffts (64 work items working in parallel) following by transpose using local
// memory followed by again 64 length 16 ffts followed by transpose using local memory
// followed by 256 length 4 ffts. For the last step since with size of work group is
// 64 and each work item can array for 16 values, 64 work items can compute 256 length
// 4 ffts by each work item computing 4 length 4 ffts.
// Similarly for n = 2048 = 8 x 8 x 8 x 4, each work group has 8 x 8 x 4 = 256 work
// iterms which each computes 256 (in-parallel) length 8 ffts in-register, followed
// by transpose using local memory, followed by 256 length 8 in-register ffts, followed
// by transpose using local memory, followed by 256 length 8 in-register ffts, followed
// by transpose using local memory, followed by 512 length 4 in-register ffts. Again,
// for the last step, each work item computes two length 4 in-register ffts and thus
// 256 work items are needed to compute all 512 ffts.
// For n = 32 = 8 x 4, 4 work items first compute 4 in-register
// lenth 8 ffts, followed by transpose using local memory followed by 8 in-register
// length 4 ffts, where each work item computes two length 4 ffts thus 4 work items
// can compute 8 length 4 ffts. However if work group size of say 64 is choosen,
// each work group can compute 64/ 4 = 16 size 32 ffts (batched transform).
// Users can play with these parameters to figure what gives best performance on
// their particular device i.e. some device have less register space thus using
// smaller base radix can avoid spilling ... some has small local memory thus
// using smaller work group size may be required etc
static void
getRadixArray(unsigned int n, unsigned int *radixArray, unsigned int *numRadices, unsigned int maxRadix)
{
if(maxRadix > 1)
{
maxRadix = min(n, maxRadix);
unsigned int cnt = 0;
while(n > maxRadix)
{
radixArray[cnt++] = maxRadix;
n /= maxRadix;
}
radixArray[cnt++] = n;
*numRadices = cnt;
return;
}
switch(n)
{
case 2:
*numRadices = 1;
radixArray[0] = 2;
break;
case 4:
*numRadices = 1;
radixArray[0] = 4;
break;
case 8:
*numRadices = 1;
radixArray[0] = 8;
break;
case 16:
*numRadices = 2;
radixArray[0] = 8; radixArray[1] = 2;
break;
case 32:
*numRadices = 2;
radixArray[0] = 8; radixArray[1] = 4;
break;
case 64:
*numRadices = 2;
radixArray[0] = 8; radixArray[1] = 8;
break;
case 128:
*numRadices = 3;
radixArray[0] = 8; radixArray[1] = 4; radixArray[2] = 4;
break;
case 256:
*numRadices = 4;
radixArray[0] = 4; radixArray[1] = 4; radixArray[2] = 4; radixArray[3] = 4;
break;
case 512:
*numRadices = 3;
radixArray[0] = 8; radixArray[1] = 8; radixArray[2] = 8;
break;
case 1024:
*numRadices = 3;
radixArray[0] = 16; radixArray[1] = 16; radixArray[2] = 4;
break;
case 2048:
*numRadices = 4;
radixArray[0] = 8; radixArray[1] = 8; radixArray[2] = 8; radixArray[3] = 4;
break;
default:
*numRadices = 0;
return;
}
}
static void
insertHeader(string &kernelString, string &kernelName, clFFT_DataFormat dataFormat)
{
if(dataFormat == clFFT_SplitComplexFormat)
kernelString += string("__kernel void ") + kernelName + string("(__global float *in_real, __global float *in_imag, __global float *out_real, __global float *out_imag, int dir, int S)\n");
else
kernelString += string("__kernel void ") + kernelName + string("(__global float2 *in, __global float2 *out, int dir, int S)\n");
}
static void
insertVariables(string &kStream, int maxRadix)
{
kStream += string(" int i, j, r, indexIn, indexOut, index, tid, bNum, xNum, k, l;\n");
kStream += string(" int s, ii, jj, offset;\n");
kStream += string(" float2 w;\n");
kStream += string(" float ang, angf, ang1;\n");
kStream += string(" __local float *lMemStore, *lMemLoad;\n");
kStream += string(" float2 a[") + num2str(maxRadix) + string("];\n");
kStream += string(" int lId = get_local_id( 0 );\n");
kStream += string(" int groupId = get_group_id( 0 );\n");
}
static void
formattedLoad(string &kernelString, int aIndex, int gIndex, clFFT_DataFormat dataFormat)
{
if(dataFormat == clFFT_InterleavedComplexFormat)
kernelString += string(" a[") + num2str(aIndex) + string("] = in[") + num2str(gIndex) + string("];\n");
else
{
kernelString += string(" a[") + num2str(aIndex) + string("].x = in_real[") + num2str(gIndex) + string("];\n");
kernelString += string(" a[") + num2str(aIndex) + string("].y = in_imag[") + num2str(gIndex) + string("];\n");
}
}
static void
formattedStore(string &kernelString, int aIndex, int gIndex, clFFT_DataFormat dataFormat)
{
if(dataFormat == clFFT_InterleavedComplexFormat)
kernelString += string(" out[") + num2str(gIndex) + string("] = a[") + num2str(aIndex) + string("];\n");
else
{
kernelString += string(" out_real[") + num2str(gIndex) + string("] = a[") + num2str(aIndex) + string("].x;\n");
kernelString += string(" out_imag[") + num2str(gIndex) + string("] = a[") + num2str(aIndex) + string("].y;\n");
}
}
static int
insertGlobalLoadsAndTranspose(string &kernelString, int N, int numWorkItemsPerXForm, int numXFormsPerWG, int R0, int mem_coalesce_width, clFFT_DataFormat dataFormat)
{
int log2NumWorkItemsPerXForm = (int) log2(numWorkItemsPerXForm);
int groupSize = numWorkItemsPerXForm * numXFormsPerWG;
int i, j;
int lMemSize = 0;
if(numXFormsPerWG > 1)
kernelString += string(" s = S & ") + num2str(numXFormsPerWG - 1) + string(";\n");
if(numWorkItemsPerXForm >= mem_coalesce_width)
{
if(numXFormsPerWG > 1)
{
kernelString += string(" ii = lId & ") + num2str(numWorkItemsPerXForm-1) + string(";\n");
kernelString += string(" jj = lId >> ") + num2str(log2NumWorkItemsPerXForm) + string(";\n");
kernelString += string(" if( !s || (groupId < get_num_groups(0)-1) || (jj < s) ) {\n");
kernelString += string(" offset = mad24( mad24(groupId, ") + num2str(numXFormsPerWG) + string(", jj), ") + num2str(N) + string(", ii );\n");
if(dataFormat == clFFT_InterleavedComplexFormat)
{
kernelString += string(" in += offset;\n");
kernelString += string(" out += offset;\n");
}
else
{
kernelString += string(" in_real += offset;\n");
kernelString += string(" in_imag += offset;\n");
kernelString += string(" out_real += offset;\n");
kernelString += string(" out_imag += offset;\n");
}
for(i = 0; i < R0; i++)
formattedLoad(kernelString, i, i*numWorkItemsPerXForm, dataFormat);
kernelString += string(" }\n");
}
else
{
kernelString += string(" ii = lId;\n");
kernelString += string(" jj = 0;\n");
kernelString += string(" offset = mad24(groupId, ") + num2str(N) + string(", ii);\n");
if(dataFormat == clFFT_InterleavedComplexFormat)
{
kernelString += string(" in += offset;\n");
kernelString += string(" out += offset;\n");
}
else
{
kernelString += string(" in_real += offset;\n");
kernelString += string(" in_imag += offset;\n");
kernelString += string(" out_real += offset;\n");
kernelString += string(" out_imag += offset;\n");
}
for(i = 0; i < R0; i++)
formattedLoad(kernelString, i, i*numWorkItemsPerXForm, dataFormat);
}
}
else if( N >= mem_coalesce_width )
{
int numInnerIter = N / mem_coalesce_width;
int numOuterIter = numXFormsPerWG / ( groupSize / mem_coalesce_width );
kernelString += string(" ii = lId & ") + num2str(mem_coalesce_width - 1) + string(";\n");
kernelString += string(" jj = lId >> ") + num2str((int)log2(mem_coalesce_width)) + string(";\n");
kernelString += string(" lMemStore = sMem + mad24( jj, ") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n");
kernelString += string(" offset = mad24( groupId, ") + num2str(numXFormsPerWG) + string(", jj);\n");
kernelString += string(" offset = mad24( offset, ") + num2str(N) + string(", ii );\n");
if(dataFormat == clFFT_InterleavedComplexFormat)
{
kernelString += string(" in += offset;\n");
kernelString += string(" out += offset;\n");
}
else
{
kernelString += string(" in_real += offset;\n");
kernelString += string(" in_imag += offset;\n");
kernelString += string(" out_real += offset;\n");
kernelString += string(" out_imag += offset;\n");
}
kernelString += string("if((groupId == get_num_groups(0)-1) && s) {\n");
for(i = 0; i < numOuterIter; i++ )
{
kernelString += string(" if( jj < s ) {\n");
for(j = 0; j < numInnerIter; j++ )
formattedLoad(kernelString, i * numInnerIter + j, j * mem_coalesce_width + i * ( groupSize / mem_coalesce_width ) * N, dataFormat);
kernelString += string(" }\n");
if(i != numOuterIter - 1)
kernelString += string(" jj += ") + num2str(groupSize / mem_coalesce_width) + string(";\n");
}
kernelString += string("}\n ");
kernelString += string("else {\n");
for(i = 0; i < numOuterIter; i++ )
{
for(j = 0; j < numInnerIter; j++ )
formattedLoad(kernelString, i * numInnerIter + j, j * mem_coalesce_width + i * ( groupSize / mem_coalesce_width ) * N, dataFormat);
}
kernelString += string("}\n");
kernelString += string(" ii = lId & ") + num2str(numWorkItemsPerXForm - 1) + string(";\n");
kernelString += string(" jj = lId >> ") + num2str(log2NumWorkItemsPerXForm) + string(";\n");
kernelString += string(" lMemLoad = sMem + mad24( jj, ") + num2str(N + numWorkItemsPerXForm) + string(", ii);\n");
for( i = 0; i < numOuterIter; i++ )
{
for( j = 0; j < numInnerIter; j++ )
{
kernelString += string(" lMemStore[") + num2str(j * mem_coalesce_width + i * ( groupSize / mem_coalesce_width ) * (N + numWorkItemsPerXForm )) + string("] = a[") +
num2str(i * numInnerIter + j) + string("].x;\n");
}
}
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
for( i = 0; i < R0; i++ )
kernelString += string(" a[") + num2str(i) + string("].x = lMemLoad[") + num2str(i * numWorkItemsPerXForm) + string("];\n");
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
for( i = 0; i < numOuterIter; i++ )
{
for( j = 0; j < numInnerIter; j++ )
{
kernelString += string(" lMemStore[") + num2str(j * mem_coalesce_width + i * ( groupSize / mem_coalesce_width ) * (N + numWorkItemsPerXForm )) + string("] = a[") +
num2str(i * numInnerIter + j) + string("].y;\n");
}
}
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
for( i = 0; i < R0; i++ )
kernelString += string(" a[") + num2str(i) + string("].y = lMemLoad[") + num2str(i * numWorkItemsPerXForm) + string("];\n");
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
lMemSize = (N + numWorkItemsPerXForm) * numXFormsPerWG;
}
else
{
kernelString += string(" offset = mad24( groupId, ") + num2str(N * numXFormsPerWG) + string(", lId );\n");
if(dataFormat == clFFT_InterleavedComplexFormat)
{
kernelString += string(" in += offset;\n");
kernelString += string(" out += offset;\n");
}
else
{
kernelString += string(" in_real += offset;\n");
kernelString += string(" in_imag += offset;\n");
kernelString += string(" out_real += offset;\n");
kernelString += string(" out_imag += offset;\n");
}
kernelString += string(" ii = lId & ") + num2str(N-1) + string(";\n");
kernelString += string(" jj = lId >> ") + num2str((int)log2(N)) + string(";\n");
kernelString += string(" lMemStore = sMem + mad24( jj, ") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n");
kernelString += string("if((groupId == get_num_groups(0)-1) && s) {\n");
for( i = 0; i < R0; i++ )
{
kernelString += string(" if(jj < s )\n");
formattedLoad(kernelString, i, i*groupSize, dataFormat);
if(i != R0 - 1)
kernelString += string(" jj += ") + num2str(groupSize / N) + string(";\n");
}
kernelString += string("}\n");
kernelString += string("else {\n");
for( i = 0; i < R0; i++ )
{
formattedLoad(kernelString, i, i*groupSize, dataFormat);
}
kernelString += string("}\n");
if(numWorkItemsPerXForm > 1)
{
kernelString += string(" ii = lId & ") + num2str(numWorkItemsPerXForm - 1) + string(";\n");
kernelString += string(" jj = lId >> ") + num2str(log2NumWorkItemsPerXForm) + string(";\n");
kernelString += string(" lMemLoad = sMem + mad24( jj, ") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n");
}
else
{
kernelString += string(" ii = 0;\n");
kernelString += string(" jj = lId;\n");
kernelString += string(" lMemLoad = sMem + mul24( jj, ") + num2str(N + numWorkItemsPerXForm) + string(");\n");
}
for( i = 0; i < R0; i++ )
kernelString += string(" lMemStore[") + num2str(i * ( groupSize / N ) * ( N + numWorkItemsPerXForm )) + string("] = a[") + num2str(i) + string("].x;\n");
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
for( i = 0; i < R0; i++ )
kernelString += string(" a[") + num2str(i) + string("].x = lMemLoad[") + num2str(i * numWorkItemsPerXForm) + string("];\n");
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
for( i = 0; i < R0; i++ )
kernelString += string(" lMemStore[") + num2str(i * ( groupSize / N ) * ( N + numWorkItemsPerXForm )) + string("] = a[") + num2str(i) + string("].y;\n");
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
for( i = 0; i < R0; i++ )
kernelString += string(" a[") + num2str(i) + string("].y = lMemLoad[") + num2str(i * numWorkItemsPerXForm) + string("];\n");
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
lMemSize = (N + numWorkItemsPerXForm) * numXFormsPerWG;
}
return lMemSize;
}
static int
insertGlobalStoresAndTranspose(string &kernelString, int N, int maxRadix, int Nr, int numWorkItemsPerXForm, int numXFormsPerWG, int mem_coalesce_width, clFFT_DataFormat dataFormat)
{
int groupSize = numWorkItemsPerXForm * numXFormsPerWG;
int i, j, k, ind;
int lMemSize = 0;
int numIter = maxRadix / Nr;
string indent = string("");
if( numWorkItemsPerXForm >= mem_coalesce_width )
{
if(numXFormsPerWG > 1)
{
kernelString += string(" if( !s || (groupId < get_num_groups(0)-1) || (jj < s) ) {\n");
indent = string(" ");
}
for(i = 0; i < maxRadix; i++)
{
j = i % numIter;
k = i / numIter;
ind = j * Nr + k;
formattedStore(kernelString, ind, i*numWorkItemsPerXForm, dataFormat);
}
if(numXFormsPerWG > 1)
kernelString += string(" }\n");
}
else if( N >= mem_coalesce_width )
{
int numInnerIter = N / mem_coalesce_width;
int numOuterIter = numXFormsPerWG / ( groupSize / mem_coalesce_width );
kernelString += string(" lMemLoad = sMem + mad24( jj, ") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n");
kernelString += string(" ii = lId & ") + num2str(mem_coalesce_width - 1) + string(";\n");
kernelString += string(" jj = lId >> ") + num2str((int)log2(mem_coalesce_width)) + string(";\n");
kernelString += string(" lMemStore = sMem + mad24( jj,") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n");
for( i = 0; i < maxRadix; i++ )
{
j = i % numIter;
k = i / numIter;
ind = j * Nr + k;
kernelString += string(" lMemLoad[") + num2str(i*numWorkItemsPerXForm) + string("] = a[") + num2str(ind) + string("].x;\n");
}
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
for( i = 0; i < numOuterIter; i++ )
for( j = 0; j < numInnerIter; j++ )
kernelString += string(" a[") + num2str(i*numInnerIter + j) + string("].x = lMemStore[") + num2str(j*mem_coalesce_width + i*( groupSize / mem_coalesce_width )*(N + numWorkItemsPerXForm)) + string("];\n");
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
for( i = 0; i < maxRadix; i++ )
{
j = i % numIter;
k = i / numIter;
ind = j * Nr + k;
kernelString += string(" lMemLoad[") + num2str(i*numWorkItemsPerXForm) + string("] = a[") + num2str(ind) + string("].y;\n");
}
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
for( i = 0; i < numOuterIter; i++ )
for( j = 0; j < numInnerIter; j++ )
kernelString += string(" a[") + num2str(i*numInnerIter + j) + string("].y = lMemStore[") + num2str(j*mem_coalesce_width + i*( groupSize / mem_coalesce_width )*(N + numWorkItemsPerXForm)) + string("];\n");
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
kernelString += string("if((groupId == get_num_groups(0)-1) && s) {\n");
for(i = 0; i < numOuterIter; i++ )
{
kernelString += string(" if( jj < s ) {\n");
for(j = 0; j < numInnerIter; j++ )
formattedStore(kernelString, i*numInnerIter + j, j*mem_coalesce_width + i*(groupSize/mem_coalesce_width)*N, dataFormat);
kernelString += string(" }\n");
if(i != numOuterIter - 1)
kernelString += string(" jj += ") + num2str(groupSize / mem_coalesce_width) + string(";\n");
}
kernelString += string("}\n");
kernelString += string("else {\n");
for(i = 0; i < numOuterIter; i++ )
{
for(j = 0; j < numInnerIter; j++ )
formattedStore(kernelString, i*numInnerIter + j, j*mem_coalesce_width + i*(groupSize/mem_coalesce_width)*N, dataFormat);
}
kernelString += string("}\n");
lMemSize = (N + numWorkItemsPerXForm) * numXFormsPerWG;
}
else
{
kernelString += string(" lMemLoad = sMem + mad24( jj,") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n");
kernelString += string(" ii = lId & ") + num2str(N - 1) + string(";\n");
kernelString += string(" jj = lId >> ") + num2str((int) log2(N)) + string(";\n");
kernelString += string(" lMemStore = sMem + mad24( jj,") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n");
for( i = 0; i < maxRadix; i++ )
{
j = i % numIter;
k = i / numIter;
ind = j * Nr + k;
kernelString += string(" lMemLoad[") + num2str(i*numWorkItemsPerXForm) + string("] = a[") + num2str(ind) + string("].x;\n");
}
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
for( i = 0; i < maxRadix; i++ )
kernelString += string(" a[") + num2str(i) + string("].x = lMemStore[") + num2str(i*( groupSize / N )*( N + numWorkItemsPerXForm )) + string("];\n");
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
for( i = 0; i < maxRadix; i++ )
{
j = i % numIter;
k = i / numIter;
ind = j * Nr + k;
kernelString += string(" lMemLoad[") + num2str(i*numWorkItemsPerXForm) + string("] = a[") + num2str(ind) + string("].y;\n");
}
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
for( i = 0; i < maxRadix; i++ )
kernelString += string(" a[") + num2str(i) + string("].y = lMemStore[") + num2str(i*( groupSize / N )*( N + numWorkItemsPerXForm )) + string("];\n");
kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
kernelString += string("if((groupId == get_num_groups(0)-1) && s) {\n");
for( i = 0; i < maxRadix; i++ )
{
kernelString += string(" if(jj < s ) {\n");
formattedStore(kernelString, i, i*groupSize, dataFormat);
kernelString += string(" }\n");
if( i != maxRadix - 1)
kernelString += string(" jj +=") + num2str(groupSize / N) + string(";\n");
}
kernelString += string("}\n");
kernelString += string("else {\n");
for( i = 0; i < maxRadix; i++ )
{
formattedStore(kernelString, i, i*groupSize, dataFormat);
}
kernelString += string("}\n");
lMemSize = (N + numWorkItemsPerXForm) * numXFormsPerWG;
}
return lMemSize;
}
static void
insertfftKernel(string &kernelString, int Nr, int numIter)
{
int i;
for(i = 0; i < numIter; i++)
{
kernelString += string(" fftKernel") + num2str(Nr) + string("(a+") + num2str(i*Nr) + string(", dir);\n");
}
}
static void
insertTwiddleKernel(string &kernelString, int Nr, int numIter, int Nprev, int len, int numWorkItemsPerXForm)
{
int z, k;
int logNPrev = log2(Nprev);
for(z = 0; z < numIter; z++)
{
if(z == 0)
{
if(Nprev > 1)
kernelString += string(" angf = (float) (ii >> ") + num2str(logNPrev) + string(");\n");
else
kernelString += string(" angf = (float) ii;\n");
}
else
{
if(Nprev > 1)
kernelString += string(" angf = (float) ((") + num2str(z*numWorkItemsPerXForm) + string(" + ii) >>") + num2str(logNPrev) + string(");\n");
else
kernelString += string(" angf = (float) (") + num2str(z*numWorkItemsPerXForm) + string(" + ii);\n");
}
for(k = 1; k < Nr; k++) {
int ind = z*Nr + k;
//float fac = (float) (2.0 * M_PI * (double) k / (double) len);
kernelString += string(" ang = dir * ( 2.0f * M_PI * ") + num2str(k) + string(".0f / ") + num2str(len) + string(".0f )") + string(" * angf;\n");
kernelString += string(" w = (float2)(native_cos(ang), native_sin(ang));\n");
kernelString += string(" a[") + num2str(ind) + string("] = complexMul(a[") + num2str(ind) + string("], w);\n");
}
}
}
static int
getPadding(int numWorkItemsPerXForm, int Nprev, int numWorkItemsReq, int numXFormsPerWG, int Nr, int numBanks, int *offset, int *midPad)
{
if((numWorkItemsPerXForm <= Nprev) || (Nprev >= numBanks))
*offset = 0;
else {
int numRowsReq = ((numWorkItemsPerXForm < numBanks) ? numWorkItemsPerXForm : numBanks) / Nprev;
int numColsReq = 1;
if(numRowsReq > Nr)
numColsReq = numRowsReq / Nr;
numColsReq = Nprev * numColsReq;
*offset = numColsReq;
}
if(numWorkItemsPerXForm >= numBanks || numXFormsPerWG == 1)
*midPad = 0;
else {
int bankNum = ( (numWorkItemsReq + *offset) * Nr ) & (numBanks - 1);
if( bankNum >= numWorkItemsPerXForm )
*midPad = 0;
else
*midPad = numWorkItemsPerXForm - bankNum;
}
int lMemSize = ( numWorkItemsReq + *offset) * Nr * numXFormsPerWG + *midPad * (numXFormsPerWG - 1);
return lMemSize;
}
static void
insertLocalStores(string &kernelString, int numIter, int Nr, int numWorkItemsPerXForm, int numWorkItemsReq, int offset, string &comp)
{
int z, k;
for(z = 0; z < numIter; z++) {
for(k = 0; k < Nr; k++) {
int index = k*(numWorkItemsReq + offset) + z*numWorkItemsPerXForm;
kernelString += string(" lMemStore[") + num2str(index) + string("] = a[") + num2str(z*Nr + k) + string("].") + comp + string(";\n");
}
}
kernelString += string(" barrier(CLK_LOCAL_MEM_FENCE);\n");
}
static void
insertLocalLoads(string &kernelString, int n, int Nr, int Nrn, int Nprev, int Ncurr, int numWorkItemsPerXForm, int numWorkItemsReq, int offset, string &comp)
{
int numWorkItemsReqN = n / Nrn;
int interBlockHNum = max( Nprev / numWorkItemsPerXForm, 1 );
int interBlockHStride = numWorkItemsPerXForm;
int vertWidth = max(numWorkItemsPerXForm / Nprev, 1);
vertWidth = min( vertWidth, Nr);
int vertNum = Nr / vertWidth;
int vertStride = ( n / Nr + offset ) * vertWidth;
int iter = max( numWorkItemsReqN / numWorkItemsPerXForm, 1);
int intraBlockHStride = (numWorkItemsPerXForm / (Nprev*Nr)) > 1 ? (numWorkItemsPerXForm / (Nprev*Nr)) : 1;
intraBlockHStride *= Nprev;
int stride = numWorkItemsReq / Nrn;
int i;
for(i = 0; i < iter; i++) {
int ii = i / (interBlockHNum * vertNum);
int zz = i % (interBlockHNum * vertNum);
int jj = zz % interBlockHNum;
int kk = zz / interBlockHNum;
int z;
for(z = 0; z < Nrn; z++) {
int st = kk * vertStride + jj * interBlockHStride + ii * intraBlockHStride + z * stride;
kernelString += string(" a[") + num2str(i*Nrn + z) + string("].") + comp + string(" = lMemLoad[") + num2str(st) + string("];\n");
}
}
kernelString += string(" barrier(CLK_LOCAL_MEM_FENCE);\n");
}
static void
insertLocalLoadIndexArithmatic(string &kernelString, int Nprev, int Nr, int numWorkItemsReq, int numWorkItemsPerXForm, int numXFormsPerWG, int offset, int midPad)
{
int Ncurr = Nprev * Nr;
int logNcurr = log2(Ncurr);
int logNprev = log2(Nprev);
int incr = (numWorkItemsReq + offset) * Nr + midPad;
if(Ncurr < numWorkItemsPerXForm)
{
if(Nprev == 1)
kernelString += string(" j = ii & ") + num2str(Ncurr - 1) + string(";\n");
else
kernelString += string(" j = (ii & ") + num2str(Ncurr - 1) + string(") >> ") + num2str(logNprev) + string(";\n");
if(Nprev == 1)
kernelString += string(" i = ii >> ") + num2str(logNcurr) + string(";\n");
else
kernelString += string(" i = mad24(ii >> ") + num2str(logNcurr) + string(", ") + num2str(Nprev) + string(", ii & ") + num2str(Nprev - 1) + string(");\n");
}
else
{
if(Nprev == 1)
kernelString += string(" j = ii;\n");
else
kernelString += string(" j = ii >> ") + num2str(logNprev) + string(";\n");
if(Nprev == 1)
kernelString += string(" i = 0;\n");
else
kernelString += string(" i = ii & ") + num2str(Nprev - 1) + string(";\n");
}
if(numXFormsPerWG > 1)
kernelString += string(" i = mad24(jj, ") + num2str(incr) + string(", i);\n");
kernelString += string(" lMemLoad = sMem + mad24(j, ") + num2str(numWorkItemsReq + offset) + string(", i);\n");
}
static void
insertLocalStoreIndexArithmatic(string &kernelString, int numWorkItemsReq, int numXFormsPerWG, int Nr, int offset, int midPad)
{
if(numXFormsPerWG == 1) {
kernelString += string(" lMemStore = sMem + ii;\n");
}
else {
kernelString += string(" lMemStore = sMem + mad24(jj, ") + num2str((numWorkItemsReq + offset)*Nr + midPad) + string(", ii);\n");
}
}
static void
createLocalMemfftKernelString(cl_fft_plan *plan)
{
unsigned int radixArray[10];
unsigned int numRadix;
unsigned int n = plan->n.x;
assert(n <= plan->max_work_item_per_workgroup * plan->max_radix && "signal lenght too big for local mem fft\n");
getRadixArray(n, radixArray, &numRadix, 0);
assert(numRadix > 0 && "no radix array supplied\n");
if(n/radixArray[0] > plan->max_work_item_per_workgroup)
getRadixArray(n, radixArray, &numRadix, plan->max_radix);
assert(radixArray[0] <= plan->max_radix && "max radix choosen is greater than allowed\n");
assert(n/radixArray[0] <= plan->max_work_item_per_workgroup && "required work items per xform greater than maximum work items allowed per work group for local mem fft\n");
unsigned int tmpLen = 1;
unsigned int i;
for(i = 0; i < numRadix; i++)
{
assert( radixArray[i] && !( (radixArray[i] - 1) & radixArray[i] ) );
tmpLen *= radixArray[i];
}
assert(tmpLen == n && "product of radices choosen doesnt match the length of signal\n");
int offset, midPad;
string localString(""), kernelName("");
clFFT_DataFormat dataFormat = plan->format;
string *kernelString = plan->kernel_string;
cl_fft_kernel_info **kInfo = &plan->kernel_info;
int kCount = 0;
while(*kInfo)
{
kInfo = &(*kInfo)->next;
kCount++;
}
kernelName = string("fft") + num2str(kCount);
*kInfo = (cl_fft_kernel_info *) malloc(sizeof(cl_fft_kernel_info));
(*kInfo)->kernel = 0;
(*kInfo)->lmem_size = 0;
(*kInfo)->num_workgroups = 0;
(*kInfo)->num_workitems_per_workgroup = 0;
(*kInfo)->dir = cl_fft_kernel_x;
(*kInfo)->in_place_possible = 1;
(*kInfo)->next = NULL;
(*kInfo)->kernel_name = (char *) malloc(sizeof(char)*(kernelName.size()+1));
strcpy((*kInfo)->kernel_name, kernelName.c_str());
unsigned int numWorkItemsPerXForm = n / radixArray[0];
unsigned int numWorkItemsPerWG = numWorkItemsPerXForm <= 64 ? 64 : numWorkItemsPerXForm;
assert(numWorkItemsPerWG <= plan->max_work_item_per_workgroup);
int numXFormsPerWG = numWorkItemsPerWG / numWorkItemsPerXForm;
(*kInfo)->num_workgroups = 1;
(*kInfo)->num_xforms_per_workgroup = numXFormsPerWG;
(*kInfo)->num_workitems_per_workgroup = numWorkItemsPerWG;
unsigned int *N = radixArray;
unsigned int maxRadix = N[0];
unsigned int lMemSize = 0;
insertVariables(localString, maxRadix);
lMemSize = insertGlobalLoadsAndTranspose(localString, n, numWorkItemsPerXForm, numXFormsPerWG, maxRadix, plan->min_mem_coalesce_width, dataFormat);
(*kInfo)->lmem_size = (lMemSize > (*kInfo)->lmem_size) ? lMemSize : (*kInfo)->lmem_size;
string xcomp = string("x");
string ycomp = string("y");
unsigned int Nprev = 1;
unsigned int len = n;
unsigned int r;
for(r = 0; r < numRadix; r++)
{
int numIter = N[0] / N[r];
int numWorkItemsReq = n / N[r];
int Ncurr = Nprev * N[r];
insertfftKernel(localString, N[r], numIter);
if(r < (numRadix - 1)) {
insertTwiddleKernel(localString, N[r], numIter, Nprev, len, numWorkItemsPerXForm);
lMemSize = getPadding(numWorkItemsPerXForm, Nprev, numWorkItemsReq, numXFormsPerWG, N[r], plan->num_local_mem_banks, &offset, &midPad);
(*kInfo)->lmem_size = (lMemSize > (*kInfo)->lmem_size) ? lMemSize : (*kInfo)->lmem_size;
insertLocalStoreIndexArithmatic(localString, numWorkItemsReq, numXFormsPerWG, N[r], offset, midPad);
insertLocalLoadIndexArithmatic(localString, Nprev, N[r], numWorkItemsReq, numWorkItemsPerXForm, numXFormsPerWG, offset, midPad);
insertLocalStores(localString, numIter, N[r], numWorkItemsPerXForm, numWorkItemsReq, offset, xcomp);
insertLocalLoads(localString, n, N[r], N[r+1], Nprev, Ncurr, numWorkItemsPerXForm, numWorkItemsReq, offset, xcomp);
insertLocalStores(localString, numIter, N[r], numWorkItemsPerXForm, numWorkItemsReq, offset, ycomp);
insertLocalLoads(localString, n, N[r], N[r+1], Nprev, Ncurr, numWorkItemsPerXForm, numWorkItemsReq, offset, ycomp);
Nprev = Ncurr;
len = len / N[r];
}
}
lMemSize = insertGlobalStoresAndTranspose(localString, n, maxRadix, N[numRadix - 1], numWorkItemsPerXForm, numXFormsPerWG, plan->min_mem_coalesce_width, dataFormat);
(*kInfo)->lmem_size = (lMemSize > (*kInfo)->lmem_size) ? lMemSize : (*kInfo)->lmem_size;
insertHeader(*kernelString, kernelName, dataFormat);
*kernelString += string("{\n");
if((*kInfo)->lmem_size)
*kernelString += string(" __local float sMem[") + num2str((*kInfo)->lmem_size) + string("];\n");
*kernelString += localString;
*kernelString += string("}\n");
}
// For n larger than what can be computed using local memory fft, global transposes
// multiple kernel launces is needed. For these sizes, n can be decomposed using
// much larger base radices i.e. say n = 262144 = 128 x 64 x 32. Thus three kernel
// launches will be needed, first computing 64 x 32, length 128 ffts, second computing
// 128 x 32 length 64 ffts, and finally a kernel computing 128 x 64 length 32 ffts.
// Each of these base radices can futher be divided into factors so that each of these
// base ffts can be computed within one kernel launch using in-register ffts and local
// memory transposes i.e for the first kernel above which computes 64 x 32 ffts on length
// 128, 128 can be decomposed into 128 = 16 x 8 i.e. 8 work items can compute 8 length
// 16 ffts followed by transpose using local memory followed by each of these eight
// work items computing 2 length 8 ffts thus computing 16 length 8 ffts in total. This
// means only 8 work items are needed for computing one length 128 fft. If we choose
// work group size of say 64, we can compute 64/8 = 8 length 128 ffts within one
// work group. Since we need to compute 64 x 32 length 128 ffts in first kernel, this
// means we need to launch 64 x 32 / 8 = 256 work groups with 64 work items in each
// work group where each work group is computing 8 length 128 ffts where each length
// 128 fft is computed by 8 work items. Same logic can be applied to other two kernels
// in this example. Users can play with difference base radices and difference
// decompositions of base radices to generates different kernels and see which gives
// best performance. Following function is just fixed to use 128 as base radix
void
getGlobalRadixInfo(int n, int *radix, int *R1, int *R2, int *numRadices)
{
int baseRadix = min(n, 128);
int numR = 0;
int N = n;
while(N > baseRadix)
{
N /= baseRadix;
numR++;
}
for(int i = 0; i < numR; i++)
radix[i] = baseRadix;
radix[numR] = N;
numR++;
*numRadices = numR;
for(int i = 0; i < numR; i++)
{
int B = radix[i];
if(B <= 8)
{
R1[i] = B;
R2[i] = 1;
continue;
}
int r1 = 2;
int r2 = B / r1;
while(r2 > r1)
{
r1 *=2;
r2 = B / r1;
}
R1[i] = r1;
R2[i] = r2;
}
}
static void
createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir dir, int vertBS)
{
int i, j, k, t;
int radixArr[10] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
int R1Arr[10] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
int R2Arr[10] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
int radix, R1, R2;
int numRadices;
int maxThreadsPerBlock = plan->max_work_item_per_workgroup;
int maxArrayLen = plan->max_radix;
int batchSize = plan->min_mem_coalesce_width;
clFFT_DataFormat dataFormat = plan->format;
int vertical = (dir == cl_fft_kernel_x) ? 0 : 1;
getGlobalRadixInfo(n, radixArr, R1Arr, R2Arr, &numRadices);
int numPasses = numRadices;
string localString(""), kernelName("");
string *kernelString = plan->kernel_string;
cl_fft_kernel_info **kInfo = &plan->kernel_info;
int kCount = 0;
while(*kInfo)
{
kInfo = &(*kInfo)->next;
kCount++;
}
int N = n;
int m = (int)log2(n);
int Rinit = vertical ? BS : 1;
batchSize = vertical ? min(BS, batchSize) : batchSize;
int passNum;
for(passNum = 0; passNum < numPasses; passNum++)
{
localString.clear();
kernelName.clear();
radix = radixArr[passNum];
R1 = R1Arr[passNum];
R2 = R2Arr[passNum];
int strideI = Rinit;
for(i = 0; i < numPasses; i++)
if(i != passNum)
strideI *= radixArr[i];
int strideO = Rinit;
for(i = 0; i < passNum; i++)
strideO *= radixArr[i];
int threadsPerXForm = R2;
batchSize = R2 == 1 ? plan->max_work_item_per_workgroup : batchSize;
batchSize = min(batchSize, strideI);
int threadsPerBlock = batchSize * threadsPerXForm;
threadsPerBlock = min(threadsPerBlock, maxThreadsPerBlock);
batchSize = threadsPerBlock / threadsPerXForm;
assert(R2 <= R1);
assert(R1*R2 == radix);
assert(R1 <= maxArrayLen);
assert(threadsPerBlock <= maxThreadsPerBlock);
int numIter = R1 / R2;
int gInInc = threadsPerBlock / batchSize;
int lgStrideO = log2(strideO);
int numBlocksPerXForm = strideI / batchSize;
int numBlocks = numBlocksPerXForm;
if(!vertical)
numBlocks *= BS;
else
numBlocks *= vertBS;
kernelName = string("fft") + num2str(kCount);
*kInfo = (cl_fft_kernel_info *) malloc(sizeof(cl_fft_kernel_info));
(*kInfo)->kernel = 0;
if(R2 == 1)
(*kInfo)->lmem_size = 0;
else
{
if(strideO == 1)
(*kInfo)->lmem_size = (radix + 1)*batchSize;
else
(*kInfo)->lmem_size = threadsPerBlock*R1;
}
(*kInfo)->num_workgroups = numBlocks;
(*kInfo)->num_xforms_per_workgroup = 1;
(*kInfo)->num_workitems_per_workgroup = threadsPerBlock;
(*kInfo)->dir = dir;
if( (passNum == (numPasses - 1)) && (numPasses & 1) )
(*kInfo)->in_place_possible = 1;
else
(*kInfo)->in_place_possible = 0;
(*kInfo)->next = NULL;
(*kInfo)->kernel_name = (char *) malloc(sizeof(char)*(kernelName.size()+1));
strcpy((*kInfo)->kernel_name, kernelName.c_str());
insertVariables(localString, R1);
if(vertical)
{
localString += string("xNum = groupId >> ") + num2str((int)log2(numBlocksPerXForm)) + string(";\n");
localString += string("groupId = groupId & ") + num2str(numBlocksPerXForm - 1) + string(";\n");
localString += string("indexIn = mad24(groupId, ") + num2str(batchSize) + string(", xNum << ") + num2str((int)log2(n*BS)) + string(");\n");
localString += string("tid = mul24(groupId, ") + num2str(batchSize) + string(");\n");
localString += string("i = tid >> ") + num2str(lgStrideO) + string(";\n");
localString += string("j = tid & ") + num2str(strideO - 1) + string(";\n");
int stride = radix*Rinit;
for(i = 0; i < passNum; i++)
stride *= radixArr[i];
localString += string("indexOut = mad24(i, ") + num2str(stride) + string(", j + ") + string("(xNum << ") + num2str((int) log2(n*BS)) + string("));\n");
localString += string("bNum = groupId;\n");
}
else
{
int lgNumBlocksPerXForm = log2(numBlocksPerXForm);
localString += string("bNum = groupId & ") + num2str(numBlocksPerXForm - 1) + string(";\n");
localString += string("xNum = groupId >> ") + num2str(lgNumBlocksPerXForm) + string(";\n");
localString += string("indexIn = mul24(bNum, ") + num2str(batchSize) + string(");\n");
localString += string("tid = indexIn;\n");
localString += string("i = tid >> ") + num2str(lgStrideO) + string(";\n");
localString += string("j = tid & ") + num2str(strideO - 1) + string(";\n");
int stride = radix*Rinit;
for(i = 0; i < passNum; i++)
stride *= radixArr[i];
localString += string("indexOut = mad24(i, ") + num2str(stride) + string(", j);\n");
localString += string("indexIn += (xNum << ") + num2str(m) + string(");\n");
localString += string("indexOut += (xNum << ") + num2str(m) + string(");\n");
}
// Load Data
int lgBatchSize = log2(batchSize);
localString += string("tid = lId;\n");
localString += string("i = tid & ") + num2str(batchSize - 1) + string(";\n");
localString += string("j = tid >> ") + num2str(lgBatchSize) + string(";\n");
localString += string("indexIn += mad24(j, ") + num2str(strideI) + string(", i);\n");
if(dataFormat == clFFT_SplitComplexFormat)
{
localString += string("in_real += indexIn;\n");
localString += string("in_imag += indexIn;\n");
for(j = 0; j < R1; j++)
localString += string("a[") + num2str(j) + string("].x = in_real[") + num2str(j*gInInc*strideI) + string("];\n");
for(j = 0; j < R1; j++)
localString += string("a[") + num2str(j) + string("].y = in_imag[") + num2str(j*gInInc*strideI) + string("];\n");
}
else
{
localString += string("in += indexIn;\n");
for(j = 0; j < R1; j++)
localString += string("a[") + num2str(j) + string("] = in[") + num2str(j*gInInc*strideI) + string("];\n");
}
localString += string("fftKernel") + num2str(R1) + string("(a, dir);\n");
if(R2 > 1)
{
// twiddle
for(k = 1; k < R1; k++)
{
localString += string("ang = dir*(2.0f*M_PI*") + num2str(k) + string("/") + num2str(radix) + string(")*j;\n");
localString += string("w = (float2)(native_cos(ang), native_sin(ang));\n");
localString += string("a[") + num2str(k) + string("] = complexMul(a[") + num2str(k) + string("], w);\n");
}
// shuffle
numIter = R1 / R2;
localString += string("indexIn = mad24(j, ") + num2str(threadsPerBlock*numIter) + string(", i);\n");
localString += string("lMemStore = sMem + tid;\n");
localString += string("lMemLoad = sMem + indexIn;\n");
for(k = 0; k < R1; k++)
localString += string("lMemStore[") + num2str(k*threadsPerBlock) + string("] = a[") + num2str(k) + string("].x;\n");
localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
for(k = 0; k < numIter; k++)
for(t = 0; t < R2; t++)
localString += string("a[") + num2str(k*R2+t) + string("].x = lMemLoad[") + num2str(t*batchSize + k*threadsPerBlock) + string("];\n");
localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
for(k = 0; k < R1; k++)
localString += string("lMemStore[") + num2str(k*threadsPerBlock) + string("] = a[") + num2str(k) + string("].y;\n");
localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
for(k = 0; k < numIter; k++)
for(t = 0; t < R2; t++)
localString += string("a[") + num2str(k*R2+t) + string("].y = lMemLoad[") + num2str(t*batchSize + k*threadsPerBlock) + string("];\n");
localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
for(j = 0; j < numIter; j++)
localString += string("fftKernel") + num2str(R2) + string("(a + ") + num2str(j*R2) + string(", dir);\n");
}
// twiddle
if(passNum < (numPasses - 1))
{
localString += string("l = ((bNum << ") + num2str(lgBatchSize) + string(") + i) >> ") + num2str(lgStrideO) + string(";\n");
localString += string("k = j << ") + num2str((int)log2(R1/R2)) + string(";\n");
localString += string("ang1 = dir*(2.0f*M_PI/") + num2str(N) + string(")*l;\n");
for(t = 0; t < R1; t++)
{
localString += string("ang = ang1*(k + ") + num2str((t%R2)*R1 + (t/R2)) + string(");\n");
localString += string("w = (float2)(native_cos(ang), native_sin(ang));\n");
localString += string("a[") + num2str(t) + string("] = complexMul(a[") + num2str(t) + string("], w);\n");
}
}
// Store Data
if(strideO == 1)
{
localString += string("lMemStore = sMem + mad24(i, ") + num2str(radix + 1) + string(", j << ") + num2str((int)log2(R1/R2)) + string(");\n");
localString += string("lMemLoad = sMem + mad24(tid >> ") + num2str((int)log2(radix)) + string(", ") + num2str(radix+1) + string(", tid & ") + num2str(radix-1) + string(");\n");
for(int i = 0; i < R1/R2; i++)
for(int j = 0; j < R2; j++)
localString += string("lMemStore[ ") + num2str(i + j*R1) + string("] = a[") + num2str(i*R2+j) + string("].x;\n");
localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
if(threadsPerBlock >= radix)
{
for(int i = 0; i < R1; i++)
localString += string("a[") + num2str(i) + string("].x = lMemLoad[") + num2str(i*(radix+1)*(threadsPerBlock/radix)) + string("];\n");
}
else
{
int innerIter = radix/threadsPerBlock;
int outerIter = R1/innerIter;
for(int i = 0; i < outerIter; i++)
for(int j = 0; j < innerIter; j++)
localString += string("a[") + num2str(i*innerIter+j) + string("].x = lMemLoad[") + num2str(j*threadsPerBlock + i*(radix+1)) + string("];\n");
}
localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
for(int i = 0; i < R1/R2; i++)
for(int j = 0; j < R2; j++)
localString += string("lMemStore[ ") + num2str(i + j*R1) + string("] = a[") + num2str(i*R2+j) + string("].y;\n");
localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
if(threadsPerBlock >= radix)
{
for(int i = 0; i < R1; i++)
localString += string("a[") + num2str(i) + string("].y = lMemLoad[") + num2str(i*(radix+1)*(threadsPerBlock/radix)) + string("];\n");
}
else
{
int innerIter = radix/threadsPerBlock;
int outerIter = R1/innerIter;
for(int i = 0; i < outerIter; i++)
for(int j = 0; j < innerIter; j++)
localString += string("a[") + num2str(i*innerIter+j) + string("].y = lMemLoad[") + num2str(j*threadsPerBlock + i*(radix+1)) + string("];\n");
}
localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
localString += string("indexOut += tid;\n");
if(dataFormat == clFFT_SplitComplexFormat) {
localString += string("out_real += indexOut;\n");
localString += string("out_imag += indexOut;\n");
for(k = 0; k < R1; k++)
localString += string("out_real[") + num2str(k*threadsPerBlock) + string("] = a[") + num2str(k) + string("].x;\n");
for(k = 0; k < R1; k++)
localString += string("out_imag[") + num2str(k*threadsPerBlock) + string("] = a[") + num2str(k) + string("].y;\n");
}
else {
localString += string("out += indexOut;\n");
for(k = 0; k < R1; k++)
localString += string("out[") + num2str(k*threadsPerBlock) + string("] = a[") + num2str(k) + string("];\n");
}
}
else
{
localString += string("indexOut += mad24(j, ") + num2str(numIter*strideO) + string(", i);\n");
if(dataFormat == clFFT_SplitComplexFormat) {
localString += string("out_real += indexOut;\n");
localString += string("out_imag += indexOut;\n");
for(k = 0; k < R1; k++)
localString += string("out_real[") + num2str(((k%R2)*R1 + (k/R2))*strideO) + string("] = a[") + num2str(k) + string("].x;\n");
for(k = 0; k < R1; k++)
localString += string("out_imag[") + num2str(((k%R2)*R1 + (k/R2))*strideO) + string("] = a[") + num2str(k) + string("].y;\n");
}
else {
localString += string("out += indexOut;\n");
for(k = 0; k < R1; k++)
localString += string("out[") + num2str(((k%R2)*R1 + (k/R2))*strideO) + string("] = a[") + num2str(k) + string("];\n");
}
}
insertHeader(*kernelString, kernelName, dataFormat);
*kernelString += string("{\n");
if((*kInfo)->lmem_size)
*kernelString += string(" __local float sMem[") + num2str((*kInfo)->lmem_size) + string("];\n");
*kernelString += localString;
*kernelString += string("}\n");
N /= radix;
kInfo = &(*kInfo)->next;
kCount++;
}
}
void FFT1D(cl_fft_plan *plan, cl_fft_kernel_dir dir)
{
unsigned int radixArray[10];
unsigned int numRadix;
switch(dir)
{
case cl_fft_kernel_x:
if(plan->n.x > plan->max_localmem_fft_size)
{
createGlobalFFTKernelString(plan, plan->n.x, 1, cl_fft_kernel_x, 1);
}
else if(plan->n.x > 1)
{
getRadixArray(plan->n.x, radixArray, &numRadix, 0);
if(plan->n.x / radixArray[0] <= plan->max_work_item_per_workgroup)
{
createLocalMemfftKernelString(plan);
}
else
{
getRadixArray(plan->n.x, radixArray, &numRadix, plan->max_radix);
if(plan->n.x / radixArray[0] <= plan->max_work_item_per_workgroup)
createLocalMemfftKernelString(plan);
else
createGlobalFFTKernelString(plan, plan->n.x, 1, cl_fft_kernel_x, 1);
}
}
break;
case cl_fft_kernel_y:
if(plan->n.y > 1)
createGlobalFFTKernelString(plan, plan->n.y, plan->n.x, cl_fft_kernel_y, 1);
break;
case cl_fft_kernel_z:
if(plan->n.z > 1)
createGlobalFFTKernelString(plan, plan->n.z, plan->n.x*plan->n.y, cl_fft_kernel_z, 1);
default:
return;
}
}
#include "fft_internal.h"
#include "fft_base_kernels.h"
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <iostream>
#include <string>
#include <sstream>
using namespace std;
extern void getKernelWorkDimensions(cl_fft_plan *plan, cl_fft_kernel_info *kernelInfo, cl_int *batchSize, size_t *gWorkItems, size_t *lWorkItems);
static void
getBlockConfigAndKernelString(cl_fft_plan *plan)
{
plan->temp_buffer_needed = 0;
*plan->kernel_string += baseKernels;
if(plan->format == clFFT_SplitComplexFormat)
*plan->kernel_string += twistKernelPlannar;
else
*plan->kernel_string += twistKernelInterleaved;
switch(plan->dim)
{
case clFFT_1D:
FFT1D(plan, cl_fft_kernel_x);
break;
case clFFT_2D:
FFT1D(plan, cl_fft_kernel_x);
FFT1D(plan, cl_fft_kernel_y);
break;
case clFFT_3D:
FFT1D(plan, cl_fft_kernel_x);
FFT1D(plan, cl_fft_kernel_y);
FFT1D(plan, cl_fft_kernel_z);
break;
default:
return;
}
plan->temp_buffer_needed = 0;
cl_fft_kernel_info *kInfo = plan->kernel_info;
while(kInfo)
{
plan->temp_buffer_needed |= !kInfo->in_place_possible;
kInfo = kInfo->next;
}
}
static void
deleteKernelInfo(cl_fft_kernel_info *kInfo)
{
if(kInfo)
{
if(kInfo->kernel_name)
free(kInfo->kernel_name);
if(kInfo->kernel)
clReleaseKernel(kInfo->kernel);
free(kInfo);
}
}
static void
destroy_plan(cl_fft_plan *Plan)
{
cl_fft_kernel_info *kernel_info = Plan->kernel_info;
while(kernel_info)
{
cl_fft_kernel_info *tmp = kernel_info->next;
deleteKernelInfo(kernel_info);
kernel_info = tmp;
}
Plan->kernel_info = NULL;
if(Plan->kernel_string)
{
delete Plan->kernel_string;
Plan->kernel_string = NULL;
}
if(Plan->twist_kernel)
{
clReleaseKernel(Plan->twist_kernel);
Plan->twist_kernel = NULL;
}
if(Plan->program)
{
clReleaseProgram(Plan->program);
Plan->program = NULL;
}
if(Plan->tempmemobj)
{
clReleaseMemObject(Plan->tempmemobj);
Plan->tempmemobj = NULL;
}
if(Plan->tempmemobj_real)
{
clReleaseMemObject(Plan->tempmemobj_real);
Plan->tempmemobj_real = NULL;
}
if(Plan->tempmemobj_imag)
{
clReleaseMemObject(Plan->tempmemobj_imag);
Plan->tempmemobj_imag = NULL;
}
}
static int
createKernelList(cl_fft_plan *plan)
{
cl_program program = plan->program;
cl_fft_kernel_info *kernel_info = plan->kernel_info;
cl_int err;
while(kernel_info)
{
kernel_info->kernel = clCreateKernel(program, kernel_info->kernel_name, &err);
if(!kernel_info->kernel || err != CL_SUCCESS)
return err;
kernel_info = kernel_info->next;
}
if(plan->format == clFFT_SplitComplexFormat)
plan->twist_kernel = clCreateKernel(program, "clFFT_1DTwistSplit", &err);
else
plan->twist_kernel = clCreateKernel(program, "clFFT_1DTwistInterleaved", &err);
if(!plan->twist_kernel || err)
return err;
return CL_SUCCESS;
}
int getMaxKernelWorkGroupSize(cl_fft_plan *plan, unsigned int *max_wg_size, unsigned int num_devices, cl_device_id *devices)
{
int reg_needed = 0;
//*max_wg_size = INT_MAX;
*max_wg_size = 0x7fffffff;
int err;
size_t wg_size;
unsigned int i;
for(i = 0; i < num_devices; i++)
{
cl_fft_kernel_info *kInfo = plan->kernel_info;
while(kInfo)
{
err = clGetKernelWorkGroupInfo(kInfo->kernel, devices[i], CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &wg_size, NULL);
if(err != CL_SUCCESS)
return -1;
if(wg_size < kInfo->num_workitems_per_workgroup)
reg_needed |= 1;
if(*max_wg_size > wg_size)
*max_wg_size = wg_size;
kInfo = kInfo->next;
}
}
return reg_needed;
}
#define ERR_MACRO(err) { \
if( err != CL_SUCCESS) \
{ \
if(error_code) \
*error_code = err; \
clFFT_DestroyPlan((clFFT_Plan) plan); \
return (clFFT_Plan) NULL; \
} \
}
clFFT_Plan
clFFT_CreatePlan(cl_context context, clFFT_Dim3 n, clFFT_Dimension dim, clFFT_DataFormat dataFormat, cl_int *error_code )
{
int i;
cl_int err;
int isPow2 = 1;
cl_fft_plan *plan = NULL;
ostringstream kString;
int num_devices;
int gpu_found = 0;
cl_device_id devices[16];
size_t ret_size;
cl_device_type device_type;
if(!context)
ERR_MACRO(CL_INVALID_VALUE);
isPow2 |= n.x && !( (n.x - 1) & n.x );
isPow2 |= n.y && !( (n.y - 1) & n.y );
isPow2 |= n.z && !( (n.z - 1) & n.z );
if(!isPow2)
ERR_MACRO(CL_INVALID_VALUE);
if( (dim == clFFT_1D && (n.y != 1 || n.z != 1)) || (dim == clFFT_2D && n.z != 1) )
ERR_MACRO(CL_INVALID_VALUE);
plan = (cl_fft_plan *) malloc(sizeof(cl_fft_plan));
if(!plan)
ERR_MACRO(CL_OUT_OF_RESOURCES);
plan->context = context;
clRetainContext(context);
plan->n = n;
plan->dim = dim;
plan->format = dataFormat;
plan->kernel_info = 0;
plan->num_kernels = 0;
plan->twist_kernel = 0;
plan->program = 0;
plan->temp_buffer_needed = 0;
plan->last_batch_size = 0;
plan->tempmemobj = 0;
plan->tempmemobj_real = 0;
plan->tempmemobj_imag = 0;
plan->max_localmem_fft_size = 2048;
plan->max_work_item_per_workgroup = 256;
plan->max_radix = 16;
plan->min_mem_coalesce_width = 16;
plan->num_local_mem_banks = 16;
patch_kernel_source:
plan->kernel_string = new string("");
if(!plan->kernel_string)
ERR_MACRO(CL_OUT_OF_RESOURCES);
getBlockConfigAndKernelString(plan);
const char *source_str = plan->kernel_string->c_str();
plan->program = clCreateProgramWithSource(context, 1, (const char**) &source_str, NULL, &err);
ERR_MACRO(err);
err = clGetContextInfo(context, CL_CONTEXT_DEVICES, sizeof(devices), devices, &ret_size);
ERR_MACRO(err);
num_devices = ret_size / sizeof(cl_device_id);
for(i = 0; i < num_devices; i++)
{
err = clGetDeviceInfo(devices[i], CL_DEVICE_TYPE, sizeof(device_type), &device_type, NULL);
ERR_MACRO(err);
if(device_type == CL_DEVICE_TYPE_GPU)
{
gpu_found = 1;
err = clBuildProgram(plan->program, 1, &devices[i], "-cl-mad-enable", NULL, NULL);
if (err != CL_SUCCESS)
{
char *build_log;
char devicename[200];
size_t log_size;
err = clGetProgramBuildInfo(plan->program, devices[i], CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
printf ("E: %d\n", err);
ERR_MACRO(err);
build_log = (char *) malloc(log_size + 1);
err = clGetProgramBuildInfo(plan->program, devices[i], CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL);
printf ("E: %d\n", err);
ERR_MACRO(err);
err = clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(devicename), devicename, NULL);
printf ("E: %d\n", err);
ERR_MACRO(err);
fprintf(stdout, "FFT program build log on device %s\n", devicename);
fprintf(stdout, "%s\n", build_log);
free(build_log);
ERR_MACRO(err);
}
}
}
if(!gpu_found)
ERR_MACRO(CL_INVALID_CONTEXT);
err = createKernelList(plan);
ERR_MACRO(err);
// we created program and kernels based on "some max work group size (default 256)" ... this work group size
// may be larger than what kernel may execute with ... if thats the case we need to regenerate the kernel source
// setting this as limit i.e max group size and rebuild.
unsigned int max_kernel_wg_size;
int patching_req = getMaxKernelWorkGroupSize(plan, &max_kernel_wg_size, num_devices, devices);
if(patching_req == -1)
{
ERR_MACRO(err);
}
if(patching_req)
{
destroy_plan(plan);
plan->max_work_item_per_workgroup = max_kernel_wg_size;
goto patch_kernel_source;
}
cl_fft_kernel_info *kInfo = plan->kernel_info;
while(kInfo)
{
plan->num_kernels++;
kInfo = kInfo->next;
}
if(error_code)
*error_code = CL_SUCCESS;
return (clFFT_Plan) plan;
}
void
clFFT_DestroyPlan(clFFT_Plan plan)
{
cl_fft_plan *Plan = (cl_fft_plan *) plan;
if(Plan)
{
destroy_plan(Plan);
clReleaseContext(Plan->context);
free(Plan);
}
}
void clFFT_DumpPlan( clFFT_Plan Plan, FILE *file)
{
size_t gDim, lDim;
FILE *out;
if(!file)
out = stdout;
else
out = file;
cl_fft_plan *plan = (cl_fft_plan *) Plan;
cl_fft_kernel_info *kInfo = plan->kernel_info;
while(kInfo)
{
cl_int s = 1;
getKernelWorkDimensions(plan, kInfo, &s, &gDim, &lDim);
fprintf(out, "Run kernel %s with global dim = {%zd*BatchSize}, local dim={%zd}\n", kInfo->kernel_name, gDim, lDim);
kInfo = kInfo->next;
}
fprintf(out, "%s\n", plan->kernel_string->c_str());
}
main.cpp 0 → 100644
#include <string.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <CL/cl.h>
#include "clFFT.h"
#include <sys/types.h>
#include <sys/stat.h>
#include <stdint.h>
#include <float.h>
#define eps_avg 10.0
#define MAX( _a, _b) ((_a)>(_b)?(_a) : (_b))
typedef enum {
clFFT_OUT_OF_PLACE,
clFFT_IN_PLACE,
}clFFT_TestType;
typedef struct
{
double real;
double imag;
}clFFT_ComplexDouble;
typedef struct
{
double *real;
double *imag;
}clFFT_SplitComplexDouble;
cl_device_id device_id;
cl_context context;
cl_command_queue queue;
// ADDED
void log_error (char* s, ...) {printf ("ERROR: %s\n", s);}
// ADDED
void log_info (char* s, ...) {printf ("INFO: %s\n", s);}
int runTest(clFFT_Dim3 n, int batchSize, clFFT_Direction dir, clFFT_Dimension dim,
clFFT_DataFormat dataFormat, int numIter, clFFT_TestType testType)
{
cl_int err = CL_SUCCESS;
int iter;
double t;
uint64_t t0, t1;
int mx = log2(n.x);
int my = log2(n.y);
int mz = log2(n.z);
int length = n.x * n.y * n.z * batchSize;
double gflops = 5e-9 * ((double)mx + (double)my + (double)mz) * (double)n.x * (double)n.y * (double)n.z * (double)batchSize * (double)numIter;
clFFT_SplitComplex data_i_split = (clFFT_SplitComplex) { NULL, NULL };
clFFT_SplitComplex data_cl_split = (clFFT_SplitComplex) { NULL, NULL };
clFFT_Complex *data_i = NULL;
clFFT_Complex *data_cl = NULL;
clFFT_SplitComplexDouble data_iref = (clFFT_SplitComplexDouble) { NULL, NULL };
clFFT_SplitComplexDouble data_oref = (clFFT_SplitComplexDouble) { NULL, NULL };
clFFT_Plan plan = NULL;
cl_mem data_in = NULL;
cl_mem data_out = NULL;
cl_mem data_in_real = NULL;
cl_mem data_in_imag = NULL;
cl_mem data_out_real = NULL;
cl_mem data_out_imag = NULL;
if(dataFormat == clFFT_SplitComplexFormat) {
data_i_split.real = (float *) malloc(sizeof(float) * length);
data_i_split.imag = (float *) malloc(sizeof(float) * length);
data_cl_split.real = (float *) malloc(sizeof(float) * length);
data_cl_split.imag = (float *) malloc(sizeof(float) * length);
if(!data_i_split.real || !data_i_split.imag || !data_cl_split.real || !data_cl_split.imag)
{
err = -1;
log_error((char*)"Out-of-Resources\n");
goto cleanup;
}
}
else {
data_i = (clFFT_Complex *) malloc(sizeof(clFFT_Complex)*length);
data_cl = (clFFT_Complex *) malloc(sizeof(clFFT_Complex)*length);
if(!data_i || !data_cl)
{
err = -2;
log_error((char*)"Out-of-Resouces\n");
goto cleanup;
}
}
data_iref.real = (double *) malloc(sizeof(double) * length);
data_iref.imag = (double *) malloc(sizeof(double) * length);
data_oref.real = (double *) malloc(sizeof(double) * length);
data_oref.imag = (double *) malloc(sizeof(double) * length);
if(!data_iref.real || !data_iref.imag || !data_oref.real || !data_oref.imag)
{
err = -3;
log_error((char*)"Out-of-Resources\n");
goto cleanup;
}
int i;
if(dataFormat == clFFT_SplitComplexFormat) {
for(i = 0; i < length; i++)
{
data_i_split.real[i] = 2.0f * (float) rand() / (float) RAND_MAX - 1.0f;
data_i_split.imag[i] = 2.0f * (float) rand() / (float) RAND_MAX - 1.0f;
data_cl_split.real[i] = 0.0f;
data_cl_split.imag[i] = 0.0f;
data_iref.real[i] = data_i_split.real[i];
data_iref.imag[i] = data_i_split.imag[i];
data_oref.real[i] = data_iref.real[i];
data_oref.imag[i] = data_iref.imag[i];
}
}
else {
// ADDED
FILE* f = fopen ("waveform", "r");
for(i = 0; i < length; i++)
{
// ADDED
// ADDED
fscanf (f, "%f\n", &data_i[i].real);
data_i[i].imag = 0;
data_cl[i].real = 0.0f;
data_cl[i].imag = 0.0f;
data_iref.real[i] = data_i[i].real;
data_iref.imag[i] = data_i[i].imag;
data_oref.real[i] = data_iref.real[i];
data_oref.imag[i] = data_iref.imag[i];
}
}
plan = clFFT_CreatePlan( context, n, dim, dataFormat, &err );
if(!plan || err)
{
log_error((char*)"clFFT_CreatePlan failed\n");
goto cleanup;
}
if(dataFormat == clFFT_SplitComplexFormat)
{
data_in_real = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length*sizeof(float), data_i_split.real, &err);
if(!data_in_real || err)
{
log_error((char*)"clCreateBuffer failed\n");
goto cleanup;
}
data_in_imag = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length*sizeof(float), data_i_split.imag, &err);
if(!data_in_imag || err)
{
log_error((char*)"clCreateBuffer failed\n");
goto cleanup;
}
if(testType == clFFT_OUT_OF_PLACE)
{
data_out_real = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length*sizeof(float), data_cl_split.real, &err);
if(!data_out_real || err)
{
log_error((char*)"clCreateBuffer failed\n");
goto cleanup;
}
data_out_imag = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length*sizeof(float), data_cl_split.imag, &err);
if(!data_out_imag || err)
{
log_error((char*)"clCreateBuffer failed\n");
goto cleanup;
}
}
else
{
data_out_real = data_in_real;
data_out_imag = data_in_imag;
}
}
else
{
data_in = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length*sizeof(float)*2, data_i, &err);
if(!data_in)
{
log_error((char*)"clCreateBuffer failed\n");
goto cleanup;
}
if(testType == clFFT_OUT_OF_PLACE)
{
data_out = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length*sizeof(float)*2, data_cl, &err);
if(!data_out)
{
log_error((char*)"clCreateBuffer failed\n");
goto cleanup;
}
}
else
data_out = data_in;
}
err = CL_SUCCESS;
if(dataFormat == clFFT_SplitComplexFormat)
{
for(iter = 0; iter < numIter; iter++)
err |= clFFT_ExecutePlannar(queue, plan, batchSize, dir, data_in_real, data_in_imag, data_out_real, data_out_imag, 0, NULL, NULL);
}
else
{
for(iter = 0; iter < numIter; iter++)
err |= clFFT_ExecuteInterleaved(queue, plan, batchSize, dir, data_in, data_out, 0, NULL, NULL);
}
err |= clFinish(queue);
if(err)
{
log_error((char*)"clFFT_Execute\n");
goto cleanup;
}
if(dataFormat == clFFT_SplitComplexFormat)
{
err |= clEnqueueReadBuffer(queue, data_out_real, CL_TRUE, 0, length*sizeof(float), data_cl_split.real, 0, NULL, NULL);
err |= clEnqueueReadBuffer(queue, data_out_imag, CL_TRUE, 0, length*sizeof(float), data_cl_split.imag, 0, NULL, NULL);
for (int i = 0; i < length; i++)
// ADDED
printf ("%3d %7.3f %7.3f\n", i, data_cl_split.real[i], data_cl_split.imag[i]);
}
else
{
err |= clEnqueueReadBuffer(queue, data_out, CL_TRUE, 0, length*sizeof(float)*2, data_cl, 0, NULL, NULL);
for (int i = 0; i < length; i++)
// ADDED
printf ("%3d %7.3f %7.3f\n", i, data_cl[i].real, data_cl[i].imag);
}
if(err)
{
log_error((char*)"clEnqueueReadBuffer failed\n");
goto cleanup;
}
cleanup:
clFFT_DestroyPlan(plan);
if(dataFormat == clFFT_SplitComplexFormat)
{
if(data_i_split.real)
free(data_i_split.real);
if(data_i_split.imag)
free(data_i_split.imag);
if(data_cl_split.real)
free(data_cl_split.real);
if(data_cl_split.imag)
free(data_cl_split.imag);
if(data_in_real)
clReleaseMemObject(data_in_real);
if(data_in_imag)
clReleaseMemObject(data_in_imag);
if(data_out_real && testType == clFFT_OUT_OF_PLACE)
clReleaseMemObject(data_out_real);
if(data_out_imag && clFFT_OUT_OF_PLACE)
clReleaseMemObject(data_out_imag);
}
else
{
if(data_i)
free(data_i);
if(data_cl)
free(data_cl);
if(data_in)
clReleaseMemObject(data_in);
if(data_out && testType == clFFT_OUT_OF_PLACE)
clReleaseMemObject(data_out);
}
if(data_iref.real)
free(data_iref.real);
if(data_iref.imag)
free(data_iref.imag);
if(data_oref.real)
free(data_oref.real);
if(data_oref.imag)
free(data_oref.imag);
return err;
}
bool ifLineCommented(const char *line) {
const char *Line = line;
while(*Line != '\0')
if((*Line == '/') && (*(Line + 1) == '/'))
return true;
else
Line++;
return false;
}
cl_device_type getGlobalDeviceType()
{
char *force_cpu = getenv( "CL_DEVICE_TYPE" );
if( force_cpu != NULL )
{
if( strcmp( force_cpu, "gpu" ) == 0 || strcmp( force_cpu, "CL_DEVICE_TYPE_GPU" ) == 0 )
return CL_DEVICE_TYPE_GPU;
else if( strcmp( force_cpu, "cpu" ) == 0 || strcmp( force_cpu, "CL_DEVICE_TYPE_CPU" ) == 0 )
return CL_DEVICE_TYPE_CPU;
else if( strcmp( force_cpu, "accelerator" ) == 0 || strcmp( force_cpu, "CL_DEVICE_TYPE_ACCELERATOR" ) == 0 )
return CL_DEVICE_TYPE_ACCELERATOR;
else if( strcmp( force_cpu, "CL_DEVICE_TYPE_DEFAULT" ) == 0 )
return CL_DEVICE_TYPE_DEFAULT;
}
// default
return CL_DEVICE_TYPE_GPU;
}
void
notify_callback(const char *errinfo, const void *private_info, size_t cb, void *user_data)
{
log_error((char*) "%s\n", errinfo );
}
int
checkMemRequirements(clFFT_Dim3 n, int batchSize, clFFT_TestType testType, cl_ulong gMemSize)
{
cl_ulong memReq = (testType == clFFT_OUT_OF_PLACE) ? 3 : 2;
memReq *= n.x*n.y*n.z*sizeof(clFFT_Complex)*batchSize;
memReq = memReq/1024/1024;
if(memReq >= gMemSize)
return -1;
return 0;
}
int main (int argc, char * const argv[]) {
cl_ulong gMemSize;
clFFT_Direction dir = clFFT_Forward;
int numIter = 1;
clFFT_Dim3 n = { 1024, 1, 1 };
int batchSize = 1;
clFFT_DataFormat dataFormat = clFFT_SplitComplexFormat;
clFFT_Dimension dim = clFFT_1D;
clFFT_TestType testType = clFFT_OUT_OF_PLACE;
cl_device_id device_ids[16];
FILE *paramFile;
cl_int err;
cl_platform_id cpPlatform;
err = clGetPlatformIDs(1, &cpPlatform, 0);
unsigned int num_devices;
cl_device_type device_type = getGlobalDeviceType();
if(device_type != CL_DEVICE_TYPE_GPU)
{
log_info((char*)"Test only supported on DEVICE_TYPE_GPU\n");
exit(0);
}
err = clGetDeviceIDs(cpPlatform, device_type, sizeof(device_ids), device_ids, &num_devices);
if(err)
{
log_error((char*)"clGetComputeDevice failed\n");
return -1;
}
device_id = NULL;
unsigned int i;
for(i = 0; i < num_devices; i++)
{
cl_bool available;
err = clGetDeviceInfo(device_ids[i], CL_DEVICE_AVAILABLE, sizeof(cl_bool), &available, NULL);
if(err)
{
log_error((char*)"Cannot check device availability of device # %d\n", i);
}
if(available)
{
device_id = device_ids[i];
break;
}
else
{
char name[200];
err = clGetDeviceInfo(device_ids[i], CL_DEVICE_NAME, sizeof(name), name, NULL);
if(err == CL_SUCCESS)
{
log_info((char*)"Device %s not available for compute\n", name);
}
else
{
log_info((char*)"Device # %d not available for compute\n", i);
}
}
}
if(!device_id)
{
log_error((char*)"None of the devices available for compute ... aborting test\n");
//test_finish();
return -1;
}
context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
if(!context || err)
{
log_error((char*)"clCreateContext failed\n");
//test_finish();
return -1;
}
queue = clCreateCommandQueue(context, device_id, 0, &err);
if(!queue || err)
{
log_error((char*)"clCreateCommandQueue() failed.\n");
clReleaseContext(context);
//test_finish();
return -1;
}
err = clGetDeviceInfo(device_id, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(cl_ulong), &gMemSize, NULL);
if(err)
{
log_error((char*)"Failed to get global mem size\n");
clReleaseContext(context);
clReleaseCommandQueue(queue);
//test_finish();
return -2;
}
gMemSize /= (1024*1024);
char delim[] = " \n";
char tmpStr[100];
char line[200];
char *param, *val;
int total_errors = 0;
if(argc == 1) {
log_error((char*)"Need file name with list of parameters to run the test\n");
//test_finish();
return -1;
}
if(argc == 2) { // arguments are supplied in a file with arguments for a single run are all on the same line
paramFile = fopen(argv[1], "r");
if(!paramFile) {
log_error((char*)"Cannot open the parameter file\n");
clReleaseContext(context);
clReleaseCommandQueue(queue);
//test_finish();
return -3;
}
while(fgets(line, 199, paramFile)) {
if(!strcmp(line, "") || !strcmp(line, "\n") || ifLineCommented(line))
continue;
param = strtok(line, delim);
while(param) {
val = strtok(NULL, delim);
if(!strcmp(param, "-n")) {
sscanf(val, "%d", &n.x);
val = strtok(NULL, delim);
sscanf(val, "%d", &n.y);
val = strtok(NULL, delim);
sscanf(val, "%d", &n.z);
}
else if(!strcmp(param, "-batchsize"))
sscanf(val, "%d", &batchSize);
else if(!strcmp(param, "-dir")) {
sscanf(val, "%s", tmpStr);
if(!strcmp(tmpStr, "forward"))
dir = clFFT_Forward;
else if(!strcmp(tmpStr, "inverse"))
dir = clFFT_Inverse;
}
else if(!strcmp(param, "-dim")) {
sscanf(val, "%s", tmpStr);
if(!strcmp(tmpStr, "1D"))
dim = clFFT_1D;
else if(!strcmp(tmpStr, "2D"))
dim = clFFT_2D;
else if(!strcmp(tmpStr, "3D"))
dim = clFFT_3D;
}
else if(!strcmp(param, "-format")) {
sscanf(val, "%s", tmpStr);
if(!strcmp(tmpStr, "plannar"))
dataFormat = clFFT_SplitComplexFormat;
else if(!strcmp(tmpStr, "interleaved"))
dataFormat = clFFT_InterleavedComplexFormat;
}
else if(!strcmp(param, "-numiter"))
sscanf(val, "%d", &numIter);
else if(!strcmp(param, "-testtype")) {
sscanf(val, "%s", tmpStr);
if(!strcmp(tmpStr, "out-of-place"))
testType = clFFT_OUT_OF_PLACE;
else if(!strcmp(tmpStr, "in-place"))
testType = clFFT_IN_PLACE;
}
param = strtok(NULL, delim);
}
if(checkMemRequirements(n, batchSize, testType, gMemSize)) {
log_info((char*)"This test cannot run because memory requirements canot be met by the available device\n");
continue;
}
err = runTest(n, batchSize, dir, dim, dataFormat, numIter, testType);
if (err)
total_errors++;
}
}
clReleaseContext(context);
clReleaseCommandQueue(queue);
return total_errors;
}
-n 64 1 1 -batchsize 8192 -dir forward -dim 1D -format plannar -numiter 1000 -testtype out-of-place
-n 1024 1 1 -batchsize 8192 -dir forward -dim 1D -format plannar -numiter 1000 -testtype out-of-place
-n 1048576 1 1 -batchsize 4 -dir inverse -dim 1D -format interleaved -numiter 1000 -testtype out-of-place
-n 1024 512 1 -batchsize 8 -dir forward -dim 2D -format interleaved -numiter 1000 -testtype out-of-place
-n 128 128 128 -batchsize 1 -dir inverse -dim 3D -format interleaved -numiter 1000 -testtype out-of-place
-n 16384 1 1 -batchsize 4 -dir forward -dim 1D -format interleaved -numiter 1 -testtype in-place
-n 32 2048 1 -batchsize 8 -dir forward -dim 2D -format interleaved -numiter 1 -testtype in-place
-n 4096 64 1 -batchsize 4 -dir inverse -dim 2D -format plannar -numiter 1 -testtype in-place
-n 64 32 16 -batchsize 1 -dir inverse -dim 3D -format interleaved -numiter 1 -testtype out-of-place
-n 256 1 1 -batchsize 1 -dir forward -dim 1D -format interleaved -numiter 1 -testtype in-place
waveform 0 → 100644
0.900000
0.825801
0.614719
0.299536
-0.070711
-0.438241
-0.745375
-0.943389
-1.000000
-0.904370
-0.668838
-0.327126
0.070711
0.465831
0.799495
1.021958
1.100000
1.021958
0.799495
0.465830
0.070710
-0.327127
-0.668839
-0.904370
-1.000000
-0.943389
-0.745375
-0.438240
-0.070710
0.299536
0.614719
0.825801
0.900000
0.825801
0.614718
0.299536
-0.070711
-0.438241
-0.745375
-0.943389
-1.000000
-0.904371
-0.668838
-0.327126
0.070712
0.465831
0.799495
1.021958
1.100000
1.021958
0.799494
0.465829
0.070710
-0.327126
-0.668839
-0.904371
-1.000000
-0.943388
-0.745375
-0.438240
-0.070711
0.299538
0.614719
0.825802
0.900000
0.825801
0.614718
0.299536
-0.070712
-0.438241
-0.745376
-0.943389
-1.000000
-0.904370
-0.668838
-0.327125
0.070711
0.465830
0.799495
1.021958
1.100000
1.021958
0.799495
0.465830
0.070709
-0.327126
-0.668839
-0.904371
-1.000000
-0.943389
-0.745374
-0.438239
-0.070711
0.299537
0.614720
0.825802
0.900000
0.825801
0.614717
0.299537
-0.070711
-0.438242
-0.745377
-0.943389
-1.000000
-0.904369
-0.668839
-0.327126
0.070713
0.465830
0.799495
1.021959
1.100000
1.021958
0.799494
0.465828
0.070711
-0.327127
-0.668840
-0.904372
-1.000000
-0.943388
-0.745374
-0.438241
-0.070710
0.299538
0.614721
0.825801
0.900000
0.825800
0.614719
0.299536
-0.070713
-0.438243
-0.745375
-0.943389
-1.000000
-0.904370
-0.668838
-0.327124
0.070714
0.465831
0.799496
1.021959
1.100000
1.021958
0.799493
0.465827
0.070710
-0.327128
-0.668841
-0.904371
-1.000000
-0.943388
-0.745375
-0.438240
-0.070709
0.299539
0.614719
0.825802
0.900000
0.825801
0.614718
0.299538
-0.070710
-0.438241
-0.745376
-0.943390
-1.000000
-0.904368
-0.668840
-0.327127
0.070712
0.465832
0.799497
1.021960
1.100000
1.021959
0.799495
0.465829
0.070709
-0.327130
-0.668842
-0.904373
-1.000000
-0.943389
-0.745374
-0.438238
-0.070707
0.299540
0.614723
0.825801
0.900000
0.825801
0.614717
0.299533
-0.070715
-0.438246
-0.745375
-0.943389
-1.000000
-0.904369
-0.668836
-0.327122
0.070717
0.465830
0.799495
1.021959
1.100000
1.021957
0.799491
0.465825
0.070711
-0.327127
-0.668840
-0.904372
-1.000000
-0.943387
-0.745376
-0.438241
-0.070710
0.299538
0.614721
0.825803
0.900000
0.825801
0.614719
0.299536
-0.070713
-0.438243
-0.745378
-0.943390
-1.000000
-0.904370
-0.668838
-0.327124
0.070714
0.465835
0.799499
1.021958
1.100000
1.021958
0.799493
0.465827
0.070706
-0.327132
-0.668838
-0.904371
-1.000000
-0.943388
-0.745373
-0.438236
-0.070705
0.299536
0.614719
0.825802
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment