Skip to content
Snippets Groups Projects
Commit 3f976760 authored by Bernd Machenschalk's avatar Bernd Machenschalk
Browse files

HSA version from Christophe Choquet

compile with:
g++ -fPIC -shared -o libclFFT.so -I /opt/hsa/include clFFT.cpp fft_kernelstring.cpp  -L/opt/hsa/lib -lhsa-runtime64
For each new FFT size, uncoment line 318 of clFFT.cpp, grab the OpenCL kernel,
save it, and create a file clFFT_SIZE.cl. Compile & run.
parent 9c5a4b48
No related branches found
No related tags found
No related merge requests found
/***************************************************************************
* Copyright (C) 2012 by Oliver Bock,Heinz-Bernd Eggenstein *
* oliver.bock[AT]aei.mpg.de *
* heinz-bernd.eggenstein[AT]aei.mpg.de *
* *
* This file is part of libclfft (originally for Einstein@Home) *
* Derived from clFFT, (C) Apple, see notice below. *
* *
* *
* libclfft is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See *
* notice below for more details. *
* *
***************************************************************************/
//
// File: fft_setup.cpp
//
// Version: <1.0>
//
// Disclaimer: IMPORTANT: This Apple software is supplied to you by Apple Inc. ("Apple")
// in consideration of your agreement to the following terms, and your use,
// installation, modification or redistribution of this Apple software
// constitutes acceptance of these terms. If you do not agree with these
// terms, please do not use, install, modify or redistribute this Apple
// software.
//
// In consideration of your agreement to abide by the following terms, and
// subject to these terms, Apple grants you a personal, non - exclusive
// license, under Apple's copyrights in this original Apple software ( the
// "Apple Software" ), to use, reproduce, modify and redistribute the Apple
// Software, with or without modifications, in source and / or binary forms;
// provided that if you redistribute the Apple Software in its entirety and
// without modifications, you must retain this notice and the following text
// and disclaimers in all such redistributions of the Apple Software. Neither
// the name, trademarks, service marks or logos of Apple Inc. may be used to
// endorse or promote products derived from the Apple Software without specific
// prior written permission from Apple. Except as expressly stated in this
// notice, no other rights or licenses, express or implied, are granted by
// Apple herein, including but not limited to any patent rights that may be
// infringed by your derivative works or by other works in which the Apple
// Software may be incorporated.
//
// The Apple Software is provided by Apple on an "AS IS" basis. APPLE MAKES NO
// WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED
// WARRANTIES OF NON - INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A
// PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION
// ALONE OR IN COMBINATION WITH YOUR PRODUCTS.
//
// IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR
// CONSEQUENTIAL DAMAGES ( INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
// INTERRUPTION ) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION
// AND / OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER
// UNDER THEORY OF CONTRACT, TORT ( INCLUDING NEGLIGENCE ), STRICT LIABILITY OR
// OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright ( C ) 2008 Apple Inc. All Rights Reserved.
//
////////////////////////////////////////////////////////////////////////////////////////////////////
#include "clFFT.h"
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include "fft_internal.h"
#include "fft_base_kernels.h"
// g++ -o clFFT -I /opt/hsa/include clFFT.cpp fft_kernelstring.cpp fft_setup.cpp -L/opt/hsa/lib -lhsa-runtime64
// g++ -fPIC -shared -o libclFFT.so -I /opt/hsa/include clFFT.cpp fft_kernelstring.cpp -L/opt/hsa/lib -lhsa-runtime64
using namespace std;
#define check(msg, status) \
if (status != HSA_STATUS_SUCCESS) { \
printf("%s failed. status=%x\n", #msg,status); \
return 1; \
} else { \
/*printf("%s succeeded.\n", #msg); */\
}
/**
* HSA HACK. Since hsa_signal_wait_acquire() sometimes waits for ever (don't know why. boinc blocking signals?)
*
* Use nanosleep instead of default timer. Since the kernels are quite fast, this does not impact CPU usage,
*/
hsa_signal_value_t myhsa_signal_wait_acquire(
hsa_signal_t signal,
hsa_signal_condition_t condition,
hsa_signal_value_t compare_value,
uint64_t timeout_hint,
hsa_wait_state_t wait_state_hint ) {
timespec t;
t.tv_sec = 0;
t.tv_nsec = 100*1000; // 0.1 ms
hsa_signal_value_t ret;
while (ret = hsa_signal_load_acquire(signal) == 1) {
nanosleep(&t,NULL);
}
return ret;
}
static fft_args_t* fft_kernarg_address = NULL;
extern hsa_signal_value_t myhsa_signal_wait_acquire(hsa_signal_t signal,hsa_signal_condition_t condition,hsa_signal_value_t compare_value,
uint64_t timeout_hint, hsa_wait_state_t wait_state_hint );
static void
getBlockConfigAndKernelString(cl_fft_plan *plan)
{
plan->temp_buffer_needed = 0;
*plan->kernel_string += baseKernels;
if(plan->format == clFFT_SplitComplexFormat)
*plan->kernel_string += twistKernelPlannar;
else
*plan->kernel_string += twistKernelInterleaved;
switch(plan->dim)
{
case clFFT_1D:
FFT1D(plan, cl_fft_kernel_x);
break;
case clFFT_2D:
FFT1D(plan, cl_fft_kernel_x);
FFT1D(plan, cl_fft_kernel_y);
break;
case clFFT_3D:
FFT1D(plan, cl_fft_kernel_x);
FFT1D(plan, cl_fft_kernel_y);
FFT1D(plan, cl_fft_kernel_z);
break;
default:
return;
}
plan->temp_buffer_needed = 0;
cl_fft_kernel_info *kInfo = plan->kernel_info;
while(kInfo)
{
plan->temp_buffer_needed |= !kInfo->in_place_possible;
kInfo = kInfo->next;
}
}
static
int precomputeSinCosLUTs(hsa_region_t global_region,cl_fft_plan * plan,int *error_code) {
size_t i=0;
cl_int err;
// find logN1,logN2, where
// n = 2^logN1 * 2^logN2 , and logN1=logN2 +/- 1
size_t N=plan->n.x*plan->n.y*plan->n.z;
plan->logN1=0;
plan->logN2=0;
plan->N1=1;
plan->N2=1;
switch (plan->twiddleMethod) {
case clFFT_native_trig: return 0;
case clFFT_sincosfunc : return 0;
case clFFT_TaylorLUT :
plan->logN1 = 0;
plan->logN2 = 8;
break;
case clFFT_BigLUT : {
size_t Nrem=N;
while(Nrem > 1) {
plan->logN1++;
Nrem >>= 1;
if(Nrem > 1) {
plan->logN2++;
Nrem >>= 1;
}
}}
break;
default: return 1;
}
plan->N1 = 1 << plan->logN1;
plan->N2 = 1 << plan->logN2;
double PI2 = 8.0*atan(1.0);
//plan->cossin_LUT_d1 = clCreateBuffer(plan->context,CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, plan->N1*2*sizeof(float),tmpLUT_cossin1, &err);
err = hsa_memory_allocate(global_region, plan->N1*2*sizeof(float), (void**)&plan->cossin_LUT_d1);
check(Create cossin_LUT_d1, err);
for(i=0; i < plan->N1; i++) {
float * tmpLUT_cossin1 = plan->cossin_LUT_d1;
tmpLUT_cossin1[i*2] =(float)cos(PI2 * (float) i / (float)N);
tmpLUT_cossin1[i*2+1]=(float)sin(PI2 * (float) i / (float)N);
}
//plan->cossin_LUT_d2 = clCreateBuffer(plan->context,CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, plan->N2*2*sizeof(float),tmpLUT_cossin2, &err);
err = hsa_memory_allocate(global_region, plan->N2*2*sizeof(float), (void**)&plan->cossin_LUT_d2);
check(Create cossin_LUT_d2, err);
for(i=0; i < plan->N2; i++) {
float * tmpLUT_cossin2 = plan->cossin_LUT_d2;
tmpLUT_cossin2[2*i] =(float)cos(PI2 * (float) i / (float) plan->N2);
tmpLUT_cossin2[2*i+1]=(float)sin(PI2 * (float) i / (float) plan->N2);
}
return 0;
}
static cl_int
allocateTemporaryBufferInterleaved(cl_fft_plan *plan, cl_uint batchSize)
{
cl_int err = CL_SUCCESS;
if(plan->temp_buffer_needed && plan->last_batch_size != batchSize)
{
plan->last_batch_size = batchSize;
size_t tmpLength = plan->n.x * plan->n.y * plan->n.z * batchSize * 2 * sizeof(cl_float);
if(plan->tempmemobj)
hsa_memory_free(plan->tempmemobj);
err = hsa_memory_allocate(plan->global_region, tmpLength, (void**)&plan->tempmemobj);
//plan->tempmemobj = clCreateBuffer(plan->context, CL_MEM_READ_WRITE, tmpLength, NULL, &err);
}
return err;
}
void
getKernelWorkDimensions(cl_fft_plan *plan, cl_fft_kernel_info *kernelInfo, cl_int *batchSize, size_t *gWorkItems, size_t *lWorkItems)
{
*lWorkItems = kernelInfo->num_workitems_per_workgroup;
int numWorkGroups = kernelInfo->num_workgroups;
int numXFormsPerWG = kernelInfo->num_xforms_per_workgroup;
switch(kernelInfo->dir)
{
case cl_fft_kernel_x:
*batchSize *= (plan->n.y * plan->n.z);
numWorkGroups = (*batchSize % numXFormsPerWG) ? (*batchSize/numXFormsPerWG + 1) : (*batchSize/numXFormsPerWG);
numWorkGroups *= kernelInfo->num_workgroups;
break;
case cl_fft_kernel_y:
*batchSize *= plan->n.z;
numWorkGroups *= *batchSize;
break;
case cl_fft_kernel_z:
numWorkGroups *= *batchSize;
break;
}
*gWorkItems = numWorkGroups * *lWorkItems;
}
clFFT_Plan
clFFT_CreatePlanAdvHSA(hsa_region_t global_region,hsa_executable_t executable,hsa_agent_t agent,hsa_region_t kernarg_region,hsa_queue_t* queue,
clFFT_Dim3 n, clFFT_Dimension dim, clFFT_DataFormat dataFormat, unsigned long flags)
{
int i;
int isPow2 = 1;
cl_fft_plan *plan = NULL;
ostringstream kString;
size_t ret_size;
cl_int error_code;
isPow2 |= n.x && !( (n.x - 1) & n.x );
isPow2 |= n.y && !( (n.y - 1) & n.y );
isPow2 |= n.z && !( (n.z - 1) & n.z );
plan = (cl_fft_plan *) malloc(sizeof(cl_fft_plan));
//plan->context = 0;
//clRetainContext(context);
plan->n = n;
plan->dim = dim;
plan->format = dataFormat;
plan->kernel_info = 0;
plan->num_kernels = 0;
plan->twist_kernel = 0;
plan->program = 0;
plan->temp_buffer_needed = 0;
plan->last_batch_size = 0;
plan->tempmemobj = NULL;
plan->tempmemobj_real = NULL;
plan->tempmemobj_imag = NULL;
plan->cossin_LUT_d1=NULL;
plan->cossin_LUT_d2=NULL;
plan->max_localmem_fft_size = 2048;
plan->max_work_item_per_workgroup = 256;
plan->max_radix = 16;
plan->min_mem_coalesce_width = 16;
plan->num_local_mem_banks = 16;
plan->global_region = global_region;
plan->executable = executable;
plan->agent = agent;
plan->kernarg_region = kernarg_region;
plan->queue = queue;
plan->twiddleMethod = (clFFT_TwiddleFactorMethod)(flags & 7);
precomputeSinCosLUTs(global_region,plan,&error_code);
plan->kernel_string = new string("");
getBlockConfigAndKernelString(plan);
const char *source_str = plan->kernel_string->c_str();
// Use this to create the initial kernel
// COPY THE SOURCE HERE TO a .cl file and then compile to BRIG.
//puts(source_str);
cl_fft_kernel_info *kInfo = plan->kernel_info;
while(kInfo)
{
plan->num_kernels++;
kInfo = kInfo->next;
}
return (clFFT_Plan) plan;
}
cl_int
clFFT_ExecuteInterleavedHSA( clFFT_Plan Plan, cl_int batchSize, clFFT_Direction dir,float *data_in, float *data_out)
{
int s;
cl_fft_plan *plan = (cl_fft_plan *) Plan;
if(plan->format != clFFT_InterleavedComplexFormat)
return CL_INVALID_VALUE;
cl_int err;
size_t gWorkItems, lWorkItems;
int inPlaceDone = -1;
cl_int isInPlace = data_in == data_out ? 1 : 0;
if((err = allocateTemporaryBufferInterleaved(plan, batchSize)) != CL_SUCCESS)
return err;
cl_mem memObj[3];
memObj[0] = data_in;
memObj[1] = data_out;
memObj[2] = plan->tempmemobj;
cl_fft_kernel_info *kernelInfo = plan->kernel_info;
int numKernels = plan->num_kernels;
int numKernelsOdd = numKernels & 1;
int currRead = 0;
int currWrite = 1;
// at least one external dram shuffle (transpose) required
if(plan->temp_buffer_needed)
{
// in-place transform
if(isInPlace)
{
inPlaceDone = 0;
currRead = 1;
currWrite = 2;
}
else
{
currWrite = (numKernels & 1) ? 1 : 2;
}
while(kernelInfo)
{
if( isInPlace && numKernelsOdd && !inPlaceDone && kernelInfo->in_place_possible)
{
currWrite = currRead;
inPlaceDone = 1;
}
s = batchSize;
getKernelWorkDimensions(plan, kernelInfo, &s, &gWorkItems, &lWorkItems);
char name[1024];
sprintf(name,"&__OpenCL_%s_kernel",kernelInfo->kernel_name);
hsa_executable_symbol_t symbol;
hsa_status_t err = hsa_executable_get_symbol(plan->executable, NULL, name, plan->agent, 0, &symbol);
check(Extract the symbol from FFT kernel, err);
/*
* Extract dispatch information from the symbol
*/
uint64_t kernel_object;
uint32_t kernarg_segment_size;
uint32_t group_segment_size;
uint32_t private_segment_size;
err = hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel_object);
check(Extracting the symbol from the executable, err);
err = hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &kernarg_segment_size);
check(Extracting the kernarg segment size from the executable, err);
err = hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &group_segment_size);
check(Extracting the group segment size from the executable, err);
err = hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &private_segment_size);
check(Extracting the private segment from the executable, err);
if (fft_kernarg_address == NULL) {
err = hsa_memory_allocate(plan->kernarg_region, kernarg_segment_size, (void**)&fft_kernarg_address);
check(Allocating resampling kernel argument memory buffer, err);
//printf("*** Allocated %d args size=%d\n",kernarg_segment_size,sizeof(fft_args_t));
}
// Pass parameters
fft_kernarg_address->global_offset_0 = 0;
fft_kernarg_address->global_offset_1 = 0;
fft_kernarg_address->global_offset_2 = 0;
fft_kernarg_address->in = (float*)memObj[currRead];
fft_kernarg_address->out = (float*)memObj[currWrite];
fft_kernarg_address->dir = dir;
fft_kernarg_address->S = s;
fft_kernarg_address->cossinLUT1 = (float*)plan->cossin_LUT_d1;
fft_kernarg_address->cossinLUT2 = (float*)plan->cossin_LUT_d2;
uint64_t index = hsa_queue_load_write_index_relaxed(plan->queue);
hsa_signal_t signal;
err=hsa_signal_create(1, 0,NULL, &signal);
check(Creating a HSA signal, err);
const uint32_t queueMask = plan->queue->size - 1;
// create HSA packet
hsa_kernel_dispatch_packet_t* dispatch_packet = &(((hsa_kernel_dispatch_packet_t*)(plan->queue->base_address))[index&queueMask]);
dispatch_packet->header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
dispatch_packet->header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
dispatch_packet->setup |= 1 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
dispatch_packet->workgroup_size_x = (uint16_t)lWorkItems;
dispatch_packet->workgroup_size_y = (uint16_t)1;
dispatch_packet->workgroup_size_z = (uint16_t)1;
dispatch_packet->grid_size_x = (uint32_t) gWorkItems;
dispatch_packet->grid_size_y = 1;
dispatch_packet->grid_size_z = 1;
dispatch_packet->completion_signal = signal;
dispatch_packet->kernel_object = kernel_object;
dispatch_packet->kernarg_address = (void*) fft_kernarg_address;
dispatch_packet->private_segment_size = private_segment_size;
dispatch_packet->group_segment_size = group_segment_size;
__atomic_store_n((uint8_t*)(&dispatch_packet->header), (uint8_t)HSA_PACKET_TYPE_KERNEL_DISPATCH, __ATOMIC_RELEASE);
/*
* Increment the write index and ring the doorbell to dispatch the kernel.
*/
hsa_queue_store_write_index_relaxed(plan->queue, index+1);
hsa_signal_store_relaxed(plan->queue->doorbell_signal, index);
check(Dispatching the clFFT kernel, err);
/*
* Wait on the dispatch completion signal until the kernel is finished.
*/
hsa_signal_value_t value = myhsa_signal_wait_acquire(signal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_BLOCKED);
err=hsa_signal_destroy(signal);
check(Destroying the signal, err);
/*
err |= clSetKernelArg(kernelInfo->kernel, 0, sizeof(cl_mem), &memObj[currRead]);
err |= clSetKernelArg(kernelInfo->kernel, 1, sizeof(cl_mem), &memObj[currWrite]);
err |= clSetKernelArg(kernelInfo->kernel, 2, sizeof(cl_int), &dir);
err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_int), &s);
err |= clSetKernelArg(kernelInfo->kernel, 4, sizeof(cl_mem), &(plan->cossin_LUT_d1));
err |= clSetKernelArg(kernelInfo->kernel, 5, sizeof(cl_mem), &(plan->cossin_LUT_d2));
err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, 0, NULL, NULL);
*/
if(err)
return err;
currRead = (currWrite == 1) ? 1 : 2;
currWrite = (currWrite == 1) ? 2 : 1;
kernelInfo = kernelInfo->next;
}
}
// no dram shuffle (transpose required) transform
// all kernels can execute in-place.
else {
while(kernelInfo)
{
s = batchSize;
getKernelWorkDimensions(plan, kernelInfo, &s, &gWorkItems, &lWorkItems);
printf("*** Call %s\n",kernelInfo->kernel_name);
char name[1024];
sprintf(name,"&__OpenCL_%s_kernel",kernelInfo->kernel_name);
hsa_executable_symbol_t symbol;
hsa_status_t err = hsa_executable_get_symbol(plan->executable, NULL, name, plan->agent, 0, &symbol);
check(Extract the symbol from FFT kernel, err);
/*
* Extract dispatch information from the symbol
*/
uint64_t kernel_object;
uint32_t kernarg_segment_size;
uint32_t group_segment_size;
uint32_t private_segment_size;
err = hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel_object);
check(Extracting the symbol from the executable, err);
err = hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &kernarg_segment_size);
check(Extracting the kernarg segment size from the executable, err);
err = hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &group_segment_size);
check(Extracting the group segment size from the executable, err);
err = hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &private_segment_size);
check(Extracting the private segment from the executable, err);
if (fft_kernarg_address == NULL) {
err = hsa_memory_allocate(plan->kernarg_region, kernarg_segment_size, (void**)&fft_kernarg_address);
check(Allocating resampling kernel argument memory buffer, err);
printf("*** Allocated %d args size=%d\n",kernarg_segment_size,sizeof(fft_args_t));
}
// Pass parameters
fft_kernarg_address->global_offset_0 = 0;
fft_kernarg_address->global_offset_1 = 0;
fft_kernarg_address->global_offset_2 = 0;
fft_kernarg_address->in = (float*)memObj[currRead];
fft_kernarg_address->out = (float*)memObj[currWrite];
fft_kernarg_address->dir = dir;
fft_kernarg_address->S = s;
fft_kernarg_address->cossinLUT1 = (float*)plan->cossin_LUT_d1;
fft_kernarg_address->cossinLUT2 = (float*)plan->cossin_LUT_d2;
uint64_t index = hsa_queue_load_write_index_relaxed(plan->queue);
hsa_signal_t signal;
err=hsa_signal_create(1, 0,NULL, &signal);
check(Creating a HSA signal, err);
const uint32_t queueMask = plan->queue->size - 1;
// create HSA packet
hsa_kernel_dispatch_packet_t* dispatch_packet = &(((hsa_kernel_dispatch_packet_t*)(plan->queue->base_address))[index&queueMask]);
dispatch_packet->header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
dispatch_packet->header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
dispatch_packet->setup |= 1 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
dispatch_packet->workgroup_size_x = (uint16_t)lWorkItems;
dispatch_packet->workgroup_size_y = (uint16_t)1;
dispatch_packet->workgroup_size_z = (uint16_t)1;
dispatch_packet->grid_size_x = (uint32_t) gWorkItems;
dispatch_packet->grid_size_y = 1;
dispatch_packet->grid_size_z = 1;
dispatch_packet->completion_signal = signal;
dispatch_packet->kernel_object = kernel_object;
dispatch_packet->kernarg_address = (void*) fft_kernarg_address;
dispatch_packet->private_segment_size = private_segment_size;
dispatch_packet->group_segment_size = group_segment_size;
__atomic_store_n((uint8_t*)(&dispatch_packet->header), (uint8_t)HSA_PACKET_TYPE_KERNEL_DISPATCH, __ATOMIC_RELEASE);
/*
* Increment the write index and ring the doorbell to dispatch the kernel.
*/
hsa_queue_store_write_index_relaxed(plan->queue, index+1);
hsa_signal_store_relaxed(plan->queue->doorbell_signal, index);
check(Dispatching the clFFT kernel, err);
/*
* Wait on the dispatch completion signal until the kernel is finished.
*/
hsa_signal_value_t value = myhsa_signal_wait_acquire(signal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_BLOCKED);
err=hsa_signal_destroy(signal);
check(Destroying the signal, err);
/*
err |= clSetKernelArg(kernelInfo->kernel, 0, sizeof(cl_mem), &memObj[currRead]);
err |= clSetKernelArg(kernelInfo->kernel, 1, sizeof(cl_mem), &memObj[currWrite]);
err |= clSetKernelArg(kernelInfo->kernel, 2, sizeof(cl_int), &dir);
err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_int), &s);
err |= clSetKernelArg(kernelInfo->kernel, 4, sizeof(cl_mem), &(plan->cossin_LUT_d1));
err |= clSetKernelArg(kernelInfo->kernel, 5, sizeof(cl_mem), &(plan->cossin_LUT_d2));
err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, 0, NULL, NULL);
*/
if(err)
return err;
currRead = 1;
currWrite = 1;
kernelInfo = kernelInfo->next;
}
}
return err;
}
static void
deleteKernelInfo(cl_fft_kernel_info *kInfo)
{
if(kInfo)
{
if(kInfo->kernel_name)
free(kInfo->kernel_name);
//if(kInfo->kernel)
// clReleaseKernel(kInfo->kernel);
free(kInfo);
}
}
static void
destroy_plan(cl_fft_plan *Plan)
{
cl_fft_kernel_info *kernel_info = Plan->kernel_info;
while(kernel_info)
{
cl_fft_kernel_info *tmp = kernel_info->next;
deleteKernelInfo(kernel_info);
kernel_info = tmp;
}
Plan->kernel_info = NULL;
if(Plan->kernel_string)
{
delete Plan->kernel_string;
Plan->kernel_string = NULL;
}
if(Plan->twist_kernel)
{
//clReleaseKernel(Plan->twist_kernel);
Plan->twist_kernel = 0;
}
if(Plan->program)
{
//clReleaseProgram(Plan->program);
Plan->program = 0;
}
if(Plan->tempmemobj)
{
hsa_memory_free(Plan->tempmemobj);
Plan->tempmemobj = NULL;
}
if(Plan->tempmemobj_real)
{
hsa_memory_free((void*)Plan->tempmemobj_real);
Plan->tempmemobj_real = NULL;
}
if(Plan->tempmemobj_imag)
{
hsa_memory_free((void*)Plan->tempmemobj_imag);
Plan->tempmemobj_imag = NULL;
}
if(Plan->cossin_LUT_d1)
{
hsa_memory_free((void*)Plan->cossin_LUT_d1);
}
if(Plan->cossin_LUT_d2)
{
hsa_memory_free((void*)Plan->cossin_LUT_d2);
}
}
void
clFFT_DestroyPlan(clFFT_Plan plan)
{
cl_fft_plan *Plan = (cl_fft_plan *) plan;
if(Plan)
{
destroy_plan(Plan);
//clReleaseContext(Plan->context);
free(Plan);
}
}
/***************************************************************************
* Copyright (C) 2012 by Oliver Bock,Heinz-Bernd Eggenstein *
* oliver.bock[AT]aei.mpg.de *
* heinz-bernd.eggenstein[AT]aei.mpg.de *
* *
* This file is part of libclfft (originally for Einstein@Home) *
* Derived from clFFT, (C) Apple, see notice below. *
* *
* *
* libclfft is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See *
* notice below for more details. *
* *
***************************************************************************/
//
// File: clFFT.h
//
// Version: <1.0>
//
// Disclaimer: IMPORTANT: This Apple software is supplied to you by Apple Inc. ("Apple")
// in consideration of your agreement to the following terms, and your use,
// installation, modification or redistribution of this Apple software
// constitutes acceptance of these terms. If you do not agree with these
// terms, please do not use, install, modify or redistribute this Apple
// software.
//
// In consideration of your agreement to abide by the following terms, and
// subject to these terms, Apple grants you a personal, non - exclusive
// license, under Apple's copyrights in this original Apple software ( the
// "Apple Software" ), to use, reproduce, modify and redistribute the Apple
// Software, with or without modifications, in source and / or binary forms;
// provided that if you redistribute the Apple Software in its entirety and
// without modifications, you must retain this notice and the following text
// and disclaimers in all such redistributions of the Apple Software. Neither
// the name, trademarks, service marks or logos of Apple Inc. may be used to
// endorse or promote products derived from the Apple Software without specific
// prior written permission from Apple. Except as expressly stated in this
// notice, no other rights or licenses, express or implied, are granted by
// Apple herein, including but not limited to any patent rights that may be
// infringed by your derivative works or by other works in which the Apple
// Software may be incorporated.
//
// The Apple Software is provided by Apple on an "AS IS" basis. APPLE MAKES NO
// WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED
// WARRANTIES OF NON - INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A
// PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION
// ALONE OR IN COMBINATION WITH YOUR PRODUCTS.
//
// IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR
// CONSEQUENTIAL DAMAGES ( INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
// INTERRUPTION ) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION
// AND / OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER
// UNDER THEORY OF CONTRACT, TORT ( INCLUDING NEGLIGENCE ), STRICT LIABILITY OR
// OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright ( C ) 2008 Apple Inc. All Rights Reserved.
//
////////////////////////////////////////////////////////////////////////////////////////////////////
#ifndef __CLFFT_H
#define __CLFFT_H
#include <stdio.h>
#ifdef __cplusplus
extern "C" {
#endif
#include "hsa.h"
#include "hsa_ext_finalize.h"
// Mini OpenCL
typedef int cl_int;
typedef unsigned int cl_uint;
typedef int cl_context;
typedef int cl_kernel;
typedef int cl_program;
typedef void * cl_mem;
typedef float cl_float;
#define CL_SUCCESS 0
#define CL_INVALID_VALUE -30
// XForm type
typedef enum
{
clFFT_Forward = -1,
clFFT_Inverse = 1
}clFFT_Direction;
// XForm dimension
typedef enum
{
clFFT_1D = 0,
clFFT_2D = 1,
clFFT_3D = 3
}clFFT_Dimension;
// XForm Data type
typedef enum
{
clFFT_SplitComplexFormat = 0,
clFFT_InterleavedComplexFormat = 1
}clFFT_DataFormat;
typedef enum
{
clFFT_native_trig = 0,
clFFT_sincosfunc = 1,
clFFT_BigLUT = 2,
clFFT_TaylorLUT = 3,
clFFT_RFU4 = 4,
clFFT_RFU5 = 5,
clFFT_RFU6 = 6,
clFFT_RFU7 = 7
} clFFT_TwiddleFactorMethod;
typedef struct
{
unsigned int x;
unsigned int y;
unsigned int z;
}clFFT_Dim3;
typedef struct
{
float *real;
float *imag;
} clFFT_SplitComplex;
typedef struct
{
float real;
float imag;
}clFFT_Complex;
struct __attribute__ ((aligned(16))) fft_args_t {
uint64_t global_offset_0;
uint64_t global_offset_1;
uint64_t global_offset_2;
uint64_t printf_buffer;
uint64_t vqueue_pointer;
uint64_t aqlwrap_pointer;
float *in;
float *out;
int dir;
int S;
float *cossinLUT1;
float *cossinLUT2;
};
typedef void* clFFT_Plan;
clFFT_Plan clFFT_CreatePlanAdvHSA(hsa_region_t global_region,hsa_executable_t executable,hsa_agent_t agent,hsa_region_t kernarg_region,hsa_queue_t* queue,
clFFT_Dim3 n, clFFT_Dimension dim, clFFT_DataFormat dataFormat, unsigned long flags);
cl_int
clFFT_ExecuteInterleavedHSA( clFFT_Plan Plan, cl_int batchSize, clFFT_Direction dir,float *data_in, float *data_out);
void clFFT_DestroyPlan( clFFT_Plan plan );
#if 0
clFFT_Plan clFFT_CreatePlan( cl_context context, clFFT_Dim3 n, clFFT_Dimension dim, clFFT_DataFormat dataFormat, cl_int *error_code );
clFFT_Plan clFFT_CreatePlanAdv( cl_context context, clFFT_Dim3 n, clFFT_Dimension dim, clFFT_DataFormat dataFormat, unsigned long flags, cl_int *error_code );
cl_int clFFT_ExecutePlannar( cl_command_queue queue, clFFT_Plan plan, cl_int batchSize, clFFT_Direction dir,
cl_mem data_in_real, cl_mem data_in_imag, cl_mem data_out_real, cl_mem data_out_imag,
cl_int num_events, cl_event *event_list, cl_event *event );
cl_int clFFT_1DTwistInterleaved(clFFT_Plan Plan, cl_command_queue queue, cl_mem array,
size_t numRows, size_t numCols, size_t startRow, size_t rowsToProcess, clFFT_Direction dir);
cl_int clFFT_1DTwistPlannar(clFFT_Plan Plan, cl_command_queue queue, cl_mem array_real, cl_mem array_imag,
size_t numRows, size_t numCols, size_t startRow, size_t rowsToProcess, clFFT_Direction dir);
void clFFT_DumpPlan( clFFT_Plan plan, FILE *file);
#endif
#ifdef __cplusplus
}
#endif
#endif
#ifndef M_PI
#define M_PI 0x1.921fb54442d18p+1
#endif
#define complexMul(a,b) ((float2)(mad(-(a).y, (b).y, (a).x * (b).x), mad((a).y, (b).x, (a).x * (b).y)))
#define cos_sinLUT1(res,dir,i,cossinLUT)\
{\
(res)=(float2)((cossinLUT)[i].x , (dir)*(cossinLUT)[i].y);\
}
#define cos_sinLUT2(res,dir,_i,_k,cossinLUT1,cossinLUT2) \
{ float _sin_1= (cossinLUT1)[_i].y; \
float _sin_2= (cossinLUT2)[_k].y; \
float _cos_1= (cossinLUT1)[_i].x; \
float _cos_2= (cossinLUT2)[_k].x; \
float _cos_res = _cos_1 * _cos_2 - _sin_1 * _sin_2; \
float _sin_res = (dir) * (_sin_1 * _cos_2 + _cos_1 * _sin_2); \
(res)=(float2)(_cos_res,_sin_res); \
}
#define conj(a) ((float2)((a).x, -(a).y))
#define conjTransp(a) ((float2)(-(a).y, (a).x))
#define fftKernel2(a,dir) \
{ \
float2 c = (a)[0]; \
(a)[0] = c + (a)[1]; \
(a)[1] = c - (a)[1]; \
}
#define fftKernel2S(d1,d2,dir) \
{ \
float2 c = (d1); \
(d1) = c + (d2); \
(d2) = c - (d2); \
}
#define fftKernel4(a,dir) \
{ \
fftKernel2S((a)[0], (a)[2], dir); \
fftKernel2S((a)[1], (a)[3], dir); \
fftKernel2S((a)[0], (a)[1], dir); \
(a)[3] = (float2)(dir)*(conjTransp((a)[3])); \
fftKernel2S((a)[2], (a)[3], dir); \
float2 c = (a)[1]; \
(a)[1] = (a)[2]; \
(a)[2] = c; \
}
#define fftKernel4s(a0,a1,a2,a3,dir) \
{ \
fftKernel2S((a0), (a2), dir); \
fftKernel2S((a1), (a3), dir); \
fftKernel2S((a0), (a1), dir); \
(a3) = (float2)(dir)*(conjTransp((a3))); \
fftKernel2S((a2), (a3), dir); \
float2 c = (a1); \
(a1) = (a2); \
(a2) = c; \
}
#define bitreverse8(a) \
{ \
float2 c; \
c = (a)[1]; \
(a)[1] = (a)[4]; \
(a)[4] = c; \
c = (a)[3]; \
(a)[3] = (a)[6]; \
(a)[6] = c; \
}
#define fftKernel8(a,dir) \
{ \
const float2 w1 = (float2)(0x1.6a09e6p-1f, dir*0x1.6a09e6p-1f); \
const float2 w3 = (float2)(-0x1.6a09e6p-1f, dir*0x1.6a09e6p-1f); \
float2 c; \
fftKernel2S((a)[0], (a)[4], dir); \
fftKernel2S((a)[1], (a)[5], dir); \
fftKernel2S((a)[2], (a)[6], dir); \
fftKernel2S((a)[3], (a)[7], dir); \
(a)[5] = complexMul(w1, (a)[5]); \
(a)[6] = (float2)(dir)*(conjTransp((a)[6])); \
(a)[7] = complexMul(w3, (a)[7]); \
fftKernel2S((a)[0], (a)[2], dir); \
fftKernel2S((a)[1], (a)[3], dir); \
fftKernel2S((a)[4], (a)[6], dir); \
fftKernel2S((a)[5], (a)[7], dir); \
(a)[3] = (float2)(dir)*(conjTransp((a)[3])); \
(a)[7] = (float2)(dir)*(conjTransp((a)[7])); \
fftKernel2S((a)[0], (a)[1], dir); \
fftKernel2S((a)[2], (a)[3], dir); \
fftKernel2S((a)[4], (a)[5], dir); \
fftKernel2S((a)[6], (a)[7], dir); \
bitreverse8((a)); \
}
#define bitreverse4x4(a) \
{ \
float2 c; \
c = (a)[1]; (a)[1] = (a)[4]; (a)[4] = c; \
c = (a)[2]; (a)[2] = (a)[8]; (a)[8] = c; \
c = (a)[3]; (a)[3] = (a)[12]; (a)[12] = c; \
c = (a)[6]; (a)[6] = (a)[9]; (a)[9] = c; \
c = (a)[7]; (a)[7] = (a)[13]; (a)[13] = c; \
c = (a)[11]; (a)[11] = (a)[14]; (a)[14] = c; \
}
#define fftKernel16(a,dir) \
{ \
const float w0 = 0x1.d906bcp-1f; \
const float w1 = 0x1.87de2ap-2f; \
const float w2 = 0x1.6a09e6p-1f; \
fftKernel4s((a)[0], (a)[4], (a)[8], (a)[12], dir); \
fftKernel4s((a)[1], (a)[5], (a)[9], (a)[13], dir); \
fftKernel4s((a)[2], (a)[6], (a)[10], (a)[14], dir); \
fftKernel4s((a)[3], (a)[7], (a)[11], (a)[15], dir); \
(a)[5] = complexMul((a)[5], (float2)(w0, dir*w1)); \
(a)[6] = complexMul((a)[6], (float2)(w2, dir*w2)); \
(a)[7] = complexMul((a)[7], (float2)(w1, dir*w0)); \
(a)[9] = complexMul((a)[9], (float2)(w2, dir*w2)); \
(a)[10] = (float2)(dir)*(conjTransp((a)[10])); \
(a)[11] = complexMul((a)[11], (float2)(-w2, dir*w2)); \
(a)[13] = complexMul((a)[13], (float2)(w1, dir*w0)); \
(a)[14] = complexMul((a)[14], (float2)(-w2, dir*w2)); \
(a)[15] = complexMul((a)[15], (float2)(-w0, dir*-w1)); \
fftKernel4((a), dir); \
fftKernel4((a) + 4, dir); \
fftKernel4((a) + 8, dir); \
fftKernel4((a) + 12, dir); \
bitreverse4x4((a)); \
}
#define bitreverse32(a) \
{ \
float2 c1, c2; \
c1 = (a)[2]; (a)[2] = (a)[1]; c2 = (a)[4]; (a)[4] = c1; c1 = (a)[8]; (a)[8] = c2; c2 = (a)[16]; (a)[16] = c1; (a)[1] = c2; \
c1 = (a)[6]; (a)[6] = (a)[3]; c2 = (a)[12]; (a)[12] = c1; c1 = (a)[24]; (a)[24] = c2; c2 = (a)[17]; (a)[17] = c1; (a)[3] = c2; \
c1 = (a)[10]; (a)[10] = (a)[5]; c2 = (a)[20]; (a)[20] = c1; c1 = (a)[9]; (a)[9] = c2; c2 = (a)[18]; (a)[18] = c1; (a)[5] = c2; \
c1 = (a)[14]; (a)[14] = (a)[7]; c2 = (a)[28]; (a)[28] = c1; c1 = (a)[25]; (a)[25] = c2; c2 = (a)[19]; (a)[19] = c1; (a)[7] = c2; \
c1 = (a)[22]; (a)[22] = (a)[11]; c2 = (a)[13]; (a)[13] = c1; c1 = (a)[26]; (a)[26] = c2; c2 = (a)[21]; (a)[21] = c1; (a)[11] = c2; \
c1 = (a)[30]; (a)[30] = (a)[15]; c2 = (a)[29]; (a)[29] = c1; c1 = (a)[27]; (a)[27] = c2; c2 = (a)[23]; (a)[23] = c1; (a)[15] = c2; \
}
#define fftKernel32(a,dir) \
{ \
fftKernel2S((a)[0], (a)[16], dir); \
fftKernel2S((a)[1], (a)[17], dir); \
fftKernel2S((a)[2], (a)[18], dir); \
fftKernel2S((a)[3], (a)[19], dir); \
fftKernel2S((a)[4], (a)[20], dir); \
fftKernel2S((a)[5], (a)[21], dir); \
fftKernel2S((a)[6], (a)[22], dir); \
fftKernel2S((a)[7], (a)[23], dir); \
fftKernel2S((a)[8], (a)[24], dir); \
fftKernel2S((a)[9], (a)[25], dir); \
fftKernel2S((a)[10], (a)[26], dir); \
fftKernel2S((a)[11], (a)[27], dir); \
fftKernel2S((a)[12], (a)[28], dir); \
fftKernel2S((a)[13], (a)[29], dir); \
fftKernel2S((a)[14], (a)[30], dir); \
fftKernel2S((a)[15], (a)[31], dir); \
(a)[17] = complexMul((a)[17], (float2)(0x1.f6297cp-1f, dir*0x1.8f8b84p-3f)); \
(a)[18] = complexMul((a)[18], (float2)(0x1.d906bcp-1f, dir*0x1.87de2ap-2f)); \
(a)[19] = complexMul((a)[19], (float2)(0x1.a9b662p-1f, dir*0x1.1c73b4p-1f)); \
(a)[20] = complexMul((a)[20], (float2)(0x1.6a09e6p-1f, dir*0x1.6a09e6p-1f)); \
(a)[21] = complexMul((a)[21], (float2)(0x1.1c73b4p-1f, dir*0x1.a9b662p-1f)); \
(a)[22] = complexMul((a)[22], (float2)(0x1.87de2ap-2f, dir*0x1.d906bcp-1f)); \
(a)[23] = complexMul((a)[23], (float2)(0x1.8f8b84p-3f, dir*0x1.f6297cp-1f)); \
(a)[24] = complexMul((a)[24], (float2)(0x0p+0f, dir*0x1p+0f)); \
(a)[25] = complexMul((a)[25], (float2)(-0x1.8f8b84p-3f, dir*0x1.f6297cp-1f)); \
(a)[26] = complexMul((a)[26], (float2)(-0x1.87de2ap-2f, dir*0x1.d906bcp-1f)); \
(a)[27] = complexMul((a)[27], (float2)(-0x1.1c73b4p-1f, dir*0x1.a9b662p-1f)); \
(a)[28] = complexMul((a)[28], (float2)(-0x1.6a09e6p-1f, dir*0x1.6a09e6p-1f)); \
(a)[29] = complexMul((a)[29], (float2)(-0x1.a9b662p-1f, dir*0x1.1c73b4p-1f)); \
(a)[30] = complexMul((a)[30], (float2)(-0x1.d906bcp-1f, dir*0x1.87de2ap-2f)); \
(a)[31] = complexMul((a)[31], (float2)(-0x1.f6297cp-1f, dir*0x1.8f8b84p-3f)); \
fftKernel16((a), dir); \
fftKernel16((a) + 16, dir); \
bitreverse32((a)); \
}
__kernel void \
clFFT_1DTwistInterleaved(__global float2 *in, unsigned int startRow, unsigned int numCols, unsigned int N, unsigned int numRowsToProcess, int dir) \
{ \
float2 a, w; \
float ang; \
unsigned int j; \
unsigned int i = get_global_id(0); \
unsigned int startIndex = i; \
\
if(i < numCols) \
{ \
for(j = 0; j < numRowsToProcess; j++) \
{ \
a = in[startIndex]; \
ang = 2.0f * M_PI * dir * i * (startRow + j) / N; \
w = (float2)(native_cos(ang), native_sin(ang)); \
a = complexMul(a, w); \
in[startIndex] = a; \
startIndex += numCols; \
} \
} \
} \
__kernel void fft0(__global float2 *in, __global float2 *out, int dir, int S, __global float2 * cossinLUT1, __global float2 * cossinLUT2 )
{
__local float sMem[2064];
int i, j, r, indexIn, indexOut, index, tid, bNum, xNum, k, l;
int s, ii, jj, offset;
float2 w;
float ang, angf, ang1;
__local float *lMemStore, *lMemLoad;
float2 a[16];
int lId = get_local_id( 0 );
int groupId = get_group_id( 0 );
__local float2 cossin_T_LUT[256];
int lLUTind= lId;
cossin_T_LUT[lLUTind]=cossinLUT2[lLUTind];
lLUTind+=128;
cossin_T_LUT[lLUTind]=cossinLUT2[lLUTind];
barrier(CLK_LOCAL_MEM_FENCE);
bNum = groupId & 63;
xNum = groupId >> 6;
indexIn = mul24(bNum, 16);
tid = indexIn;
i = tid >> 0;
j = tid & 0;
indexOut = mad24(i, 128, j);
indexIn += (xNum << 17);
indexOut += (xNum << 17);
tid = lId;
i = tid & 15;
j = tid >> 4;
indexIn += mad24(j, 1024, i);
in += indexIn;
a[0] = in[0];
a[1] = in[8192];
a[2] = in[16384];
a[3] = in[24576];
a[4] = in[32768];
a[5] = in[40960];
a[6] = in[49152];
a[7] = in[57344];
a[8] = in[65536];
a[9] = in[73728];
a[10] = in[81920];
a[11] = in[90112];
a[12] = in[98304];
a[13] = in[106496];
a[14] = in[114688];
a[15] = in[122880];
fftKernel16(a, dir);
{ int ang_index = (1 * ( j)) & 127;
int k = ang_index * 2;
float2 cs =cossin_T_LUT[k];
cs.y *=dir;
w = cs;
}
a[1] = complexMul(a[1], w);
{ int ang_index = (1 * ( j)) & 63;
int k = ang_index * 4;
float2 cs =cossin_T_LUT[k];
cs.y *=dir;
w = cs;
}
a[2] = complexMul(a[2], w);
{ int ang_index = (3 * ( j)) & 127;
int k = ang_index * 2;
float2 cs =cossin_T_LUT[k];
cs.y *=dir;
w = cs;
}
a[3] = complexMul(a[3], w);
{ int ang_index = (1 * ( j)) & 31;
int k = ang_index * 8;
float2 cs =cossin_T_LUT[k];
cs.y *=dir;
w = cs;
}
a[4] = complexMul(a[4], w);
{ int ang_index = (5 * ( j)) & 127;
int k = ang_index * 2;
float2 cs =cossin_T_LUT[k];
cs.y *=dir;
w = cs;
}
a[5] = complexMul(a[5], w);
{ int ang_index = (3 * ( j)) & 63;
int k = ang_index * 4;
float2 cs =cossin_T_LUT[k];
cs.y *=dir;
w = cs;
}
a[6] = complexMul(a[6], w);
{ int ang_index = (7 * ( j)) & 127;
int k = ang_index * 2;
float2 cs =cossin_T_LUT[k];
cs.y *=dir;
w = cs;
}
a[7] = complexMul(a[7], w);
{ int ang_index = (1 * ( j)) & 15;
int k = ang_index * 16;
float2 cs =cossin_T_LUT[k];
cs.y *=dir;
w = cs;
}
a[8] = complexMul(a[8], w);
{ int ang_index = (9 * ( j)) & 127;
int k = ang_index * 2;
float2 cs =cossin_T_LUT[k];
cs.y *=dir;
w = cs;
}
a[9] = complexMul(a[9], w);
{ int ang_index = (5 * ( j)) & 63;
int k = ang_index * 4;
float2 cs =cossin_T_LUT[k];
cs.y *=dir;
w = cs;
}
a[10] = complexMul(a[10], w);
{ int ang_index = (11 * ( j)) & 127;
int k = ang_index * 2;
float2 cs =cossin_T_LUT[k];
cs.y *=dir;
w = cs;
}
a[11] = complexMul(a[11], w);
{ int ang_index = (3 * ( j)) & 31;
int k = ang_index * 8;
float2 cs =cossin_T_LUT[k];
cs.y *=dir;
w = cs;
}
a[12] = complexMul(a[12], w);
{ int ang_index = (13 * ( j)) & 127;
int k = ang_index * 2;
float2 cs =cossin_T_LUT[k];
cs.y *=dir;
w = cs;
}
a[13] = complexMul(a[13], w);
{ int ang_index = (7 * ( j)) & 63;
int k = ang_index * 4;
float2 cs =cossin_T_LUT[k];
cs.y *=dir;
w = cs;
}
a[14] = complexMul(a[14], w);
{ int ang_index = (15 * ( j)) & 127;
int k = ang_index * 2;
float2 cs =cossin_T_LUT[k];
cs.y *=dir;
w = cs;
}
a[15] = complexMul(a[15], w);
indexIn = mad24(j, 256, i);
lMemStore = sMem + tid;
lMemLoad = sMem + indexIn;
lMemStore[0] = a[0].x;
lMemStore[128] = a[1].x;
lMemStore[256] = a[2].x;
lMemStore[384] = a[3].x;
lMemStore[512] = a[4].x;
lMemStore[640] = a[5].x;
lMemStore[768] = a[6].x;
lMemStore[896] = a[7].x;
lMemStore[1024] = a[8].x;
lMemStore[1152] = a[9].x;
lMemStore[1280] = a[10].x;
lMemStore[1408] = a[11].x;
lMemStore[1536] = a[12].x;
lMemStore[1664] = a[13].x;
lMemStore[1792] = a[14].x;
lMemStore[1920] = a[15].x;
barrier(CLK_LOCAL_MEM_FENCE);
a[0].x = lMemLoad[0];
a[1].x = lMemLoad[16];
a[2].x = lMemLoad[32];
a[3].x = lMemLoad[48];
a[4].x = lMemLoad[64];
a[5].x = lMemLoad[80];
a[6].x = lMemLoad[96];
a[7].x = lMemLoad[112];
a[8].x = lMemLoad[128];
a[9].x = lMemLoad[144];
a[10].x = lMemLoad[160];
a[11].x = lMemLoad[176];
a[12].x = lMemLoad[192];
a[13].x = lMemLoad[208];
a[14].x = lMemLoad[224];
a[15].x = lMemLoad[240];
barrier(CLK_LOCAL_MEM_FENCE);
lMemStore[0] = a[0].y;
lMemStore[128] = a[1].y;
lMemStore[256] = a[2].y;
lMemStore[384] = a[3].y;
lMemStore[512] = a[4].y;
lMemStore[640] = a[5].y;
lMemStore[768] = a[6].y;
lMemStore[896] = a[7].y;
lMemStore[1024] = a[8].y;
lMemStore[1152] = a[9].y;
lMemStore[1280] = a[10].y;
lMemStore[1408] = a[11].y;
lMemStore[1536] = a[12].y;
lMemStore[1664] = a[13].y;
lMemStore[1792] = a[14].y;
lMemStore[1920] = a[15].y;
barrier(CLK_LOCAL_MEM_FENCE);
a[0].y = lMemLoad[0];
a[1].y = lMemLoad[16];
a[2].y = lMemLoad[32];
a[3].y = lMemLoad[48];
a[4].y = lMemLoad[64];
a[5].y = lMemLoad[80];
a[6].y = lMemLoad[96];
a[7].y = lMemLoad[112];
a[8].y = lMemLoad[128];
a[9].y = lMemLoad[144];
a[10].y = lMemLoad[160];
a[11].y = lMemLoad[176];
a[12].y = lMemLoad[192];
a[13].y = lMemLoad[208];
a[14].y = lMemLoad[224];
a[15].y = lMemLoad[240];
barrier(CLK_LOCAL_MEM_FENCE);
fftKernel8(a + 0, dir);
fftKernel8(a + 8, dir);
l = ((bNum << 4) + i) >> 0;
k = j << 1;
{ int ang_index = (1 * ( l * (k + 0))) & 131071;
int k = (ang_index + 256) >> 9;
float2 csx0 =cossin_T_LUT[k];
float2 csx0Transp= (float2)(csx0.y,-csx0.x);
int r=ang_index - k * ( 512 );
float mh=-4.7936900955e-05*(float)r;
float mhsqr2= mh*mh*(-0.5f);
float hqub6= mhsqr2*mh*(1.0f/3.0f);
float2 cs;
cs= hqub6 * csx0Transp;
cs += mhsqr2*csx0;
cs += mh*csx0Transp;
cs += csx0;
cs.y *=dir;
w = cs;
}
a[0] = complexMul(a[0], w);
{ int ang_index = (1 * ( l * (k + 16))) & 131071;
int k = (ang_index + 256) >> 9;
float2 csx0 =cossin_T_LUT[k];
float2 csx0Transp= (float2)(csx0.y,-csx0.x);
int r=ang_index - k * ( 512 );
float mh=-4.7936900955e-05*(float)r;
float mhsqr2= mh*mh*(-0.5f);
float hqub6= mhsqr2*mh*(1.0f/3.0f);
float2 cs;
cs= hqub6 * csx0Transp;
cs += mhsqr2*csx0;
cs += mh*csx0Transp;
cs += csx0;
cs.y *=dir;
w = cs;
}
a[1] = complexMul(a[1], w);
{ int ang_index = (1 * ( l * (k + 32))) & 131071;
int k = (ang_index + 256) >> 9;
float2 csx0 =cossin_T_LUT[k];
float2 csx0Transp= (float2)(csx0.y,-csx0.x);
int r=ang_index - k * ( 512 );
float mh=-4.7936900955e-05*(float)r;
float mhsqr2= mh*mh*(-0.5f);
float hqub6= mhsqr2*mh*(1.0f/3.0f);
float2 cs;
cs= hqub6 * csx0Transp;
cs += mhsqr2*csx0;
cs += mh*csx0Transp;
cs += csx0;
cs.y *=dir;
w = cs;
}
a[2] = complexMul(a[2], w);
{ int ang_index = (1 * ( l * (k + 48))) & 131071;
int k = (ang_index + 256) >> 9;
float2 csx0 =cossin_T_LUT[k];
float2 csx0Transp= (float2)(csx0.y,-csx0.x);
int r=ang_index - k * ( 512 );
float mh=-4.7936900955e-05*(float)r;
float mhsqr2= mh*mh*(-0.5f);
float hqub6= mhsqr2*mh*(1.0f/3.0f);
float2 cs;
cs= hqub6 * csx0Transp;
cs += mhsqr2*csx0;
cs += mh*csx0Transp;
cs += csx0;
cs.y *=dir;
w = cs;
}
a[3] = complexMul(a[3], w);
{ int ang_index = (1 * ( l * (k + 64))) & 131071;
int k = (ang_index + 256) >> 9;
float2 csx0 =cossin_T_LUT[k];
float2 csx0Transp= (float2)(csx0.y,-csx0.x);
int r=ang_index - k * ( 512 );
float mh=-4.7936900955e-05*(float)r;
float mhsqr2= mh*mh*(-0.5f);
float hqub6= mhsqr2*mh*(1.0f/3.0f);
float2 cs;
cs= hqub6 * csx0Transp;
cs += mhsqr2*csx0;
cs += mh*csx0Transp;
cs += csx0;
cs.y *=dir;
w = cs;
}
a[4] = complexMul(a[4], w);
{ int ang_index = (1 * ( l * (k + 80))) & 131071;
int k = (ang_index + 256) >> 9;
float2 csx0 =cossin_T_LUT[k];
float2 csx0Transp= (float2)(csx0.y,-csx0.x);
int r=ang_index - k * ( 512 );
float mh=-4.7936900955e-05*(float)r;
float mhsqr2= mh*mh*(-0.5f);
float hqub6= mhsqr2*mh*(1.0f/3.0f);
float2 cs;
cs= hqub6 * csx0Transp;
cs += mhsqr2*csx0;
cs += mh*csx0Transp;
cs += csx0;
cs.y *=dir;
w = cs;
}
a[5] = complexMul(a[5], w);
{ int ang_index = (1 * ( l * (k + 96))) & 131071;
int k = (ang_index + 256) >> 9;
float2 csx0 =cossin_T_LUT[k];
float2 csx0Transp= (float2)(csx0.y,-csx0.x);
int r=ang_index - k * ( 512 );
float mh=-4.7936900955e-05*(float)r;
float mhsqr2= mh*mh*(-0.5f);
float hqub6= mhsqr2*mh*(1.0f/3.0f);
float2 cs;
cs= hqub6 * csx0Transp;
cs += mhsqr2*csx0;
cs += mh*csx0Transp;
cs += csx0;
cs.y *=dir;
w = cs;
}
a[6] = complexMul(a[6], w);
{ int ang_index = (1 * ( l * (k + 112))) & 131071;
int k = (ang_index + 256) >> 9;
float2 csx0 =cossin_T_LUT[k];
float2 csx0Transp= (float2)(csx0.y,-csx0.x);
int r=ang_index - k * ( 512 );
float mh=-4.7936900955e-05*(float)r;
float mhsqr2= mh*mh*(-0.5f);
float hqub6= mhsqr2*mh*(1.0f/3.0f);
float2 cs;
cs= hqub6 * csx0Transp;
cs += mhsqr2*csx0;
cs += mh*csx0Transp;
cs += csx0;
cs.y *=dir;
w = cs;
}
a[7] = complexMul(a[7], w);
{ int ang_index = (1 * ( l * (k + 1))) & 131071;
int k = (ang_index + 256) >> 9;
float2 csx0 =cossin_T_LUT[k];
float2 csx0Transp= (float2)(csx0.y,-csx0.x);
int r=ang_index - k * ( 512 );
float mh=-4.7936900955e-05*(float)r;
float mhsqr2= mh*mh*(-0.5f);
float hqub6= mhsqr2*mh*(1.0f/3.0f);
float2 cs;
cs= hqub6 * csx0Transp;
cs += mhsqr2*csx0;
cs += mh*csx0Transp;
cs += csx0;
cs.y *=dir;
w = cs;
}
a[8] = complexMul(a[8], w);
{ int ang_index = (1 * ( l * (k + 17))) & 131071;
int k = (ang_index + 256) >> 9;
float2 csx0 =cossin_T_LUT[k];
float2 csx0Transp= (float2)(csx0.y,-csx0.x);
int r=ang_index - k * ( 512 );
float mh=-4.7936900955e-05*(float)r;
float mhsqr2= mh*mh*(-0.5f);
float hqub6= mhsqr2*mh*(1.0f/3.0f);
float2 cs;
cs= hqub6 * csx0Transp;
cs += mhsqr2*csx0;
cs += mh*csx0Transp;
cs += csx0;
cs.y *=dir;
w = cs;
}
a[9] = complexMul(a[9], w);
{ int ang_index = (1 * ( l * (k + 33))) & 131071;
int k = (ang_index + 256) >> 9;
float2 csx0 =cossin_T_LUT[k];
float2 csx0Transp= (float2)(csx0.y,-csx0.x);
int r=ang_index - k * ( 512 );
float mh=-4.7936900955e-05*(float)r;
float mhsqr2= mh*mh*(-0.5f);
float hqub6= mhsqr2*mh*(1.0f/3.0f);
float2 cs;
cs= hqub6 * csx0Transp;
cs += mhsqr2*csx0;
cs += mh*csx0Transp;
cs += csx0;
cs.y *=dir;
w = cs;
}
a[10] = complexMul(a[10], w);
{ int ang_index = (1 * ( l * (k + 49))) & 131071;
int k = (ang_index + 256) >> 9;
float2 csx0 =cossin_T_LUT[k];
float2 csx0Transp= (float2)(csx0.y,-csx0.x);
int r=ang_index - k * ( 512 );
float mh=-4.7936900955e-05*(float)r;
float mhsqr2= mh*mh*(-0.5f);
float hqub6= mhsqr2*mh*(1.0f/3.0f);
float2 cs;
cs= hqub6 * csx0Transp;
cs += mhsqr2*csx0;
cs += mh*csx0Transp;
cs += csx0;
cs.y *=dir;
w = cs;
}
a[11] = complexMul(a[11], w);
{ int ang_index = (1 * ( l * (k + 65))) & 131071;
int k = (ang_index + 256) >> 9;
float2 csx0 =cossin_T_LUT[k];
float2 csx0Transp= (float2)(csx0.y,-csx0.x);
int r=ang_index - k * ( 512 );
float mh=-4.7936900955e-05*(float)r;
float mhsqr2= mh*mh*(-0.5f);
float hqub6= mhsqr2*mh*(1.0f/3.0f);
float2 cs;
cs= hqub6 * csx0Transp;
cs += mhsqr2*csx0;
cs += mh*csx0Transp;
cs += csx0;
cs.y *=dir;
w = cs;
}
a[12] = complexMul(a[12], w);
{ int ang_index = (1 * ( l * (k + 81))) & 131071;
int k = (ang_index + 256) >> 9;
float2 csx0 =cossin_T_LUT[k];
float2 csx0Transp= (float2)(csx0.y,-csx0.x);
int r=ang_index - k * ( 512 );
float mh=-4.7936900955e-05*(float)r;
float mhsqr2= mh*mh*(-0.5f);
float hqub6= mhsqr2*mh*(1.0f/3.0f);
float2 cs;
cs= hqub6 * csx0Transp;
cs += mhsqr2*csx0;
cs += mh*csx0Transp;
cs += csx0;
cs.y *=dir;
w = cs;
}
a[13] = complexMul(a[13], w);
{ int ang_index = (1 * ( l * (k + 97))) & 131071;
int k = (ang_index + 256) >> 9;
float2 csx0 =cossin_T_LUT[k];
float2 csx0Transp= (float2)(csx0.y,-csx0.x);
int r=ang_index - k * ( 512 );
float mh=-4.7936900955e-05*(float)r;
float mhsqr2= mh*mh*(-0.5f);
float hqub6= mhsqr2*mh*(1.0f/3.0f);
float2 cs;
cs= hqub6 * csx0Transp;
cs += mhsqr2*csx0;
cs += mh*csx0Transp;
cs += csx0;
cs.y *=dir;
w = cs;
}
a[14] = complexMul(a[14], w);
{ int ang_index = (1 * ( l * (k + 113))) & 131071;
int k = (ang_index + 256) >> 9;
float2 csx0 =cossin_T_LUT[k];
float2 csx0Transp= (float2)(csx0.y,-csx0.x);
int r=ang_index - k * ( 512 );
float mh=-4.7936900955e-05*(float)r;
float mhsqr2= mh*mh*(-0.5f);
float hqub6= mhsqr2*mh*(1.0f/3.0f);
float2 cs;
cs= hqub6 * csx0Transp;
cs += mhsqr2*csx0;
cs += mh*csx0Transp;
cs += csx0;
cs.y *=dir;
w = cs;
}
a[15] = complexMul(a[15], w);
lMemStore = sMem + mad24(i, 129, j << 1);
lMemLoad = sMem + mad24(tid >> 7, 129, tid & 127);
lMemStore[ 0] = a[0].x;
lMemStore[ 16] = a[1].x;
lMemStore[ 32] = a[2].x;
lMemStore[ 48] = a[3].x;
lMemStore[ 64] = a[4].x;
lMemStore[ 80] = a[5].x;
lMemStore[ 96] = a[6].x;
lMemStore[ 112] = a[7].x;
lMemStore[ 1] = a[8].x;
lMemStore[ 17] = a[9].x;
lMemStore[ 33] = a[10].x;
lMemStore[ 49] = a[11].x;
lMemStore[ 65] = a[12].x;
lMemStore[ 81] = a[13].x;
lMemStore[ 97] = a[14].x;
lMemStore[ 113] = a[15].x;
barrier(CLK_LOCAL_MEM_FENCE);
a[0].x = lMemLoad[0];
a[1].x = lMemLoad[129];
a[2].x = lMemLoad[258];
a[3].x = lMemLoad[387];
a[4].x = lMemLoad[516];
a[5].x = lMemLoad[645];
a[6].x = lMemLoad[774];
a[7].x = lMemLoad[903];
a[8].x = lMemLoad[1032];
a[9].x = lMemLoad[1161];
a[10].x = lMemLoad[1290];
a[11].x = lMemLoad[1419];
a[12].x = lMemLoad[1548];
a[13].x = lMemLoad[1677];
a[14].x = lMemLoad[1806];
a[15].x = lMemLoad[1935];
barrier(CLK_LOCAL_MEM_FENCE);
lMemStore[ 0] = a[0].y;
lMemStore[ 16] = a[1].y;
lMemStore[ 32] = a[2].y;
lMemStore[ 48] = a[3].y;
lMemStore[ 64] = a[4].y;
lMemStore[ 80] = a[5].y;
lMemStore[ 96] = a[6].y;
lMemStore[ 112] = a[7].y;
lMemStore[ 1] = a[8].y;
lMemStore[ 17] = a[9].y;
lMemStore[ 33] = a[10].y;
lMemStore[ 49] = a[11].y;
lMemStore[ 65] = a[12].y;
lMemStore[ 81] = a[13].y;
lMemStore[ 97] = a[14].y;
lMemStore[ 113] = a[15].y;
barrier(CLK_LOCAL_MEM_FENCE);
a[0].y = lMemLoad[0];
a[1].y = lMemLoad[129];
a[2].y = lMemLoad[258];
a[3].y = lMemLoad[387];
a[4].y = lMemLoad[516];
a[5].y = lMemLoad[645];
a[6].y = lMemLoad[774];
a[7].y = lMemLoad[903];
a[8].y = lMemLoad[1032];
a[9].y = lMemLoad[1161];
a[10].y = lMemLoad[1290];
a[11].y = lMemLoad[1419];
a[12].y = lMemLoad[1548];
a[13].y = lMemLoad[1677];
a[14].y = lMemLoad[1806];
a[15].y = lMemLoad[1935];
barrier(CLK_LOCAL_MEM_FENCE);
indexOut += tid;
out += indexOut;
out[0] = a[0];
out[128] = a[1];
out[256] = a[2];
out[384] = a[3];
out[512] = a[4];
out[640] = a[5];
out[768] = a[6];
out[896] = a[7];
out[1024] = a[8];
out[1152] = a[9];
out[1280] = a[10];
out[1408] = a[11];
out[1536] = a[12];
out[1664] = a[13];
out[1792] = a[14];
out[1920] = a[15];
}
__kernel void fft1(__global float2 *in, __global float2 *out, int dir, int S, __global float2 * cossinLUT1, __global float2 * cossinLUT2 )
{
__local float sMem[2048];
int i, j, r, indexIn, indexOut, index, tid, bNum, xNum, k, l;
int s, ii, jj, offset;
float2 w;
float ang, angf, ang1;
__local float *lMemStore, *lMemLoad;
float2 a[16];
int lId = get_local_id( 0 );
int groupId = get_group_id( 0 );
__local float2 cossin_T_LUT[256];
int lLUTind= lId;
cossin_T_LUT[lLUTind]=cossinLUT2[lLUTind];
lLUTind+=128;
cossin_T_LUT[lLUTind]=cossinLUT2[lLUTind];
barrier(CLK_LOCAL_MEM_FENCE);
bNum = groupId & 63;
xNum = groupId >> 6;
indexIn = mul24(bNum, 16);
tid = indexIn;
i = tid >> 7;
j = tid & 127;
indexOut = mad24(i, 16384, j);
indexIn += (xNum << 17);
indexOut += (xNum << 17);
tid = lId;
i = tid & 15;
j = tid >> 4;
indexIn += mad24(j, 1024, i);
in += indexIn;
a[0] = in[0];
a[1] = in[8192];
a[2] = in[16384];
a[3] = in[24576];
a[4] = in[32768];
a[5] = in[40960];
a[6] = in[49152];
a[7] = in[57344];
a[8] = in[65536];
a[9] = in[73728];
a[10] = in[81920];
a[11] = in[90112];
a[12] = in[98304];
a[13] = in[106496];
a[14] = in[114688];
a[15] = in[122880];
fftKernel16(a, dir);
{ int ang_index = (1 * ( j)) & 127;
int k = ang_index * 2;
float2 cs =cossin_T_LUT[k];
cs.y *=dir;
w = cs;
}
a[1] = complexMul(a[1], w);
{ int ang_index = (1 * ( j)) & 63;
int k = ang_index * 4;
float2 cs =cossin_T_LUT[k];
cs.y *=dir;
w = cs;
}
a[2] = complexMul(a[2], w);
{ int ang_index = (3 * ( j)) & 127;
int k = ang_index * 2;
float2 cs =cossin_T_LUT[k];
cs.y *=dir;
w = cs;
}
a[3] = complexMul(a[3], w);
{ int ang_index = (1 * ( j)) & 31;
int k = ang_index * 8;
float2 cs =cossin_T_LUT[k];
cs.y *=dir;
w = cs;
}
a[4] = complexMul(a[4], w);
{ int ang_index = (5 * ( j)) & 127;
int k = ang_index * 2;
float2 cs =cossin_T_LUT[k];
cs.y *=dir;
w = cs;
}
a[5] = complexMul(a[5], w);
{ int ang_index = (3 * ( j)) & 63;
int k = ang_index * 4;
float2 cs =cossin_T_LUT[k];
cs.y *=dir;
w = cs;
}
a[6] = complexMul(a[6], w);
{ int ang_index = (7 * ( j)) & 127;
int k = ang_index * 2;
float2 cs =cossin_T_LUT[k];
cs.y *=dir;
w = cs;
}
a[7] = complexMul(a[7], w);
{ int ang_index = (1 * ( j)) & 15;
int k = ang_index * 16;
float2 cs =cossin_T_LUT[k];
cs.y *=dir;
w = cs;
}
a[8] = complexMul(a[8], w);
{ int ang_index = (9 * ( j)) & 127;
int k = ang_index * 2;
float2 cs =cossin_T_LUT[k];
cs.y *=dir;
w = cs;
}
a[9] = complexMul(a[9], w);
{ int ang_index = (5 * ( j)) & 63;
int k = ang_index * 4;
float2 cs =cossin_T_LUT[k];
cs.y *=dir;
w = cs;
}
a[10] = complexMul(a[10], w);
{ int ang_index = (11 * ( j)) & 127;
int k = ang_index * 2;
float2 cs =cossin_T_LUT[k];
cs.y *=dir;
w = cs;
}
a[11] = complexMul(a[11], w);
{ int ang_index = (3 * ( j)) & 31;
int k = ang_index * 8;
float2 cs =cossin_T_LUT[k];
cs.y *=dir;
w = cs;
}
a[12] = complexMul(a[12], w);
{ int ang_index = (13 * ( j)) & 127;
int k = ang_index * 2;
float2 cs =cossin_T_LUT[k];
cs.y *=dir;
w = cs;
}
a[13] = complexMul(a[13], w);
{ int ang_index = (7 * ( j)) & 63;
int k = ang_index * 4;
float2 cs =cossin_T_LUT[k];
cs.y *=dir;
w = cs;
}
a[14] = complexMul(a[14], w);
{ int ang_index = (15 * ( j)) & 127;
int k = ang_index * 2;
float2 cs =cossin_T_LUT[k];
cs.y *=dir;
w = cs;
}
a[15] = complexMul(a[15], w);
indexIn = mad24(j, 256, i);
lMemStore = sMem + tid;
lMemLoad = sMem + indexIn;
lMemStore[0] = a[0].x;
lMemStore[128] = a[1].x;
lMemStore[256] = a[2].x;
lMemStore[384] = a[3].x;
lMemStore[512] = a[4].x;
lMemStore[640] = a[5].x;
lMemStore[768] = a[6].x;
lMemStore[896] = a[7].x;
lMemStore[1024] = a[8].x;
lMemStore[1152] = a[9].x;
lMemStore[1280] = a[10].x;
lMemStore[1408] = a[11].x;
lMemStore[1536] = a[12].x;
lMemStore[1664] = a[13].x;
lMemStore[1792] = a[14].x;
lMemStore[1920] = a[15].x;
barrier(CLK_LOCAL_MEM_FENCE);
a[0].x = lMemLoad[0];
a[1].x = lMemLoad[16];
a[2].x = lMemLoad[32];
a[3].x = lMemLoad[48];
a[4].x = lMemLoad[64];
a[5].x = lMemLoad[80];
a[6].x = lMemLoad[96];
a[7].x = lMemLoad[112];
a[8].x = lMemLoad[128];
a[9].x = lMemLoad[144];
a[10].x = lMemLoad[160];
a[11].x = lMemLoad[176];
a[12].x = lMemLoad[192];
a[13].x = lMemLoad[208];
a[14].x = lMemLoad[224];
a[15].x = lMemLoad[240];
barrier(CLK_LOCAL_MEM_FENCE);
lMemStore[0] = a[0].y;
lMemStore[128] = a[1].y;
lMemStore[256] = a[2].y;
lMemStore[384] = a[3].y;
lMemStore[512] = a[4].y;
lMemStore[640] = a[5].y;
lMemStore[768] = a[6].y;
lMemStore[896] = a[7].y;
lMemStore[1024] = a[8].y;
lMemStore[1152] = a[9].y;
lMemStore[1280] = a[10].y;
lMemStore[1408] = a[11].y;
lMemStore[1536] = a[12].y;
lMemStore[1664] = a[13].y;
lMemStore[1792] = a[14].y;
lMemStore[1920] = a[15].y;
barrier(CLK_LOCAL_MEM_FENCE);
a[0].y = lMemLoad[0];
a[1].y = lMemLoad[16];
a[2].y = lMemLoad[32];
a[3].y = lMemLoad[48];
a[4].y = lMemLoad[64];
a[5].y = lMemLoad[80];
a[6].y = lMemLoad[96];
a[7].y = lMemLoad[112];
a[8].y = lMemLoad[128];
a[9].y = lMemLoad[144];
a[10].y = lMemLoad[160];
a[11].y = lMemLoad[176];
a[12].y = lMemLoad[192];
a[13].y = lMemLoad[208];
a[14].y = lMemLoad[224];
a[15].y = lMemLoad[240];
barrier(CLK_LOCAL_MEM_FENCE);
fftKernel8(a + 0, dir);
fftKernel8(a + 8, dir);
l = ((bNum << 4) + i) >> 7;
k = j << 1;
{ int ang_index = (1 * ( l * (k + 0))) & 1023;
int k = (ang_index + 2) >> 2;
float2 csx0 =cossin_T_LUT[k];
float2 csx0Transp= (float2)(csx0.y,-csx0.x);
int r=ang_index - k * ( 4 );
float mh=-6.1359233223e-03*(float)r;
float mhsqr2= mh*mh*(-0.5f);
float hqub6= mhsqr2*mh*(1.0f/3.0f);
float2 cs;
cs= hqub6 * csx0Transp;
cs += mhsqr2*csx0;
cs += mh*csx0Transp;
cs += csx0;
cs.y *=dir;
w = cs;
}
a[0] = complexMul(a[0], w);
{ int ang_index = (1 * ( l * (k + 16))) & 1023;
int k = (ang_index + 2) >> 2;
float2 csx0 =cossin_T_LUT[k];
float2 csx0Transp= (float2)(csx0.y,-csx0.x);
int r=ang_index - k * ( 4 );
float mh=-6.1359233223e-03*(float)r;
float mhsqr2= mh*mh*(-0.5f);
float hqub6= mhsqr2*mh*(1.0f/3.0f);
float2 cs;
cs= hqub6 * csx0Transp;
cs += mhsqr2*csx0;
cs += mh*csx0Transp;
cs += csx0;
cs.y *=dir;
w = cs;
}
a[1] = complexMul(a[1], w);
{ int ang_index = (1 * ( l * (k + 32))) & 1023;
int k = (ang_index + 2) >> 2;
float2 csx0 =cossin_T_LUT[k];
float2 csx0Transp= (float2)(csx0.y,-csx0.x);
int r=ang_index - k * ( 4 );
float mh=-6.1359233223e-03*(float)r;
float mhsqr2= mh*mh*(-0.5f);
float hqub6= mhsqr2*mh*(1.0f/3.0f);
float2 cs;
cs= hqub6 * csx0Transp;
cs += mhsqr2*csx0;
cs += mh*csx0Transp;
cs += csx0;
cs.y *=dir;
w = cs;
}
a[2] = complexMul(a[2], w);
{ int ang_index = (1 * ( l * (k + 48))) & 1023;
int k = (ang_index + 2) >> 2;
float2 csx0 =cossin_T_LUT[k];
float2 csx0Transp= (float2)(csx0.y,-csx0.x);
int r=ang_index - k * ( 4 );
float mh=-6.1359233223e-03*(float)r;
float mhsqr2= mh*mh*(-0.5f);
float hqub6= mhsqr2*mh*(1.0f/3.0f);
float2 cs;
cs= hqub6 * csx0Transp;
cs += mhsqr2*csx0;
cs += mh*csx0Transp;
cs += csx0;
cs.y *=dir;
w = cs;
}
a[3] = complexMul(a[3], w);
{ int ang_index = (1 * ( l * (k + 64))) & 1023;
int k = (ang_index + 2) >> 2;
float2 csx0 =cossin_T_LUT[k];
float2 csx0Transp= (float2)(csx0.y,-csx0.x);
int r=ang_index - k * ( 4 );
float mh=-6.1359233223e-03*(float)r;
float mhsqr2= mh*mh*(-0.5f);
float hqub6= mhsqr2*mh*(1.0f/3.0f);
float2 cs;
cs= hqub6 * csx0Transp;
cs += mhsqr2*csx0;
cs += mh*csx0Transp;
cs += csx0;
cs.y *=dir;
w = cs;
}
a[4] = complexMul(a[4], w);
{ int ang_index = (1 * ( l * (k + 80))) & 1023;
int k = (ang_index + 2) >> 2;
float2 csx0 =cossin_T_LUT[k];
float2 csx0Transp= (float2)(csx0.y,-csx0.x);
int r=ang_index - k * ( 4 );
float mh=-6.1359233223e-03*(float)r;
float mhsqr2= mh*mh*(-0.5f);
float hqub6= mhsqr2*mh*(1.0f/3.0f);
float2 cs;
cs= hqub6 * csx0Transp;
cs += mhsqr2*csx0;
cs += mh*csx0Transp;
cs += csx0;
cs.y *=dir;
w = cs;
}
a[5] = complexMul(a[5], w);
{ int ang_index = (1 * ( l * (k + 96))) & 1023;
int k = (ang_index + 2) >> 2;
float2 csx0 =cossin_T_LUT[k];
float2 csx0Transp= (float2)(csx0.y,-csx0.x);
int r=ang_index - k * ( 4 );
float mh=-6.1359233223e-03*(float)r;
float mhsqr2= mh*mh*(-0.5f);
float hqub6= mhsqr2*mh*(1.0f/3.0f);
float2 cs;
cs= hqub6 * csx0Transp;
cs += mhsqr2*csx0;
cs += mh*csx0Transp;
cs += csx0;
cs.y *=dir;
w = cs;
}
a[6] = complexMul(a[6], w);
{ int ang_index = (1 * ( l * (k + 112))) & 1023;
int k = (ang_index + 2) >> 2;
float2 csx0 =cossin_T_LUT[k];
float2 csx0Transp= (float2)(csx0.y,-csx0.x);
int r=ang_index - k * ( 4 );
float mh=-6.1359233223e-03*(float)r;
float mhsqr2= mh*mh*(-0.5f);
float hqub6= mhsqr2*mh*(1.0f/3.0f);
float2 cs;
cs= hqub6 * csx0Transp;
cs += mhsqr2*csx0;
cs += mh*csx0Transp;
cs += csx0;
cs.y *=dir;
w = cs;
}
a[7] = complexMul(a[7], w);
{ int ang_index = (1 * ( l * (k + 1))) & 1023;
int k = (ang_index + 2) >> 2;
float2 csx0 =cossin_T_LUT[k];
float2 csx0Transp= (float2)(csx0.y,-csx0.x);
int r=ang_index - k * ( 4 );
float mh=-6.1359233223e-03*(float)r;
float mhsqr2= mh*mh*(-0.5f);
float hqub6= mhsqr2*mh*(1.0f/3.0f);
float2 cs;
cs= hqub6 * csx0Transp;
cs += mhsqr2*csx0;
cs += mh*csx0Transp;
cs += csx0;
cs.y *=dir;
w = cs;
}
a[8] = complexMul(a[8], w);
{ int ang_index = (1 * ( l * (k + 17))) & 1023;
int k = (ang_index + 2) >> 2;
float2 csx0 =cossin_T_LUT[k];
float2 csx0Transp= (float2)(csx0.y,-csx0.x);
int r=ang_index - k * ( 4 );
float mh=-6.1359233223e-03*(float)r;
float mhsqr2= mh*mh*(-0.5f);
float hqub6= mhsqr2*mh*(1.0f/3.0f);
float2 cs;
cs= hqub6 * csx0Transp;
cs += mhsqr2*csx0;
cs += mh*csx0Transp;
cs += csx0;
cs.y *=dir;
w = cs;
}
a[9] = complexMul(a[9], w);
{ int ang_index = (1 * ( l * (k + 33))) & 1023;
int k = (ang_index + 2) >> 2;
float2 csx0 =cossin_T_LUT[k];
float2 csx0Transp= (float2)(csx0.y,-csx0.x);
int r=ang_index - k * ( 4 );
float mh=-6.1359233223e-03*(float)r;
float mhsqr2= mh*mh*(-0.5f);
float hqub6= mhsqr2*mh*(1.0f/3.0f);
float2 cs;
cs= hqub6 * csx0Transp;
cs += mhsqr2*csx0;
cs += mh*csx0Transp;
cs += csx0;
cs.y *=dir;
w = cs;
}
a[10] = complexMul(a[10], w);
{ int ang_index = (1 * ( l * (k + 49))) & 1023;
int k = (ang_index + 2) >> 2;
float2 csx0 =cossin_T_LUT[k];
float2 csx0Transp= (float2)(csx0.y,-csx0.x);
int r=ang_index - k * ( 4 );
float mh=-6.1359233223e-03*(float)r;
float mhsqr2= mh*mh*(-0.5f);
float hqub6= mhsqr2*mh*(1.0f/3.0f);
float2 cs;
cs= hqub6 * csx0Transp;
cs += mhsqr2*csx0;
cs += mh*csx0Transp;
cs += csx0;
cs.y *=dir;
w = cs;
}
a[11] = complexMul(a[11], w);
{ int ang_index = (1 * ( l * (k + 65))) & 1023;
int k = (ang_index + 2) >> 2;
float2 csx0 =cossin_T_LUT[k];
float2 csx0Transp= (float2)(csx0.y,-csx0.x);
int r=ang_index - k * ( 4 );
float mh=-6.1359233223e-03*(float)r;
float mhsqr2= mh*mh*(-0.5f);
float hqub6= mhsqr2*mh*(1.0f/3.0f);
float2 cs;
cs= hqub6 * csx0Transp;
cs += mhsqr2*csx0;
cs += mh*csx0Transp;
cs += csx0;
cs.y *=dir;
w = cs;
}
a[12] = complexMul(a[12], w);
{ int ang_index = (1 * ( l * (k + 81))) & 1023;
int k = (ang_index + 2) >> 2;
float2 csx0 =cossin_T_LUT[k];
float2 csx0Transp= (float2)(csx0.y,-csx0.x);
int r=ang_index - k * ( 4 );
float mh=-6.1359233223e-03*(float)r;
float mhsqr2= mh*mh*(-0.5f);
float hqub6= mhsqr2*mh*(1.0f/3.0f);
float2 cs;
cs= hqub6 * csx0Transp;
cs += mhsqr2*csx0;
cs += mh*csx0Transp;
cs += csx0;
cs.y *=dir;
w = cs;
}
a[13] = complexMul(a[13], w);
{ int ang_index = (1 * ( l * (k + 97))) & 1023;
int k = (ang_index + 2) >> 2;
float2 csx0 =cossin_T_LUT[k];
float2 csx0Transp= (float2)(csx0.y,-csx0.x);
int r=ang_index - k * ( 4 );
float mh=-6.1359233223e-03*(float)r;
float mhsqr2= mh*mh*(-0.5f);
float hqub6= mhsqr2*mh*(1.0f/3.0f);
float2 cs;
cs= hqub6 * csx0Transp;
cs += mhsqr2*csx0;
cs += mh*csx0Transp;
cs += csx0;
cs.y *=dir;
w = cs;
}
a[14] = complexMul(a[14], w);
{ int ang_index = (1 * ( l * (k + 113))) & 1023;
int k = (ang_index + 2) >> 2;
float2 csx0 =cossin_T_LUT[k];
float2 csx0Transp= (float2)(csx0.y,-csx0.x);
int r=ang_index - k * ( 4 );
float mh=-6.1359233223e-03*(float)r;
float mhsqr2= mh*mh*(-0.5f);
float hqub6= mhsqr2*mh*(1.0f/3.0f);
float2 cs;
cs= hqub6 * csx0Transp;
cs += mhsqr2*csx0;
cs += mh*csx0Transp;
cs += csx0;
cs.y *=dir;
w = cs;
}
a[15] = complexMul(a[15], w);
indexOut += mad24(j, 256, i);
out += indexOut;
out[0] = a[0];
out[2048] = a[1];
out[4096] = a[2];
out[6144] = a[3];
out[8192] = a[4];
out[10240] = a[5];
out[12288] = a[6];
out[14336] = a[7];
out[128] = a[8];
out[2176] = a[9];
out[4224] = a[10];
out[6272] = a[11];
out[8320] = a[12];
out[10368] = a[13];
out[12416] = a[14];
out[14464] = a[15];
}
__kernel void fft2(__global float2 *in, __global float2 *out, int dir, int S, __global float2 * cossinLUT1, __global float2 * cossinLUT2 )
{
int i, j, r, indexIn, indexOut, index, tid, bNum, xNum, k, l;
int s, ii, jj, offset;
float2 w;
float ang, angf, ang1;
__local float *lMemStore, *lMemLoad;
float2 a[8];
int lId = get_local_id( 0 );
int groupId = get_group_id( 0 );
bNum = groupId & 63;
xNum = groupId >> 6;
indexIn = mul24(bNum, 256);
tid = indexIn;
i = tid >> 14;
j = tid & 16383;
indexOut = mad24(i, 131072, j);
indexIn += (xNum << 17);
indexOut += (xNum << 17);
tid = lId;
i = tid & 255;
j = tid >> 8;
indexIn += mad24(j, 16384, i);
in += indexIn;
a[0] = in[0];
a[1] = in[16384];
a[2] = in[32768];
a[3] = in[49152];
a[4] = in[65536];
a[5] = in[81920];
a[6] = in[98304];
a[7] = in[114688];
fftKernel8(a, dir);
indexOut += mad24(j, 131072, i);
out += indexOut;
out[0] = a[0];
out[16384] = a[1];
out[32768] = a[2];
out[49152] = a[3];
out[65536] = a[4];
out[81920] = a[5];
out[98304] = a[6];
out[114688] = a[7];
}
#ifndef M_PI
#define M_PI 0x1.921fb54442d18p+1
#endif
#define complexMul(a,b) ((float2)(mad(-(a).y, (b).y, (a).x * (b).x), mad((a).y, (b).x, (a).x * (b).y)))
#define cos_sinLUT1(res,dir,i,cossinLUT)\
{\
(res)=(float2)((cossinLUT)[i].x , (dir)*(cossinLUT)[i].y);\
}
#define cos_sinLUT2(res,dir,_i,_k,cossinLUT1,cossinLUT2) \
{ float _sin_1= (cossinLUT1)[_i].y; \
float _sin_2= (cossinLUT2)[_k].y; \
float _cos_1= (cossinLUT1)[_i].x; \
float _cos_2= (cossinLUT2)[_k].x; \
float _cos_res = _cos_1 * _cos_2 - _sin_1 * _sin_2; \
float _sin_res = (dir) * (_sin_1 * _cos_2 + _cos_1 * _sin_2); \
(res)=(float2)(_cos_res,_sin_res); \
}
#define conj(a) ((float2)((a).x, -(a).y))
#define conjTransp(a) ((float2)(-(a).y, (a).x))
#define fftKernel2(a,dir) \
{ \
float2 c = (a)[0]; \
(a)[0] = c + (a)[1]; \
(a)[1] = c - (a)[1]; \
}
#define fftKernel2S(d1,d2,dir) \
{ \
float2 c = (d1); \
(d1) = c + (d2); \
(d2) = c - (d2); \
}
#define fftKernel4(a,dir) \
{ \
fftKernel2S((a)[0], (a)[2], dir); \
fftKernel2S((a)[1], (a)[3], dir); \
fftKernel2S((a)[0], (a)[1], dir); \
(a)[3] = (float2)(dir)*(conjTransp((a)[3])); \
fftKernel2S((a)[2], (a)[3], dir); \
float2 c = (a)[1]; \
(a)[1] = (a)[2]; \
(a)[2] = c; \
}
#define fftKernel4s(a0,a1,a2,a3,dir) \
{ \
fftKernel2S((a0), (a2), dir); \
fftKernel2S((a1), (a3), dir); \
fftKernel2S((a0), (a1), dir); \
(a3) = (float2)(dir)*(conjTransp((a3))); \
fftKernel2S((a2), (a3), dir); \
float2 c = (a1); \
(a1) = (a2); \
(a2) = c; \
}
#define bitreverse8(a) \
{ \
float2 c; \
c = (a)[1]; \
(a)[1] = (a)[4]; \
(a)[4] = c; \
c = (a)[3]; \
(a)[3] = (a)[6]; \
(a)[6] = c; \
}
#define fftKernel8(a,dir) \
{ \
const float2 w1 = (float2)(0x1.6a09e6p-1f, dir*0x1.6a09e6p-1f); \
const float2 w3 = (float2)(-0x1.6a09e6p-1f, dir*0x1.6a09e6p-1f); \
float2 c; \
fftKernel2S((a)[0], (a)[4], dir); \
fftKernel2S((a)[1], (a)[5], dir); \
fftKernel2S((a)[2], (a)[6], dir); \
fftKernel2S((a)[3], (a)[7], dir); \
(a)[5] = complexMul(w1, (a)[5]); \
(a)[6] = (float2)(dir)*(conjTransp((a)[6])); \
(a)[7] = complexMul(w3, (a)[7]); \
fftKernel2S((a)[0], (a)[2], dir); \
fftKernel2S((a)[1], (a)[3], dir); \
fftKernel2S((a)[4], (a)[6], dir); \
fftKernel2S((a)[5], (a)[7], dir); \
(a)[3] = (float2)(dir)*(conjTransp((a)[3])); \
(a)[7] = (float2)(dir)*(conjTransp((a)[7])); \
fftKernel2S((a)[0], (a)[1], dir); \
fftKernel2S((a)[2], (a)[3], dir); \
fftKernel2S((a)[4], (a)[5], dir); \
fftKernel2S((a)[6], (a)[7], dir); \
bitreverse8((a)); \
}
#define bitreverse4x4(a) \
{ \
float2 c; \
c = (a)[1]; (a)[1] = (a)[4]; (a)[4] = c; \
c = (a)[2]; (a)[2] = (a)[8]; (a)[8] = c; \
c = (a)[3]; (a)[3] = (a)[12]; (a)[12] = c; \
c = (a)[6]; (a)[6] = (a)[9]; (a)[9] = c; \
c = (a)[7]; (a)[7] = (a)[13]; (a)[13] = c; \
c = (a)[11]; (a)[11] = (a)[14]; (a)[14] = c; \
}
#define fftKernel16(a,dir) \
{ \
const float w0 = 0x1.d906bcp-1f; \
const float w1 = 0x1.87de2ap-2f; \
const float w2 = 0x1.6a09e6p-1f; \
fftKernel4s((a)[0], (a)[4], (a)[8], (a)[12], dir); \
fftKernel4s((a)[1], (a)[5], (a)[9], (a)[13], dir); \
fftKernel4s((a)[2], (a)[6], (a)[10], (a)[14], dir); \
fftKernel4s((a)[3], (a)[7], (a)[11], (a)[15], dir); \
(a)[5] = complexMul((a)[5], (float2)(w0, dir*w1)); \
(a)[6] = complexMul((a)[6], (float2)(w2, dir*w2)); \
(a)[7] = complexMul((a)[7], (float2)(w1, dir*w0)); \
(a)[9] = complexMul((a)[9], (float2)(w2, dir*w2)); \
(a)[10] = (float2)(dir)*(conjTransp((a)[10])); \
(a)[11] = complexMul((a)[11], (float2)(-w2, dir*w2)); \
(a)[13] = complexMul((a)[13], (float2)(w1, dir*w0)); \
(a)[14] = complexMul((a)[14], (float2)(-w2, dir*w2)); \
(a)[15] = complexMul((a)[15], (float2)(-w0, dir*-w1)); \
fftKernel4((a), dir); \
fftKernel4((a) + 4, dir); \
fftKernel4((a) + 8, dir); \
fftKernel4((a) + 12, dir); \
bitreverse4x4((a)); \
}
#define bitreverse32(a) \
{ \
float2 c1, c2; \
c1 = (a)[2]; (a)[2] = (a)[1]; c2 = (a)[4]; (a)[4] = c1; c1 = (a)[8]; (a)[8] = c2; c2 = (a)[16]; (a)[16] = c1; (a)[1] = c2; \
c1 = (a)[6]; (a)[6] = (a)[3]; c2 = (a)[12]; (a)[12] = c1; c1 = (a)[24]; (a)[24] = c2; c2 = (a)[17]; (a)[17] = c1; (a)[3] = c2; \
c1 = (a)[10]; (a)[10] = (a)[5]; c2 = (a)[20]; (a)[20] = c1; c1 = (a)[9]; (a)[9] = c2; c2 = (a)[18]; (a)[18] = c1; (a)[5] = c2; \
c1 = (a)[14]; (a)[14] = (a)[7]; c2 = (a)[28]; (a)[28] = c1; c1 = (a)[25]; (a)[25] = c2; c2 = (a)[19]; (a)[19] = c1; (a)[7] = c2; \
c1 = (a)[22]; (a)[22] = (a)[11]; c2 = (a)[13]; (a)[13] = c1; c1 = (a)[26]; (a)[26] = c2; c2 = (a)[21]; (a)[21] = c1; (a)[11] = c2; \
c1 = (a)[30]; (a)[30] = (a)[15]; c2 = (a)[29]; (a)[29] = c1; c1 = (a)[27]; (a)[27] = c2; c2 = (a)[23]; (a)[23] = c1; (a)[15] = c2; \
}
#define fftKernel32(a,dir) \
{ \
fftKernel2S((a)[0], (a)[16], dir); \
fftKernel2S((a)[1], (a)[17], dir); \
fftKernel2S((a)[2], (a)[18], dir); \
fftKernel2S((a)[3], (a)[19], dir); \
fftKernel2S((a)[4], (a)[20], dir); \
fftKernel2S((a)[5], (a)[21], dir); \
fftKernel2S((a)[6], (a)[22], dir); \
fftKernel2S((a)[7], (a)[23], dir); \
fftKernel2S((a)[8], (a)[24], dir); \
fftKernel2S((a)[9], (a)[25], dir); \
fftKernel2S((a)[10], (a)[26], dir); \
fftKernel2S((a)[11], (a)[27], dir); \
fftKernel2S((a)[12], (a)[28], dir); \
fftKernel2S((a)[13], (a)[29], dir); \
fftKernel2S((a)[14], (a)[30], dir); \
fftKernel2S((a)[15], (a)[31], dir); \
(a)[17] = complexMul((a)[17], (float2)(0x1.f6297cp-1f, dir*0x1.8f8b84p-3f)); \
(a)[18] = complexMul((a)[18], (float2)(0x1.d906bcp-1f, dir*0x1.87de2ap-2f)); \
(a)[19] = complexMul((a)[19], (float2)(0x1.a9b662p-1f, dir*0x1.1c73b4p-1f)); \
(a)[20] = complexMul((a)[20], (float2)(0x1.6a09e6p-1f, dir*0x1.6a09e6p-1f)); \
(a)[21] = complexMul((a)[21], (float2)(0x1.1c73b4p-1f, dir*0x1.a9b662p-1f)); \
(a)[22] = complexMul((a)[22], (float2)(0x1.87de2ap-2f, dir*0x1.d906bcp-1f)); \
(a)[23] = complexMul((a)[23], (float2)(0x1.8f8b84p-3f, dir*0x1.f6297cp-1f)); \
(a)[24] = complexMul((a)[24], (float2)(0x0p+0f, dir*0x1p+0f)); \
(a)[25] = complexMul((a)[25], (float2)(-0x1.8f8b84p-3f, dir*0x1.f6297cp-1f)); \
(a)[26] = complexMul((a)[26], (float2)(-0x1.87de2ap-2f, dir*0x1.d906bcp-1f)); \
(a)[27] = complexMul((a)[27], (float2)(-0x1.1c73b4p-1f, dir*0x1.a9b662p-1f)); \
(a)[28] = complexMul((a)[28], (float2)(-0x1.6a09e6p-1f, dir*0x1.6a09e6p-1f)); \
(a)[29] = complexMul((a)[29], (float2)(-0x1.a9b662p-1f, dir*0x1.1c73b4p-1f)); \
(a)[30] = complexMul((a)[30], (float2)(-0x1.d906bcp-1f, dir*0x1.87de2ap-2f)); \
(a)[31] = complexMul((a)[31], (float2)(-0x1.f6297cp-1f, dir*0x1.8f8b84p-3f)); \
fftKernel16((a), dir); \
fftKernel16((a) + 16, dir); \
bitreverse32((a)); \
}
__kernel void \
clFFT_1DTwistInterleaved(__global float2 *in, unsigned int startRow, unsigned int numCols, unsigned int N, unsigned int numRowsToProcess, int dir) \
{ \
float2 a, w; \
float ang; \
unsigned int j; \
unsigned int i = get_global_id(0); \
unsigned int startIndex = i; \
\
if(i < numCols) \
{ \
for(j = 0; j < numRowsToProcess; j++) \
{ \
a = in[startIndex]; \
ang = 2.0f * M_PI * dir * i * (startRow + j) / N; \
w = (float2)(native_cos(ang), native_sin(ang)); \
a = complexMul(a, w); \
in[startIndex] = a; \
startIndex += numCols; \
} \
} \
} \
__kernel void fft0(__global float2 *in, __global float2 *out, int dir, int S, __global float2 * cossinLUT1, __global float2 * cossinLUT2 )
{
__local float sMem[2064];
int i, j, r, indexIn, indexOut, index, tid, bNum, xNum, k, l;
int s, ii, jj, offset;
float2 w;
float ang, angf, ang1;
__local float *lMemStore, *lMemLoad;
float2 a[16];
int lId = get_local_id( 0 );
int groupId = get_group_id( 0 );
__local float2 cossin_T_LUT[256];
int lLUTind= lId;
cossin_T_LUT[lLUTind]=cossinLUT2[lLUTind];
lLUTind+=128;
cossin_T_LUT[lLUTind]=cossinLUT2[lLUTind];
barrier(CLK_LOCAL_MEM_FENCE);
bNum = groupId & 127;
xNum = groupId >> 7;
indexIn = mul24(bNum, 16);
tid = indexIn;
i = tid >> 0;
j = tid & 0;
indexOut = mad24(i, 128, j);
indexIn += (xNum << 18);
indexOut += (xNum << 18);
tid = lId;
i = tid & 15;
j = tid >> 4;
indexIn += mad24(j, 2048, i);
in += indexIn;
a[0] = in[0];
a[1] = in[16384];
a[2] = in[32768];
a[3] = in[49152];
a[4] = in[65536];
a[5] = in[81920];
a[6] = in[98304];
a[7] = in[114688];
a[8] = in[131072];
a[9] = in[147456];
a[10] = in[163840];
a[11] = in[180224];
a[12] = in[196608];
a[13] = in[212992];
a[14] = in[229376];
a[15] = in[245760];
fftKernel16(a, dir);
{ int ang_index = (1 * ( j)) & 127;
int k = ang_index * 2;
float2 cs =cossin_T_LUT[k];
cs.y *=dir;
w = cs;
}
a[1] = complexMul(a[1], w);
{ int ang_index = (1 * ( j)) & 63;
int k = ang_index * 4;
float2 cs =cossin_T_LUT[k];
cs.y *=dir;
w = cs;
}
a[2] = complexMul(a[2], w);
{ int ang_index = (3 * ( j)) & 127;
int k = ang_index * 2;
float2 cs =cossin_T_LUT[k];
cs.y *=dir;
w = cs;
}
a[3] = complexMul(a[3], w);
{ int ang_index = (1 * ( j)) & 31;
int k = ang_index * 8;
float2 cs =cossin_T_LUT[k];
cs.y *=dir;
w = cs;
}
a[4] = complexMul(a[4], w);
{ int ang_index = (5 * ( j)) & 127;
int k = ang_index * 2;
float2 cs =cossin_T_LUT[k];
cs.y *=dir;
w = cs;
}
a[5] = complexMul(a[5], w);
{ int ang_index = (3 * ( j)) & 63;
int k = ang_index * 4;
float2 cs =cossin_T_LUT[k];
cs.y *=dir;
w = cs;
}
a[6] = complexMul(a[6], w);
{ int ang_index = (7 * ( j)) & 127;
int k = ang_index * 2;
float2 cs =cossin_T_LUT[k];
cs.y *=dir;
w = cs;
}
a[7] = complexMul(a[7], w);
{ int ang_index = (1 * ( j)) & 15;
int k = ang_index * 16;
float2 cs =cossin_T_LUT[k];
cs.y *=dir;
w = cs;
}
a[8] = complexMul(a[8], w);
{ int ang_index = (9 * ( j)) & 127;
int k = ang_index * 2;
float2 cs =cossin_T_LUT[k];
cs.y *=dir;
w = cs;
}
a[9] = complexMul(a[9], w);
{ int ang_index = (5 * ( j)) & 63;
int k = ang_index * 4;
float2 cs =cossin_T_LUT[k];
cs.y *=dir;
w = cs;
}
a[10] = complexMul(a[10], w);
{ int ang_index = (11 * ( j)) & 127;
int k = ang_index * 2;
float2 cs =cossin_T_LUT[k];
cs.y *=dir;
w = cs;
}
a[11] = complexMul(a[11], w);
{ int ang_index = (3 * ( j)) & 31;
int k = ang_index * 8;
float2 cs =cossin_T_LUT[k];
cs.y *=dir;
w = cs;
}
a[12] = complexMul(a[12], w);
{ int ang_index = (13 * ( j)) & 127;
int k = ang_index * 2;
float2 cs =cossin_T_LUT[k];
cs.y *=dir;
w = cs;
}
a[13] = complexMul(a[13], w);
{ int ang_index = (7 * ( j)) & 63;
int k = ang_index * 4;
float2 cs =cossin_T_LUT[k];
cs.y *=dir;
w = cs;
}
a[14] = complexMul(a[14], w);
{ int ang_index = (15 * ( j)) & 127;
int k = ang_index * 2;
float2 cs =cossin_T_LUT[k];
cs.y *=dir;
w = cs;
}
a[15] = complexMul(a[15], w);
indexIn = mad24(j, 256, i);
lMemStore = sMem + tid;
lMemLoad = sMem + indexIn;
lMemStore[0] = a[0].x;
lMemStore[128] = a[1].x;
lMemStore[256] = a[2].x;
lMemStore[384] = a[3].x;
lMemStore[512] = a[4].x;
lMemStore[640] = a[5].x;
lMemStore[768] = a[6].x;
lMemStore[896] = a[7].x;
lMemStore[1024] = a[8].x;
lMemStore[1152] = a[9].x;
lMemStore[1280] = a[10].x;
lMemStore[1408] = a[11].x;
lMemStore[1536] = a[12].x;
lMemStore[1664] = a[13].x;
lMemStore[1792] = a[14].x;
lMemStore[1920] = a[15].x;
barrier(CLK_LOCAL_MEM_FENCE);
a[0].x = lMemLoad[0];
a[1].x = lMemLoad[16];
a[2].x = lMemLoad[32];
a[3].x = lMemLoad[48];
a[4].x = lMemLoad[64];
a[5].x = lMemLoad[80];
a[6].x = lMemLoad[96];
a[7].x = lMemLoad[112];
a[8].x = lMemLoad[128];
a[9].x = lMemLoad[144];
a[10].x = lMemLoad[160];
a[11].x = lMemLoad[176];
a[12].x = lMemLoad[192];
a[13].x = lMemLoad[208];
a[14].x = lMemLoad[224];
a[15].x = lMemLoad[240];
barrier(CLK_LOCAL_MEM_FENCE);
lMemStore[0] = a[0].y;
lMemStore[128] = a[1].y;
lMemStore[256] = a[2].y;
lMemStore[384] = a[3].y;
lMemStore[512] = a[4].y;
lMemStore[640] = a[5].y;
lMemStore[768] = a[6].y;
lMemStore[896] = a[7].y;
lMemStore[1024] = a[8].y;
lMemStore[1152] = a[9].y;
lMemStore[1280] = a[10].y;
lMemStore[1408] = a[11].y;
lMemStore[1536] = a[12].y;
lMemStore[1664] = a[13].y;
lMemStore[1792] = a[14].y;
lMemStore[1920] = a[15].y;
barrier(CLK_LOCAL_MEM_FENCE);
a[0].y = lMemLoad[0];
a[1].y = lMemLoad[16];
a[2].y = lMemLoad[32];
a[3].y = lMemLoad[48];
a[4].y = lMemLoad[64];
a[5].y = lMemLoad[80];
a[6].y = lMemLoad[96];
a[7].y = lMemLoad[112];
a[8].y = lMemLoad[128];
a[9].y = lMemLoad[144];
a[10].y = lMemLoad[160];
a[11].y = lMemLoad[176];
a[12].y = lMemLoad[192];
a[13].y = lMemLoad[208];
a[14].y = lMemLoad[224];
a[15].y = lMemLoad[240];
barrier(CLK_LOCAL_MEM_FENCE);
fftKernel8(a + 0, dir);
fftKernel8(a + 8, dir);
l = ((bNum << 4) + i) >> 0;
k = j << 1;
{ int ang_index = (1 * ( l * (k + 0))) & 262143;
int k = (ang_index + 512) >> 10;
float2 csx0 =cossin_T_LUT[k];
float2 csx0Transp= (float2)(csx0.y,-csx0.x);
int r=ang_index - k * ( 1024 );
float mh=-2.3968450478e-05*(float)r;
float mhsqr2= mh*mh*(-0.5f);
float hqub6= mhsqr2*mh*(1.0f/3.0f);
float2 cs;
cs= hqub6 * csx0Transp;
cs += mhsqr2*csx0;
cs += mh*csx0Transp;
cs += csx0;
cs.y *=dir;
w = cs;
}
a[0] = complexMul(a[0], w);
{ int ang_index = (1 * ( l * (k + 16))) & 262143;
int k = (ang_index + 512) >> 10;
float2 csx0 =cossin_T_LUT[k];
float2 csx0Transp= (float2)(csx0.y,-csx0.x);
int r=ang_index - k * ( 1024 );
float mh=-2.3968450478e-05*(float)r;
float mhsqr2= mh*mh*(-0.5f);
float hqub6= mhsqr2*mh*(1.0f/3.0f);
float2 cs;
cs= hqub6 * csx0Transp;
cs += mhsqr2*csx0;
cs += mh*csx0Transp;
cs += csx0;
cs.y *=dir;
w = cs;
}
a[1] = complexMul(a[1], w);
{ int ang_index = (1 * ( l * (k + 32))) & 262143;
int k = (ang_index + 512) >> 10;
float2 csx0 =cossin_T_LUT[k];
float2 csx0Transp= (float2)(csx0.y,-csx0.x);
int r=ang_index - k * ( 1024 );
float mh=-2.3968450478e-05*(float)r;
float mhsqr2= mh*mh*(-0.5f);
float hqub6= mhsqr2*mh*(1.0f/3.0f);
float2 cs;
cs= hqub6 * csx0Transp;
cs += mhsqr2*csx0;
cs += mh*csx0Transp;
cs += csx0;
cs.y *=dir;
w = cs;
}
a[2] = complexMul(a[2], w);
{ int ang_index = (1 * ( l * (k + 48))) & 262143;
int k = (ang_index + 512) >> 10;
float2 csx0 =cossin_T_LUT[k];
float2 csx0Transp= (float2)(csx0.y,-csx0.x);
int r=ang_index - k * ( 1024 );
float mh=-2.3968450478e-05*(float)r;
float mhsqr2= mh*mh*(-0.5f);
float hqub6= mhsqr2*mh*(1.0f/3.0f);
float2 cs;
cs= hqub6 * csx0Transp;
cs += mhsqr2*csx0;
cs += mh*csx0Transp;
cs += csx0;
cs.y *=dir;
w = cs;
}
a[3] = complexMul(a[3], w);
{ int ang_index = (1 * ( l * (k + 64))) & 262143;
int k = (ang_index + 512) >> 10;
float2 csx0 =cossin_T_LUT[k];
float2 csx0Transp= (float2)(csx0.y,-csx0.x);
int r=ang_index - k * ( 1024 );
float mh=-2.3968450478e-05*(float)r;
float mhsqr2= mh*mh*(-0.5f);
float hqub6= mhsqr2*mh*(1.0f/3.0f);
float2 cs;
cs= hqub6 * csx0Transp;
cs += mhsqr2*csx0;
cs += mh*csx0Transp;
cs += csx0;
cs.y *=dir;
w = cs;
}
a[4] = complexMul(a[4], w);
{ int ang_index = (1 * ( l * (k + 80))) & 262143;
int k = (ang_index + 512) >> 10;
float2 csx0 =cossin_T_LUT[k];
float2 csx0Transp= (float2)(csx0.y,-csx0.x);
int r=ang_index - k * ( 1024 );
float mh=-2.3968450478e-05*(float)r;
float mhsqr2= mh*mh*(-0.5f);
float hqub6= mhsqr2*mh*(1.0f/3.0f);
float2 cs;
cs= hqub6 * csx0Transp;
cs += mhsqr2*csx0;
cs += mh*csx0Transp;
cs += csx0;
cs.y *=dir;
w = cs;
}
a[5] = complexMul(a[5], w);
{ int ang_index = (1 * ( l * (k + 96))) & 262143;
int k = (ang_index + 512) >> 10;
float2 csx0 =cossin_T_LUT[k];
float2 csx0Transp= (float2)(csx0.y,-csx0.x);
int r=ang_index - k * ( 1024 );
float mh=-2.3968450478e-05*(float)r;
float mhsqr2= mh*mh*(-0.5f);
float hqub6= mhsqr2*mh*(1.0f/3.0f);
float2 cs;
cs= hqub6 * csx0Transp;
cs += mhsqr2*csx0;
cs += mh*csx0Transp;
cs += csx0;
cs.y *=dir;
w = cs;
}
a[6] = complexMul(a[6], w);
{ int ang_index = (1 * ( l * (k + 112))) & 262143;
int k = (ang_index + 512) >> 10;
float2 csx0 =cossin_T_LUT[k];
float2 csx0Transp= (float2)(csx0.y,-csx0.x);
int r=ang_index - k * ( 1024 );
float mh=-2.3968450478e-05*(float)r;
float mhsqr2= mh*mh*(-0.5f);
float hqub6= mhsqr2*mh*(1.0f/3.0f);
float2 cs;
cs= hqub6 * csx0Transp;
cs += mhsqr2*csx0;
cs += mh*csx0Transp;
cs += csx0;
cs.y *=dir;
w = cs;
}
a[7] = complexMul(a[7], w);
{ int ang_index = (1 * ( l * (k + 1))) & 262143;
int k = (ang_index + 512) >> 10;
float2 csx0 =cossin_T_LUT[k];
float2 csx0Transp= (float2)(csx0.y,-csx0.x);
int r=ang_index - k * ( 1024 );
float mh=-2.3968450478e-05*(float)r;
float mhsqr2= mh*mh*(-0.5f);
float hqub6= mhsqr2*mh*(1.0f/3.0f);
float2 cs;
cs= hqub6 * csx0Transp;
cs += mhsqr2*csx0;
cs += mh*csx0Transp;
cs += csx0;
cs.y *=dir;
w = cs;
}
a[8] = complexMul(a[8], w);
{ int ang_index = (1 * ( l * (k + 17))) & 262143;
int k = (ang_index + 512) >> 10;
float2 csx0 =cossin_T_LUT[k];
float2 csx0Transp= (float2)(csx0.y,-csx0.x);
int r=ang_index - k * ( 1024 );
float mh=-2.3968450478e-05*(float)r;
float mhsqr2= mh*mh*(-0.5f);
float hqub6= mhsqr2*mh*(1.0f/3.0f);
float2 cs;
cs= hqub6 * csx0Transp;
cs += mhsqr2*csx0;
cs += mh*csx0Transp;
cs += csx0;
cs.y *=dir;
w = cs;
}
a[9] = complexMul(a[9], w);
{ int ang_index = (1 * ( l * (k + 33))) & 262143;
int k = (ang_index + 512) >> 10;
float2 csx0 =cossin_T_LUT[k];
float2 csx0Transp= (float2)(csx0.y,-csx0.x);
int r=ang_index - k * ( 1024 );
float mh=-2.3968450478e-05*(float)r;
float mhsqr2= mh*mh*(-0.5f);
float hqub6= mhsqr2*mh*(1.0f/3.0f);
float2 cs;
cs= hqub6 * csx0Transp;
cs += mhsqr2*csx0;
cs += mh*csx0Transp;
cs += csx0;
cs.y *=dir;
w = cs;
}
a[10] = complexMul(a[10], w);
{ int ang_index = (1 * ( l * (k + 49))) & 262143;
int k = (ang_index + 512) >> 10;
float2 csx0 =cossin_T_LUT[k];
float2 csx0Transp= (float2)(csx0.y,-csx0.x);
int r=ang_index - k * ( 1024 );
float mh=-2.3968450478e-05*(float)r;
float mhsqr2= mh*mh*(-0.5f);
float hqub6= mhsqr2*mh*(1.0f/3.0f);
float2 cs;
cs= hqub6 * csx0Transp;
cs += mhsqr2*csx0;
cs += mh*csx0Transp;
cs += csx0;
cs.y *=dir;
w = cs;
}
a[11] = complexMul(a[11], w);
{ int ang_index = (1 * ( l * (k + 65))) & 262143;
int k = (ang_index + 512) >> 10;
float2 csx0 =cossin_T_LUT[k];
float2 csx0Transp= (float2)(csx0.y,-csx0.x);
int r=ang_index - k * ( 1024 );
float mh=-2.3968450478e-05*(float)r;
float mhsqr2= mh*mh*(-0.5f);
float hqub6= mhsqr2*mh*(1.0f/3.0f);
float2 cs;
cs= hqub6 * csx0Transp;
cs += mhsqr2*csx0;
cs += mh*csx0Transp;
cs += csx0;
cs.y *=dir;
w = cs;
}
a[12] = complexMul(a[12], w);
{ int ang_index = (1 * ( l * (k + 81))) & 262143;
int k = (ang_index + 512) >> 10;
float2 csx0 =cossin_T_LUT[k];
float2 csx0Transp= (float2)(csx0.y,-csx0.x);
int r=ang_index - k * ( 1024 );
float mh=-2.3968450478e-05*(float)r;
float mhsqr2= mh*mh*(-0.5f);
float hqub6= mhsqr2*mh*(1.0f/3.0f);
float2 cs;
cs= hqub6 * csx0Transp;
cs += mhsqr2*csx0;
cs += mh*csx0Transp;
cs += csx0;
cs.y *=dir;
w = cs;
}
a[13] = complexMul(a[13], w);
{ int ang_index = (1 * ( l * (k + 97))) & 262143;
int k = (ang_index + 512) >> 10;
float2 csx0 =cossin_T_LUT[k];
float2 csx0Transp= (float2)(csx0.y,-csx0.x);
int r=ang_index - k * ( 1024 );
float mh=-2.3968450478e-05*(float)r;
float mhsqr2= mh*mh*(-0.5f);
float hqub6= mhsqr2*mh*(1.0f/3.0f);
float2 cs;
cs= hqub6 * csx0Transp;
cs += mhsqr2*csx0;
cs += mh*csx0Transp;
cs += csx0;
cs.y *=dir;
w = cs;
}
a[14] = complexMul(a[14], w);
{ int ang_index = (1 * ( l * (k + 113))) & 262143;
int k = (ang_index + 512) >> 10;
float2 csx0 =cossin_T_LUT[k];
float2 csx0Transp= (float2)(csx0.y,-csx0.x);
int r=ang_index - k * ( 1024 );
float mh=-2.3968450478e-05*(float)r;
float mhsqr2= mh*mh*(-0.5f);
float hqub6= mhsqr2*mh*(1.0f/3.0f);
float2 cs;
cs= hqub6 * csx0Transp;
cs += mhsqr2*csx0;
cs += mh*csx0Transp;
cs += csx0;
cs.y *=dir;
w = cs;
}
a[15] = complexMul(a[15], w);
lMemStore = sMem + mad24(i, 129, j << 1);
lMemLoad = sMem + mad24(tid >> 7, 129, tid & 127);
lMemStore[ 0] = a[0].x;
lMemStore[ 16] = a[1].x;
lMemStore[ 32] = a[2].x;
lMemStore[ 48] = a[3].x;
lMemStore[ 64] = a[4].x;
lMemStore[ 80] = a[5].x;
lMemStore[ 96] = a[6].x;
lMemStore[ 112] = a[7].x;
lMemStore[ 1] = a[8].x;
lMemStore[ 17] = a[9].x;
lMemStore[ 33] = a[10].x;
lMemStore[ 49] = a[11].x;
lMemStore[ 65] = a[12].x;
lMemStore[ 81] = a[13].x;
lMemStore[ 97] = a[14].x;
lMemStore[ 113] = a[15].x;
barrier(CLK_LOCAL_MEM_FENCE);
a[0].x = lMemLoad[0];
a[1].x = lMemLoad[129];
a[2].x = lMemLoad[258];
a[3].x = lMemLoad[387];
a[4].x = lMemLoad[516];
a[5].x = lMemLoad[645];
a[6].x = lMemLoad[774];
a[7].x = lMemLoad[903];
a[8].x = lMemLoad[1032];
a[9].x = lMemLoad[1161];
a[10].x = lMemLoad[1290];
a[11].x = lMemLoad[1419];
a[12].x = lMemLoad[1548];
a[13].x = lMemLoad[1677];
a[14].x = lMemLoad[1806];
a[15].x = lMemLoad[1935];
barrier(CLK_LOCAL_MEM_FENCE);
lMemStore[ 0] = a[0].y;
lMemStore[ 16] = a[1].y;
lMemStore[ 32] = a[2].y;
lMemStore[ 48] = a[3].y;
lMemStore[ 64] = a[4].y;
lMemStore[ 80] = a[5].y;
lMemStore[ 96] = a[6].y;
lMemStore[ 112] = a[7].y;
lMemStore[ 1] = a[8].y;
lMemStore[ 17] = a[9].y;
lMemStore[ 33] = a[10].y;
lMemStore[ 49] = a[11].y;
lMemStore[ 65] = a[12].y;
lMemStore[ 81] = a[13].y;
lMemStore[ 97] = a[14].y;
lMemStore[ 113] = a[15].y;
barrier(CLK_LOCAL_MEM_FENCE);
a[0].y = lMemLoad[0];
a[1].y = lMemLoad[129];
a[2].y = lMemLoad[258];
a[3].y = lMemLoad[387];
a[4].y = lMemLoad[516];
a[5].y = lMemLoad[645];
a[6].y = lMemLoad[774];
a[7].y = lMemLoad[903];
a[8].y = lMemLoad[1032];
a[9].y = lMemLoad[1161];
a[10].y = lMemLoad[1290];
a[11].y = lMemLoad[1419];
a[12].y = lMemLoad[1548];
a[13].y = lMemLoad[1677];
a[14].y = lMemLoad[1806];
a[15].y = lMemLoad[1935];
barrier(CLK_LOCAL_MEM_FENCE);
indexOut += tid;
out += indexOut;
out[0] = a[0];
out[128] = a[1];
out[256] = a[2];
out[384] = a[3];
out[512] = a[4];
out[640] = a[5];
out[768] = a[6];
out[896] = a[7];
out[1024] = a[8];
out[1152] = a[9];
out[1280] = a[10];
out[1408] = a[11];
out[1536] = a[12];
out[1664] = a[13];
out[1792] = a[14];
out[1920] = a[15];
}
__kernel void fft1(__global float2 *in, __global float2 *out, int dir, int S, __global float2 * cossinLUT1, __global float2 * cossinLUT2 )
{
__local float sMem[2048];
int i, j, r, indexIn, indexOut, index, tid, bNum, xNum, k, l;
int s, ii, jj, offset;
float2 w;
float ang, angf, ang1;
__local float *lMemStore, *lMemLoad;
float2 a[16];
int lId = get_local_id( 0 );
int groupId = get_group_id( 0 );
__local float2 cossin_T_LUT[256];
int lLUTind= lId;
cossin_T_LUT[lLUTind]=cossinLUT2[lLUTind];
lLUTind+=128;
cossin_T_LUT[lLUTind]=cossinLUT2[lLUTind];
barrier(CLK_LOCAL_MEM_FENCE);
bNum = groupId & 127;
xNum = groupId >> 7;
indexIn = mul24(bNum, 16);
tid = indexIn;
i = tid >> 7;
j = tid & 127;
indexOut = mad24(i, 16384, j);
indexIn += (xNum << 18);
indexOut += (xNum << 18);
tid = lId;
i = tid & 15;
j = tid >> 4;
indexIn += mad24(j, 2048, i);
in += indexIn;
a[0] = in[0];
a[1] = in[16384];
a[2] = in[32768];
a[3] = in[49152];
a[4] = in[65536];
a[5] = in[81920];
a[6] = in[98304];
a[7] = in[114688];
a[8] = in[131072];
a[9] = in[147456];
a[10] = in[163840];
a[11] = in[180224];
a[12] = in[196608];
a[13] = in[212992];
a[14] = in[229376];
a[15] = in[245760];
fftKernel16(a, dir);
{ int ang_index = (1 * ( j)) & 127;
int k = ang_index * 2;
float2 cs =cossin_T_LUT[k];
cs.y *=dir;
w = cs;
}
a[1] = complexMul(a[1], w);
{ int ang_index = (1 * ( j)) & 63;
int k = ang_index * 4;
float2 cs =cossin_T_LUT[k];
cs.y *=dir;
w = cs;
}
a[2] = complexMul(a[2], w);
{ int ang_index = (3 * ( j)) & 127;
int k = ang_index * 2;
float2 cs =cossin_T_LUT[k];
cs.y *=dir;
w = cs;
}
a[3] = complexMul(a[3], w);
{ int ang_index = (1 * ( j)) & 31;
int k = ang_index * 8;
float2 cs =cossin_T_LUT[k];
cs.y *=dir;
w = cs;
}
a[4] = complexMul(a[4], w);
{ int ang_index = (5 * ( j)) & 127;
int k = ang_index * 2;
float2 cs =cossin_T_LUT[k];
cs.y *=dir;
w = cs;
}
a[5] = complexMul(a[5], w);
{ int ang_index = (3 * ( j)) & 63;
int k = ang_index * 4;
float2 cs =cossin_T_LUT[k];
cs.y *=dir;
w = cs;
}
a[6] = complexMul(a[6], w);
{ int ang_index = (7 * ( j)) & 127;
int k = ang_index * 2;
float2 cs =cossin_T_LUT[k];
cs.y *=dir;
w = cs;
}
a[7] = complexMul(a[7], w);
{ int ang_index = (1 * ( j)) & 15;
int k = ang_index * 16;
float2 cs =cossin_T_LUT[k];
cs.y *=dir;
w = cs;
}
a[8] = complexMul(a[8], w);
{ int ang_index = (9 * ( j)) & 127;
int k = ang_index * 2;
float2 cs =cossin_T_LUT[k];
cs.y *=dir;
w = cs;
}
a[9] = complexMul(a[9], w);
{ int ang_index = (5 * ( j)) & 63;
int k = ang_index * 4;
float2 cs =cossin_T_LUT[k];
cs.y *=dir;
w = cs;
}
a[10] = complexMul(a[10], w);
{ int ang_index = (11 * ( j)) & 127;
int k = ang_index * 2;
float2 cs =cossin_T_LUT[k];
cs.y *=dir;
w = cs;
}
a[11] = complexMul(a[11], w);
{ int ang_index = (3 * ( j)) & 31;
int k = ang_index * 8;
float2 cs =cossin_T_LUT[k];
cs.y *=dir;
w = cs;
}
a[12] = complexMul(a[12], w);
{ int ang_index = (13 * ( j)) & 127;
int k = ang_index * 2;
float2 cs =cossin_T_LUT[k];
cs.y *=dir;
w = cs;
}
a[13] = complexMul(a[13], w);
{ int ang_index = (7 * ( j)) & 63;
int k = ang_index * 4;
float2 cs =cossin_T_LUT[k];
cs.y *=dir;
w = cs;
}
a[14] = complexMul(a[14], w);
{ int ang_index = (15 * ( j)) & 127;
int k = ang_index * 2;
float2 cs =cossin_T_LUT[k];
cs.y *=dir;
w = cs;
}
a[15] = complexMul(a[15], w);
indexIn = mad24(j, 256, i);
lMemStore = sMem + tid;
lMemLoad = sMem + indexIn;
lMemStore[0] = a[0].x;
lMemStore[128] = a[1].x;
lMemStore[256] = a[2].x;
lMemStore[384] = a[3].x;
lMemStore[512] = a[4].x;
lMemStore[640] = a[5].x;
lMemStore[768] = a[6].x;
lMemStore[896] = a[7].x;
lMemStore[1024] = a[8].x;
lMemStore[1152] = a[9].x;
lMemStore[1280] = a[10].x;
lMemStore[1408] = a[11].x;
lMemStore[1536] = a[12].x;
lMemStore[1664] = a[13].x;
lMemStore[1792] = a[14].x;
lMemStore[1920] = a[15].x;
barrier(CLK_LOCAL_MEM_FENCE);
a[0].x = lMemLoad[0];
a[1].x = lMemLoad[16];
a[2].x = lMemLoad[32];
a[3].x = lMemLoad[48];
a[4].x = lMemLoad[64];
a[5].x = lMemLoad[80];
a[6].x = lMemLoad[96];
a[7].x = lMemLoad[112];
a[8].x = lMemLoad[128];
a[9].x = lMemLoad[144];
a[10].x = lMemLoad[160];
a[11].x = lMemLoad[176];
a[12].x = lMemLoad[192];
a[13].x = lMemLoad[208];
a[14].x = lMemLoad[224];
a[15].x = lMemLoad[240];
barrier(CLK_LOCAL_MEM_FENCE);
lMemStore[0] = a[0].y;
lMemStore[128] = a[1].y;
lMemStore[256] = a[2].y;
lMemStore[384] = a[3].y;
lMemStore[512] = a[4].y;
lMemStore[640] = a[5].y;
lMemStore[768] = a[6].y;
lMemStore[896] = a[7].y;
lMemStore[1024] = a[8].y;
lMemStore[1152] = a[9].y;
lMemStore[1280] = a[10].y;
lMemStore[1408] = a[11].y;
lMemStore[1536] = a[12].y;
lMemStore[1664] = a[13].y;
lMemStore[1792] = a[14].y;
lMemStore[1920] = a[15].y;
barrier(CLK_LOCAL_MEM_FENCE);
a[0].y = lMemLoad[0];
a[1].y = lMemLoad[16];
a[2].y = lMemLoad[32];
a[3].y = lMemLoad[48];
a[4].y = lMemLoad[64];
a[5].y = lMemLoad[80];
a[6].y = lMemLoad[96];
a[7].y = lMemLoad[112];
a[8].y = lMemLoad[128];
a[9].y = lMemLoad[144];
a[10].y = lMemLoad[160];
a[11].y = lMemLoad[176];
a[12].y = lMemLoad[192];
a[13].y = lMemLoad[208];
a[14].y = lMemLoad[224];
a[15].y = lMemLoad[240];
barrier(CLK_LOCAL_MEM_FENCE);
fftKernel8(a + 0, dir);
fftKernel8(a + 8, dir);
l = ((bNum << 4) + i) >> 7;
k = j << 1;
{ int ang_index = (1 * ( l * (k + 0))) & 2047;
int k = (ang_index + 4) >> 3;
float2 csx0 =cossin_T_LUT[k];
float2 csx0Transp= (float2)(csx0.y,-csx0.x);
int r=ang_index - k * ( 8 );
float mh=-3.0679616611e-03*(float)r;
float mhsqr2= mh*mh*(-0.5f);
float hqub6= mhsqr2*mh*(1.0f/3.0f);
float2 cs;
cs= hqub6 * csx0Transp;
cs += mhsqr2*csx0;
cs += mh*csx0Transp;
cs += csx0;
cs.y *=dir;
w = cs;
}
a[0] = complexMul(a[0], w);
{ int ang_index = (1 * ( l * (k + 16))) & 2047;
int k = (ang_index + 4) >> 3;
float2 csx0 =cossin_T_LUT[k];
float2 csx0Transp= (float2)(csx0.y,-csx0.x);
int r=ang_index - k * ( 8 );
float mh=-3.0679616611e-03*(float)r;
float mhsqr2= mh*mh*(-0.5f);
float hqub6= mhsqr2*mh*(1.0f/3.0f);
float2 cs;
cs= hqub6 * csx0Transp;
cs += mhsqr2*csx0;
cs += mh*csx0Transp;
cs += csx0;
cs.y *=dir;
w = cs;
}
a[1] = complexMul(a[1], w);
{ int ang_index = (1 * ( l * (k + 32))) & 2047;
int k = (ang_index + 4) >> 3;
float2 csx0 =cossin_T_LUT[k];
float2 csx0Transp= (float2)(csx0.y,-csx0.x);
int r=ang_index - k * ( 8 );
float mh=-3.0679616611e-03*(float)r;
float mhsqr2= mh*mh*(-0.5f);
float hqub6= mhsqr2*mh*(1.0f/3.0f);
float2 cs;
cs= hqub6 * csx0Transp;
cs += mhsqr2*csx0;
cs += mh*csx0Transp;
cs += csx0;
cs.y *=dir;
w = cs;
}
a[2] = complexMul(a[2], w);
{ int ang_index = (1 * ( l * (k + 48))) & 2047;
int k = (ang_index + 4) >> 3;
float2 csx0 =cossin_T_LUT[k];
float2 csx0Transp= (float2)(csx0.y,-csx0.x);
int r=ang_index - k * ( 8 );
float mh=-3.0679616611e-03*(float)r;
float mhsqr2= mh*mh*(-0.5f);
float hqub6= mhsqr2*mh*(1.0f/3.0f);
float2 cs;
cs= hqub6 * csx0Transp;
cs += mhsqr2*csx0;
cs += mh*csx0Transp;
cs += csx0;
cs.y *=dir;
w = cs;
}
a[3] = complexMul(a[3], w);
{ int ang_index = (1 * ( l * (k + 64))) & 2047;
int k = (ang_index + 4) >> 3;
float2 csx0 =cossin_T_LUT[k];
float2 csx0Transp= (float2)(csx0.y,-csx0.x);
int r=ang_index - k * ( 8 );
float mh=-3.0679616611e-03*(float)r;
float mhsqr2= mh*mh*(-0.5f);
float hqub6= mhsqr2*mh*(1.0f/3.0f);
float2 cs;
cs= hqub6 * csx0Transp;
cs += mhsqr2*csx0;
cs += mh*csx0Transp;
cs += csx0;
cs.y *=dir;
w = cs;
}
a[4] = complexMul(a[4], w);
{ int ang_index = (1 * ( l * (k + 80))) & 2047;
int k = (ang_index + 4) >> 3;
float2 csx0 =cossin_T_LUT[k];
float2 csx0Transp= (float2)(csx0.y,-csx0.x);
int r=ang_index - k * ( 8 );
float mh=-3.0679616611e-03*(float)r;
float mhsqr2= mh*mh*(-0.5f);
float hqub6= mhsqr2*mh*(1.0f/3.0f);
float2 cs;
cs= hqub6 * csx0Transp;
cs += mhsqr2*csx0;
cs += mh*csx0Transp;
cs += csx0;
cs.y *=dir;
w = cs;
}
a[5] = complexMul(a[5], w);
{ int ang_index = (1 * ( l * (k + 96))) & 2047;
int k = (ang_index + 4) >> 3;
float2 csx0 =cossin_T_LUT[k];
float2 csx0Transp= (float2)(csx0.y,-csx0.x);
int r=ang_index - k * ( 8 );
float mh=-3.0679616611e-03*(float)r;
float mhsqr2= mh*mh*(-0.5f);
float hqub6= mhsqr2*mh*(1.0f/3.0f);
float2 cs;
cs= hqub6 * csx0Transp;
cs += mhsqr2*csx0;
cs += mh*csx0Transp;
cs += csx0;
cs.y *=dir;
w = cs;
}
a[6] = complexMul(a[6], w);
{ int ang_index = (1 * ( l * (k + 112))) & 2047;
int k = (ang_index + 4) >> 3;
float2 csx0 =cossin_T_LUT[k];
float2 csx0Transp= (float2)(csx0.y,-csx0.x);
int r=ang_index - k * ( 8 );
float mh=-3.0679616611e-03*(float)r;
float mhsqr2= mh*mh*(-0.5f);
float hqub6= mhsqr2*mh*(1.0f/3.0f);
float2 cs;
cs= hqub6 * csx0Transp;
cs += mhsqr2*csx0;
cs += mh*csx0Transp;
cs += csx0;
cs.y *=dir;
w = cs;
}
a[7] = complexMul(a[7], w);
{ int ang_index = (1 * ( l * (k + 1))) & 2047;
int k = (ang_index + 4) >> 3;
float2 csx0 =cossin_T_LUT[k];
float2 csx0Transp= (float2)(csx0.y,-csx0.x);
int r=ang_index - k * ( 8 );
float mh=-3.0679616611e-03*(float)r;
float mhsqr2= mh*mh*(-0.5f);
float hqub6= mhsqr2*mh*(1.0f/3.0f);
float2 cs;
cs= hqub6 * csx0Transp;
cs += mhsqr2*csx0;
cs += mh*csx0Transp;
cs += csx0;
cs.y *=dir;
w = cs;
}
a[8] = complexMul(a[8], w);
{ int ang_index = (1 * ( l * (k + 17))) & 2047;
int k = (ang_index + 4) >> 3;
float2 csx0 =cossin_T_LUT[k];
float2 csx0Transp= (float2)(csx0.y,-csx0.x);
int r=ang_index - k * ( 8 );
float mh=-3.0679616611e-03*(float)r;
float mhsqr2= mh*mh*(-0.5f);
float hqub6= mhsqr2*mh*(1.0f/3.0f);
float2 cs;
cs= hqub6 * csx0Transp;
cs += mhsqr2*csx0;
cs += mh*csx0Transp;
cs += csx0;
cs.y *=dir;
w = cs;
}
a[9] = complexMul(a[9], w);
{ int ang_index = (1 * ( l * (k + 33))) & 2047;
int k = (ang_index + 4) >> 3;
float2 csx0 =cossin_T_LUT[k];
float2 csx0Transp= (float2)(csx0.y,-csx0.x);
int r=ang_index - k * ( 8 );
float mh=-3.0679616611e-03*(float)r;
float mhsqr2= mh*mh*(-0.5f);
float hqub6= mhsqr2*mh*(1.0f/3.0f);
float2 cs;
cs= hqub6 * csx0Transp;
cs += mhsqr2*csx0;
cs += mh*csx0Transp;
cs += csx0;
cs.y *=dir;
w = cs;
}
a[10] = complexMul(a[10], w);
{ int ang_index = (1 * ( l * (k + 49))) & 2047;
int k = (ang_index + 4) >> 3;
float2 csx0 =cossin_T_LUT[k];
float2 csx0Transp= (float2)(csx0.y,-csx0.x);
int r=ang_index - k * ( 8 );
float mh=-3.0679616611e-03*(float)r;
float mhsqr2= mh*mh*(-0.5f);
float hqub6= mhsqr2*mh*(1.0f/3.0f);
float2 cs;
cs= hqub6 * csx0Transp;
cs += mhsqr2*csx0;
cs += mh*csx0Transp;
cs += csx0;
cs.y *=dir;
w = cs;
}
a[11] = complexMul(a[11], w);
{ int ang_index = (1 * ( l * (k + 65))) & 2047;
int k = (ang_index + 4) >> 3;
float2 csx0 =cossin_T_LUT[k];
float2 csx0Transp= (float2)(csx0.y,-csx0.x);
int r=ang_index - k * ( 8 );
float mh=-3.0679616611e-03*(float)r;
float mhsqr2= mh*mh*(-0.5f);
float hqub6= mhsqr2*mh*(1.0f/3.0f);
float2 cs;
cs= hqub6 * csx0Transp;
cs += mhsqr2*csx0;
cs += mh*csx0Transp;
cs += csx0;
cs.y *=dir;
w = cs;
}
a[12] = complexMul(a[12], w);
{ int ang_index = (1 * ( l * (k + 81))) & 2047;
int k = (ang_index + 4) >> 3;
float2 csx0 =cossin_T_LUT[k];
float2 csx0Transp= (float2)(csx0.y,-csx0.x);
int r=ang_index - k * ( 8 );
float mh=-3.0679616611e-03*(float)r;
float mhsqr2= mh*mh*(-0.5f);
float hqub6= mhsqr2*mh*(1.0f/3.0f);
float2 cs;
cs= hqub6 * csx0Transp;
cs += mhsqr2*csx0;
cs += mh*csx0Transp;
cs += csx0;
cs.y *=dir;
w = cs;
}
a[13] = complexMul(a[13], w);
{ int ang_index = (1 * ( l * (k + 97))) & 2047;
int k = (ang_index + 4) >> 3;
float2 csx0 =cossin_T_LUT[k];
float2 csx0Transp= (float2)(csx0.y,-csx0.x);
int r=ang_index - k * ( 8 );
float mh=-3.0679616611e-03*(float)r;
float mhsqr2= mh*mh*(-0.5f);
float hqub6= mhsqr2*mh*(1.0f/3.0f);
float2 cs;
cs= hqub6 * csx0Transp;
cs += mhsqr2*csx0;
cs += mh*csx0Transp;
cs += csx0;
cs.y *=dir;
w = cs;
}
a[14] = complexMul(a[14], w);
{ int ang_index = (1 * ( l * (k + 113))) & 2047;
int k = (ang_index + 4) >> 3;
float2 csx0 =cossin_T_LUT[k];
float2 csx0Transp= (float2)(csx0.y,-csx0.x);
int r=ang_index - k * ( 8 );
float mh=-3.0679616611e-03*(float)r;
float mhsqr2= mh*mh*(-0.5f);
float hqub6= mhsqr2*mh*(1.0f/3.0f);
float2 cs;
cs= hqub6 * csx0Transp;
cs += mhsqr2*csx0;
cs += mh*csx0Transp;
cs += csx0;
cs.y *=dir;
w = cs;
}
a[15] = complexMul(a[15], w);
indexOut += mad24(j, 256, i);
out += indexOut;
out[0] = a[0];
out[2048] = a[1];
out[4096] = a[2];
out[6144] = a[3];
out[8192] = a[4];
out[10240] = a[5];
out[12288] = a[6];
out[14336] = a[7];
out[128] = a[8];
out[2176] = a[9];
out[4224] = a[10];
out[6272] = a[11];
out[8320] = a[12];
out[10368] = a[13];
out[12416] = a[14];
out[14464] = a[15];
}
__kernel void fft2(__global float2 *in, __global float2 *out, int dir, int S, __global float2 * cossinLUT1, __global float2 * cossinLUT2 )
{
__local float sMem[256];
int i, j, r, indexIn, indexOut, index, tid, bNum, xNum, k, l;
int s, ii, jj, offset;
float2 w;
float ang, angf, ang1;
__local float *lMemStore, *lMemLoad;
float2 a[4];
int lId = get_local_id( 0 );
int groupId = get_group_id( 0 );
__local float2 cossin_T_LUT[256];
int lLUTind= lId;
cossin_T_LUT[lLUTind]=cossinLUT2[lLUTind];
lLUTind+=64;
cossin_T_LUT[lLUTind]=cossinLUT2[lLUTind];
lLUTind+=64;
cossin_T_LUT[lLUTind]=cossinLUT2[lLUTind];
lLUTind+=64;
cossin_T_LUT[lLUTind]=cossinLUT2[lLUTind];
barrier(CLK_LOCAL_MEM_FENCE);
bNum = groupId & 1023;
xNum = groupId >> 10;
indexIn = mul24(bNum, 16);
tid = indexIn;
i = tid >> 14;
j = tid & 16383;
indexOut = mad24(i, 262144, j);
indexIn += (xNum << 18);
indexOut += (xNum << 18);
tid = lId;
i = tid & 15;
j = tid >> 4;
indexIn += mad24(j, 16384, i);
in += indexIn;
a[0] = in[0];
a[1] = in[65536];
a[2] = in[131072];
a[3] = in[196608];
fftKernel4(a, dir);
{ int ang_index = (1 * ( j)) & 15;
int k = ang_index * 16;
float2 cs =cossin_T_LUT[k];
cs.y *=dir;
w = cs;
}
a[1] = complexMul(a[1], w);
{ int ang_index = (1 * ( j)) & 7;
int k = ang_index * 32;
float2 cs =cossin_T_LUT[k];
cs.y *=dir;
w = cs;
}
a[2] = complexMul(a[2], w);
{ int ang_index = (3 * ( j)) & 15;
int k = ang_index * 16;
float2 cs =cossin_T_LUT[k];
cs.y *=dir;
w = cs;
}
a[3] = complexMul(a[3], w);
indexIn = mad24(j, 64, i);
lMemStore = sMem + tid;
lMemLoad = sMem + indexIn;
lMemStore[0] = a[0].x;
lMemStore[64] = a[1].x;
lMemStore[128] = a[2].x;
lMemStore[192] = a[3].x;
barrier(CLK_LOCAL_MEM_FENCE);
a[0].x = lMemLoad[0];
a[1].x = lMemLoad[16];
a[2].x = lMemLoad[32];
a[3].x = lMemLoad[48];
barrier(CLK_LOCAL_MEM_FENCE);
lMemStore[0] = a[0].y;
lMemStore[64] = a[1].y;
lMemStore[128] = a[2].y;
lMemStore[192] = a[3].y;
barrier(CLK_LOCAL_MEM_FENCE);
a[0].y = lMemLoad[0];
a[1].y = lMemLoad[16];
a[2].y = lMemLoad[32];
a[3].y = lMemLoad[48];
barrier(CLK_LOCAL_MEM_FENCE);
fftKernel4(a + 0, dir);
indexOut += mad24(j, 16384, i);
out += indexOut;
out[0] = a[0];
out[65536] = a[1];
out[131072] = a[2];
out[196608] = a[3];
}
...@@ -95,6 +95,21 @@ typedef struct ...@@ -95,6 +95,21 @@ typedef struct
// context in which fft resources are created and kernels are executed // context in which fft resources are created and kernels are executed
cl_context context; cl_context context;
// HSA global region
hsa_region_t global_region;
// HSA executable
hsa_executable_t executable;
// HSA agent
hsa_agent_t agent;
// HSA kernel argument region
hsa_region_t kernarg_region;
// HSA current execution queue
hsa_queue_t* queue;
// size of signal // size of signal
clFFT_Dim3 n; clFFT_Dim3 n;
...@@ -141,19 +156,19 @@ typedef struct ...@@ -141,19 +156,19 @@ typedef struct
size_t last_batch_size; size_t last_batch_size;
// temporary buffer for interleaved plan // temporary buffer for interleaved plan
cl_mem tempmemobj; float* tempmemobj;
// temporary buffer for planner plan. Only one of tempmemobj or // temporary buffer for planner plan. Only one of tempmemobj or
// (tempmemobj_real, tempmemobj_imag) pair is valid (allocated) depending // (tempmemobj_real, tempmemobj_imag) pair is valid (allocated) depending
// data format of plan (plannar or interleaved) // data format of plan (plannar or interleaved)
cl_mem tempmemobj_real, tempmemobj_imag; float *tempmemobj_real, *tempmemobj_imag;
// precomputed lookup tables for sin,cos calculations, each of size // precomputed lookup tables for sin,cos calculations, each of size
// sqrt(n) or 2*sqrt(n), n is size of signal; // sqrt(n) or 2*sqrt(n), n is size of signal;
cl_mem cossin_LUT_d1; float* cossin_LUT_d1;
cl_mem cossin_LUT_d2; float* cossin_LUT_d2;
int logN1; int logN1;
int logN2; int logN2;
size_t N1; size_t N1;
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment