diff --git a/src/clFFT.cpp b/src/clFFT.cpp new file mode 100644 index 0000000000000000000000000000000000000000..44df6497e6c1b79b3318cbd2440cc50bc885d733 --- /dev/null +++ b/src/clFFT.cpp @@ -0,0 +1,664 @@ +/*************************************************************************** + * Copyright (C) 2012 by Oliver Bock,Heinz-Bernd Eggenstein * + * oliver.bock[AT]aei.mpg.de * + * heinz-bernd.eggenstein[AT]aei.mpg.de * + * * + * This file is part of libclfft (originally for Einstein@Home) * + * Derived from clFFT, (C) Apple, see notice below. * + * * + * * + * libclfft is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See * + * notice below for more details. * + * * + ***************************************************************************/ +// +// File: fft_setup.cpp +// +// Version: <1.0> +// +// Disclaimer: IMPORTANT: This Apple software is supplied to you by Apple Inc. ("Apple") +// in consideration of your agreement to the following terms, and your use, +// installation, modification or redistribution of this Apple software +// constitutes acceptance of these terms. If you do not agree with these +// terms, please do not use, install, modify or redistribute this Apple +// software. +// +// In consideration of your agreement to abide by the following terms, and +// subject to these terms, Apple grants you a personal, non - exclusive +// license, under Apple's copyrights in this original Apple software ( the +// "Apple Software" ), to use, reproduce, modify and redistribute the Apple +// Software, with or without modifications, in source and / or binary forms; +// provided that if you redistribute the Apple Software in its entirety and +// without modifications, you must retain this notice and the following text +// and disclaimers in all such redistributions of the Apple Software. Neither +// the name, trademarks, service marks or logos of Apple Inc. may be used to +// endorse or promote products derived from the Apple Software without specific +// prior written permission from Apple. Except as expressly stated in this +// notice, no other rights or licenses, express or implied, are granted by +// Apple herein, including but not limited to any patent rights that may be +// infringed by your derivative works or by other works in which the Apple +// Software may be incorporated. +// +// The Apple Software is provided by Apple on an "AS IS" basis. APPLE MAKES NO +// WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED +// WARRANTIES OF NON - INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A +// PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION +// ALONE OR IN COMBINATION WITH YOUR PRODUCTS. +// +// IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR +// CONSEQUENTIAL DAMAGES ( INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION ) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION +// AND / OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER +// UNDER THEORY OF CONTRACT, TORT ( INCLUDING NEGLIGENCE ), STRICT LIABILITY OR +// OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Copyright ( C ) 2008 Apple Inc. All Rights Reserved. +// +//////////////////////////////////////////////////////////////////////////////////////////////////// + + +#include "clFFT.h" +#include <stdlib.h> +#include <stdio.h> +#include <math.h> +#include "fft_internal.h" +#include "fft_base_kernels.h" + +// g++ -o clFFT -I /opt/hsa/include clFFT.cpp fft_kernelstring.cpp fft_setup.cpp -L/opt/hsa/lib -lhsa-runtime64 +// g++ -fPIC -shared -o libclFFT.so -I /opt/hsa/include clFFT.cpp fft_kernelstring.cpp -L/opt/hsa/lib -lhsa-runtime64 + +using namespace std; + +#define check(msg, status) \ +if (status != HSA_STATUS_SUCCESS) { \ + printf("%s failed. status=%x\n", #msg,status); \ + return 1; \ +} else { \ + /*printf("%s succeeded.\n", #msg); */\ +} + +/** + * HSA HACK. Since hsa_signal_wait_acquire() sometimes waits for ever (don't know why. boinc blocking signals?) + * + * Use nanosleep instead of default timer. Since the kernels are quite fast, this does not impact CPU usage, + */ +hsa_signal_value_t myhsa_signal_wait_acquire( + hsa_signal_t signal, + hsa_signal_condition_t condition, + hsa_signal_value_t compare_value, + uint64_t timeout_hint, + hsa_wait_state_t wait_state_hint ) { + + timespec t; + t.tv_sec = 0; + t.tv_nsec = 100*1000; // 0.1 ms + hsa_signal_value_t ret; + while (ret = hsa_signal_load_acquire(signal) == 1) { + nanosleep(&t,NULL); + } + return ret; +} + + +static fft_args_t* fft_kernarg_address = NULL; +extern hsa_signal_value_t myhsa_signal_wait_acquire(hsa_signal_t signal,hsa_signal_condition_t condition,hsa_signal_value_t compare_value, + uint64_t timeout_hint, hsa_wait_state_t wait_state_hint ); + +static void +getBlockConfigAndKernelString(cl_fft_plan *plan) +{ + plan->temp_buffer_needed = 0; + *plan->kernel_string += baseKernels; + + if(plan->format == clFFT_SplitComplexFormat) + *plan->kernel_string += twistKernelPlannar; + else + *plan->kernel_string += twistKernelInterleaved; + + switch(plan->dim) + { + case clFFT_1D: + FFT1D(plan, cl_fft_kernel_x); + break; + + case clFFT_2D: + FFT1D(plan, cl_fft_kernel_x); + FFT1D(plan, cl_fft_kernel_y); + break; + + case clFFT_3D: + FFT1D(plan, cl_fft_kernel_x); + FFT1D(plan, cl_fft_kernel_y); + FFT1D(plan, cl_fft_kernel_z); + break; + + default: + return; + } + + plan->temp_buffer_needed = 0; + cl_fft_kernel_info *kInfo = plan->kernel_info; + while(kInfo) + { + plan->temp_buffer_needed |= !kInfo->in_place_possible; + kInfo = kInfo->next; + } +} + +static +int precomputeSinCosLUTs(hsa_region_t global_region,cl_fft_plan * plan,int *error_code) { + + size_t i=0; + cl_int err; + + // find logN1,logN2, where + // n = 2^logN1 * 2^logN2 , and logN1=logN2 +/- 1 + + size_t N=plan->n.x*plan->n.y*plan->n.z; + + plan->logN1=0; + plan->logN2=0; + plan->N1=1; + plan->N2=1; + + switch (plan->twiddleMethod) { + case clFFT_native_trig: return 0; + case clFFT_sincosfunc : return 0; + case clFFT_TaylorLUT : + plan->logN1 = 0; + plan->logN2 = 8; + break; + case clFFT_BigLUT : { + + size_t Nrem=N; + + while(Nrem > 1) { + plan->logN1++; + Nrem >>= 1; + + if(Nrem > 1) { + plan->logN2++; + Nrem >>= 1; + } + }} + break; + default: return 1; + } + + plan->N1 = 1 << plan->logN1; + + plan->N2 = 1 << plan->logN2; + + + double PI2 = 8.0*atan(1.0); + + //plan->cossin_LUT_d1 = clCreateBuffer(plan->context,CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, plan->N1*2*sizeof(float),tmpLUT_cossin1, &err); + err = hsa_memory_allocate(global_region, plan->N1*2*sizeof(float), (void**)&plan->cossin_LUT_d1); + check(Create cossin_LUT_d1, err); + for(i=0; i < plan->N1; i++) { + float * tmpLUT_cossin1 = plan->cossin_LUT_d1; + tmpLUT_cossin1[i*2] =(float)cos(PI2 * (float) i / (float)N); + tmpLUT_cossin1[i*2+1]=(float)sin(PI2 * (float) i / (float)N); + } + + + //plan->cossin_LUT_d2 = clCreateBuffer(plan->context,CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, plan->N2*2*sizeof(float),tmpLUT_cossin2, &err); + err = hsa_memory_allocate(global_region, plan->N2*2*sizeof(float), (void**)&plan->cossin_LUT_d2); + check(Create cossin_LUT_d2, err); + for(i=0; i < plan->N2; i++) { + float * tmpLUT_cossin2 = plan->cossin_LUT_d2; + tmpLUT_cossin2[2*i] =(float)cos(PI2 * (float) i / (float) plan->N2); + tmpLUT_cossin2[2*i+1]=(float)sin(PI2 * (float) i / (float) plan->N2); + } + return 0; +} + +static cl_int +allocateTemporaryBufferInterleaved(cl_fft_plan *plan, cl_uint batchSize) +{ + cl_int err = CL_SUCCESS; + if(plan->temp_buffer_needed && plan->last_batch_size != batchSize) + { + plan->last_batch_size = batchSize; + size_t tmpLength = plan->n.x * plan->n.y * plan->n.z * batchSize * 2 * sizeof(cl_float); + + if(plan->tempmemobj) + hsa_memory_free(plan->tempmemobj); + + err = hsa_memory_allocate(plan->global_region, tmpLength, (void**)&plan->tempmemobj); + //plan->tempmemobj = clCreateBuffer(plan->context, CL_MEM_READ_WRITE, tmpLength, NULL, &err); + } + return err; +} + +void +getKernelWorkDimensions(cl_fft_plan *plan, cl_fft_kernel_info *kernelInfo, cl_int *batchSize, size_t *gWorkItems, size_t *lWorkItems) +{ + *lWorkItems = kernelInfo->num_workitems_per_workgroup; + int numWorkGroups = kernelInfo->num_workgroups; + int numXFormsPerWG = kernelInfo->num_xforms_per_workgroup; + + switch(kernelInfo->dir) + { + case cl_fft_kernel_x: + *batchSize *= (plan->n.y * plan->n.z); + numWorkGroups = (*batchSize % numXFormsPerWG) ? (*batchSize/numXFormsPerWG + 1) : (*batchSize/numXFormsPerWG); + numWorkGroups *= kernelInfo->num_workgroups; + break; + case cl_fft_kernel_y: + *batchSize *= plan->n.z; + numWorkGroups *= *batchSize; + break; + case cl_fft_kernel_z: + numWorkGroups *= *batchSize; + break; + } + + *gWorkItems = numWorkGroups * *lWorkItems; +} +clFFT_Plan +clFFT_CreatePlanAdvHSA(hsa_region_t global_region,hsa_executable_t executable,hsa_agent_t agent,hsa_region_t kernarg_region,hsa_queue_t* queue, + clFFT_Dim3 n, clFFT_Dimension dim, clFFT_DataFormat dataFormat, unsigned long flags) +{ + int i; + int isPow2 = 1; + cl_fft_plan *plan = NULL; + ostringstream kString; + size_t ret_size; + cl_int error_code; + + isPow2 |= n.x && !( (n.x - 1) & n.x ); + isPow2 |= n.y && !( (n.y - 1) & n.y ); + isPow2 |= n.z && !( (n.z - 1) & n.z ); + + plan = (cl_fft_plan *) malloc(sizeof(cl_fft_plan)); + + //plan->context = 0; + //clRetainContext(context); + plan->n = n; + plan->dim = dim; + plan->format = dataFormat; + plan->kernel_info = 0; + plan->num_kernels = 0; + plan->twist_kernel = 0; + plan->program = 0; + plan->temp_buffer_needed = 0; + plan->last_batch_size = 0; + plan->tempmemobj = NULL; + plan->tempmemobj_real = NULL; + plan->tempmemobj_imag = NULL; + plan->cossin_LUT_d1=NULL; + plan->cossin_LUT_d2=NULL; + plan->max_localmem_fft_size = 2048; + plan->max_work_item_per_workgroup = 256; + plan->max_radix = 16; + plan->min_mem_coalesce_width = 16; + plan->num_local_mem_banks = 16; + plan->global_region = global_region; + plan->executable = executable; + plan->agent = agent; + plan->kernarg_region = kernarg_region; + plan->queue = queue; + + plan->twiddleMethod = (clFFT_TwiddleFactorMethod)(flags & 7); + + precomputeSinCosLUTs(global_region,plan,&error_code); + + plan->kernel_string = new string(""); + + getBlockConfigAndKernelString(plan); + + const char *source_str = plan->kernel_string->c_str(); + + // Use this to create the initial kernel + // COPY THE SOURCE HERE TO a .cl file and then compile to BRIG. + //puts(source_str); + + cl_fft_kernel_info *kInfo = plan->kernel_info; + while(kInfo) + { + plan->num_kernels++; + kInfo = kInfo->next; + } + + return (clFFT_Plan) plan; +} + + +cl_int +clFFT_ExecuteInterleavedHSA( clFFT_Plan Plan, cl_int batchSize, clFFT_Direction dir,float *data_in, float *data_out) +{ + int s; + cl_fft_plan *plan = (cl_fft_plan *) Plan; + if(plan->format != clFFT_InterleavedComplexFormat) + return CL_INVALID_VALUE; + cl_int err; + size_t gWorkItems, lWorkItems; + int inPlaceDone = -1; + + cl_int isInPlace = data_in == data_out ? 1 : 0; + + if((err = allocateTemporaryBufferInterleaved(plan, batchSize)) != CL_SUCCESS) + return err; + + cl_mem memObj[3]; + memObj[0] = data_in; + memObj[1] = data_out; + memObj[2] = plan->tempmemobj; + cl_fft_kernel_info *kernelInfo = plan->kernel_info; + int numKernels = plan->num_kernels; + + int numKernelsOdd = numKernels & 1; + int currRead = 0; + int currWrite = 1; + + // at least one external dram shuffle (transpose) required + if(plan->temp_buffer_needed) + { + // in-place transform + if(isInPlace) + { + inPlaceDone = 0; + currRead = 1; + currWrite = 2; + } + else + { + currWrite = (numKernels & 1) ? 1 : 2; + } + + while(kernelInfo) + { + if( isInPlace && numKernelsOdd && !inPlaceDone && kernelInfo->in_place_possible) + { + currWrite = currRead; + inPlaceDone = 1; + } + + s = batchSize; + getKernelWorkDimensions(plan, kernelInfo, &s, &gWorkItems, &lWorkItems); + + char name[1024]; + sprintf(name,"&__OpenCL_%s_kernel",kernelInfo->kernel_name); + + hsa_executable_symbol_t symbol; + hsa_status_t err = hsa_executable_get_symbol(plan->executable, NULL, name, plan->agent, 0, &symbol); + check(Extract the symbol from FFT kernel, err); + + /* + * Extract dispatch information from the symbol + */ + uint64_t kernel_object; + uint32_t kernarg_segment_size; + uint32_t group_segment_size; + uint32_t private_segment_size; + err = hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel_object); + check(Extracting the symbol from the executable, err); + err = hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &kernarg_segment_size); + check(Extracting the kernarg segment size from the executable, err); + err = hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &group_segment_size); + check(Extracting the group segment size from the executable, err); + err = hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &private_segment_size); + check(Extracting the private segment from the executable, err); + if (fft_kernarg_address == NULL) { + err = hsa_memory_allocate(plan->kernarg_region, kernarg_segment_size, (void**)&fft_kernarg_address); + check(Allocating resampling kernel argument memory buffer, err); + //printf("*** Allocated %d args size=%d\n",kernarg_segment_size,sizeof(fft_args_t)); + } + // Pass parameters + fft_kernarg_address->global_offset_0 = 0; + fft_kernarg_address->global_offset_1 = 0; + fft_kernarg_address->global_offset_2 = 0; + fft_kernarg_address->in = (float*)memObj[currRead]; + fft_kernarg_address->out = (float*)memObj[currWrite]; + fft_kernarg_address->dir = dir; + fft_kernarg_address->S = s; + fft_kernarg_address->cossinLUT1 = (float*)plan->cossin_LUT_d1; + fft_kernarg_address->cossinLUT2 = (float*)plan->cossin_LUT_d2; + + uint64_t index = hsa_queue_load_write_index_relaxed(plan->queue); + hsa_signal_t signal; + err=hsa_signal_create(1, 0,NULL, &signal); + check(Creating a HSA signal, err); + const uint32_t queueMask = plan->queue->size - 1; + + // create HSA packet + hsa_kernel_dispatch_packet_t* dispatch_packet = &(((hsa_kernel_dispatch_packet_t*)(plan->queue->base_address))[index&queueMask]); + dispatch_packet->header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE; + dispatch_packet->header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE; + dispatch_packet->setup |= 1 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; + dispatch_packet->workgroup_size_x = (uint16_t)lWorkItems; + dispatch_packet->workgroup_size_y = (uint16_t)1; + dispatch_packet->workgroup_size_z = (uint16_t)1; + dispatch_packet->grid_size_x = (uint32_t) gWorkItems; + dispatch_packet->grid_size_y = 1; + dispatch_packet->grid_size_z = 1; + dispatch_packet->completion_signal = signal; + dispatch_packet->kernel_object = kernel_object; + dispatch_packet->kernarg_address = (void*) fft_kernarg_address; + dispatch_packet->private_segment_size = private_segment_size; + dispatch_packet->group_segment_size = group_segment_size; + __atomic_store_n((uint8_t*)(&dispatch_packet->header), (uint8_t)HSA_PACKET_TYPE_KERNEL_DISPATCH, __ATOMIC_RELEASE); + + /* + * Increment the write index and ring the doorbell to dispatch the kernel. + */ + hsa_queue_store_write_index_relaxed(plan->queue, index+1); + hsa_signal_store_relaxed(plan->queue->doorbell_signal, index); + check(Dispatching the clFFT kernel, err); + + /* + * Wait on the dispatch completion signal until the kernel is finished. + */ + hsa_signal_value_t value = myhsa_signal_wait_acquire(signal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_BLOCKED); + + err=hsa_signal_destroy(signal); + check(Destroying the signal, err); + +/* + err |= clSetKernelArg(kernelInfo->kernel, 0, sizeof(cl_mem), &memObj[currRead]); + err |= clSetKernelArg(kernelInfo->kernel, 1, sizeof(cl_mem), &memObj[currWrite]); + err |= clSetKernelArg(kernelInfo->kernel, 2, sizeof(cl_int), &dir); + err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_int), &s); + err |= clSetKernelArg(kernelInfo->kernel, 4, sizeof(cl_mem), &(plan->cossin_LUT_d1)); + err |= clSetKernelArg(kernelInfo->kernel, 5, sizeof(cl_mem), &(plan->cossin_LUT_d2)); + + err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, 0, NULL, NULL); +*/ + if(err) + return err; + currRead = (currWrite == 1) ? 1 : 2; + currWrite = (currWrite == 1) ? 2 : 1; + + kernelInfo = kernelInfo->next; + } + } + // no dram shuffle (transpose required) transform + // all kernels can execute in-place. + else { + while(kernelInfo) + { + s = batchSize; + getKernelWorkDimensions(plan, kernelInfo, &s, &gWorkItems, &lWorkItems); + printf("*** Call %s\n",kernelInfo->kernel_name); + char name[1024]; + sprintf(name,"&__OpenCL_%s_kernel",kernelInfo->kernel_name); + hsa_executable_symbol_t symbol; + hsa_status_t err = hsa_executable_get_symbol(plan->executable, NULL, name, plan->agent, 0, &symbol); + check(Extract the symbol from FFT kernel, err); + + /* + * Extract dispatch information from the symbol + */ + uint64_t kernel_object; + uint32_t kernarg_segment_size; + uint32_t group_segment_size; + uint32_t private_segment_size; + err = hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel_object); + check(Extracting the symbol from the executable, err); + err = hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &kernarg_segment_size); + check(Extracting the kernarg segment size from the executable, err); + err = hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &group_segment_size); + check(Extracting the group segment size from the executable, err); + err = hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &private_segment_size); + check(Extracting the private segment from the executable, err); + if (fft_kernarg_address == NULL) { + err = hsa_memory_allocate(plan->kernarg_region, kernarg_segment_size, (void**)&fft_kernarg_address); + check(Allocating resampling kernel argument memory buffer, err); + printf("*** Allocated %d args size=%d\n",kernarg_segment_size,sizeof(fft_args_t)); + } + // Pass parameters + fft_kernarg_address->global_offset_0 = 0; + fft_kernarg_address->global_offset_1 = 0; + fft_kernarg_address->global_offset_2 = 0; + fft_kernarg_address->in = (float*)memObj[currRead]; + fft_kernarg_address->out = (float*)memObj[currWrite]; + fft_kernarg_address->dir = dir; + fft_kernarg_address->S = s; + fft_kernarg_address->cossinLUT1 = (float*)plan->cossin_LUT_d1; + fft_kernarg_address->cossinLUT2 = (float*)plan->cossin_LUT_d2; + + uint64_t index = hsa_queue_load_write_index_relaxed(plan->queue); + hsa_signal_t signal; + err=hsa_signal_create(1, 0,NULL, &signal); + check(Creating a HSA signal, err); + const uint32_t queueMask = plan->queue->size - 1; + + // create HSA packet + hsa_kernel_dispatch_packet_t* dispatch_packet = &(((hsa_kernel_dispatch_packet_t*)(plan->queue->base_address))[index&queueMask]); + dispatch_packet->header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE; + dispatch_packet->header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE; + dispatch_packet->setup |= 1 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; + dispatch_packet->workgroup_size_x = (uint16_t)lWorkItems; + dispatch_packet->workgroup_size_y = (uint16_t)1; + dispatch_packet->workgroup_size_z = (uint16_t)1; + dispatch_packet->grid_size_x = (uint32_t) gWorkItems; + dispatch_packet->grid_size_y = 1; + dispatch_packet->grid_size_z = 1; + dispatch_packet->completion_signal = signal; + dispatch_packet->kernel_object = kernel_object; + dispatch_packet->kernarg_address = (void*) fft_kernarg_address; + dispatch_packet->private_segment_size = private_segment_size; + dispatch_packet->group_segment_size = group_segment_size; + __atomic_store_n((uint8_t*)(&dispatch_packet->header), (uint8_t)HSA_PACKET_TYPE_KERNEL_DISPATCH, __ATOMIC_RELEASE); + + /* + * Increment the write index and ring the doorbell to dispatch the kernel. + */ + hsa_queue_store_write_index_relaxed(plan->queue, index+1); + hsa_signal_store_relaxed(plan->queue->doorbell_signal, index); + check(Dispatching the clFFT kernel, err); + + /* + * Wait on the dispatch completion signal until the kernel is finished. + */ + hsa_signal_value_t value = myhsa_signal_wait_acquire(signal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_BLOCKED); + + err=hsa_signal_destroy(signal); + check(Destroying the signal, err); +/* + err |= clSetKernelArg(kernelInfo->kernel, 0, sizeof(cl_mem), &memObj[currRead]); + err |= clSetKernelArg(kernelInfo->kernel, 1, sizeof(cl_mem), &memObj[currWrite]); + err |= clSetKernelArg(kernelInfo->kernel, 2, sizeof(cl_int), &dir); + err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_int), &s); + err |= clSetKernelArg(kernelInfo->kernel, 4, sizeof(cl_mem), &(plan->cossin_LUT_d1)); + err |= clSetKernelArg(kernelInfo->kernel, 5, sizeof(cl_mem), &(plan->cossin_LUT_d2)); + + err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, 0, NULL, NULL); +*/ + if(err) + return err; + + currRead = 1; + currWrite = 1; + + kernelInfo = kernelInfo->next; + } + } + + return err; +} + +static void +deleteKernelInfo(cl_fft_kernel_info *kInfo) +{ + if(kInfo) + { + if(kInfo->kernel_name) + free(kInfo->kernel_name); + //if(kInfo->kernel) + // clReleaseKernel(kInfo->kernel); + free(kInfo); + } +} + +static void +destroy_plan(cl_fft_plan *Plan) +{ + cl_fft_kernel_info *kernel_info = Plan->kernel_info; + + while(kernel_info) + { + cl_fft_kernel_info *tmp = kernel_info->next; + deleteKernelInfo(kernel_info); + kernel_info = tmp; + } + + Plan->kernel_info = NULL; + + if(Plan->kernel_string) + { + delete Plan->kernel_string; + Plan->kernel_string = NULL; + } + if(Plan->twist_kernel) + { + //clReleaseKernel(Plan->twist_kernel); + Plan->twist_kernel = 0; + } + if(Plan->program) + { + //clReleaseProgram(Plan->program); + Plan->program = 0; + } + if(Plan->tempmemobj) + { + hsa_memory_free(Plan->tempmemobj); + Plan->tempmemobj = NULL; + } + if(Plan->tempmemobj_real) + { + hsa_memory_free((void*)Plan->tempmemobj_real); + Plan->tempmemobj_real = NULL; + } + if(Plan->tempmemobj_imag) + { + hsa_memory_free((void*)Plan->tempmemobj_imag); + Plan->tempmemobj_imag = NULL; + } + + if(Plan->cossin_LUT_d1) + { + hsa_memory_free((void*)Plan->cossin_LUT_d1); + } + + if(Plan->cossin_LUT_d2) + { + hsa_memory_free((void*)Plan->cossin_LUT_d2); + } + +} +void +clFFT_DestroyPlan(clFFT_Plan plan) +{ + cl_fft_plan *Plan = (cl_fft_plan *) plan; + if(Plan) + { + destroy_plan(Plan); + //clReleaseContext(Plan->context); + free(Plan); + } +} diff --git a/src/clFFT.h b/src/clFFT.h new file mode 100644 index 0000000000000000000000000000000000000000..9f6384976c5f8449432aaede5ca0d942da9ea1a5 --- /dev/null +++ b/src/clFFT.h @@ -0,0 +1,190 @@ +/*************************************************************************** + * Copyright (C) 2012 by Oliver Bock,Heinz-Bernd Eggenstein * + * oliver.bock[AT]aei.mpg.de * + * heinz-bernd.eggenstein[AT]aei.mpg.de * + * * + * This file is part of libclfft (originally for Einstein@Home) * + * Derived from clFFT, (C) Apple, see notice below. * + * * + * * + * libclfft is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See * + * notice below for more details. * + * * + ***************************************************************************/ +// +// File: clFFT.h +// +// Version: <1.0> +// +// Disclaimer: IMPORTANT: This Apple software is supplied to you by Apple Inc. ("Apple") +// in consideration of your agreement to the following terms, and your use, +// installation, modification or redistribution of this Apple software +// constitutes acceptance of these terms. If you do not agree with these +// terms, please do not use, install, modify or redistribute this Apple +// software. +// +// In consideration of your agreement to abide by the following terms, and +// subject to these terms, Apple grants you a personal, non - exclusive +// license, under Apple's copyrights in this original Apple software ( the +// "Apple Software" ), to use, reproduce, modify and redistribute the Apple +// Software, with or without modifications, in source and / or binary forms; +// provided that if you redistribute the Apple Software in its entirety and +// without modifications, you must retain this notice and the following text +// and disclaimers in all such redistributions of the Apple Software. Neither +// the name, trademarks, service marks or logos of Apple Inc. may be used to +// endorse or promote products derived from the Apple Software without specific +// prior written permission from Apple. Except as expressly stated in this +// notice, no other rights or licenses, express or implied, are granted by +// Apple herein, including but not limited to any patent rights that may be +// infringed by your derivative works or by other works in which the Apple +// Software may be incorporated. +// +// The Apple Software is provided by Apple on an "AS IS" basis. APPLE MAKES NO +// WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED +// WARRANTIES OF NON - INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A +// PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION +// ALONE OR IN COMBINATION WITH YOUR PRODUCTS. +// +// IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR +// CONSEQUENTIAL DAMAGES ( INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION ) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION +// AND / OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER +// UNDER THEORY OF CONTRACT, TORT ( INCLUDING NEGLIGENCE ), STRICT LIABILITY OR +// OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Copyright ( C ) 2008 Apple Inc. All Rights Reserved. +// +//////////////////////////////////////////////////////////////////////////////////////////////////// + + +#ifndef __CLFFT_H +#define __CLFFT_H + +#include <stdio.h> +#ifdef __cplusplus +extern "C" { +#endif + +#include "hsa.h" +#include "hsa_ext_finalize.h" + +// Mini OpenCL +typedef int cl_int; +typedef unsigned int cl_uint; +typedef int cl_context; +typedef int cl_kernel; +typedef int cl_program; +typedef void * cl_mem; +typedef float cl_float; +#define CL_SUCCESS 0 +#define CL_INVALID_VALUE -30 + + +// XForm type +typedef enum +{ + clFFT_Forward = -1, + clFFT_Inverse = 1 + +}clFFT_Direction; + +// XForm dimension +typedef enum +{ + clFFT_1D = 0, + clFFT_2D = 1, + clFFT_3D = 3 + +}clFFT_Dimension; + +// XForm Data type +typedef enum +{ + clFFT_SplitComplexFormat = 0, + clFFT_InterleavedComplexFormat = 1 +}clFFT_DataFormat; + +typedef enum +{ + clFFT_native_trig = 0, + clFFT_sincosfunc = 1, + clFFT_BigLUT = 2, + clFFT_TaylorLUT = 3, + clFFT_RFU4 = 4, + clFFT_RFU5 = 5, + clFFT_RFU6 = 6, + clFFT_RFU7 = 7 +} clFFT_TwiddleFactorMethod; + +typedef struct +{ + unsigned int x; + unsigned int y; + unsigned int z; +}clFFT_Dim3; + +typedef struct +{ + float *real; + float *imag; +} clFFT_SplitComplex; + +typedef struct +{ + float real; + float imag; +}clFFT_Complex; + +struct __attribute__ ((aligned(16))) fft_args_t { + uint64_t global_offset_0; + uint64_t global_offset_1; + uint64_t global_offset_2; + uint64_t printf_buffer; + uint64_t vqueue_pointer; + uint64_t aqlwrap_pointer; + float *in; + float *out; + int dir; + int S; + float *cossinLUT1; + float *cossinLUT2; +}; + +typedef void* clFFT_Plan; + +clFFT_Plan clFFT_CreatePlanAdvHSA(hsa_region_t global_region,hsa_executable_t executable,hsa_agent_t agent,hsa_region_t kernarg_region,hsa_queue_t* queue, + clFFT_Dim3 n, clFFT_Dimension dim, clFFT_DataFormat dataFormat, unsigned long flags); +cl_int +clFFT_ExecuteInterleavedHSA( clFFT_Plan Plan, cl_int batchSize, clFFT_Direction dir,float *data_in, float *data_out); + +void clFFT_DestroyPlan( clFFT_Plan plan ); + +#if 0 +clFFT_Plan clFFT_CreatePlan( cl_context context, clFFT_Dim3 n, clFFT_Dimension dim, clFFT_DataFormat dataFormat, cl_int *error_code ); + +clFFT_Plan clFFT_CreatePlanAdv( cl_context context, clFFT_Dim3 n, clFFT_Dimension dim, clFFT_DataFormat dataFormat, unsigned long flags, cl_int *error_code ); + + + + +cl_int clFFT_ExecutePlannar( cl_command_queue queue, clFFT_Plan plan, cl_int batchSize, clFFT_Direction dir, + cl_mem data_in_real, cl_mem data_in_imag, cl_mem data_out_real, cl_mem data_out_imag, + cl_int num_events, cl_event *event_list, cl_event *event ); + +cl_int clFFT_1DTwistInterleaved(clFFT_Plan Plan, cl_command_queue queue, cl_mem array, + size_t numRows, size_t numCols, size_t startRow, size_t rowsToProcess, clFFT_Direction dir); + + +cl_int clFFT_1DTwistPlannar(clFFT_Plan Plan, cl_command_queue queue, cl_mem array_real, cl_mem array_imag, + size_t numRows, size_t numCols, size_t startRow, size_t rowsToProcess, clFFT_Direction dir); + +void clFFT_DumpPlan( clFFT_Plan plan, FILE *file); +#endif +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/clFFT_131072.cl b/src/clFFT_131072.cl new file mode 100644 index 0000000000000000000000000000000000000000..e7cffad8d5daa7063b921f4a3a9c3382edfea675 --- /dev/null +++ b/src/clFFT_131072.cl @@ -0,0 +1,1356 @@ +#ifndef M_PI +#define M_PI 0x1.921fb54442d18p+1 +#endif +#define complexMul(a,b) ((float2)(mad(-(a).y, (b).y, (a).x * (b).x), mad((a).y, (b).x, (a).x * (b).y))) + +#define cos_sinLUT1(res,dir,i,cossinLUT)\ +{\ +(res)=(float2)((cossinLUT)[i].x , (dir)*(cossinLUT)[i].y);\ +} + +#define cos_sinLUT2(res,dir,_i,_k,cossinLUT1,cossinLUT2) \ +{ float _sin_1= (cossinLUT1)[_i].y; \ + float _sin_2= (cossinLUT2)[_k].y; \ + float _cos_1= (cossinLUT1)[_i].x; \ + float _cos_2= (cossinLUT2)[_k].x; \ + float _cos_res = _cos_1 * _cos_2 - _sin_1 * _sin_2; \ + float _sin_res = (dir) * (_sin_1 * _cos_2 + _cos_1 * _sin_2); \ + (res)=(float2)(_cos_res,_sin_res); \ +} + +#define conj(a) ((float2)((a).x, -(a).y)) +#define conjTransp(a) ((float2)(-(a).y, (a).x)) + +#define fftKernel2(a,dir) \ +{ \ + float2 c = (a)[0]; \ + (a)[0] = c + (a)[1]; \ + (a)[1] = c - (a)[1]; \ +} + +#define fftKernel2S(d1,d2,dir) \ +{ \ + float2 c = (d1); \ + (d1) = c + (d2); \ + (d2) = c - (d2); \ +} + +#define fftKernel4(a,dir) \ +{ \ + fftKernel2S((a)[0], (a)[2], dir); \ + fftKernel2S((a)[1], (a)[3], dir); \ + fftKernel2S((a)[0], (a)[1], dir); \ + (a)[3] = (float2)(dir)*(conjTransp((a)[3])); \ + fftKernel2S((a)[2], (a)[3], dir); \ + float2 c = (a)[1]; \ + (a)[1] = (a)[2]; \ + (a)[2] = c; \ +} + +#define fftKernel4s(a0,a1,a2,a3,dir) \ +{ \ + fftKernel2S((a0), (a2), dir); \ + fftKernel2S((a1), (a3), dir); \ + fftKernel2S((a0), (a1), dir); \ + (a3) = (float2)(dir)*(conjTransp((a3))); \ + fftKernel2S((a2), (a3), dir); \ + float2 c = (a1); \ + (a1) = (a2); \ + (a2) = c; \ +} + +#define bitreverse8(a) \ +{ \ + float2 c; \ + c = (a)[1]; \ + (a)[1] = (a)[4]; \ + (a)[4] = c; \ + c = (a)[3]; \ + (a)[3] = (a)[6]; \ + (a)[6] = c; \ +} + +#define fftKernel8(a,dir) \ +{ \ + const float2 w1 = (float2)(0x1.6a09e6p-1f, dir*0x1.6a09e6p-1f); \ + const float2 w3 = (float2)(-0x1.6a09e6p-1f, dir*0x1.6a09e6p-1f); \ + float2 c; \ + fftKernel2S((a)[0], (a)[4], dir); \ + fftKernel2S((a)[1], (a)[5], dir); \ + fftKernel2S((a)[2], (a)[6], dir); \ + fftKernel2S((a)[3], (a)[7], dir); \ + (a)[5] = complexMul(w1, (a)[5]); \ + (a)[6] = (float2)(dir)*(conjTransp((a)[6])); \ + (a)[7] = complexMul(w3, (a)[7]); \ + fftKernel2S((a)[0], (a)[2], dir); \ + fftKernel2S((a)[1], (a)[3], dir); \ + fftKernel2S((a)[4], (a)[6], dir); \ + fftKernel2S((a)[5], (a)[7], dir); \ + (a)[3] = (float2)(dir)*(conjTransp((a)[3])); \ + (a)[7] = (float2)(dir)*(conjTransp((a)[7])); \ + fftKernel2S((a)[0], (a)[1], dir); \ + fftKernel2S((a)[2], (a)[3], dir); \ + fftKernel2S((a)[4], (a)[5], dir); \ + fftKernel2S((a)[6], (a)[7], dir); \ + bitreverse8((a)); \ +} + +#define bitreverse4x4(a) \ +{ \ + float2 c; \ + c = (a)[1]; (a)[1] = (a)[4]; (a)[4] = c; \ + c = (a)[2]; (a)[2] = (a)[8]; (a)[8] = c; \ + c = (a)[3]; (a)[3] = (a)[12]; (a)[12] = c; \ + c = (a)[6]; (a)[6] = (a)[9]; (a)[9] = c; \ + c = (a)[7]; (a)[7] = (a)[13]; (a)[13] = c; \ + c = (a)[11]; (a)[11] = (a)[14]; (a)[14] = c; \ +} + +#define fftKernel16(a,dir) \ +{ \ + const float w0 = 0x1.d906bcp-1f; \ + const float w1 = 0x1.87de2ap-2f; \ + const float w2 = 0x1.6a09e6p-1f; \ + fftKernel4s((a)[0], (a)[4], (a)[8], (a)[12], dir); \ + fftKernel4s((a)[1], (a)[5], (a)[9], (a)[13], dir); \ + fftKernel4s((a)[2], (a)[6], (a)[10], (a)[14], dir); \ + fftKernel4s((a)[3], (a)[7], (a)[11], (a)[15], dir); \ + (a)[5] = complexMul((a)[5], (float2)(w0, dir*w1)); \ + (a)[6] = complexMul((a)[6], (float2)(w2, dir*w2)); \ + (a)[7] = complexMul((a)[7], (float2)(w1, dir*w0)); \ + (a)[9] = complexMul((a)[9], (float2)(w2, dir*w2)); \ + (a)[10] = (float2)(dir)*(conjTransp((a)[10])); \ + (a)[11] = complexMul((a)[11], (float2)(-w2, dir*w2)); \ + (a)[13] = complexMul((a)[13], (float2)(w1, dir*w0)); \ + (a)[14] = complexMul((a)[14], (float2)(-w2, dir*w2)); \ + (a)[15] = complexMul((a)[15], (float2)(-w0, dir*-w1)); \ + fftKernel4((a), dir); \ + fftKernel4((a) + 4, dir); \ + fftKernel4((a) + 8, dir); \ + fftKernel4((a) + 12, dir); \ + bitreverse4x4((a)); \ +} + +#define bitreverse32(a) \ +{ \ + float2 c1, c2; \ + c1 = (a)[2]; (a)[2] = (a)[1]; c2 = (a)[4]; (a)[4] = c1; c1 = (a)[8]; (a)[8] = c2; c2 = (a)[16]; (a)[16] = c1; (a)[1] = c2; \ + c1 = (a)[6]; (a)[6] = (a)[3]; c2 = (a)[12]; (a)[12] = c1; c1 = (a)[24]; (a)[24] = c2; c2 = (a)[17]; (a)[17] = c1; (a)[3] = c2; \ + c1 = (a)[10]; (a)[10] = (a)[5]; c2 = (a)[20]; (a)[20] = c1; c1 = (a)[9]; (a)[9] = c2; c2 = (a)[18]; (a)[18] = c1; (a)[5] = c2; \ + c1 = (a)[14]; (a)[14] = (a)[7]; c2 = (a)[28]; (a)[28] = c1; c1 = (a)[25]; (a)[25] = c2; c2 = (a)[19]; (a)[19] = c1; (a)[7] = c2; \ + c1 = (a)[22]; (a)[22] = (a)[11]; c2 = (a)[13]; (a)[13] = c1; c1 = (a)[26]; (a)[26] = c2; c2 = (a)[21]; (a)[21] = c1; (a)[11] = c2; \ + c1 = (a)[30]; (a)[30] = (a)[15]; c2 = (a)[29]; (a)[29] = c1; c1 = (a)[27]; (a)[27] = c2; c2 = (a)[23]; (a)[23] = c1; (a)[15] = c2; \ +} + +#define fftKernel32(a,dir) \ +{ \ + fftKernel2S((a)[0], (a)[16], dir); \ + fftKernel2S((a)[1], (a)[17], dir); \ + fftKernel2S((a)[2], (a)[18], dir); \ + fftKernel2S((a)[3], (a)[19], dir); \ + fftKernel2S((a)[4], (a)[20], dir); \ + fftKernel2S((a)[5], (a)[21], dir); \ + fftKernel2S((a)[6], (a)[22], dir); \ + fftKernel2S((a)[7], (a)[23], dir); \ + fftKernel2S((a)[8], (a)[24], dir); \ + fftKernel2S((a)[9], (a)[25], dir); \ + fftKernel2S((a)[10], (a)[26], dir); \ + fftKernel2S((a)[11], (a)[27], dir); \ + fftKernel2S((a)[12], (a)[28], dir); \ + fftKernel2S((a)[13], (a)[29], dir); \ + fftKernel2S((a)[14], (a)[30], dir); \ + fftKernel2S((a)[15], (a)[31], dir); \ + (a)[17] = complexMul((a)[17], (float2)(0x1.f6297cp-1f, dir*0x1.8f8b84p-3f)); \ + (a)[18] = complexMul((a)[18], (float2)(0x1.d906bcp-1f, dir*0x1.87de2ap-2f)); \ + (a)[19] = complexMul((a)[19], (float2)(0x1.a9b662p-1f, dir*0x1.1c73b4p-1f)); \ + (a)[20] = complexMul((a)[20], (float2)(0x1.6a09e6p-1f, dir*0x1.6a09e6p-1f)); \ + (a)[21] = complexMul((a)[21], (float2)(0x1.1c73b4p-1f, dir*0x1.a9b662p-1f)); \ + (a)[22] = complexMul((a)[22], (float2)(0x1.87de2ap-2f, dir*0x1.d906bcp-1f)); \ + (a)[23] = complexMul((a)[23], (float2)(0x1.8f8b84p-3f, dir*0x1.f6297cp-1f)); \ + (a)[24] = complexMul((a)[24], (float2)(0x0p+0f, dir*0x1p+0f)); \ + (a)[25] = complexMul((a)[25], (float2)(-0x1.8f8b84p-3f, dir*0x1.f6297cp-1f)); \ + (a)[26] = complexMul((a)[26], (float2)(-0x1.87de2ap-2f, dir*0x1.d906bcp-1f)); \ + (a)[27] = complexMul((a)[27], (float2)(-0x1.1c73b4p-1f, dir*0x1.a9b662p-1f)); \ + (a)[28] = complexMul((a)[28], (float2)(-0x1.6a09e6p-1f, dir*0x1.6a09e6p-1f)); \ + (a)[29] = complexMul((a)[29], (float2)(-0x1.a9b662p-1f, dir*0x1.1c73b4p-1f)); \ + (a)[30] = complexMul((a)[30], (float2)(-0x1.d906bcp-1f, dir*0x1.87de2ap-2f)); \ + (a)[31] = complexMul((a)[31], (float2)(-0x1.f6297cp-1f, dir*0x1.8f8b84p-3f)); \ + fftKernel16((a), dir); \ + fftKernel16((a) + 16, dir); \ + bitreverse32((a)); \ +} + +__kernel void \ +clFFT_1DTwistInterleaved(__global float2 *in, unsigned int startRow, unsigned int numCols, unsigned int N, unsigned int numRowsToProcess, int dir) \ +{ \ + float2 a, w; \ + float ang; \ + unsigned int j; \ + unsigned int i = get_global_id(0); \ + unsigned int startIndex = i; \ + \ + if(i < numCols) \ + { \ + for(j = 0; j < numRowsToProcess; j++) \ + { \ + a = in[startIndex]; \ + ang = 2.0f * M_PI * dir * i * (startRow + j) / N; \ + w = (float2)(native_cos(ang), native_sin(ang)); \ + a = complexMul(a, w); \ + in[startIndex] = a; \ + startIndex += numCols; \ + } \ + } \ +} \ +__kernel void fft0(__global float2 *in, __global float2 *out, int dir, int S, __global float2 * cossinLUT1, __global float2 * cossinLUT2 ) +{ + __local float sMem[2064]; + int i, j, r, indexIn, indexOut, index, tid, bNum, xNum, k, l; + int s, ii, jj, offset; + float2 w; + float ang, angf, ang1; + __local float *lMemStore, *lMemLoad; + float2 a[16]; + int lId = get_local_id( 0 ); + int groupId = get_group_id( 0 ); + __local float2 cossin_T_LUT[256]; + int lLUTind= lId; + cossin_T_LUT[lLUTind]=cossinLUT2[lLUTind]; + lLUTind+=128; + cossin_T_LUT[lLUTind]=cossinLUT2[lLUTind]; + barrier(CLK_LOCAL_MEM_FENCE); +bNum = groupId & 63; +xNum = groupId >> 6; +indexIn = mul24(bNum, 16); +tid = indexIn; +i = tid >> 0; +j = tid & 0; +indexOut = mad24(i, 128, j); +indexIn += (xNum << 17); +indexOut += (xNum << 17); +tid = lId; +i = tid & 15; +j = tid >> 4; +indexIn += mad24(j, 1024, i); +in += indexIn; +a[0] = in[0]; +a[1] = in[8192]; +a[2] = in[16384]; +a[3] = in[24576]; +a[4] = in[32768]; +a[5] = in[40960]; +a[6] = in[49152]; +a[7] = in[57344]; +a[8] = in[65536]; +a[9] = in[73728]; +a[10] = in[81920]; +a[11] = in[90112]; +a[12] = in[98304]; +a[13] = in[106496]; +a[14] = in[114688]; +a[15] = in[122880]; +fftKernel16(a, dir); +{ int ang_index = (1 * ( j)) & 127; + int k = ang_index * 2; + float2 cs =cossin_T_LUT[k]; + cs.y *=dir; +w = cs; +} +a[1] = complexMul(a[1], w); +{ int ang_index = (1 * ( j)) & 63; + int k = ang_index * 4; + float2 cs =cossin_T_LUT[k]; + cs.y *=dir; +w = cs; +} +a[2] = complexMul(a[2], w); +{ int ang_index = (3 * ( j)) & 127; + int k = ang_index * 2; + float2 cs =cossin_T_LUT[k]; + cs.y *=dir; +w = cs; +} +a[3] = complexMul(a[3], w); +{ int ang_index = (1 * ( j)) & 31; + int k = ang_index * 8; + float2 cs =cossin_T_LUT[k]; + cs.y *=dir; +w = cs; +} +a[4] = complexMul(a[4], w); +{ int ang_index = (5 * ( j)) & 127; + int k = ang_index * 2; + float2 cs =cossin_T_LUT[k]; + cs.y *=dir; +w = cs; +} +a[5] = complexMul(a[5], w); +{ int ang_index = (3 * ( j)) & 63; + int k = ang_index * 4; + float2 cs =cossin_T_LUT[k]; + cs.y *=dir; +w = cs; +} +a[6] = complexMul(a[6], w); +{ int ang_index = (7 * ( j)) & 127; + int k = ang_index * 2; + float2 cs =cossin_T_LUT[k]; + cs.y *=dir; +w = cs; +} +a[7] = complexMul(a[7], w); +{ int ang_index = (1 * ( j)) & 15; + int k = ang_index * 16; + float2 cs =cossin_T_LUT[k]; + cs.y *=dir; +w = cs; +} +a[8] = complexMul(a[8], w); +{ int ang_index = (9 * ( j)) & 127; + int k = ang_index * 2; + float2 cs =cossin_T_LUT[k]; + cs.y *=dir; +w = cs; +} +a[9] = complexMul(a[9], w); +{ int ang_index = (5 * ( j)) & 63; + int k = ang_index * 4; + float2 cs =cossin_T_LUT[k]; + cs.y *=dir; +w = cs; +} +a[10] = complexMul(a[10], w); +{ int ang_index = (11 * ( j)) & 127; + int k = ang_index * 2; + float2 cs =cossin_T_LUT[k]; + cs.y *=dir; +w = cs; +} +a[11] = complexMul(a[11], w); +{ int ang_index = (3 * ( j)) & 31; + int k = ang_index * 8; + float2 cs =cossin_T_LUT[k]; + cs.y *=dir; +w = cs; +} +a[12] = complexMul(a[12], w); +{ int ang_index = (13 * ( j)) & 127; + int k = ang_index * 2; + float2 cs =cossin_T_LUT[k]; + cs.y *=dir; +w = cs; +} +a[13] = complexMul(a[13], w); +{ int ang_index = (7 * ( j)) & 63; + int k = ang_index * 4; + float2 cs =cossin_T_LUT[k]; + cs.y *=dir; +w = cs; +} +a[14] = complexMul(a[14], w); +{ int ang_index = (15 * ( j)) & 127; + int k = ang_index * 2; + float2 cs =cossin_T_LUT[k]; + cs.y *=dir; +w = cs; +} +a[15] = complexMul(a[15], w); +indexIn = mad24(j, 256, i); +lMemStore = sMem + tid; +lMemLoad = sMem + indexIn; +lMemStore[0] = a[0].x; +lMemStore[128] = a[1].x; +lMemStore[256] = a[2].x; +lMemStore[384] = a[3].x; +lMemStore[512] = a[4].x; +lMemStore[640] = a[5].x; +lMemStore[768] = a[6].x; +lMemStore[896] = a[7].x; +lMemStore[1024] = a[8].x; +lMemStore[1152] = a[9].x; +lMemStore[1280] = a[10].x; +lMemStore[1408] = a[11].x; +lMemStore[1536] = a[12].x; +lMemStore[1664] = a[13].x; +lMemStore[1792] = a[14].x; +lMemStore[1920] = a[15].x; +barrier(CLK_LOCAL_MEM_FENCE); +a[0].x = lMemLoad[0]; +a[1].x = lMemLoad[16]; +a[2].x = lMemLoad[32]; +a[3].x = lMemLoad[48]; +a[4].x = lMemLoad[64]; +a[5].x = lMemLoad[80]; +a[6].x = lMemLoad[96]; +a[7].x = lMemLoad[112]; +a[8].x = lMemLoad[128]; +a[9].x = lMemLoad[144]; +a[10].x = lMemLoad[160]; +a[11].x = lMemLoad[176]; +a[12].x = lMemLoad[192]; +a[13].x = lMemLoad[208]; +a[14].x = lMemLoad[224]; +a[15].x = lMemLoad[240]; +barrier(CLK_LOCAL_MEM_FENCE); +lMemStore[0] = a[0].y; +lMemStore[128] = a[1].y; +lMemStore[256] = a[2].y; +lMemStore[384] = a[3].y; +lMemStore[512] = a[4].y; +lMemStore[640] = a[5].y; +lMemStore[768] = a[6].y; +lMemStore[896] = a[7].y; +lMemStore[1024] = a[8].y; +lMemStore[1152] = a[9].y; +lMemStore[1280] = a[10].y; +lMemStore[1408] = a[11].y; +lMemStore[1536] = a[12].y; +lMemStore[1664] = a[13].y; +lMemStore[1792] = a[14].y; +lMemStore[1920] = a[15].y; +barrier(CLK_LOCAL_MEM_FENCE); +a[0].y = lMemLoad[0]; +a[1].y = lMemLoad[16]; +a[2].y = lMemLoad[32]; +a[3].y = lMemLoad[48]; +a[4].y = lMemLoad[64]; +a[5].y = lMemLoad[80]; +a[6].y = lMemLoad[96]; +a[7].y = lMemLoad[112]; +a[8].y = lMemLoad[128]; +a[9].y = lMemLoad[144]; +a[10].y = lMemLoad[160]; +a[11].y = lMemLoad[176]; +a[12].y = lMemLoad[192]; +a[13].y = lMemLoad[208]; +a[14].y = lMemLoad[224]; +a[15].y = lMemLoad[240]; +barrier(CLK_LOCAL_MEM_FENCE); +fftKernel8(a + 0, dir); +fftKernel8(a + 8, dir); +l = ((bNum << 4) + i) >> 0; +k = j << 1; +{ int ang_index = (1 * ( l * (k + 0))) & 131071; + int k = (ang_index + 256) >> 9; + float2 csx0 =cossin_T_LUT[k]; + float2 csx0Transp= (float2)(csx0.y,-csx0.x); + int r=ang_index - k * ( 512 ); + float mh=-4.7936900955e-05*(float)r; + float mhsqr2= mh*mh*(-0.5f); + float hqub6= mhsqr2*mh*(1.0f/3.0f); + float2 cs; + cs= hqub6 * csx0Transp; + cs += mhsqr2*csx0; + cs += mh*csx0Transp; + cs += csx0; + cs.y *=dir; +w = cs; +} +a[0] = complexMul(a[0], w); +{ int ang_index = (1 * ( l * (k + 16))) & 131071; + int k = (ang_index + 256) >> 9; + float2 csx0 =cossin_T_LUT[k]; + float2 csx0Transp= (float2)(csx0.y,-csx0.x); + int r=ang_index - k * ( 512 ); + float mh=-4.7936900955e-05*(float)r; + float mhsqr2= mh*mh*(-0.5f); + float hqub6= mhsqr2*mh*(1.0f/3.0f); + float2 cs; + cs= hqub6 * csx0Transp; + cs += mhsqr2*csx0; + cs += mh*csx0Transp; + cs += csx0; + cs.y *=dir; +w = cs; +} +a[1] = complexMul(a[1], w); +{ int ang_index = (1 * ( l * (k + 32))) & 131071; + int k = (ang_index + 256) >> 9; + float2 csx0 =cossin_T_LUT[k]; + float2 csx0Transp= (float2)(csx0.y,-csx0.x); + int r=ang_index - k * ( 512 ); + float mh=-4.7936900955e-05*(float)r; + float mhsqr2= mh*mh*(-0.5f); + float hqub6= mhsqr2*mh*(1.0f/3.0f); + float2 cs; + cs= hqub6 * csx0Transp; + cs += mhsqr2*csx0; + cs += mh*csx0Transp; + cs += csx0; + cs.y *=dir; +w = cs; +} +a[2] = complexMul(a[2], w); +{ int ang_index = (1 * ( l * (k + 48))) & 131071; + int k = (ang_index + 256) >> 9; + float2 csx0 =cossin_T_LUT[k]; + float2 csx0Transp= (float2)(csx0.y,-csx0.x); + int r=ang_index - k * ( 512 ); + float mh=-4.7936900955e-05*(float)r; + float mhsqr2= mh*mh*(-0.5f); + float hqub6= mhsqr2*mh*(1.0f/3.0f); + float2 cs; + cs= hqub6 * csx0Transp; + cs += mhsqr2*csx0; + cs += mh*csx0Transp; + cs += csx0; + cs.y *=dir; +w = cs; +} +a[3] = complexMul(a[3], w); +{ int ang_index = (1 * ( l * (k + 64))) & 131071; + int k = (ang_index + 256) >> 9; + float2 csx0 =cossin_T_LUT[k]; + float2 csx0Transp= (float2)(csx0.y,-csx0.x); + int r=ang_index - k * ( 512 ); + float mh=-4.7936900955e-05*(float)r; + float mhsqr2= mh*mh*(-0.5f); + float hqub6= mhsqr2*mh*(1.0f/3.0f); + float2 cs; + cs= hqub6 * csx0Transp; + cs += mhsqr2*csx0; + cs += mh*csx0Transp; + cs += csx0; + cs.y *=dir; +w = cs; +} +a[4] = complexMul(a[4], w); +{ int ang_index = (1 * ( l * (k + 80))) & 131071; + int k = (ang_index + 256) >> 9; + float2 csx0 =cossin_T_LUT[k]; + float2 csx0Transp= (float2)(csx0.y,-csx0.x); + int r=ang_index - k * ( 512 ); + float mh=-4.7936900955e-05*(float)r; + float mhsqr2= mh*mh*(-0.5f); + float hqub6= mhsqr2*mh*(1.0f/3.0f); + float2 cs; + cs= hqub6 * csx0Transp; + cs += mhsqr2*csx0; + cs += mh*csx0Transp; + cs += csx0; + cs.y *=dir; +w = cs; +} +a[5] = complexMul(a[5], w); +{ int ang_index = (1 * ( l * (k + 96))) & 131071; + int k = (ang_index + 256) >> 9; + float2 csx0 =cossin_T_LUT[k]; + float2 csx0Transp= (float2)(csx0.y,-csx0.x); + int r=ang_index - k * ( 512 ); + float mh=-4.7936900955e-05*(float)r; + float mhsqr2= mh*mh*(-0.5f); + float hqub6= mhsqr2*mh*(1.0f/3.0f); + float2 cs; + cs= hqub6 * csx0Transp; + cs += mhsqr2*csx0; + cs += mh*csx0Transp; + cs += csx0; + cs.y *=dir; +w = cs; +} +a[6] = complexMul(a[6], w); +{ int ang_index = (1 * ( l * (k + 112))) & 131071; + int k = (ang_index + 256) >> 9; + float2 csx0 =cossin_T_LUT[k]; + float2 csx0Transp= (float2)(csx0.y,-csx0.x); + int r=ang_index - k * ( 512 ); + float mh=-4.7936900955e-05*(float)r; + float mhsqr2= mh*mh*(-0.5f); + float hqub6= mhsqr2*mh*(1.0f/3.0f); + float2 cs; + cs= hqub6 * csx0Transp; + cs += mhsqr2*csx0; + cs += mh*csx0Transp; + cs += csx0; + cs.y *=dir; +w = cs; +} +a[7] = complexMul(a[7], w); +{ int ang_index = (1 * ( l * (k + 1))) & 131071; + int k = (ang_index + 256) >> 9; + float2 csx0 =cossin_T_LUT[k]; + float2 csx0Transp= (float2)(csx0.y,-csx0.x); + int r=ang_index - k * ( 512 ); + float mh=-4.7936900955e-05*(float)r; + float mhsqr2= mh*mh*(-0.5f); + float hqub6= mhsqr2*mh*(1.0f/3.0f); + float2 cs; + cs= hqub6 * csx0Transp; + cs += mhsqr2*csx0; + cs += mh*csx0Transp; + cs += csx0; + cs.y *=dir; +w = cs; +} +a[8] = complexMul(a[8], w); +{ int ang_index = (1 * ( l * (k + 17))) & 131071; + int k = (ang_index + 256) >> 9; + float2 csx0 =cossin_T_LUT[k]; + float2 csx0Transp= (float2)(csx0.y,-csx0.x); + int r=ang_index - k * ( 512 ); + float mh=-4.7936900955e-05*(float)r; + float mhsqr2= mh*mh*(-0.5f); + float hqub6= mhsqr2*mh*(1.0f/3.0f); + float2 cs; + cs= hqub6 * csx0Transp; + cs += mhsqr2*csx0; + cs += mh*csx0Transp; + cs += csx0; + cs.y *=dir; +w = cs; +} +a[9] = complexMul(a[9], w); +{ int ang_index = (1 * ( l * (k + 33))) & 131071; + int k = (ang_index + 256) >> 9; + float2 csx0 =cossin_T_LUT[k]; + float2 csx0Transp= (float2)(csx0.y,-csx0.x); + int r=ang_index - k * ( 512 ); + float mh=-4.7936900955e-05*(float)r; + float mhsqr2= mh*mh*(-0.5f); + float hqub6= mhsqr2*mh*(1.0f/3.0f); + float2 cs; + cs= hqub6 * csx0Transp; + cs += mhsqr2*csx0; + cs += mh*csx0Transp; + cs += csx0; + cs.y *=dir; +w = cs; +} +a[10] = complexMul(a[10], w); +{ int ang_index = (1 * ( l * (k + 49))) & 131071; + int k = (ang_index + 256) >> 9; + float2 csx0 =cossin_T_LUT[k]; + float2 csx0Transp= (float2)(csx0.y,-csx0.x); + int r=ang_index - k * ( 512 ); + float mh=-4.7936900955e-05*(float)r; + float mhsqr2= mh*mh*(-0.5f); + float hqub6= mhsqr2*mh*(1.0f/3.0f); + float2 cs; + cs= hqub6 * csx0Transp; + cs += mhsqr2*csx0; + cs += mh*csx0Transp; + cs += csx0; + cs.y *=dir; +w = cs; +} +a[11] = complexMul(a[11], w); +{ int ang_index = (1 * ( l * (k + 65))) & 131071; + int k = (ang_index + 256) >> 9; + float2 csx0 =cossin_T_LUT[k]; + float2 csx0Transp= (float2)(csx0.y,-csx0.x); + int r=ang_index - k * ( 512 ); + float mh=-4.7936900955e-05*(float)r; + float mhsqr2= mh*mh*(-0.5f); + float hqub6= mhsqr2*mh*(1.0f/3.0f); + float2 cs; + cs= hqub6 * csx0Transp; + cs += mhsqr2*csx0; + cs += mh*csx0Transp; + cs += csx0; + cs.y *=dir; +w = cs; +} +a[12] = complexMul(a[12], w); +{ int ang_index = (1 * ( l * (k + 81))) & 131071; + int k = (ang_index + 256) >> 9; + float2 csx0 =cossin_T_LUT[k]; + float2 csx0Transp= (float2)(csx0.y,-csx0.x); + int r=ang_index - k * ( 512 ); + float mh=-4.7936900955e-05*(float)r; + float mhsqr2= mh*mh*(-0.5f); + float hqub6= mhsqr2*mh*(1.0f/3.0f); + float2 cs; + cs= hqub6 * csx0Transp; + cs += mhsqr2*csx0; + cs += mh*csx0Transp; + cs += csx0; + cs.y *=dir; +w = cs; +} +a[13] = complexMul(a[13], w); +{ int ang_index = (1 * ( l * (k + 97))) & 131071; + int k = (ang_index + 256) >> 9; + float2 csx0 =cossin_T_LUT[k]; + float2 csx0Transp= (float2)(csx0.y,-csx0.x); + int r=ang_index - k * ( 512 ); + float mh=-4.7936900955e-05*(float)r; + float mhsqr2= mh*mh*(-0.5f); + float hqub6= mhsqr2*mh*(1.0f/3.0f); + float2 cs; + cs= hqub6 * csx0Transp; + cs += mhsqr2*csx0; + cs += mh*csx0Transp; + cs += csx0; + cs.y *=dir; +w = cs; +} +a[14] = complexMul(a[14], w); +{ int ang_index = (1 * ( l * (k + 113))) & 131071; + int k = (ang_index + 256) >> 9; + float2 csx0 =cossin_T_LUT[k]; + float2 csx0Transp= (float2)(csx0.y,-csx0.x); + int r=ang_index - k * ( 512 ); + float mh=-4.7936900955e-05*(float)r; + float mhsqr2= mh*mh*(-0.5f); + float hqub6= mhsqr2*mh*(1.0f/3.0f); + float2 cs; + cs= hqub6 * csx0Transp; + cs += mhsqr2*csx0; + cs += mh*csx0Transp; + cs += csx0; + cs.y *=dir; +w = cs; +} +a[15] = complexMul(a[15], w); +lMemStore = sMem + mad24(i, 129, j << 1); +lMemLoad = sMem + mad24(tid >> 7, 129, tid & 127); +lMemStore[ 0] = a[0].x; +lMemStore[ 16] = a[1].x; +lMemStore[ 32] = a[2].x; +lMemStore[ 48] = a[3].x; +lMemStore[ 64] = a[4].x; +lMemStore[ 80] = a[5].x; +lMemStore[ 96] = a[6].x; +lMemStore[ 112] = a[7].x; +lMemStore[ 1] = a[8].x; +lMemStore[ 17] = a[9].x; +lMemStore[ 33] = a[10].x; +lMemStore[ 49] = a[11].x; +lMemStore[ 65] = a[12].x; +lMemStore[ 81] = a[13].x; +lMemStore[ 97] = a[14].x; +lMemStore[ 113] = a[15].x; +barrier(CLK_LOCAL_MEM_FENCE); +a[0].x = lMemLoad[0]; +a[1].x = lMemLoad[129]; +a[2].x = lMemLoad[258]; +a[3].x = lMemLoad[387]; +a[4].x = lMemLoad[516]; +a[5].x = lMemLoad[645]; +a[6].x = lMemLoad[774]; +a[7].x = lMemLoad[903]; +a[8].x = lMemLoad[1032]; +a[9].x = lMemLoad[1161]; +a[10].x = lMemLoad[1290]; +a[11].x = lMemLoad[1419]; +a[12].x = lMemLoad[1548]; +a[13].x = lMemLoad[1677]; +a[14].x = lMemLoad[1806]; +a[15].x = lMemLoad[1935]; +barrier(CLK_LOCAL_MEM_FENCE); +lMemStore[ 0] = a[0].y; +lMemStore[ 16] = a[1].y; +lMemStore[ 32] = a[2].y; +lMemStore[ 48] = a[3].y; +lMemStore[ 64] = a[4].y; +lMemStore[ 80] = a[5].y; +lMemStore[ 96] = a[6].y; +lMemStore[ 112] = a[7].y; +lMemStore[ 1] = a[8].y; +lMemStore[ 17] = a[9].y; +lMemStore[ 33] = a[10].y; +lMemStore[ 49] = a[11].y; +lMemStore[ 65] = a[12].y; +lMemStore[ 81] = a[13].y; +lMemStore[ 97] = a[14].y; +lMemStore[ 113] = a[15].y; +barrier(CLK_LOCAL_MEM_FENCE); +a[0].y = lMemLoad[0]; +a[1].y = lMemLoad[129]; +a[2].y = lMemLoad[258]; +a[3].y = lMemLoad[387]; +a[4].y = lMemLoad[516]; +a[5].y = lMemLoad[645]; +a[6].y = lMemLoad[774]; +a[7].y = lMemLoad[903]; +a[8].y = lMemLoad[1032]; +a[9].y = lMemLoad[1161]; +a[10].y = lMemLoad[1290]; +a[11].y = lMemLoad[1419]; +a[12].y = lMemLoad[1548]; +a[13].y = lMemLoad[1677]; +a[14].y = lMemLoad[1806]; +a[15].y = lMemLoad[1935]; +barrier(CLK_LOCAL_MEM_FENCE); +indexOut += tid; +out += indexOut; +out[0] = a[0]; +out[128] = a[1]; +out[256] = a[2]; +out[384] = a[3]; +out[512] = a[4]; +out[640] = a[5]; +out[768] = a[6]; +out[896] = a[7]; +out[1024] = a[8]; +out[1152] = a[9]; +out[1280] = a[10]; +out[1408] = a[11]; +out[1536] = a[12]; +out[1664] = a[13]; +out[1792] = a[14]; +out[1920] = a[15]; +} +__kernel void fft1(__global float2 *in, __global float2 *out, int dir, int S, __global float2 * cossinLUT1, __global float2 * cossinLUT2 ) +{ + __local float sMem[2048]; + int i, j, r, indexIn, indexOut, index, tid, bNum, xNum, k, l; + int s, ii, jj, offset; + float2 w; + float ang, angf, ang1; + __local float *lMemStore, *lMemLoad; + float2 a[16]; + int lId = get_local_id( 0 ); + int groupId = get_group_id( 0 ); + __local float2 cossin_T_LUT[256]; + int lLUTind= lId; + cossin_T_LUT[lLUTind]=cossinLUT2[lLUTind]; + lLUTind+=128; + cossin_T_LUT[lLUTind]=cossinLUT2[lLUTind]; + barrier(CLK_LOCAL_MEM_FENCE); +bNum = groupId & 63; +xNum = groupId >> 6; +indexIn = mul24(bNum, 16); +tid = indexIn; +i = tid >> 7; +j = tid & 127; +indexOut = mad24(i, 16384, j); +indexIn += (xNum << 17); +indexOut += (xNum << 17); +tid = lId; +i = tid & 15; +j = tid >> 4; +indexIn += mad24(j, 1024, i); +in += indexIn; +a[0] = in[0]; +a[1] = in[8192]; +a[2] = in[16384]; +a[3] = in[24576]; +a[4] = in[32768]; +a[5] = in[40960]; +a[6] = in[49152]; +a[7] = in[57344]; +a[8] = in[65536]; +a[9] = in[73728]; +a[10] = in[81920]; +a[11] = in[90112]; +a[12] = in[98304]; +a[13] = in[106496]; +a[14] = in[114688]; +a[15] = in[122880]; +fftKernel16(a, dir); +{ int ang_index = (1 * ( j)) & 127; + int k = ang_index * 2; + float2 cs =cossin_T_LUT[k]; + cs.y *=dir; +w = cs; +} +a[1] = complexMul(a[1], w); +{ int ang_index = (1 * ( j)) & 63; + int k = ang_index * 4; + float2 cs =cossin_T_LUT[k]; + cs.y *=dir; +w = cs; +} +a[2] = complexMul(a[2], w); +{ int ang_index = (3 * ( j)) & 127; + int k = ang_index * 2; + float2 cs =cossin_T_LUT[k]; + cs.y *=dir; +w = cs; +} +a[3] = complexMul(a[3], w); +{ int ang_index = (1 * ( j)) & 31; + int k = ang_index * 8; + float2 cs =cossin_T_LUT[k]; + cs.y *=dir; +w = cs; +} +a[4] = complexMul(a[4], w); +{ int ang_index = (5 * ( j)) & 127; + int k = ang_index * 2; + float2 cs =cossin_T_LUT[k]; + cs.y *=dir; +w = cs; +} +a[5] = complexMul(a[5], w); +{ int ang_index = (3 * ( j)) & 63; + int k = ang_index * 4; + float2 cs =cossin_T_LUT[k]; + cs.y *=dir; +w = cs; +} +a[6] = complexMul(a[6], w); +{ int ang_index = (7 * ( j)) & 127; + int k = ang_index * 2; + float2 cs =cossin_T_LUT[k]; + cs.y *=dir; +w = cs; +} +a[7] = complexMul(a[7], w); +{ int ang_index = (1 * ( j)) & 15; + int k = ang_index * 16; + float2 cs =cossin_T_LUT[k]; + cs.y *=dir; +w = cs; +} +a[8] = complexMul(a[8], w); +{ int ang_index = (9 * ( j)) & 127; + int k = ang_index * 2; + float2 cs =cossin_T_LUT[k]; + cs.y *=dir; +w = cs; +} +a[9] = complexMul(a[9], w); +{ int ang_index = (5 * ( j)) & 63; + int k = ang_index * 4; + float2 cs =cossin_T_LUT[k]; + cs.y *=dir; +w = cs; +} +a[10] = complexMul(a[10], w); +{ int ang_index = (11 * ( j)) & 127; + int k = ang_index * 2; + float2 cs =cossin_T_LUT[k]; + cs.y *=dir; +w = cs; +} +a[11] = complexMul(a[11], w); +{ int ang_index = (3 * ( j)) & 31; + int k = ang_index * 8; + float2 cs =cossin_T_LUT[k]; + cs.y *=dir; +w = cs; +} +a[12] = complexMul(a[12], w); +{ int ang_index = (13 * ( j)) & 127; + int k = ang_index * 2; + float2 cs =cossin_T_LUT[k]; + cs.y *=dir; +w = cs; +} +a[13] = complexMul(a[13], w); +{ int ang_index = (7 * ( j)) & 63; + int k = ang_index * 4; + float2 cs =cossin_T_LUT[k]; + cs.y *=dir; +w = cs; +} +a[14] = complexMul(a[14], w); +{ int ang_index = (15 * ( j)) & 127; + int k = ang_index * 2; + float2 cs =cossin_T_LUT[k]; + cs.y *=dir; +w = cs; +} +a[15] = complexMul(a[15], w); +indexIn = mad24(j, 256, i); +lMemStore = sMem + tid; +lMemLoad = sMem + indexIn; +lMemStore[0] = a[0].x; +lMemStore[128] = a[1].x; +lMemStore[256] = a[2].x; +lMemStore[384] = a[3].x; +lMemStore[512] = a[4].x; +lMemStore[640] = a[5].x; +lMemStore[768] = a[6].x; +lMemStore[896] = a[7].x; +lMemStore[1024] = a[8].x; +lMemStore[1152] = a[9].x; +lMemStore[1280] = a[10].x; +lMemStore[1408] = a[11].x; +lMemStore[1536] = a[12].x; +lMemStore[1664] = a[13].x; +lMemStore[1792] = a[14].x; +lMemStore[1920] = a[15].x; +barrier(CLK_LOCAL_MEM_FENCE); +a[0].x = lMemLoad[0]; +a[1].x = lMemLoad[16]; +a[2].x = lMemLoad[32]; +a[3].x = lMemLoad[48]; +a[4].x = lMemLoad[64]; +a[5].x = lMemLoad[80]; +a[6].x = lMemLoad[96]; +a[7].x = lMemLoad[112]; +a[8].x = lMemLoad[128]; +a[9].x = lMemLoad[144]; +a[10].x = lMemLoad[160]; +a[11].x = lMemLoad[176]; +a[12].x = lMemLoad[192]; +a[13].x = lMemLoad[208]; +a[14].x = lMemLoad[224]; +a[15].x = lMemLoad[240]; +barrier(CLK_LOCAL_MEM_FENCE); +lMemStore[0] = a[0].y; +lMemStore[128] = a[1].y; +lMemStore[256] = a[2].y; +lMemStore[384] = a[3].y; +lMemStore[512] = a[4].y; +lMemStore[640] = a[5].y; +lMemStore[768] = a[6].y; +lMemStore[896] = a[7].y; +lMemStore[1024] = a[8].y; +lMemStore[1152] = a[9].y; +lMemStore[1280] = a[10].y; +lMemStore[1408] = a[11].y; +lMemStore[1536] = a[12].y; +lMemStore[1664] = a[13].y; +lMemStore[1792] = a[14].y; +lMemStore[1920] = a[15].y; +barrier(CLK_LOCAL_MEM_FENCE); +a[0].y = lMemLoad[0]; +a[1].y = lMemLoad[16]; +a[2].y = lMemLoad[32]; +a[3].y = lMemLoad[48]; +a[4].y = lMemLoad[64]; +a[5].y = lMemLoad[80]; +a[6].y = lMemLoad[96]; +a[7].y = lMemLoad[112]; +a[8].y = lMemLoad[128]; +a[9].y = lMemLoad[144]; +a[10].y = lMemLoad[160]; +a[11].y = lMemLoad[176]; +a[12].y = lMemLoad[192]; +a[13].y = lMemLoad[208]; +a[14].y = lMemLoad[224]; +a[15].y = lMemLoad[240]; +barrier(CLK_LOCAL_MEM_FENCE); +fftKernel8(a + 0, dir); +fftKernel8(a + 8, dir); +l = ((bNum << 4) + i) >> 7; +k = j << 1; +{ int ang_index = (1 * ( l * (k + 0))) & 1023; + int k = (ang_index + 2) >> 2; + float2 csx0 =cossin_T_LUT[k]; + float2 csx0Transp= (float2)(csx0.y,-csx0.x); + int r=ang_index - k * ( 4 ); + float mh=-6.1359233223e-03*(float)r; + float mhsqr2= mh*mh*(-0.5f); + float hqub6= mhsqr2*mh*(1.0f/3.0f); + float2 cs; + cs= hqub6 * csx0Transp; + cs += mhsqr2*csx0; + cs += mh*csx0Transp; + cs += csx0; + cs.y *=dir; +w = cs; +} +a[0] = complexMul(a[0], w); +{ int ang_index = (1 * ( l * (k + 16))) & 1023; + int k = (ang_index + 2) >> 2; + float2 csx0 =cossin_T_LUT[k]; + float2 csx0Transp= (float2)(csx0.y,-csx0.x); + int r=ang_index - k * ( 4 ); + float mh=-6.1359233223e-03*(float)r; + float mhsqr2= mh*mh*(-0.5f); + float hqub6= mhsqr2*mh*(1.0f/3.0f); + float2 cs; + cs= hqub6 * csx0Transp; + cs += mhsqr2*csx0; + cs += mh*csx0Transp; + cs += csx0; + cs.y *=dir; +w = cs; +} +a[1] = complexMul(a[1], w); +{ int ang_index = (1 * ( l * (k + 32))) & 1023; + int k = (ang_index + 2) >> 2; + float2 csx0 =cossin_T_LUT[k]; + float2 csx0Transp= (float2)(csx0.y,-csx0.x); + int r=ang_index - k * ( 4 ); + float mh=-6.1359233223e-03*(float)r; + float mhsqr2= mh*mh*(-0.5f); + float hqub6= mhsqr2*mh*(1.0f/3.0f); + float2 cs; + cs= hqub6 * csx0Transp; + cs += mhsqr2*csx0; + cs += mh*csx0Transp; + cs += csx0; + cs.y *=dir; +w = cs; +} +a[2] = complexMul(a[2], w); +{ int ang_index = (1 * ( l * (k + 48))) & 1023; + int k = (ang_index + 2) >> 2; + float2 csx0 =cossin_T_LUT[k]; + float2 csx0Transp= (float2)(csx0.y,-csx0.x); + int r=ang_index - k * ( 4 ); + float mh=-6.1359233223e-03*(float)r; + float mhsqr2= mh*mh*(-0.5f); + float hqub6= mhsqr2*mh*(1.0f/3.0f); + float2 cs; + cs= hqub6 * csx0Transp; + cs += mhsqr2*csx0; + cs += mh*csx0Transp; + cs += csx0; + cs.y *=dir; +w = cs; +} +a[3] = complexMul(a[3], w); +{ int ang_index = (1 * ( l * (k + 64))) & 1023; + int k = (ang_index + 2) >> 2; + float2 csx0 =cossin_T_LUT[k]; + float2 csx0Transp= (float2)(csx0.y,-csx0.x); + int r=ang_index - k * ( 4 ); + float mh=-6.1359233223e-03*(float)r; + float mhsqr2= mh*mh*(-0.5f); + float hqub6= mhsqr2*mh*(1.0f/3.0f); + float2 cs; + cs= hqub6 * csx0Transp; + cs += mhsqr2*csx0; + cs += mh*csx0Transp; + cs += csx0; + cs.y *=dir; +w = cs; +} +a[4] = complexMul(a[4], w); +{ int ang_index = (1 * ( l * (k + 80))) & 1023; + int k = (ang_index + 2) >> 2; + float2 csx0 =cossin_T_LUT[k]; + float2 csx0Transp= (float2)(csx0.y,-csx0.x); + int r=ang_index - k * ( 4 ); + float mh=-6.1359233223e-03*(float)r; + float mhsqr2= mh*mh*(-0.5f); + float hqub6= mhsqr2*mh*(1.0f/3.0f); + float2 cs; + cs= hqub6 * csx0Transp; + cs += mhsqr2*csx0; + cs += mh*csx0Transp; + cs += csx0; + cs.y *=dir; +w = cs; +} +a[5] = complexMul(a[5], w); +{ int ang_index = (1 * ( l * (k + 96))) & 1023; + int k = (ang_index + 2) >> 2; + float2 csx0 =cossin_T_LUT[k]; + float2 csx0Transp= (float2)(csx0.y,-csx0.x); + int r=ang_index - k * ( 4 ); + float mh=-6.1359233223e-03*(float)r; + float mhsqr2= mh*mh*(-0.5f); + float hqub6= mhsqr2*mh*(1.0f/3.0f); + float2 cs; + cs= hqub6 * csx0Transp; + cs += mhsqr2*csx0; + cs += mh*csx0Transp; + cs += csx0; + cs.y *=dir; +w = cs; +} +a[6] = complexMul(a[6], w); +{ int ang_index = (1 * ( l * (k + 112))) & 1023; + int k = (ang_index + 2) >> 2; + float2 csx0 =cossin_T_LUT[k]; + float2 csx0Transp= (float2)(csx0.y,-csx0.x); + int r=ang_index - k * ( 4 ); + float mh=-6.1359233223e-03*(float)r; + float mhsqr2= mh*mh*(-0.5f); + float hqub6= mhsqr2*mh*(1.0f/3.0f); + float2 cs; + cs= hqub6 * csx0Transp; + cs += mhsqr2*csx0; + cs += mh*csx0Transp; + cs += csx0; + cs.y *=dir; +w = cs; +} +a[7] = complexMul(a[7], w); +{ int ang_index = (1 * ( l * (k + 1))) & 1023; + int k = (ang_index + 2) >> 2; + float2 csx0 =cossin_T_LUT[k]; + float2 csx0Transp= (float2)(csx0.y,-csx0.x); + int r=ang_index - k * ( 4 ); + float mh=-6.1359233223e-03*(float)r; + float mhsqr2= mh*mh*(-0.5f); + float hqub6= mhsqr2*mh*(1.0f/3.0f); + float2 cs; + cs= hqub6 * csx0Transp; + cs += mhsqr2*csx0; + cs += mh*csx0Transp; + cs += csx0; + cs.y *=dir; +w = cs; +} +a[8] = complexMul(a[8], w); +{ int ang_index = (1 * ( l * (k + 17))) & 1023; + int k = (ang_index + 2) >> 2; + float2 csx0 =cossin_T_LUT[k]; + float2 csx0Transp= (float2)(csx0.y,-csx0.x); + int r=ang_index - k * ( 4 ); + float mh=-6.1359233223e-03*(float)r; + float mhsqr2= mh*mh*(-0.5f); + float hqub6= mhsqr2*mh*(1.0f/3.0f); + float2 cs; + cs= hqub6 * csx0Transp; + cs += mhsqr2*csx0; + cs += mh*csx0Transp; + cs += csx0; + cs.y *=dir; +w = cs; +} +a[9] = complexMul(a[9], w); +{ int ang_index = (1 * ( l * (k + 33))) & 1023; + int k = (ang_index + 2) >> 2; + float2 csx0 =cossin_T_LUT[k]; + float2 csx0Transp= (float2)(csx0.y,-csx0.x); + int r=ang_index - k * ( 4 ); + float mh=-6.1359233223e-03*(float)r; + float mhsqr2= mh*mh*(-0.5f); + float hqub6= mhsqr2*mh*(1.0f/3.0f); + float2 cs; + cs= hqub6 * csx0Transp; + cs += mhsqr2*csx0; + cs += mh*csx0Transp; + cs += csx0; + cs.y *=dir; +w = cs; +} +a[10] = complexMul(a[10], w); +{ int ang_index = (1 * ( l * (k + 49))) & 1023; + int k = (ang_index + 2) >> 2; + float2 csx0 =cossin_T_LUT[k]; + float2 csx0Transp= (float2)(csx0.y,-csx0.x); + int r=ang_index - k * ( 4 ); + float mh=-6.1359233223e-03*(float)r; + float mhsqr2= mh*mh*(-0.5f); + float hqub6= mhsqr2*mh*(1.0f/3.0f); + float2 cs; + cs= hqub6 * csx0Transp; + cs += mhsqr2*csx0; + cs += mh*csx0Transp; + cs += csx0; + cs.y *=dir; +w = cs; +} +a[11] = complexMul(a[11], w); +{ int ang_index = (1 * ( l * (k + 65))) & 1023; + int k = (ang_index + 2) >> 2; + float2 csx0 =cossin_T_LUT[k]; + float2 csx0Transp= (float2)(csx0.y,-csx0.x); + int r=ang_index - k * ( 4 ); + float mh=-6.1359233223e-03*(float)r; + float mhsqr2= mh*mh*(-0.5f); + float hqub6= mhsqr2*mh*(1.0f/3.0f); + float2 cs; + cs= hqub6 * csx0Transp; + cs += mhsqr2*csx0; + cs += mh*csx0Transp; + cs += csx0; + cs.y *=dir; +w = cs; +} +a[12] = complexMul(a[12], w); +{ int ang_index = (1 * ( l * (k + 81))) & 1023; + int k = (ang_index + 2) >> 2; + float2 csx0 =cossin_T_LUT[k]; + float2 csx0Transp= (float2)(csx0.y,-csx0.x); + int r=ang_index - k * ( 4 ); + float mh=-6.1359233223e-03*(float)r; + float mhsqr2= mh*mh*(-0.5f); + float hqub6= mhsqr2*mh*(1.0f/3.0f); + float2 cs; + cs= hqub6 * csx0Transp; + cs += mhsqr2*csx0; + cs += mh*csx0Transp; + cs += csx0; + cs.y *=dir; +w = cs; +} +a[13] = complexMul(a[13], w); +{ int ang_index = (1 * ( l * (k + 97))) & 1023; + int k = (ang_index + 2) >> 2; + float2 csx0 =cossin_T_LUT[k]; + float2 csx0Transp= (float2)(csx0.y,-csx0.x); + int r=ang_index - k * ( 4 ); + float mh=-6.1359233223e-03*(float)r; + float mhsqr2= mh*mh*(-0.5f); + float hqub6= mhsqr2*mh*(1.0f/3.0f); + float2 cs; + cs= hqub6 * csx0Transp; + cs += mhsqr2*csx0; + cs += mh*csx0Transp; + cs += csx0; + cs.y *=dir; +w = cs; +} +a[14] = complexMul(a[14], w); +{ int ang_index = (1 * ( l * (k + 113))) & 1023; + int k = (ang_index + 2) >> 2; + float2 csx0 =cossin_T_LUT[k]; + float2 csx0Transp= (float2)(csx0.y,-csx0.x); + int r=ang_index - k * ( 4 ); + float mh=-6.1359233223e-03*(float)r; + float mhsqr2= mh*mh*(-0.5f); + float hqub6= mhsqr2*mh*(1.0f/3.0f); + float2 cs; + cs= hqub6 * csx0Transp; + cs += mhsqr2*csx0; + cs += mh*csx0Transp; + cs += csx0; + cs.y *=dir; +w = cs; +} +a[15] = complexMul(a[15], w); +indexOut += mad24(j, 256, i); +out += indexOut; +out[0] = a[0]; +out[2048] = a[1]; +out[4096] = a[2]; +out[6144] = a[3]; +out[8192] = a[4]; +out[10240] = a[5]; +out[12288] = a[6]; +out[14336] = a[7]; +out[128] = a[8]; +out[2176] = a[9]; +out[4224] = a[10]; +out[6272] = a[11]; +out[8320] = a[12]; +out[10368] = a[13]; +out[12416] = a[14]; +out[14464] = a[15]; +} +__kernel void fft2(__global float2 *in, __global float2 *out, int dir, int S, __global float2 * cossinLUT1, __global float2 * cossinLUT2 ) +{ + int i, j, r, indexIn, indexOut, index, tid, bNum, xNum, k, l; + int s, ii, jj, offset; + float2 w; + float ang, angf, ang1; + __local float *lMemStore, *lMemLoad; + float2 a[8]; + int lId = get_local_id( 0 ); + int groupId = get_group_id( 0 ); +bNum = groupId & 63; +xNum = groupId >> 6; +indexIn = mul24(bNum, 256); +tid = indexIn; +i = tid >> 14; +j = tid & 16383; +indexOut = mad24(i, 131072, j); +indexIn += (xNum << 17); +indexOut += (xNum << 17); +tid = lId; +i = tid & 255; +j = tid >> 8; +indexIn += mad24(j, 16384, i); +in += indexIn; +a[0] = in[0]; +a[1] = in[16384]; +a[2] = in[32768]; +a[3] = in[49152]; +a[4] = in[65536]; +a[5] = in[81920]; +a[6] = in[98304]; +a[7] = in[114688]; +fftKernel8(a, dir); +indexOut += mad24(j, 131072, i); +out += indexOut; +out[0] = a[0]; +out[16384] = a[1]; +out[32768] = a[2]; +out[49152] = a[3]; +out[65536] = a[4]; +out[81920] = a[5]; +out[98304] = a[6]; +out[114688] = a[7]; +} diff --git a/src/clFFT_262144.cl b/src/clFFT_262144.cl new file mode 100644 index 0000000000000000000000000000000000000000..4591874c9fe26bacc7b0b84f5dea285ccedad2b0 --- /dev/null +++ b/src/clFFT_262144.cl @@ -0,0 +1,1404 @@ +#ifndef M_PI +#define M_PI 0x1.921fb54442d18p+1 +#endif +#define complexMul(a,b) ((float2)(mad(-(a).y, (b).y, (a).x * (b).x), mad((a).y, (b).x, (a).x * (b).y))) + +#define cos_sinLUT1(res,dir,i,cossinLUT)\ +{\ +(res)=(float2)((cossinLUT)[i].x , (dir)*(cossinLUT)[i].y);\ +} + +#define cos_sinLUT2(res,dir,_i,_k,cossinLUT1,cossinLUT2) \ +{ float _sin_1= (cossinLUT1)[_i].y; \ + float _sin_2= (cossinLUT2)[_k].y; \ + float _cos_1= (cossinLUT1)[_i].x; \ + float _cos_2= (cossinLUT2)[_k].x; \ + float _cos_res = _cos_1 * _cos_2 - _sin_1 * _sin_2; \ + float _sin_res = (dir) * (_sin_1 * _cos_2 + _cos_1 * _sin_2); \ + (res)=(float2)(_cos_res,_sin_res); \ +} + +#define conj(a) ((float2)((a).x, -(a).y)) +#define conjTransp(a) ((float2)(-(a).y, (a).x)) + +#define fftKernel2(a,dir) \ +{ \ + float2 c = (a)[0]; \ + (a)[0] = c + (a)[1]; \ + (a)[1] = c - (a)[1]; \ +} + +#define fftKernel2S(d1,d2,dir) \ +{ \ + float2 c = (d1); \ + (d1) = c + (d2); \ + (d2) = c - (d2); \ +} + +#define fftKernel4(a,dir) \ +{ \ + fftKernel2S((a)[0], (a)[2], dir); \ + fftKernel2S((a)[1], (a)[3], dir); \ + fftKernel2S((a)[0], (a)[1], dir); \ + (a)[3] = (float2)(dir)*(conjTransp((a)[3])); \ + fftKernel2S((a)[2], (a)[3], dir); \ + float2 c = (a)[1]; \ + (a)[1] = (a)[2]; \ + (a)[2] = c; \ +} + +#define fftKernel4s(a0,a1,a2,a3,dir) \ +{ \ + fftKernel2S((a0), (a2), dir); \ + fftKernel2S((a1), (a3), dir); \ + fftKernel2S((a0), (a1), dir); \ + (a3) = (float2)(dir)*(conjTransp((a3))); \ + fftKernel2S((a2), (a3), dir); \ + float2 c = (a1); \ + (a1) = (a2); \ + (a2) = c; \ +} + +#define bitreverse8(a) \ +{ \ + float2 c; \ + c = (a)[1]; \ + (a)[1] = (a)[4]; \ + (a)[4] = c; \ + c = (a)[3]; \ + (a)[3] = (a)[6]; \ + (a)[6] = c; \ +} + +#define fftKernel8(a,dir) \ +{ \ + const float2 w1 = (float2)(0x1.6a09e6p-1f, dir*0x1.6a09e6p-1f); \ + const float2 w3 = (float2)(-0x1.6a09e6p-1f, dir*0x1.6a09e6p-1f); \ + float2 c; \ + fftKernel2S((a)[0], (a)[4], dir); \ + fftKernel2S((a)[1], (a)[5], dir); \ + fftKernel2S((a)[2], (a)[6], dir); \ + fftKernel2S((a)[3], (a)[7], dir); \ + (a)[5] = complexMul(w1, (a)[5]); \ + (a)[6] = (float2)(dir)*(conjTransp((a)[6])); \ + (a)[7] = complexMul(w3, (a)[7]); \ + fftKernel2S((a)[0], (a)[2], dir); \ + fftKernel2S((a)[1], (a)[3], dir); \ + fftKernel2S((a)[4], (a)[6], dir); \ + fftKernel2S((a)[5], (a)[7], dir); \ + (a)[3] = (float2)(dir)*(conjTransp((a)[3])); \ + (a)[7] = (float2)(dir)*(conjTransp((a)[7])); \ + fftKernel2S((a)[0], (a)[1], dir); \ + fftKernel2S((a)[2], (a)[3], dir); \ + fftKernel2S((a)[4], (a)[5], dir); \ + fftKernel2S((a)[6], (a)[7], dir); \ + bitreverse8((a)); \ +} + +#define bitreverse4x4(a) \ +{ \ + float2 c; \ + c = (a)[1]; (a)[1] = (a)[4]; (a)[4] = c; \ + c = (a)[2]; (a)[2] = (a)[8]; (a)[8] = c; \ + c = (a)[3]; (a)[3] = (a)[12]; (a)[12] = c; \ + c = (a)[6]; (a)[6] = (a)[9]; (a)[9] = c; \ + c = (a)[7]; (a)[7] = (a)[13]; (a)[13] = c; \ + c = (a)[11]; (a)[11] = (a)[14]; (a)[14] = c; \ +} + +#define fftKernel16(a,dir) \ +{ \ + const float w0 = 0x1.d906bcp-1f; \ + const float w1 = 0x1.87de2ap-2f; \ + const float w2 = 0x1.6a09e6p-1f; \ + fftKernel4s((a)[0], (a)[4], (a)[8], (a)[12], dir); \ + fftKernel4s((a)[1], (a)[5], (a)[9], (a)[13], dir); \ + fftKernel4s((a)[2], (a)[6], (a)[10], (a)[14], dir); \ + fftKernel4s((a)[3], (a)[7], (a)[11], (a)[15], dir); \ + (a)[5] = complexMul((a)[5], (float2)(w0, dir*w1)); \ + (a)[6] = complexMul((a)[6], (float2)(w2, dir*w2)); \ + (a)[7] = complexMul((a)[7], (float2)(w1, dir*w0)); \ + (a)[9] = complexMul((a)[9], (float2)(w2, dir*w2)); \ + (a)[10] = (float2)(dir)*(conjTransp((a)[10])); \ + (a)[11] = complexMul((a)[11], (float2)(-w2, dir*w2)); \ + (a)[13] = complexMul((a)[13], (float2)(w1, dir*w0)); \ + (a)[14] = complexMul((a)[14], (float2)(-w2, dir*w2)); \ + (a)[15] = complexMul((a)[15], (float2)(-w0, dir*-w1)); \ + fftKernel4((a), dir); \ + fftKernel4((a) + 4, dir); \ + fftKernel4((a) + 8, dir); \ + fftKernel4((a) + 12, dir); \ + bitreverse4x4((a)); \ +} + +#define bitreverse32(a) \ +{ \ + float2 c1, c2; \ + c1 = (a)[2]; (a)[2] = (a)[1]; c2 = (a)[4]; (a)[4] = c1; c1 = (a)[8]; (a)[8] = c2; c2 = (a)[16]; (a)[16] = c1; (a)[1] = c2; \ + c1 = (a)[6]; (a)[6] = (a)[3]; c2 = (a)[12]; (a)[12] = c1; c1 = (a)[24]; (a)[24] = c2; c2 = (a)[17]; (a)[17] = c1; (a)[3] = c2; \ + c1 = (a)[10]; (a)[10] = (a)[5]; c2 = (a)[20]; (a)[20] = c1; c1 = (a)[9]; (a)[9] = c2; c2 = (a)[18]; (a)[18] = c1; (a)[5] = c2; \ + c1 = (a)[14]; (a)[14] = (a)[7]; c2 = (a)[28]; (a)[28] = c1; c1 = (a)[25]; (a)[25] = c2; c2 = (a)[19]; (a)[19] = c1; (a)[7] = c2; \ + c1 = (a)[22]; (a)[22] = (a)[11]; c2 = (a)[13]; (a)[13] = c1; c1 = (a)[26]; (a)[26] = c2; c2 = (a)[21]; (a)[21] = c1; (a)[11] = c2; \ + c1 = (a)[30]; (a)[30] = (a)[15]; c2 = (a)[29]; (a)[29] = c1; c1 = (a)[27]; (a)[27] = c2; c2 = (a)[23]; (a)[23] = c1; (a)[15] = c2; \ +} + +#define fftKernel32(a,dir) \ +{ \ + fftKernel2S((a)[0], (a)[16], dir); \ + fftKernel2S((a)[1], (a)[17], dir); \ + fftKernel2S((a)[2], (a)[18], dir); \ + fftKernel2S((a)[3], (a)[19], dir); \ + fftKernel2S((a)[4], (a)[20], dir); \ + fftKernel2S((a)[5], (a)[21], dir); \ + fftKernel2S((a)[6], (a)[22], dir); \ + fftKernel2S((a)[7], (a)[23], dir); \ + fftKernel2S((a)[8], (a)[24], dir); \ + fftKernel2S((a)[9], (a)[25], dir); \ + fftKernel2S((a)[10], (a)[26], dir); \ + fftKernel2S((a)[11], (a)[27], dir); \ + fftKernel2S((a)[12], (a)[28], dir); \ + fftKernel2S((a)[13], (a)[29], dir); \ + fftKernel2S((a)[14], (a)[30], dir); \ + fftKernel2S((a)[15], (a)[31], dir); \ + (a)[17] = complexMul((a)[17], (float2)(0x1.f6297cp-1f, dir*0x1.8f8b84p-3f)); \ + (a)[18] = complexMul((a)[18], (float2)(0x1.d906bcp-1f, dir*0x1.87de2ap-2f)); \ + (a)[19] = complexMul((a)[19], (float2)(0x1.a9b662p-1f, dir*0x1.1c73b4p-1f)); \ + (a)[20] = complexMul((a)[20], (float2)(0x1.6a09e6p-1f, dir*0x1.6a09e6p-1f)); \ + (a)[21] = complexMul((a)[21], (float2)(0x1.1c73b4p-1f, dir*0x1.a9b662p-1f)); \ + (a)[22] = complexMul((a)[22], (float2)(0x1.87de2ap-2f, dir*0x1.d906bcp-1f)); \ + (a)[23] = complexMul((a)[23], (float2)(0x1.8f8b84p-3f, dir*0x1.f6297cp-1f)); \ + (a)[24] = complexMul((a)[24], (float2)(0x0p+0f, dir*0x1p+0f)); \ + (a)[25] = complexMul((a)[25], (float2)(-0x1.8f8b84p-3f, dir*0x1.f6297cp-1f)); \ + (a)[26] = complexMul((a)[26], (float2)(-0x1.87de2ap-2f, dir*0x1.d906bcp-1f)); \ + (a)[27] = complexMul((a)[27], (float2)(-0x1.1c73b4p-1f, dir*0x1.a9b662p-1f)); \ + (a)[28] = complexMul((a)[28], (float2)(-0x1.6a09e6p-1f, dir*0x1.6a09e6p-1f)); \ + (a)[29] = complexMul((a)[29], (float2)(-0x1.a9b662p-1f, dir*0x1.1c73b4p-1f)); \ + (a)[30] = complexMul((a)[30], (float2)(-0x1.d906bcp-1f, dir*0x1.87de2ap-2f)); \ + (a)[31] = complexMul((a)[31], (float2)(-0x1.f6297cp-1f, dir*0x1.8f8b84p-3f)); \ + fftKernel16((a), dir); \ + fftKernel16((a) + 16, dir); \ + bitreverse32((a)); \ +} + +__kernel void \ +clFFT_1DTwistInterleaved(__global float2 *in, unsigned int startRow, unsigned int numCols, unsigned int N, unsigned int numRowsToProcess, int dir) \ +{ \ + float2 a, w; \ + float ang; \ + unsigned int j; \ + unsigned int i = get_global_id(0); \ + unsigned int startIndex = i; \ + \ + if(i < numCols) \ + { \ + for(j = 0; j < numRowsToProcess; j++) \ + { \ + a = in[startIndex]; \ + ang = 2.0f * M_PI * dir * i * (startRow + j) / N; \ + w = (float2)(native_cos(ang), native_sin(ang)); \ + a = complexMul(a, w); \ + in[startIndex] = a; \ + startIndex += numCols; \ + } \ + } \ +} \ +__kernel void fft0(__global float2 *in, __global float2 *out, int dir, int S, __global float2 * cossinLUT1, __global float2 * cossinLUT2 ) +{ + __local float sMem[2064]; + int i, j, r, indexIn, indexOut, index, tid, bNum, xNum, k, l; + int s, ii, jj, offset; + float2 w; + float ang, angf, ang1; + __local float *lMemStore, *lMemLoad; + float2 a[16]; + int lId = get_local_id( 0 ); + int groupId = get_group_id( 0 ); + __local float2 cossin_T_LUT[256]; + int lLUTind= lId; + cossin_T_LUT[lLUTind]=cossinLUT2[lLUTind]; + lLUTind+=128; + cossin_T_LUT[lLUTind]=cossinLUT2[lLUTind]; + barrier(CLK_LOCAL_MEM_FENCE); +bNum = groupId & 127; +xNum = groupId >> 7; +indexIn = mul24(bNum, 16); +tid = indexIn; +i = tid >> 0; +j = tid & 0; +indexOut = mad24(i, 128, j); +indexIn += (xNum << 18); +indexOut += (xNum << 18); +tid = lId; +i = tid & 15; +j = tid >> 4; +indexIn += mad24(j, 2048, i); +in += indexIn; +a[0] = in[0]; +a[1] = in[16384]; +a[2] = in[32768]; +a[3] = in[49152]; +a[4] = in[65536]; +a[5] = in[81920]; +a[6] = in[98304]; +a[7] = in[114688]; +a[8] = in[131072]; +a[9] = in[147456]; +a[10] = in[163840]; +a[11] = in[180224]; +a[12] = in[196608]; +a[13] = in[212992]; +a[14] = in[229376]; +a[15] = in[245760]; +fftKernel16(a, dir); +{ int ang_index = (1 * ( j)) & 127; + int k = ang_index * 2; + float2 cs =cossin_T_LUT[k]; + cs.y *=dir; +w = cs; +} +a[1] = complexMul(a[1], w); +{ int ang_index = (1 * ( j)) & 63; + int k = ang_index * 4; + float2 cs =cossin_T_LUT[k]; + cs.y *=dir; +w = cs; +} +a[2] = complexMul(a[2], w); +{ int ang_index = (3 * ( j)) & 127; + int k = ang_index * 2; + float2 cs =cossin_T_LUT[k]; + cs.y *=dir; +w = cs; +} +a[3] = complexMul(a[3], w); +{ int ang_index = (1 * ( j)) & 31; + int k = ang_index * 8; + float2 cs =cossin_T_LUT[k]; + cs.y *=dir; +w = cs; +} +a[4] = complexMul(a[4], w); +{ int ang_index = (5 * ( j)) & 127; + int k = ang_index * 2; + float2 cs =cossin_T_LUT[k]; + cs.y *=dir; +w = cs; +} +a[5] = complexMul(a[5], w); +{ int ang_index = (3 * ( j)) & 63; + int k = ang_index * 4; + float2 cs =cossin_T_LUT[k]; + cs.y *=dir; +w = cs; +} +a[6] = complexMul(a[6], w); +{ int ang_index = (7 * ( j)) & 127; + int k = ang_index * 2; + float2 cs =cossin_T_LUT[k]; + cs.y *=dir; +w = cs; +} +a[7] = complexMul(a[7], w); +{ int ang_index = (1 * ( j)) & 15; + int k = ang_index * 16; + float2 cs =cossin_T_LUT[k]; + cs.y *=dir; +w = cs; +} +a[8] = complexMul(a[8], w); +{ int ang_index = (9 * ( j)) & 127; + int k = ang_index * 2; + float2 cs =cossin_T_LUT[k]; + cs.y *=dir; +w = cs; +} +a[9] = complexMul(a[9], w); +{ int ang_index = (5 * ( j)) & 63; + int k = ang_index * 4; + float2 cs =cossin_T_LUT[k]; + cs.y *=dir; +w = cs; +} +a[10] = complexMul(a[10], w); +{ int ang_index = (11 * ( j)) & 127; + int k = ang_index * 2; + float2 cs =cossin_T_LUT[k]; + cs.y *=dir; +w = cs; +} +a[11] = complexMul(a[11], w); +{ int ang_index = (3 * ( j)) & 31; + int k = ang_index * 8; + float2 cs =cossin_T_LUT[k]; + cs.y *=dir; +w = cs; +} +a[12] = complexMul(a[12], w); +{ int ang_index = (13 * ( j)) & 127; + int k = ang_index * 2; + float2 cs =cossin_T_LUT[k]; + cs.y *=dir; +w = cs; +} +a[13] = complexMul(a[13], w); +{ int ang_index = (7 * ( j)) & 63; + int k = ang_index * 4; + float2 cs =cossin_T_LUT[k]; + cs.y *=dir; +w = cs; +} +a[14] = complexMul(a[14], w); +{ int ang_index = (15 * ( j)) & 127; + int k = ang_index * 2; + float2 cs =cossin_T_LUT[k]; + cs.y *=dir; +w = cs; +} +a[15] = complexMul(a[15], w); +indexIn = mad24(j, 256, i); +lMemStore = sMem + tid; +lMemLoad = sMem + indexIn; +lMemStore[0] = a[0].x; +lMemStore[128] = a[1].x; +lMemStore[256] = a[2].x; +lMemStore[384] = a[3].x; +lMemStore[512] = a[4].x; +lMemStore[640] = a[5].x; +lMemStore[768] = a[6].x; +lMemStore[896] = a[7].x; +lMemStore[1024] = a[8].x; +lMemStore[1152] = a[9].x; +lMemStore[1280] = a[10].x; +lMemStore[1408] = a[11].x; +lMemStore[1536] = a[12].x; +lMemStore[1664] = a[13].x; +lMemStore[1792] = a[14].x; +lMemStore[1920] = a[15].x; +barrier(CLK_LOCAL_MEM_FENCE); +a[0].x = lMemLoad[0]; +a[1].x = lMemLoad[16]; +a[2].x = lMemLoad[32]; +a[3].x = lMemLoad[48]; +a[4].x = lMemLoad[64]; +a[5].x = lMemLoad[80]; +a[6].x = lMemLoad[96]; +a[7].x = lMemLoad[112]; +a[8].x = lMemLoad[128]; +a[9].x = lMemLoad[144]; +a[10].x = lMemLoad[160]; +a[11].x = lMemLoad[176]; +a[12].x = lMemLoad[192]; +a[13].x = lMemLoad[208]; +a[14].x = lMemLoad[224]; +a[15].x = lMemLoad[240]; +barrier(CLK_LOCAL_MEM_FENCE); +lMemStore[0] = a[0].y; +lMemStore[128] = a[1].y; +lMemStore[256] = a[2].y; +lMemStore[384] = a[3].y; +lMemStore[512] = a[4].y; +lMemStore[640] = a[5].y; +lMemStore[768] = a[6].y; +lMemStore[896] = a[7].y; +lMemStore[1024] = a[8].y; +lMemStore[1152] = a[9].y; +lMemStore[1280] = a[10].y; +lMemStore[1408] = a[11].y; +lMemStore[1536] = a[12].y; +lMemStore[1664] = a[13].y; +lMemStore[1792] = a[14].y; +lMemStore[1920] = a[15].y; +barrier(CLK_LOCAL_MEM_FENCE); +a[0].y = lMemLoad[0]; +a[1].y = lMemLoad[16]; +a[2].y = lMemLoad[32]; +a[3].y = lMemLoad[48]; +a[4].y = lMemLoad[64]; +a[5].y = lMemLoad[80]; +a[6].y = lMemLoad[96]; +a[7].y = lMemLoad[112]; +a[8].y = lMemLoad[128]; +a[9].y = lMemLoad[144]; +a[10].y = lMemLoad[160]; +a[11].y = lMemLoad[176]; +a[12].y = lMemLoad[192]; +a[13].y = lMemLoad[208]; +a[14].y = lMemLoad[224]; +a[15].y = lMemLoad[240]; +barrier(CLK_LOCAL_MEM_FENCE); +fftKernel8(a + 0, dir); +fftKernel8(a + 8, dir); +l = ((bNum << 4) + i) >> 0; +k = j << 1; +{ int ang_index = (1 * ( l * (k + 0))) & 262143; + int k = (ang_index + 512) >> 10; + float2 csx0 =cossin_T_LUT[k]; + float2 csx0Transp= (float2)(csx0.y,-csx0.x); + int r=ang_index - k * ( 1024 ); + float mh=-2.3968450478e-05*(float)r; + float mhsqr2= mh*mh*(-0.5f); + float hqub6= mhsqr2*mh*(1.0f/3.0f); + float2 cs; + cs= hqub6 * csx0Transp; + cs += mhsqr2*csx0; + cs += mh*csx0Transp; + cs += csx0; + cs.y *=dir; +w = cs; +} +a[0] = complexMul(a[0], w); +{ int ang_index = (1 * ( l * (k + 16))) & 262143; + int k = (ang_index + 512) >> 10; + float2 csx0 =cossin_T_LUT[k]; + float2 csx0Transp= (float2)(csx0.y,-csx0.x); + int r=ang_index - k * ( 1024 ); + float mh=-2.3968450478e-05*(float)r; + float mhsqr2= mh*mh*(-0.5f); + float hqub6= mhsqr2*mh*(1.0f/3.0f); + float2 cs; + cs= hqub6 * csx0Transp; + cs += mhsqr2*csx0; + cs += mh*csx0Transp; + cs += csx0; + cs.y *=dir; +w = cs; +} +a[1] = complexMul(a[1], w); +{ int ang_index = (1 * ( l * (k + 32))) & 262143; + int k = (ang_index + 512) >> 10; + float2 csx0 =cossin_T_LUT[k]; + float2 csx0Transp= (float2)(csx0.y,-csx0.x); + int r=ang_index - k * ( 1024 ); + float mh=-2.3968450478e-05*(float)r; + float mhsqr2= mh*mh*(-0.5f); + float hqub6= mhsqr2*mh*(1.0f/3.0f); + float2 cs; + cs= hqub6 * csx0Transp; + cs += mhsqr2*csx0; + cs += mh*csx0Transp; + cs += csx0; + cs.y *=dir; +w = cs; +} +a[2] = complexMul(a[2], w); +{ int ang_index = (1 * ( l * (k + 48))) & 262143; + int k = (ang_index + 512) >> 10; + float2 csx0 =cossin_T_LUT[k]; + float2 csx0Transp= (float2)(csx0.y,-csx0.x); + int r=ang_index - k * ( 1024 ); + float mh=-2.3968450478e-05*(float)r; + float mhsqr2= mh*mh*(-0.5f); + float hqub6= mhsqr2*mh*(1.0f/3.0f); + float2 cs; + cs= hqub6 * csx0Transp; + cs += mhsqr2*csx0; + cs += mh*csx0Transp; + cs += csx0; + cs.y *=dir; +w = cs; +} +a[3] = complexMul(a[3], w); +{ int ang_index = (1 * ( l * (k + 64))) & 262143; + int k = (ang_index + 512) >> 10; + float2 csx0 =cossin_T_LUT[k]; + float2 csx0Transp= (float2)(csx0.y,-csx0.x); + int r=ang_index - k * ( 1024 ); + float mh=-2.3968450478e-05*(float)r; + float mhsqr2= mh*mh*(-0.5f); + float hqub6= mhsqr2*mh*(1.0f/3.0f); + float2 cs; + cs= hqub6 * csx0Transp; + cs += mhsqr2*csx0; + cs += mh*csx0Transp; + cs += csx0; + cs.y *=dir; +w = cs; +} +a[4] = complexMul(a[4], w); +{ int ang_index = (1 * ( l * (k + 80))) & 262143; + int k = (ang_index + 512) >> 10; + float2 csx0 =cossin_T_LUT[k]; + float2 csx0Transp= (float2)(csx0.y,-csx0.x); + int r=ang_index - k * ( 1024 ); + float mh=-2.3968450478e-05*(float)r; + float mhsqr2= mh*mh*(-0.5f); + float hqub6= mhsqr2*mh*(1.0f/3.0f); + float2 cs; + cs= hqub6 * csx0Transp; + cs += mhsqr2*csx0; + cs += mh*csx0Transp; + cs += csx0; + cs.y *=dir; +w = cs; +} +a[5] = complexMul(a[5], w); +{ int ang_index = (1 * ( l * (k + 96))) & 262143; + int k = (ang_index + 512) >> 10; + float2 csx0 =cossin_T_LUT[k]; + float2 csx0Transp= (float2)(csx0.y,-csx0.x); + int r=ang_index - k * ( 1024 ); + float mh=-2.3968450478e-05*(float)r; + float mhsqr2= mh*mh*(-0.5f); + float hqub6= mhsqr2*mh*(1.0f/3.0f); + float2 cs; + cs= hqub6 * csx0Transp; + cs += mhsqr2*csx0; + cs += mh*csx0Transp; + cs += csx0; + cs.y *=dir; +w = cs; +} +a[6] = complexMul(a[6], w); +{ int ang_index = (1 * ( l * (k + 112))) & 262143; + int k = (ang_index + 512) >> 10; + float2 csx0 =cossin_T_LUT[k]; + float2 csx0Transp= (float2)(csx0.y,-csx0.x); + int r=ang_index - k * ( 1024 ); + float mh=-2.3968450478e-05*(float)r; + float mhsqr2= mh*mh*(-0.5f); + float hqub6= mhsqr2*mh*(1.0f/3.0f); + float2 cs; + cs= hqub6 * csx0Transp; + cs += mhsqr2*csx0; + cs += mh*csx0Transp; + cs += csx0; + cs.y *=dir; +w = cs; +} +a[7] = complexMul(a[7], w); +{ int ang_index = (1 * ( l * (k + 1))) & 262143; + int k = (ang_index + 512) >> 10; + float2 csx0 =cossin_T_LUT[k]; + float2 csx0Transp= (float2)(csx0.y,-csx0.x); + int r=ang_index - k * ( 1024 ); + float mh=-2.3968450478e-05*(float)r; + float mhsqr2= mh*mh*(-0.5f); + float hqub6= mhsqr2*mh*(1.0f/3.0f); + float2 cs; + cs= hqub6 * csx0Transp; + cs += mhsqr2*csx0; + cs += mh*csx0Transp; + cs += csx0; + cs.y *=dir; +w = cs; +} +a[8] = complexMul(a[8], w); +{ int ang_index = (1 * ( l * (k + 17))) & 262143; + int k = (ang_index + 512) >> 10; + float2 csx0 =cossin_T_LUT[k]; + float2 csx0Transp= (float2)(csx0.y,-csx0.x); + int r=ang_index - k * ( 1024 ); + float mh=-2.3968450478e-05*(float)r; + float mhsqr2= mh*mh*(-0.5f); + float hqub6= mhsqr2*mh*(1.0f/3.0f); + float2 cs; + cs= hqub6 * csx0Transp; + cs += mhsqr2*csx0; + cs += mh*csx0Transp; + cs += csx0; + cs.y *=dir; +w = cs; +} +a[9] = complexMul(a[9], w); +{ int ang_index = (1 * ( l * (k + 33))) & 262143; + int k = (ang_index + 512) >> 10; + float2 csx0 =cossin_T_LUT[k]; + float2 csx0Transp= (float2)(csx0.y,-csx0.x); + int r=ang_index - k * ( 1024 ); + float mh=-2.3968450478e-05*(float)r; + float mhsqr2= mh*mh*(-0.5f); + float hqub6= mhsqr2*mh*(1.0f/3.0f); + float2 cs; + cs= hqub6 * csx0Transp; + cs += mhsqr2*csx0; + cs += mh*csx0Transp; + cs += csx0; + cs.y *=dir; +w = cs; +} +a[10] = complexMul(a[10], w); +{ int ang_index = (1 * ( l * (k + 49))) & 262143; + int k = (ang_index + 512) >> 10; + float2 csx0 =cossin_T_LUT[k]; + float2 csx0Transp= (float2)(csx0.y,-csx0.x); + int r=ang_index - k * ( 1024 ); + float mh=-2.3968450478e-05*(float)r; + float mhsqr2= mh*mh*(-0.5f); + float hqub6= mhsqr2*mh*(1.0f/3.0f); + float2 cs; + cs= hqub6 * csx0Transp; + cs += mhsqr2*csx0; + cs += mh*csx0Transp; + cs += csx0; + cs.y *=dir; +w = cs; +} +a[11] = complexMul(a[11], w); +{ int ang_index = (1 * ( l * (k + 65))) & 262143; + int k = (ang_index + 512) >> 10; + float2 csx0 =cossin_T_LUT[k]; + float2 csx0Transp= (float2)(csx0.y,-csx0.x); + int r=ang_index - k * ( 1024 ); + float mh=-2.3968450478e-05*(float)r; + float mhsqr2= mh*mh*(-0.5f); + float hqub6= mhsqr2*mh*(1.0f/3.0f); + float2 cs; + cs= hqub6 * csx0Transp; + cs += mhsqr2*csx0; + cs += mh*csx0Transp; + cs += csx0; + cs.y *=dir; +w = cs; +} +a[12] = complexMul(a[12], w); +{ int ang_index = (1 * ( l * (k + 81))) & 262143; + int k = (ang_index + 512) >> 10; + float2 csx0 =cossin_T_LUT[k]; + float2 csx0Transp= (float2)(csx0.y,-csx0.x); + int r=ang_index - k * ( 1024 ); + float mh=-2.3968450478e-05*(float)r; + float mhsqr2= mh*mh*(-0.5f); + float hqub6= mhsqr2*mh*(1.0f/3.0f); + float2 cs; + cs= hqub6 * csx0Transp; + cs += mhsqr2*csx0; + cs += mh*csx0Transp; + cs += csx0; + cs.y *=dir; +w = cs; +} +a[13] = complexMul(a[13], w); +{ int ang_index = (1 * ( l * (k + 97))) & 262143; + int k = (ang_index + 512) >> 10; + float2 csx0 =cossin_T_LUT[k]; + float2 csx0Transp= (float2)(csx0.y,-csx0.x); + int r=ang_index - k * ( 1024 ); + float mh=-2.3968450478e-05*(float)r; + float mhsqr2= mh*mh*(-0.5f); + float hqub6= mhsqr2*mh*(1.0f/3.0f); + float2 cs; + cs= hqub6 * csx0Transp; + cs += mhsqr2*csx0; + cs += mh*csx0Transp; + cs += csx0; + cs.y *=dir; +w = cs; +} +a[14] = complexMul(a[14], w); +{ int ang_index = (1 * ( l * (k + 113))) & 262143; + int k = (ang_index + 512) >> 10; + float2 csx0 =cossin_T_LUT[k]; + float2 csx0Transp= (float2)(csx0.y,-csx0.x); + int r=ang_index - k * ( 1024 ); + float mh=-2.3968450478e-05*(float)r; + float mhsqr2= mh*mh*(-0.5f); + float hqub6= mhsqr2*mh*(1.0f/3.0f); + float2 cs; + cs= hqub6 * csx0Transp; + cs += mhsqr2*csx0; + cs += mh*csx0Transp; + cs += csx0; + cs.y *=dir; +w = cs; +} +a[15] = complexMul(a[15], w); +lMemStore = sMem + mad24(i, 129, j << 1); +lMemLoad = sMem + mad24(tid >> 7, 129, tid & 127); +lMemStore[ 0] = a[0].x; +lMemStore[ 16] = a[1].x; +lMemStore[ 32] = a[2].x; +lMemStore[ 48] = a[3].x; +lMemStore[ 64] = a[4].x; +lMemStore[ 80] = a[5].x; +lMemStore[ 96] = a[6].x; +lMemStore[ 112] = a[7].x; +lMemStore[ 1] = a[8].x; +lMemStore[ 17] = a[9].x; +lMemStore[ 33] = a[10].x; +lMemStore[ 49] = a[11].x; +lMemStore[ 65] = a[12].x; +lMemStore[ 81] = a[13].x; +lMemStore[ 97] = a[14].x; +lMemStore[ 113] = a[15].x; +barrier(CLK_LOCAL_MEM_FENCE); +a[0].x = lMemLoad[0]; +a[1].x = lMemLoad[129]; +a[2].x = lMemLoad[258]; +a[3].x = lMemLoad[387]; +a[4].x = lMemLoad[516]; +a[5].x = lMemLoad[645]; +a[6].x = lMemLoad[774]; +a[7].x = lMemLoad[903]; +a[8].x = lMemLoad[1032]; +a[9].x = lMemLoad[1161]; +a[10].x = lMemLoad[1290]; +a[11].x = lMemLoad[1419]; +a[12].x = lMemLoad[1548]; +a[13].x = lMemLoad[1677]; +a[14].x = lMemLoad[1806]; +a[15].x = lMemLoad[1935]; +barrier(CLK_LOCAL_MEM_FENCE); +lMemStore[ 0] = a[0].y; +lMemStore[ 16] = a[1].y; +lMemStore[ 32] = a[2].y; +lMemStore[ 48] = a[3].y; +lMemStore[ 64] = a[4].y; +lMemStore[ 80] = a[5].y; +lMemStore[ 96] = a[6].y; +lMemStore[ 112] = a[7].y; +lMemStore[ 1] = a[8].y; +lMemStore[ 17] = a[9].y; +lMemStore[ 33] = a[10].y; +lMemStore[ 49] = a[11].y; +lMemStore[ 65] = a[12].y; +lMemStore[ 81] = a[13].y; +lMemStore[ 97] = a[14].y; +lMemStore[ 113] = a[15].y; +barrier(CLK_LOCAL_MEM_FENCE); +a[0].y = lMemLoad[0]; +a[1].y = lMemLoad[129]; +a[2].y = lMemLoad[258]; +a[3].y = lMemLoad[387]; +a[4].y = lMemLoad[516]; +a[5].y = lMemLoad[645]; +a[6].y = lMemLoad[774]; +a[7].y = lMemLoad[903]; +a[8].y = lMemLoad[1032]; +a[9].y = lMemLoad[1161]; +a[10].y = lMemLoad[1290]; +a[11].y = lMemLoad[1419]; +a[12].y = lMemLoad[1548]; +a[13].y = lMemLoad[1677]; +a[14].y = lMemLoad[1806]; +a[15].y = lMemLoad[1935]; +barrier(CLK_LOCAL_MEM_FENCE); +indexOut += tid; +out += indexOut; +out[0] = a[0]; +out[128] = a[1]; +out[256] = a[2]; +out[384] = a[3]; +out[512] = a[4]; +out[640] = a[5]; +out[768] = a[6]; +out[896] = a[7]; +out[1024] = a[8]; +out[1152] = a[9]; +out[1280] = a[10]; +out[1408] = a[11]; +out[1536] = a[12]; +out[1664] = a[13]; +out[1792] = a[14]; +out[1920] = a[15]; +} +__kernel void fft1(__global float2 *in, __global float2 *out, int dir, int S, __global float2 * cossinLUT1, __global float2 * cossinLUT2 ) +{ + __local float sMem[2048]; + int i, j, r, indexIn, indexOut, index, tid, bNum, xNum, k, l; + int s, ii, jj, offset; + float2 w; + float ang, angf, ang1; + __local float *lMemStore, *lMemLoad; + float2 a[16]; + int lId = get_local_id( 0 ); + int groupId = get_group_id( 0 ); + __local float2 cossin_T_LUT[256]; + int lLUTind= lId; + cossin_T_LUT[lLUTind]=cossinLUT2[lLUTind]; + lLUTind+=128; + cossin_T_LUT[lLUTind]=cossinLUT2[lLUTind]; + barrier(CLK_LOCAL_MEM_FENCE); +bNum = groupId & 127; +xNum = groupId >> 7; +indexIn = mul24(bNum, 16); +tid = indexIn; +i = tid >> 7; +j = tid & 127; +indexOut = mad24(i, 16384, j); +indexIn += (xNum << 18); +indexOut += (xNum << 18); +tid = lId; +i = tid & 15; +j = tid >> 4; +indexIn += mad24(j, 2048, i); +in += indexIn; +a[0] = in[0]; +a[1] = in[16384]; +a[2] = in[32768]; +a[3] = in[49152]; +a[4] = in[65536]; +a[5] = in[81920]; +a[6] = in[98304]; +a[7] = in[114688]; +a[8] = in[131072]; +a[9] = in[147456]; +a[10] = in[163840]; +a[11] = in[180224]; +a[12] = in[196608]; +a[13] = in[212992]; +a[14] = in[229376]; +a[15] = in[245760]; +fftKernel16(a, dir); +{ int ang_index = (1 * ( j)) & 127; + int k = ang_index * 2; + float2 cs =cossin_T_LUT[k]; + cs.y *=dir; +w = cs; +} +a[1] = complexMul(a[1], w); +{ int ang_index = (1 * ( j)) & 63; + int k = ang_index * 4; + float2 cs =cossin_T_LUT[k]; + cs.y *=dir; +w = cs; +} +a[2] = complexMul(a[2], w); +{ int ang_index = (3 * ( j)) & 127; + int k = ang_index * 2; + float2 cs =cossin_T_LUT[k]; + cs.y *=dir; +w = cs; +} +a[3] = complexMul(a[3], w); +{ int ang_index = (1 * ( j)) & 31; + int k = ang_index * 8; + float2 cs =cossin_T_LUT[k]; + cs.y *=dir; +w = cs; +} +a[4] = complexMul(a[4], w); +{ int ang_index = (5 * ( j)) & 127; + int k = ang_index * 2; + float2 cs =cossin_T_LUT[k]; + cs.y *=dir; +w = cs; +} +a[5] = complexMul(a[5], w); +{ int ang_index = (3 * ( j)) & 63; + int k = ang_index * 4; + float2 cs =cossin_T_LUT[k]; + cs.y *=dir; +w = cs; +} +a[6] = complexMul(a[6], w); +{ int ang_index = (7 * ( j)) & 127; + int k = ang_index * 2; + float2 cs =cossin_T_LUT[k]; + cs.y *=dir; +w = cs; +} +a[7] = complexMul(a[7], w); +{ int ang_index = (1 * ( j)) & 15; + int k = ang_index * 16; + float2 cs =cossin_T_LUT[k]; + cs.y *=dir; +w = cs; +} +a[8] = complexMul(a[8], w); +{ int ang_index = (9 * ( j)) & 127; + int k = ang_index * 2; + float2 cs =cossin_T_LUT[k]; + cs.y *=dir; +w = cs; +} +a[9] = complexMul(a[9], w); +{ int ang_index = (5 * ( j)) & 63; + int k = ang_index * 4; + float2 cs =cossin_T_LUT[k]; + cs.y *=dir; +w = cs; +} +a[10] = complexMul(a[10], w); +{ int ang_index = (11 * ( j)) & 127; + int k = ang_index * 2; + float2 cs =cossin_T_LUT[k]; + cs.y *=dir; +w = cs; +} +a[11] = complexMul(a[11], w); +{ int ang_index = (3 * ( j)) & 31; + int k = ang_index * 8; + float2 cs =cossin_T_LUT[k]; + cs.y *=dir; +w = cs; +} +a[12] = complexMul(a[12], w); +{ int ang_index = (13 * ( j)) & 127; + int k = ang_index * 2; + float2 cs =cossin_T_LUT[k]; + cs.y *=dir; +w = cs; +} +a[13] = complexMul(a[13], w); +{ int ang_index = (7 * ( j)) & 63; + int k = ang_index * 4; + float2 cs =cossin_T_LUT[k]; + cs.y *=dir; +w = cs; +} +a[14] = complexMul(a[14], w); +{ int ang_index = (15 * ( j)) & 127; + int k = ang_index * 2; + float2 cs =cossin_T_LUT[k]; + cs.y *=dir; +w = cs; +} +a[15] = complexMul(a[15], w); +indexIn = mad24(j, 256, i); +lMemStore = sMem + tid; +lMemLoad = sMem + indexIn; +lMemStore[0] = a[0].x; +lMemStore[128] = a[1].x; +lMemStore[256] = a[2].x; +lMemStore[384] = a[3].x; +lMemStore[512] = a[4].x; +lMemStore[640] = a[5].x; +lMemStore[768] = a[6].x; +lMemStore[896] = a[7].x; +lMemStore[1024] = a[8].x; +lMemStore[1152] = a[9].x; +lMemStore[1280] = a[10].x; +lMemStore[1408] = a[11].x; +lMemStore[1536] = a[12].x; +lMemStore[1664] = a[13].x; +lMemStore[1792] = a[14].x; +lMemStore[1920] = a[15].x; +barrier(CLK_LOCAL_MEM_FENCE); +a[0].x = lMemLoad[0]; +a[1].x = lMemLoad[16]; +a[2].x = lMemLoad[32]; +a[3].x = lMemLoad[48]; +a[4].x = lMemLoad[64]; +a[5].x = lMemLoad[80]; +a[6].x = lMemLoad[96]; +a[7].x = lMemLoad[112]; +a[8].x = lMemLoad[128]; +a[9].x = lMemLoad[144]; +a[10].x = lMemLoad[160]; +a[11].x = lMemLoad[176]; +a[12].x = lMemLoad[192]; +a[13].x = lMemLoad[208]; +a[14].x = lMemLoad[224]; +a[15].x = lMemLoad[240]; +barrier(CLK_LOCAL_MEM_FENCE); +lMemStore[0] = a[0].y; +lMemStore[128] = a[1].y; +lMemStore[256] = a[2].y; +lMemStore[384] = a[3].y; +lMemStore[512] = a[4].y; +lMemStore[640] = a[5].y; +lMemStore[768] = a[6].y; +lMemStore[896] = a[7].y; +lMemStore[1024] = a[8].y; +lMemStore[1152] = a[9].y; +lMemStore[1280] = a[10].y; +lMemStore[1408] = a[11].y; +lMemStore[1536] = a[12].y; +lMemStore[1664] = a[13].y; +lMemStore[1792] = a[14].y; +lMemStore[1920] = a[15].y; +barrier(CLK_LOCAL_MEM_FENCE); +a[0].y = lMemLoad[0]; +a[1].y = lMemLoad[16]; +a[2].y = lMemLoad[32]; +a[3].y = lMemLoad[48]; +a[4].y = lMemLoad[64]; +a[5].y = lMemLoad[80]; +a[6].y = lMemLoad[96]; +a[7].y = lMemLoad[112]; +a[8].y = lMemLoad[128]; +a[9].y = lMemLoad[144]; +a[10].y = lMemLoad[160]; +a[11].y = lMemLoad[176]; +a[12].y = lMemLoad[192]; +a[13].y = lMemLoad[208]; +a[14].y = lMemLoad[224]; +a[15].y = lMemLoad[240]; +barrier(CLK_LOCAL_MEM_FENCE); +fftKernel8(a + 0, dir); +fftKernel8(a + 8, dir); +l = ((bNum << 4) + i) >> 7; +k = j << 1; +{ int ang_index = (1 * ( l * (k + 0))) & 2047; + int k = (ang_index + 4) >> 3; + float2 csx0 =cossin_T_LUT[k]; + float2 csx0Transp= (float2)(csx0.y,-csx0.x); + int r=ang_index - k * ( 8 ); + float mh=-3.0679616611e-03*(float)r; + float mhsqr2= mh*mh*(-0.5f); + float hqub6= mhsqr2*mh*(1.0f/3.0f); + float2 cs; + cs= hqub6 * csx0Transp; + cs += mhsqr2*csx0; + cs += mh*csx0Transp; + cs += csx0; + cs.y *=dir; +w = cs; +} +a[0] = complexMul(a[0], w); +{ int ang_index = (1 * ( l * (k + 16))) & 2047; + int k = (ang_index + 4) >> 3; + float2 csx0 =cossin_T_LUT[k]; + float2 csx0Transp= (float2)(csx0.y,-csx0.x); + int r=ang_index - k * ( 8 ); + float mh=-3.0679616611e-03*(float)r; + float mhsqr2= mh*mh*(-0.5f); + float hqub6= mhsqr2*mh*(1.0f/3.0f); + float2 cs; + cs= hqub6 * csx0Transp; + cs += mhsqr2*csx0; + cs += mh*csx0Transp; + cs += csx0; + cs.y *=dir; +w = cs; +} +a[1] = complexMul(a[1], w); +{ int ang_index = (1 * ( l * (k + 32))) & 2047; + int k = (ang_index + 4) >> 3; + float2 csx0 =cossin_T_LUT[k]; + float2 csx0Transp= (float2)(csx0.y,-csx0.x); + int r=ang_index - k * ( 8 ); + float mh=-3.0679616611e-03*(float)r; + float mhsqr2= mh*mh*(-0.5f); + float hqub6= mhsqr2*mh*(1.0f/3.0f); + float2 cs; + cs= hqub6 * csx0Transp; + cs += mhsqr2*csx0; + cs += mh*csx0Transp; + cs += csx0; + cs.y *=dir; +w = cs; +} +a[2] = complexMul(a[2], w); +{ int ang_index = (1 * ( l * (k + 48))) & 2047; + int k = (ang_index + 4) >> 3; + float2 csx0 =cossin_T_LUT[k]; + float2 csx0Transp= (float2)(csx0.y,-csx0.x); + int r=ang_index - k * ( 8 ); + float mh=-3.0679616611e-03*(float)r; + float mhsqr2= mh*mh*(-0.5f); + float hqub6= mhsqr2*mh*(1.0f/3.0f); + float2 cs; + cs= hqub6 * csx0Transp; + cs += mhsqr2*csx0; + cs += mh*csx0Transp; + cs += csx0; + cs.y *=dir; +w = cs; +} +a[3] = complexMul(a[3], w); +{ int ang_index = (1 * ( l * (k + 64))) & 2047; + int k = (ang_index + 4) >> 3; + float2 csx0 =cossin_T_LUT[k]; + float2 csx0Transp= (float2)(csx0.y,-csx0.x); + int r=ang_index - k * ( 8 ); + float mh=-3.0679616611e-03*(float)r; + float mhsqr2= mh*mh*(-0.5f); + float hqub6= mhsqr2*mh*(1.0f/3.0f); + float2 cs; + cs= hqub6 * csx0Transp; + cs += mhsqr2*csx0; + cs += mh*csx0Transp; + cs += csx0; + cs.y *=dir; +w = cs; +} +a[4] = complexMul(a[4], w); +{ int ang_index = (1 * ( l * (k + 80))) & 2047; + int k = (ang_index + 4) >> 3; + float2 csx0 =cossin_T_LUT[k]; + float2 csx0Transp= (float2)(csx0.y,-csx0.x); + int r=ang_index - k * ( 8 ); + float mh=-3.0679616611e-03*(float)r; + float mhsqr2= mh*mh*(-0.5f); + float hqub6= mhsqr2*mh*(1.0f/3.0f); + float2 cs; + cs= hqub6 * csx0Transp; + cs += mhsqr2*csx0; + cs += mh*csx0Transp; + cs += csx0; + cs.y *=dir; +w = cs; +} +a[5] = complexMul(a[5], w); +{ int ang_index = (1 * ( l * (k + 96))) & 2047; + int k = (ang_index + 4) >> 3; + float2 csx0 =cossin_T_LUT[k]; + float2 csx0Transp= (float2)(csx0.y,-csx0.x); + int r=ang_index - k * ( 8 ); + float mh=-3.0679616611e-03*(float)r; + float mhsqr2= mh*mh*(-0.5f); + float hqub6= mhsqr2*mh*(1.0f/3.0f); + float2 cs; + cs= hqub6 * csx0Transp; + cs += mhsqr2*csx0; + cs += mh*csx0Transp; + cs += csx0; + cs.y *=dir; +w = cs; +} +a[6] = complexMul(a[6], w); +{ int ang_index = (1 * ( l * (k + 112))) & 2047; + int k = (ang_index + 4) >> 3; + float2 csx0 =cossin_T_LUT[k]; + float2 csx0Transp= (float2)(csx0.y,-csx0.x); + int r=ang_index - k * ( 8 ); + float mh=-3.0679616611e-03*(float)r; + float mhsqr2= mh*mh*(-0.5f); + float hqub6= mhsqr2*mh*(1.0f/3.0f); + float2 cs; + cs= hqub6 * csx0Transp; + cs += mhsqr2*csx0; + cs += mh*csx0Transp; + cs += csx0; + cs.y *=dir; +w = cs; +} +a[7] = complexMul(a[7], w); +{ int ang_index = (1 * ( l * (k + 1))) & 2047; + int k = (ang_index + 4) >> 3; + float2 csx0 =cossin_T_LUT[k]; + float2 csx0Transp= (float2)(csx0.y,-csx0.x); + int r=ang_index - k * ( 8 ); + float mh=-3.0679616611e-03*(float)r; + float mhsqr2= mh*mh*(-0.5f); + float hqub6= mhsqr2*mh*(1.0f/3.0f); + float2 cs; + cs= hqub6 * csx0Transp; + cs += mhsqr2*csx0; + cs += mh*csx0Transp; + cs += csx0; + cs.y *=dir; +w = cs; +} +a[8] = complexMul(a[8], w); +{ int ang_index = (1 * ( l * (k + 17))) & 2047; + int k = (ang_index + 4) >> 3; + float2 csx0 =cossin_T_LUT[k]; + float2 csx0Transp= (float2)(csx0.y,-csx0.x); + int r=ang_index - k * ( 8 ); + float mh=-3.0679616611e-03*(float)r; + float mhsqr2= mh*mh*(-0.5f); + float hqub6= mhsqr2*mh*(1.0f/3.0f); + float2 cs; + cs= hqub6 * csx0Transp; + cs += mhsqr2*csx0; + cs += mh*csx0Transp; + cs += csx0; + cs.y *=dir; +w = cs; +} +a[9] = complexMul(a[9], w); +{ int ang_index = (1 * ( l * (k + 33))) & 2047; + int k = (ang_index + 4) >> 3; + float2 csx0 =cossin_T_LUT[k]; + float2 csx0Transp= (float2)(csx0.y,-csx0.x); + int r=ang_index - k * ( 8 ); + float mh=-3.0679616611e-03*(float)r; + float mhsqr2= mh*mh*(-0.5f); + float hqub6= mhsqr2*mh*(1.0f/3.0f); + float2 cs; + cs= hqub6 * csx0Transp; + cs += mhsqr2*csx0; + cs += mh*csx0Transp; + cs += csx0; + cs.y *=dir; +w = cs; +} +a[10] = complexMul(a[10], w); +{ int ang_index = (1 * ( l * (k + 49))) & 2047; + int k = (ang_index + 4) >> 3; + float2 csx0 =cossin_T_LUT[k]; + float2 csx0Transp= (float2)(csx0.y,-csx0.x); + int r=ang_index - k * ( 8 ); + float mh=-3.0679616611e-03*(float)r; + float mhsqr2= mh*mh*(-0.5f); + float hqub6= mhsqr2*mh*(1.0f/3.0f); + float2 cs; + cs= hqub6 * csx0Transp; + cs += mhsqr2*csx0; + cs += mh*csx0Transp; + cs += csx0; + cs.y *=dir; +w = cs; +} +a[11] = complexMul(a[11], w); +{ int ang_index = (1 * ( l * (k + 65))) & 2047; + int k = (ang_index + 4) >> 3; + float2 csx0 =cossin_T_LUT[k]; + float2 csx0Transp= (float2)(csx0.y,-csx0.x); + int r=ang_index - k * ( 8 ); + float mh=-3.0679616611e-03*(float)r; + float mhsqr2= mh*mh*(-0.5f); + float hqub6= mhsqr2*mh*(1.0f/3.0f); + float2 cs; + cs= hqub6 * csx0Transp; + cs += mhsqr2*csx0; + cs += mh*csx0Transp; + cs += csx0; + cs.y *=dir; +w = cs; +} +a[12] = complexMul(a[12], w); +{ int ang_index = (1 * ( l * (k + 81))) & 2047; + int k = (ang_index + 4) >> 3; + float2 csx0 =cossin_T_LUT[k]; + float2 csx0Transp= (float2)(csx0.y,-csx0.x); + int r=ang_index - k * ( 8 ); + float mh=-3.0679616611e-03*(float)r; + float mhsqr2= mh*mh*(-0.5f); + float hqub6= mhsqr2*mh*(1.0f/3.0f); + float2 cs; + cs= hqub6 * csx0Transp; + cs += mhsqr2*csx0; + cs += mh*csx0Transp; + cs += csx0; + cs.y *=dir; +w = cs; +} +a[13] = complexMul(a[13], w); +{ int ang_index = (1 * ( l * (k + 97))) & 2047; + int k = (ang_index + 4) >> 3; + float2 csx0 =cossin_T_LUT[k]; + float2 csx0Transp= (float2)(csx0.y,-csx0.x); + int r=ang_index - k * ( 8 ); + float mh=-3.0679616611e-03*(float)r; + float mhsqr2= mh*mh*(-0.5f); + float hqub6= mhsqr2*mh*(1.0f/3.0f); + float2 cs; + cs= hqub6 * csx0Transp; + cs += mhsqr2*csx0; + cs += mh*csx0Transp; + cs += csx0; + cs.y *=dir; +w = cs; +} +a[14] = complexMul(a[14], w); +{ int ang_index = (1 * ( l * (k + 113))) & 2047; + int k = (ang_index + 4) >> 3; + float2 csx0 =cossin_T_LUT[k]; + float2 csx0Transp= (float2)(csx0.y,-csx0.x); + int r=ang_index - k * ( 8 ); + float mh=-3.0679616611e-03*(float)r; + float mhsqr2= mh*mh*(-0.5f); + float hqub6= mhsqr2*mh*(1.0f/3.0f); + float2 cs; + cs= hqub6 * csx0Transp; + cs += mhsqr2*csx0; + cs += mh*csx0Transp; + cs += csx0; + cs.y *=dir; +w = cs; +} +a[15] = complexMul(a[15], w); +indexOut += mad24(j, 256, i); +out += indexOut; +out[0] = a[0]; +out[2048] = a[1]; +out[4096] = a[2]; +out[6144] = a[3]; +out[8192] = a[4]; +out[10240] = a[5]; +out[12288] = a[6]; +out[14336] = a[7]; +out[128] = a[8]; +out[2176] = a[9]; +out[4224] = a[10]; +out[6272] = a[11]; +out[8320] = a[12]; +out[10368] = a[13]; +out[12416] = a[14]; +out[14464] = a[15]; +} +__kernel void fft2(__global float2 *in, __global float2 *out, int dir, int S, __global float2 * cossinLUT1, __global float2 * cossinLUT2 ) +{ + __local float sMem[256]; + int i, j, r, indexIn, indexOut, index, tid, bNum, xNum, k, l; + int s, ii, jj, offset; + float2 w; + float ang, angf, ang1; + __local float *lMemStore, *lMemLoad; + float2 a[4]; + int lId = get_local_id( 0 ); + int groupId = get_group_id( 0 ); + __local float2 cossin_T_LUT[256]; + int lLUTind= lId; + cossin_T_LUT[lLUTind]=cossinLUT2[lLUTind]; + lLUTind+=64; + cossin_T_LUT[lLUTind]=cossinLUT2[lLUTind]; + lLUTind+=64; + cossin_T_LUT[lLUTind]=cossinLUT2[lLUTind]; + lLUTind+=64; + cossin_T_LUT[lLUTind]=cossinLUT2[lLUTind]; + barrier(CLK_LOCAL_MEM_FENCE); +bNum = groupId & 1023; +xNum = groupId >> 10; +indexIn = mul24(bNum, 16); +tid = indexIn; +i = tid >> 14; +j = tid & 16383; +indexOut = mad24(i, 262144, j); +indexIn += (xNum << 18); +indexOut += (xNum << 18); +tid = lId; +i = tid & 15; +j = tid >> 4; +indexIn += mad24(j, 16384, i); +in += indexIn; +a[0] = in[0]; +a[1] = in[65536]; +a[2] = in[131072]; +a[3] = in[196608]; +fftKernel4(a, dir); +{ int ang_index = (1 * ( j)) & 15; + int k = ang_index * 16; + float2 cs =cossin_T_LUT[k]; + cs.y *=dir; +w = cs; +} +a[1] = complexMul(a[1], w); +{ int ang_index = (1 * ( j)) & 7; + int k = ang_index * 32; + float2 cs =cossin_T_LUT[k]; + cs.y *=dir; +w = cs; +} +a[2] = complexMul(a[2], w); +{ int ang_index = (3 * ( j)) & 15; + int k = ang_index * 16; + float2 cs =cossin_T_LUT[k]; + cs.y *=dir; +w = cs; +} +a[3] = complexMul(a[3], w); +indexIn = mad24(j, 64, i); +lMemStore = sMem + tid; +lMemLoad = sMem + indexIn; +lMemStore[0] = a[0].x; +lMemStore[64] = a[1].x; +lMemStore[128] = a[2].x; +lMemStore[192] = a[3].x; +barrier(CLK_LOCAL_MEM_FENCE); +a[0].x = lMemLoad[0]; +a[1].x = lMemLoad[16]; +a[2].x = lMemLoad[32]; +a[3].x = lMemLoad[48]; +barrier(CLK_LOCAL_MEM_FENCE); +lMemStore[0] = a[0].y; +lMemStore[64] = a[1].y; +lMemStore[128] = a[2].y; +lMemStore[192] = a[3].y; +barrier(CLK_LOCAL_MEM_FENCE); +a[0].y = lMemLoad[0]; +a[1].y = lMemLoad[16]; +a[2].y = lMemLoad[32]; +a[3].y = lMemLoad[48]; +barrier(CLK_LOCAL_MEM_FENCE); +fftKernel4(a + 0, dir); +indexOut += mad24(j, 16384, i); +out += indexOut; +out[0] = a[0]; +out[65536] = a[1]; +out[131072] = a[2]; +out[196608] = a[3]; +} diff --git a/src/fft_internal.h b/src/fft_internal.h index 8178374ecefca39c127d65051825182242026e75..2c91ff7d08ea7164ac52e762a1b0929fc02097c3 100644 --- a/src/fft_internal.h +++ b/src/fft_internal.h @@ -95,6 +95,21 @@ typedef struct // context in which fft resources are created and kernels are executed cl_context context; + // HSA global region + hsa_region_t global_region; + + // HSA executable + hsa_executable_t executable; + + // HSA agent + hsa_agent_t agent; + + // HSA kernel argument region + hsa_region_t kernarg_region; + + // HSA current execution queue + hsa_queue_t* queue; + // size of signal clFFT_Dim3 n; @@ -141,25 +156,25 @@ typedef struct size_t last_batch_size; // temporary buffer for interleaved plan - cl_mem tempmemobj; + float* tempmemobj; // temporary buffer for planner plan. Only one of tempmemobj or // (tempmemobj_real, tempmemobj_imag) pair is valid (allocated) depending // data format of plan (plannar or interleaved) - cl_mem tempmemobj_real, tempmemobj_imag; + float *tempmemobj_real, *tempmemobj_imag; - // precomputed lookup tables for sin,cos calculations, each of size + // precomputed lookup tables for sin,cos calculations, each of size // sqrt(n) or 2*sqrt(n), n is size of signal; - - cl_mem cossin_LUT_d1; - cl_mem cossin_LUT_d2; + + float* cossin_LUT_d1; + float* cossin_LUT_d2; int logN1; int logN2; - size_t N1; + size_t N1; size_t N2; clFFT_TwiddleFactorMethod twiddleMethod; - + // Maximum size of signal for which local memory transposed based // fft is sufficient i.e. no global mem transpose (communication) // is needed