Skip to content
Snippets Groups Projects
Select Git revision
  • 47d231bec58a7c01779e9aa8f915c855bb80ee45
  • master default protected
  • improve_Makefile
  • HSA
  • clmathfft
  • longer_dft_support
  • current_fgrp_apps
  • current_brp_apps
8 results

fft_internal.h

Blame
  • Forked from einsteinathome / libclfft
    63 commits behind the upstream repository.
    fft_internal.h 6.91 KiB
    
    //
    // File:       fft_internal.h
    //
    // Version:    <1.0>
    //
    // Disclaimer: IMPORTANT:  This Apple software is supplied to you by Apple Inc. ("Apple")
    //             in consideration of your agreement to the following terms, and your use,
    //             installation, modification or redistribution of this Apple software
    //             constitutes acceptance of these terms.  If you do not agree with these
    //             terms, please do not use, install, modify or redistribute this Apple
    //             software.
    //
    //             In consideration of your agreement to abide by the following terms, and
    //             subject to these terms, Apple grants you a personal, non - exclusive
    //             license, under Apple's copyrights in this original Apple software ( the
    //             "Apple Software" ), to use, reproduce, modify and redistribute the Apple
    //             Software, with or without modifications, in source and / or binary forms;
    //             provided that if you redistribute the Apple Software in its entirety and
    //             without modifications, you must retain this notice and the following text
    //             and disclaimers in all such redistributions of the Apple Software. Neither
    //             the name, trademarks, service marks or logos of Apple Inc. may be used to
    //             endorse or promote products derived from the Apple Software without specific
    //             prior written permission from Apple.  Except as expressly stated in this
    //             notice, no other rights or licenses, express or implied, are granted by
    //             Apple herein, including but not limited to any patent rights that may be
    //             infringed by your derivative works or by other works in which the Apple
    //             Software may be incorporated.
    //
    //             The Apple Software is provided by Apple on an "AS IS" basis.  APPLE MAKES NO
    //             WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED
    //             WARRANTIES OF NON - INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A
    //             PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION
    //             ALONE OR IN COMBINATION WITH YOUR PRODUCTS.
    //
    //             IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR
    //             CONSEQUENTIAL DAMAGES ( INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
    //             SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
    //             INTERRUPTION ) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION
    //             AND / OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER
    //             UNDER THEORY OF CONTRACT, TORT ( INCLUDING NEGLIGENCE ), STRICT LIABILITY OR
    //             OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
    //
    // Copyright ( C ) 2008 Apple Inc. All Rights Reserved.
    //
    ////////////////////////////////////////////////////////////////////////////////////////////////////
    
    
    #ifndef __CLFFT_INTERNAL_H
    #define __CLFFT_INTERNAL_H
    
    #include "clFFT.h"
    #include <iostream>
    #include <string>
    #include <sstream>
    
    using namespace std;
    
    typedef enum kernel_dir_t
    {
    	cl_fft_kernel_x,
    	cl_fft_kernel_y,
    	cl_fft_kernel_z
    }cl_fft_kernel_dir;
    
    typedef struct kernel_info_t
    {
    	cl_kernel kernel;
    	char *kernel_name;
    	size_t lmem_size;
    	size_t num_workgroups;
        size_t num_xforms_per_workgroup;
    	size_t num_workitems_per_workgroup;
    	cl_fft_kernel_dir dir;
    	int in_place_possible;
    	kernel_info_t *next;
    }cl_fft_kernel_info;
    
    typedef struct 
    {
    	// context in which fft resources are created and kernels are executed
    	cl_context              context;
    	
    	// size of signal
    	clFFT_Dim3              n;
    	
    	// dimension of transform ... must be either 1D, 2D or 3D
    	clFFT_Dimension			dim;
    	
    	// data format ... must be either interleaved or plannar
    	clFFT_DataFormat		format;
    	
    	// string containing kernel source. Generated at runtime based on
    	// n, dim, format and other parameters
    	string                  *kernel_string;
    	
    	// CL program containing source and kernel this particular 
    	// n, dim, data format
    	cl_program				program;
    	
    	// linked list of kernels which needs to be executed for this fft
    	cl_fft_kernel_info		*kernel_info;
    	
    	// number of kernels
    	int                     num_kernels;
    	
    	// twist kernel for virtualizing fft of very large sizes that do not
    	// fit in GPU global memory
    	cl_kernel				twist_kernel;
    	
    	// flag indicating if temporary intermediate buffer is needed or not.
    	// this depends on fft kernels being executed and if transform is 
    	// in-place or out-of-place. e.g. Local memory fft (say 1D 1024 ... 
    	// one that does not require global transpose do not need temporary buffer)
    	// 2D 1024x1024 out-of-place fft however do require intermediate buffer.
    	// If temp buffer is needed, its allocation is lazy i.e. its not allocated
    	// until its needed
    	cl_int                  temp_buffer_needed;
    	
    	// Batch size is runtime parameter and size of temporary buffer (if needed)
    	// depends on batch size. Allocation of temporary buffer is lazy i.e. its
    	// only created when needed. Once its created at first call of clFFT_Executexxx
    	// it is not allocated next time if next time clFFT_Executexxx is called with 
    	// batch size different than the first call. last_batch_size caches the last
    	// batch size with which this plan is used so that we dont keep allocating/deallocating
    	// temp buffer if same batch size is used again and again.
    	size_t                  last_batch_size;
    	
    	// temporary buffer for interleaved plan
    	cl_mem   				tempmemobj;
    	
    	// temporary buffer for planner plan. Only one of tempmemobj or 
    	// (tempmemobj_real, tempmemobj_imag) pair is valid (allocated) depending 
    	// data format of plan (plannar or interleaved)
    	cl_mem                  tempmemobj_real, tempmemobj_imag;
    	
    	// Maximum size of signal for which local memory transposed based
    	// fft is sufficient i.e. no global mem transpose (communication)
    	// is needed
    	size_t					max_localmem_fft_size;
    	
    	// Maximum work items per work group allowed. This, along with max_radix below controls 
    	// maximum local memory being used by fft kernels of this plan. Set to 256 by default
    	size_t                  max_work_item_per_workgroup;
    	
    	// Maximum base radix for local memory fft ... this controls the maximum register 
    	// space used by work items. Currently defaults to 16
    	size_t                  max_radix;
    	
    	// Device depended parameter that tells how many work-items need to be read consecutive
    	// values to make sure global memory access by work-items of a work-group result in 
    	// coalesced memory access to utilize full bandwidth e.g. on NVidia tesla, this is 16
    	size_t                  min_mem_coalesce_width;
    	
    	// Number of local memory banks. This is used to geneate kernel with local memory 
    	// transposes with appropriate padding to avoid bank conflicts to local memory
    	// e.g. on NVidia it is 16.
    	size_t                  num_local_mem_banks;
    }cl_fft_plan;
    
    void FFT1D(cl_fft_plan *plan, cl_fft_kernel_dir dir);
    
    #endif