Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found
Select Git revision
  • Add-PKGBUILD
  • HSA
  • add-clFFT_GetSize
  • add-dylib-target
  • clmathfft
  • counting-mallocs
  • longer_dft_support
  • master
  • remove-CPU-constraint
  • current_brp_apps
  • current_fgrp_apps
11 results

Target

Select target project
  • einsteinathome/libclfft
  • maxBensch/libclfft
  • brevilo/libclfft
3 results
Select Git revision
  • BRP_build_fixes
  • CLFFT_NO_MAD_ENABLE
  • HSA
  • clmathfft
  • improve_Makefile
  • longer_dft_support
  • master
  • override_cl_compile_options
  • current_brp_apps
  • current_fgrp_apps
10 results
Show changes
Commits on Source (2)
...@@ -10,6 +10,9 @@ static: ...@@ -10,6 +10,9 @@ static:
shared: shared:
$(MAKE) -C src shared $(MAKE) -C src shared
dylib:
$(MAKE) -C src dylib
sample: sample:
$(MAKE) -C example $(MAKE) -C example
......
...@@ -5,6 +5,7 @@ AR ?= ar ...@@ -5,6 +5,7 @@ AR ?= ar
TARGET = libeclfft.a TARGET = libeclfft.a
TARGET2 = libeclfft.so TARGET2 = libeclfft.so
TARGET3 = libeclfft.dylib
ifndef OPENCL_INCLUDE ifndef OPENCL_INCLUDE
ifdef CUDA_INSTALL_PATH ifdef CUDA_INSTALL_PATH
...@@ -35,6 +36,8 @@ static: $(TARGET) ...@@ -35,6 +36,8 @@ static: $(TARGET)
shared: $(TARGET2) shared: $(TARGET2)
dylib: $(TARGET3)
$(TARGET): $(OBJECTS) $(TARGET): $(OBJECTS)
$(AR) rcs $(TARGET) $(OBJECTS) $(AR) rcs $(TARGET) $(OBJECTS)
mkdir -p ../lib mkdir -p ../lib
...@@ -43,6 +46,9 @@ $(TARGET): $(OBJECTS) ...@@ -43,6 +46,9 @@ $(TARGET): $(OBJECTS)
$(TARGET2): $(OBJECTS) $(TARGET2): $(OBJECTS)
$(CXX) $(LDFLAGS) -shared -o $(TARGET2) $(OBJECTS) $(CXX) $(LDFLAGS) -shared -o $(TARGET2) $(OBJECTS)
$(TARGET3): $(OBJECTS)
$(CXX) $(LDFLAGS) -dynamiclib -o $(TARGET3) $(OBJECTS)
fft_setup.o: fft_setup.cpp fft_internal.h fft_base_kernels.h fft_setup.o: fft_setup.cpp fft_internal.h fft_base_kernels.h
$(CXX) $(CXXFLAGS) -c fft_setup.cpp $(CXX) $(CXXFLAGS) -c fft_setup.cpp
...@@ -58,4 +64,4 @@ install: ../include/clFFT.h libeclfft.* ...@@ -58,4 +64,4 @@ install: ../include/clFFT.h libeclfft.*
cp libeclfft.* $(PREFIX)/lib/ cp libeclfft.* $(PREFIX)/lib/
clean: clean:
rm -f *.o *.a *.so rm -f *.o *.a *.so *.dylib
...@@ -348,7 +348,7 @@ clFFT_ExecutePlannar( cl_command_queue queue, clFFT_Plan Plan, cl_int batchSize, ...@@ -348,7 +348,7 @@ clFFT_ExecutePlannar( cl_command_queue queue, clFFT_Plan Plan, cl_int batchSize,
cl_int cl_int
clFFT_1DTwistInterleaved(clFFT_Plan Plan, cl_command_queue queue, cl_mem array, clFFT_1DTwistInterleaved(clFFT_Plan Plan, cl_command_queue queue, cl_mem array,
size_t numRows, size_t numCols, size_t startRow, size_t rowsToProcess, clFFT_Direction dir) unsigned numRows, unsigned numCols, unsigned startRow, unsigned rowsToProcess, clFFT_Direction dir)
{ {
cl_fft_plan *plan = (cl_fft_plan *) Plan; cl_fft_plan *plan = (cl_fft_plan *) Plan;
...@@ -387,7 +387,7 @@ clFFT_1DTwistInterleaved(clFFT_Plan Plan, cl_command_queue queue, cl_mem array, ...@@ -387,7 +387,7 @@ clFFT_1DTwistInterleaved(clFFT_Plan Plan, cl_command_queue queue, cl_mem array,
cl_int cl_int
clFFT_1DTwistPlannar(clFFT_Plan Plan, cl_command_queue queue, cl_mem array_real, cl_mem array_imag, clFFT_1DTwistPlannar(clFFT_Plan Plan, cl_command_queue queue, cl_mem array_real, cl_mem array_imag,
size_t numRows, size_t numCols, size_t startRow, size_t rowsToProcess, clFFT_Direction dir) unsigned numRows, unsigned numCols, unsigned startRow, unsigned rowsToProcess, clFFT_Direction dir)
{ {
cl_fft_plan *plan = (cl_fft_plan *) Plan; cl_fft_plan *plan = (cl_fft_plan *) Plan;
......
...@@ -81,10 +81,10 @@ typedef struct kernel_info_t ...@@ -81,10 +81,10 @@ typedef struct kernel_info_t
{ {
cl_kernel kernel; cl_kernel kernel;
char *kernel_name; char *kernel_name;
size_t lmem_size; unsigned lmem_size;
size_t num_workgroups; unsigned num_workgroups;
size_t num_xforms_per_workgroup; unsigned num_xforms_per_workgroup;
size_t num_workitems_per_workgroup; unsigned num_workitems_per_workgroup;
cl_fft_kernel_dir dir; cl_fft_kernel_dir dir;
int in_place_possible; int in_place_possible;
kernel_info_t *next; kernel_info_t *next;
...@@ -138,7 +138,7 @@ typedef struct ...@@ -138,7 +138,7 @@ typedef struct
// batch size different than the first call. last_batch_size caches the last // batch size different than the first call. last_batch_size caches the last
// batch size with which this plan is used so that we dont keep allocating/deallocating // batch size with which this plan is used so that we dont keep allocating/deallocating
// temp buffer if same batch size is used again and again. // temp buffer if same batch size is used again and again.
size_t last_batch_size; unsigned last_batch_size;
// temporary buffer for interleaved plan // temporary buffer for interleaved plan
cl_mem tempmemobj; cl_mem tempmemobj;
...@@ -163,25 +163,25 @@ typedef struct ...@@ -163,25 +163,25 @@ typedef struct
// Maximum size of signal for which local memory transposed based // Maximum size of signal for which local memory transposed based
// fft is sufficient i.e. no global mem transpose (communication) // fft is sufficient i.e. no global mem transpose (communication)
// is needed // is needed
size_t max_localmem_fft_size; unsigned max_localmem_fft_size;
// Maximum work items per work group allowed. This, along with max_radix below controls // Maximum work items per work group allowed. This, along with max_radix below controls
// maximum local memory being used by fft kernels of this plan. Set to 256 by default // maximum local memory being used by fft kernels of this plan. Set to 256 by default
size_t max_work_item_per_workgroup; unsigned max_work_item_per_workgroup;
// Maximum base radix for local memory fft ... this controls the maximum register // Maximum base radix for local memory fft ... this controls the maximum register
// space used by work items. Currently defaults to 16 // space used by work items. Currently defaults to 16
size_t max_radix; unsigned max_radix;
// Device depended parameter that tells how many work-items need to be read consecutive // Device depended parameter that tells how many work-items need to be read consecutive
// values to make sure global memory access by work-items of a work-group result in // values to make sure global memory access by work-items of a work-group result in
// coalesced memory access to utilize full bandwidth e.g. on NVidia tesla, this is 16 // coalesced memory access to utilize full bandwidth e.g. on NVidia tesla, this is 16
size_t min_mem_coalesce_width; unsigned min_mem_coalesce_width;
// Number of local memory banks. This is used to geneate kernel with local memory // Number of local memory banks. This is used to geneate kernel with local memory
// transposes with appropriate padding to avoid bank conflicts to local memory // transposes with appropriate padding to avoid bank conflicts to local memory
// e.g. on NVidia it is 16. // e.g. on NVidia it is 16.
size_t num_local_mem_banks; unsigned num_local_mem_banks;
}cl_fft_plan; }cl_fft_plan;
void FFT1D(cl_fft_plan *plan, cl_fft_kernel_dir dir); void FFT1D(cl_fft_plan *plan, cl_fft_kernel_dir dir);
......
...@@ -605,7 +605,7 @@ static void ...@@ -605,7 +605,7 @@ static void
insertTwiddleKernel(string &kernelString, int Nr, int numIter, int Nprev, int len, int numWorkItemsPerXForm) insertTwiddleKernel(string &kernelString, int Nr, int numIter, int Nprev, int len, int numWorkItemsPerXForm)
{ {
int z, k; int z, k;
int logNPrev = log2(Nprev); int logNPrev = (int)log2(Nprev);
for(z = 0; z < numIter; z++) for(z = 0; z < numIter; z++)
{ {
...@@ -716,8 +716,8 @@ static void ...@@ -716,8 +716,8 @@ static void
insertLocalLoadIndexArithmatic(string &kernelString, int Nprev, int Nr, int numWorkItemsReq, int numWorkItemsPerXForm, int numXFormsPerWG, int offset, int midPad) insertLocalLoadIndexArithmatic(string &kernelString, int Nprev, int Nr, int numWorkItemsReq, int numWorkItemsPerXForm, int numXFormsPerWG, int offset, int midPad)
{ {
int Ncurr = Nprev * Nr; int Ncurr = Nprev * Nr;
int logNcurr = log2(Ncurr); int logNcurr = (int)log2(Ncurr);
int logNprev = log2(Nprev); int logNprev = (int)log2(Nprev);
int incr = (numWorkItemsReq + offset) * Nr + midPad; int incr = (numWorkItemsReq + offset) * Nr + midPad;
if(Ncurr < numWorkItemsPerXForm) if(Ncurr < numWorkItemsPerXForm)
...@@ -1280,7 +1280,7 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir ...@@ -1280,7 +1280,7 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir
int gInInc = threadsPerBlock / batchSize; int gInInc = threadsPerBlock / batchSize;
int lgStrideO = log2(strideO); int lgStrideO = (int)log2(strideO);
int numBlocksPerXForm = strideI / batchSize; int numBlocksPerXForm = strideI / batchSize;
int numBlocks = numBlocksPerXForm; int numBlocks = numBlocksPerXForm;
if(!vertical) if(!vertical)
...@@ -1333,7 +1333,7 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir ...@@ -1333,7 +1333,7 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir
} }
else else
{ {
int lgNumBlocksPerXForm = log2(numBlocksPerXForm); int lgNumBlocksPerXForm = (int)log2(numBlocksPerXForm);
localString += string("bNum = groupId & ") + num2str(numBlocksPerXForm - 1) + string(";\n"); localString += string("bNum = groupId & ") + num2str(numBlocksPerXForm - 1) + string(";\n");
localString += string("xNum = groupId >> ") + num2str(lgNumBlocksPerXForm) + string(";\n"); localString += string("xNum = groupId >> ") + num2str(lgNumBlocksPerXForm) + string(";\n");
localString += string("indexIn = mul24(bNum, ") + num2str(batchSize) + string(");\n"); localString += string("indexIn = mul24(bNum, ") + num2str(batchSize) + string(");\n");
...@@ -1349,7 +1349,7 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir ...@@ -1349,7 +1349,7 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir
} }
// Load Data // Load Data
int lgBatchSize = log2(batchSize); int lgBatchSize = (int)log2(batchSize);
localString += string("tid = lId;\n"); localString += string("tid = lId;\n");
localString += string("i = tid & ") + num2str(batchSize - 1) + string(";\n"); localString += string("i = tid & ") + num2str(batchSize - 1) + string(";\n");
localString += string("j = tid >> ") + num2str(lgBatchSize) + string(";\n"); localString += string("j = tid >> ") + num2str(lgBatchSize) + string(";\n");
...@@ -1442,40 +1442,40 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir ...@@ -1442,40 +1442,40 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir
localString += string("lMemStore = sMem + mad24(i, ") + num2str(radix + 1) + string(", j << ") + num2str((int)log2(R1/R2)) + string(");\n"); localString += string("lMemStore = sMem + mad24(i, ") + num2str(radix + 1) + string(", j << ") + num2str((int)log2(R1/R2)) + string(");\n");
localString += string("lMemLoad = sMem + mad24(tid >> ") + num2str((int)log2(radix)) + string(", ") + num2str(radix+1) + string(", tid & ") + num2str(radix-1) + string(");\n"); localString += string("lMemLoad = sMem + mad24(tid >> ") + num2str((int)log2(radix)) + string(", ") + num2str(radix+1) + string(", tid & ") + num2str(radix-1) + string(");\n");
for(int i = 0; i < R1/R2; i++) for(i = 0; i < R1/R2; i++)
for(int j = 0; j < R2; j++) for(j = 0; j < R2; j++)
localString += string("lMemStore[ ") + num2str(i + j*R1) + string("] = a[") + num2str(i*R2+j) + string("].x;\n"); localString += string("lMemStore[ ") + num2str(i + j*R1) + string("] = a[") + num2str(i*R2+j) + string("].x;\n");
localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n"); localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
if(threadsPerBlock >= radix) if(threadsPerBlock >= radix)
{ {
for(int i = 0; i < R1; i++) for(i = 0; i < R1; i++)
localString += string("a[") + num2str(i) + string("].x = lMemLoad[") + num2str(i*(radix+1)*(threadsPerBlock/radix)) + string("];\n"); localString += string("a[") + num2str(i) + string("].x = lMemLoad[") + num2str(i*(radix+1)*(threadsPerBlock/radix)) + string("];\n");
} }
else else
{ {
int innerIter = radix/threadsPerBlock; int innerIter = radix/threadsPerBlock;
int outerIter = R1/innerIter; int outerIter = R1/innerIter;
for(int i = 0; i < outerIter; i++) for(i = 0; i < outerIter; i++)
for(int j = 0; j < innerIter; j++) for(j = 0; j < innerIter; j++)
localString += string("a[") + num2str(i*innerIter+j) + string("].x = lMemLoad[") + num2str(j*threadsPerBlock + i*(radix+1)) + string("];\n"); localString += string("a[") + num2str(i*innerIter+j) + string("].x = lMemLoad[") + num2str(j*threadsPerBlock + i*(radix+1)) + string("];\n");
} }
localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n"); localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
for(int i = 0; i < R1/R2; i++) for(i = 0; i < R1/R2; i++)
for(int j = 0; j < R2; j++) for(j = 0; j < R2; j++)
localString += string("lMemStore[ ") + num2str(i + j*R1) + string("] = a[") + num2str(i*R2+j) + string("].y;\n"); localString += string("lMemStore[ ") + num2str(i + j*R1) + string("] = a[") + num2str(i*R2+j) + string("].y;\n");
localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n"); localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
if(threadsPerBlock >= radix) if(threadsPerBlock >= radix)
{ {
for(int i = 0; i < R1; i++) for(i = 0; i < R1; i++)
localString += string("a[") + num2str(i) + string("].y = lMemLoad[") + num2str(i*(radix+1)*(threadsPerBlock/radix)) + string("];\n"); localString += string("a[") + num2str(i) + string("].y = lMemLoad[") + num2str(i*(radix+1)*(threadsPerBlock/radix)) + string("];\n");
} }
else else
{ {
int innerIter = radix/threadsPerBlock; int innerIter = radix/threadsPerBlock;
int outerIter = R1/innerIter; int outerIter = R1/innerIter;
for(int i = 0; i < outerIter; i++) for(i = 0; i < outerIter; i++)
for(int j = 0; j < innerIter; j++) for(j = 0; j < innerIter; j++)
localString += string("a[") + num2str(i*innerIter+j) + string("].y = lMemLoad[") + num2str(j*threadsPerBlock + i*(radix+1)) + string("];\n"); localString += string("a[") + num2str(i*innerIter+j) + string("].y = lMemLoad[") + num2str(j*threadsPerBlock + i*(radix+1)) + string("];\n");
} }
localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n"); localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
......
...@@ -219,7 +219,7 @@ int getMaxKernelWorkGroupSize(cl_fft_plan *plan, unsigned int *max_wg_size, unsi ...@@ -219,7 +219,7 @@ int getMaxKernelWorkGroupSize(cl_fft_plan *plan, unsigned int *max_wg_size, unsi
int reg_needed = 0; int reg_needed = 0;
*max_wg_size = INT_MAX; *max_wg_size = INT_MAX;
int err; int err;
size_t wg_size; unsigned wg_size;
unsigned int i; unsigned int i;
for(i = 0; i < num_devices; i++) for(i = 0; i < num_devices; i++)
...@@ -427,7 +427,7 @@ patch_kernel_source: ...@@ -427,7 +427,7 @@ patch_kernel_source:
err = clGetContextInfo(context, CL_CONTEXT_DEVICES, sizeof(devices), devices, &ret_size); err = clGetContextInfo(context, CL_CONTEXT_DEVICES, sizeof(devices), devices, &ret_size);
ERR_MACRO(err); ERR_MACRO(err);
num_devices = ret_size / sizeof(cl_device_id); num_devices = (int)(ret_size / sizeof(cl_device_id));
for(i = 0; i < num_devices; i++) for(i = 0; i < num_devices; i++)
{ {
......