Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found
Select Git revision
  • Add-PKGBUILD
  • HSA
  • add-clFFT_GetSize
  • add-dylib-target
  • clmathfft
  • counting-mallocs
  • longer_dft_support
  • master
  • remove-CPU-constraint
  • current_brp_apps
  • current_fgrp_apps
11 results

Target

Select target project
  • einsteinathome/libclfft
  • maxBensch/libclfft
  • brevilo/libclfft
3 results
Select Git revision
  • BRP_build_fixes
  • CLFFT_NO_MAD_ENABLE
  • HSA
  • clmathfft
  • improve_Makefile
  • longer_dft_support
  • master
  • override_cl_compile_options
  • current_brp_apps
  • current_fgrp_apps
10 results
Show changes
Commits on Source (2)
......@@ -10,6 +10,9 @@ static:
shared:
$(MAKE) -C src shared
dylib:
$(MAKE) -C src dylib
sample:
$(MAKE) -C example
......
......@@ -5,6 +5,7 @@ AR ?= ar
TARGET = libeclfft.a
TARGET2 = libeclfft.so
TARGET3 = libeclfft.dylib
ifndef OPENCL_INCLUDE
ifdef CUDA_INSTALL_PATH
......@@ -35,6 +36,8 @@ static: $(TARGET)
shared: $(TARGET2)
dylib: $(TARGET3)
$(TARGET): $(OBJECTS)
$(AR) rcs $(TARGET) $(OBJECTS)
mkdir -p ../lib
......@@ -43,6 +46,9 @@ $(TARGET): $(OBJECTS)
$(TARGET2): $(OBJECTS)
$(CXX) $(LDFLAGS) -shared -o $(TARGET2) $(OBJECTS)
$(TARGET3): $(OBJECTS)
$(CXX) $(LDFLAGS) -dynamiclib -o $(TARGET3) $(OBJECTS)
fft_setup.o: fft_setup.cpp fft_internal.h fft_base_kernels.h
$(CXX) $(CXXFLAGS) -c fft_setup.cpp
......@@ -58,4 +64,4 @@ install: ../include/clFFT.h libeclfft.*
cp libeclfft.* $(PREFIX)/lib/
clean:
rm -f *.o *.a *.so
rm -f *.o *.a *.so *.dylib
......@@ -348,7 +348,7 @@ clFFT_ExecutePlannar( cl_command_queue queue, clFFT_Plan Plan, cl_int batchSize,
cl_int
clFFT_1DTwistInterleaved(clFFT_Plan Plan, cl_command_queue queue, cl_mem array,
size_t numRows, size_t numCols, size_t startRow, size_t rowsToProcess, clFFT_Direction dir)
unsigned numRows, unsigned numCols, unsigned startRow, unsigned rowsToProcess, clFFT_Direction dir)
{
cl_fft_plan *plan = (cl_fft_plan *) Plan;
......@@ -387,7 +387,7 @@ clFFT_1DTwistInterleaved(clFFT_Plan Plan, cl_command_queue queue, cl_mem array,
cl_int
clFFT_1DTwistPlannar(clFFT_Plan Plan, cl_command_queue queue, cl_mem array_real, cl_mem array_imag,
size_t numRows, size_t numCols, size_t startRow, size_t rowsToProcess, clFFT_Direction dir)
unsigned numRows, unsigned numCols, unsigned startRow, unsigned rowsToProcess, clFFT_Direction dir)
{
cl_fft_plan *plan = (cl_fft_plan *) Plan;
......
......@@ -81,10 +81,10 @@ typedef struct kernel_info_t
{
cl_kernel kernel;
char *kernel_name;
size_t lmem_size;
size_t num_workgroups;
size_t num_xforms_per_workgroup;
size_t num_workitems_per_workgroup;
unsigned lmem_size;
unsigned num_workgroups;
unsigned num_xforms_per_workgroup;
unsigned num_workitems_per_workgroup;
cl_fft_kernel_dir dir;
int in_place_possible;
kernel_info_t *next;
......@@ -138,7 +138,7 @@ typedef struct
// batch size different than the first call. last_batch_size caches the last
// batch size with which this plan is used so that we dont keep allocating/deallocating
// temp buffer if same batch size is used again and again.
size_t last_batch_size;
unsigned last_batch_size;
// temporary buffer for interleaved plan
cl_mem tempmemobj;
......@@ -163,25 +163,25 @@ typedef struct
// Maximum size of signal for which local memory transposed based
// fft is sufficient i.e. no global mem transpose (communication)
// is needed
size_t max_localmem_fft_size;
unsigned max_localmem_fft_size;
// Maximum work items per work group allowed. This, along with max_radix below controls
// maximum local memory being used by fft kernels of this plan. Set to 256 by default
size_t max_work_item_per_workgroup;
unsigned max_work_item_per_workgroup;
// Maximum base radix for local memory fft ... this controls the maximum register
// space used by work items. Currently defaults to 16
size_t max_radix;
unsigned max_radix;
// Device depended parameter that tells how many work-items need to be read consecutive
// values to make sure global memory access by work-items of a work-group result in
// coalesced memory access to utilize full bandwidth e.g. on NVidia tesla, this is 16
size_t min_mem_coalesce_width;
unsigned min_mem_coalesce_width;
// Number of local memory banks. This is used to geneate kernel with local memory
// transposes with appropriate padding to avoid bank conflicts to local memory
// e.g. on NVidia it is 16.
size_t num_local_mem_banks;
unsigned num_local_mem_banks;
}cl_fft_plan;
void FFT1D(cl_fft_plan *plan, cl_fft_kernel_dir dir);
......
......@@ -605,7 +605,7 @@ static void
insertTwiddleKernel(string &kernelString, int Nr, int numIter, int Nprev, int len, int numWorkItemsPerXForm)
{
int z, k;
int logNPrev = log2(Nprev);
int logNPrev = (int)log2(Nprev);
for(z = 0; z < numIter; z++)
{
......@@ -716,8 +716,8 @@ static void
insertLocalLoadIndexArithmatic(string &kernelString, int Nprev, int Nr, int numWorkItemsReq, int numWorkItemsPerXForm, int numXFormsPerWG, int offset, int midPad)
{
int Ncurr = Nprev * Nr;
int logNcurr = log2(Ncurr);
int logNprev = log2(Nprev);
int logNcurr = (int)log2(Ncurr);
int logNprev = (int)log2(Nprev);
int incr = (numWorkItemsReq + offset) * Nr + midPad;
if(Ncurr < numWorkItemsPerXForm)
......@@ -1280,7 +1280,7 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir
int gInInc = threadsPerBlock / batchSize;
int lgStrideO = log2(strideO);
int lgStrideO = (int)log2(strideO);
int numBlocksPerXForm = strideI / batchSize;
int numBlocks = numBlocksPerXForm;
if(!vertical)
......@@ -1333,7 +1333,7 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir
}
else
{
int lgNumBlocksPerXForm = log2(numBlocksPerXForm);
int lgNumBlocksPerXForm = (int)log2(numBlocksPerXForm);
localString += string("bNum = groupId & ") + num2str(numBlocksPerXForm - 1) + string(";\n");
localString += string("xNum = groupId >> ") + num2str(lgNumBlocksPerXForm) + string(";\n");
localString += string("indexIn = mul24(bNum, ") + num2str(batchSize) + string(");\n");
......@@ -1349,7 +1349,7 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir
}
// Load Data
int lgBatchSize = log2(batchSize);
int lgBatchSize = (int)log2(batchSize);
localString += string("tid = lId;\n");
localString += string("i = tid & ") + num2str(batchSize - 1) + string(";\n");
localString += string("j = tid >> ") + num2str(lgBatchSize) + string(";\n");
......@@ -1442,40 +1442,40 @@ createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir
localString += string("lMemStore = sMem + mad24(i, ") + num2str(radix + 1) + string(", j << ") + num2str((int)log2(R1/R2)) + string(");\n");
localString += string("lMemLoad = sMem + mad24(tid >> ") + num2str((int)log2(radix)) + string(", ") + num2str(radix+1) + string(", tid & ") + num2str(radix-1) + string(");\n");
for(int i = 0; i < R1/R2; i++)
for(int j = 0; j < R2; j++)
for(i = 0; i < R1/R2; i++)
for(j = 0; j < R2; j++)
localString += string("lMemStore[ ") + num2str(i + j*R1) + string("] = a[") + num2str(i*R2+j) + string("].x;\n");
localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
if(threadsPerBlock >= radix)
{
for(int i = 0; i < R1; i++)
for(i = 0; i < R1; i++)
localString += string("a[") + num2str(i) + string("].x = lMemLoad[") + num2str(i*(radix+1)*(threadsPerBlock/radix)) + string("];\n");
}
else
{
int innerIter = radix/threadsPerBlock;
int outerIter = R1/innerIter;
for(int i = 0; i < outerIter; i++)
for(int j = 0; j < innerIter; j++)
for(i = 0; i < outerIter; i++)
for(j = 0; j < innerIter; j++)
localString += string("a[") + num2str(i*innerIter+j) + string("].x = lMemLoad[") + num2str(j*threadsPerBlock + i*(radix+1)) + string("];\n");
}
localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
for(int i = 0; i < R1/R2; i++)
for(int j = 0; j < R2; j++)
for(i = 0; i < R1/R2; i++)
for(j = 0; j < R2; j++)
localString += string("lMemStore[ ") + num2str(i + j*R1) + string("] = a[") + num2str(i*R2+j) + string("].y;\n");
localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
if(threadsPerBlock >= radix)
{
for(int i = 0; i < R1; i++)
for(i = 0; i < R1; i++)
localString += string("a[") + num2str(i) + string("].y = lMemLoad[") + num2str(i*(radix+1)*(threadsPerBlock/radix)) + string("];\n");
}
else
{
int innerIter = radix/threadsPerBlock;
int outerIter = R1/innerIter;
for(int i = 0; i < outerIter; i++)
for(int j = 0; j < innerIter; j++)
for(i = 0; i < outerIter; i++)
for(j = 0; j < innerIter; j++)
localString += string("a[") + num2str(i*innerIter+j) + string("].y = lMemLoad[") + num2str(j*threadsPerBlock + i*(radix+1)) + string("];\n");
}
localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
......
......@@ -219,7 +219,7 @@ int getMaxKernelWorkGroupSize(cl_fft_plan *plan, unsigned int *max_wg_size, unsi
int reg_needed = 0;
*max_wg_size = INT_MAX;
int err;
size_t wg_size;
unsigned wg_size;
unsigned int i;
for(i = 0; i < num_devices; i++)
......@@ -427,7 +427,7 @@ patch_kernel_source:
err = clGetContextInfo(context, CL_CONTEXT_DEVICES, sizeof(devices), devices, &ret_size);
ERR_MACRO(err);
num_devices = ret_size / sizeof(cl_device_id);
num_devices = (int)(ret_size / sizeof(cl_device_id));
for(i = 0; i < num_devices; i++)
{
......