//
// File:       main.cpp
//
// Version:    <1.0>
//
// Disclaimer: IMPORTANT:  This Apple software is supplied to you by Apple Inc. ("Apple")
//             in consideration of your agreement to the following terms, and your use,
//             installation, modification or redistribution of this Apple software
//             constitutes acceptance of these terms.  If you do not agree with these
//             terms, please do not use, install, modify or redistribute this Apple
//             software.
//
//             In consideration of your agreement to abide by the following terms, and
//             subject to these terms, Apple grants you a personal, non - exclusive
//             license, under Apple's copyrights in this original Apple software ( the
//             "Apple Software" ), to use, reproduce, modify and redistribute the Apple
//             Software, with or without modifications, in source and / or binary forms;
//             provided that if you redistribute the Apple Software in its entirety and
//             without modifications, you must retain this notice and the following text
//             and disclaimers in all such redistributions of the Apple Software. Neither
//             the name, trademarks, service marks or logos of Apple Inc. may be used to
//             endorse or promote products derived from the Apple Software without specific
//             prior written permission from Apple.  Except as expressly stated in this
//             notice, no other rights or licenses, express or implied, are granted by
//             Apple herein, including but not limited to any patent rights that may be
//             infringed by your derivative works or by other works in which the Apple
//             Software may be incorporated.
//
//             The Apple Software is provided by Apple on an "AS IS" basis.  APPLE MAKES NO
//             WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED
//             WARRANTIES OF NON - INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A
//             PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION
//             ALONE OR IN COMBINATION WITH YOUR PRODUCTS.
//
//             IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR
//             CONSEQUENTIAL DAMAGES ( INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
//             SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
//             INTERRUPTION ) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION
//             AND / OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER
//             UNDER THEORY OF CONTRACT, TORT ( INCLUDING NEGLIGENCE ), STRICT LIABILITY OR
//             OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright ( C ) 2008 Apple Inc. All Rights Reserved.
//
////////////////////////////////////////////////////////////////////////////////////////////////////


#include <string.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#ifdef __APPLE__
    #include <OpenCL/cl.h>
    #include <mach/mach_time.h>
    #include <Accelerate/Accelerate.h>
#else
    #include <CL/cl.h>
#endif
#include <clFFT.h>
#include "procs.h"
#include <sys/types.h>
#include <sys/stat.h>
#include <stdint.h>
#include <float.h>

#define eps_avg 10.0

#define MAX( _a, _b)	((_a)>(_b)?(_a) : (_b))

typedef enum {
	clFFT_OUT_OF_PLACE,
	clFFT_IN_PLACE,
}clFFT_TestType;

typedef struct
{
	double real;
	double imag;
}clFFT_ComplexDouble;

typedef struct
{
	double *real;
	double *imag;
}clFFT_SplitComplexDouble;

cl_device_id     device_id;
cl_context       context;
cl_command_queue queue;

typedef unsigned long long ulong;

#ifdef __APPLE__
double subtractTimes( uint64_t endTime, uint64_t startTime )
{
    uint64_t difference = endTime - startTime;
    static double conversion = 0.0;
    
    if( conversion == 0.0 )
    {
        mach_timebase_info_data_t info;
        kern_return_t err = mach_timebase_info( &info );
        
		//Convert the timebase into seconds
        if( err == 0  )
			conversion = 1e-9 * (double) info.numer / (double) info.denom;
    }
    
    return conversion * (double) difference;
}
#endif

#ifdef __APPLE__
void computeReferenceF(clFFT_SplitComplex *out, clFFT_Dim3 n, 
					  unsigned int batchSize, clFFT_Dimension dim, clFFT_Direction dir)
{
	FFTSetup plan_vdsp;
	DSPSplitComplex out_vdsp;
	FFTDirection dir_vdsp = dir == clFFT_Forward ? FFT_FORWARD : FFT_INVERSE;
	
	unsigned int i, j, k;
	unsigned int stride;
	unsigned int log2Nx = (unsigned int) log2(n.x);
	unsigned int log2Ny = (unsigned int) log2(n.y);
	unsigned int log2Nz = (unsigned int) log2(n.z);
	unsigned int log2N;
	
	log2N = log2Nx;
	log2N = log2N > log2Ny ? log2N : log2Ny;
	log2N = log2N > log2Nz ? log2N : log2Nz;
	
	plan_vdsp = vDSP_create_fftsetup(log2N, 2);
	
	switch(dim)
	{
		case clFFT_1D:
			
			for(i = 0; i < batchSize; i++)
			{
				stride = i * n.x;
				out_vdsp.realp  = out->real  + stride;
				out_vdsp.imagp  = out->imag  + stride;
				
			    vDSP_fft_zip(plan_vdsp, &out_vdsp, 1, log2Nx, dir_vdsp);
			}
			break;
			
		case clFFT_2D:
			
			for(i = 0; i < batchSize; i++)
			{
				for(j = 0; j < n.y; j++)
				{
					stride = j * n.x + i * n.x * n.y;
					out_vdsp.realp = out->real + stride;
					out_vdsp.imagp = out->imag + stride;
					
					vDSP_fft_zip(plan_vdsp, &out_vdsp, 1, log2Nx, dir_vdsp);
				}
			}
			for(i = 0; i < batchSize; i++)
			{
				for(j = 0; j < n.x; j++)
				{
					stride = j + i * n.x  * n.y;
					out_vdsp.realp = out->real + stride;
					out_vdsp.imagp = out->imag + stride;
					
					vDSP_fft_zip(plan_vdsp, &out_vdsp, n.x, log2Ny, dir_vdsp);
				}
			}
			break;
			
		case clFFT_3D:
			
			for(i = 0; i < batchSize; i++)
			{
				for(j = 0; j < n.z; j++)
				{
					for(k = 0; k < n.y; k++)
					{
						stride = k * n.x + j * n.x * n.y + i * n.x * n.y * n.z;
						out_vdsp.realp = out->real + stride;
						out_vdsp.imagp = out->imag + stride;
						
						vDSP_fft_zip(plan_vdsp, &out_vdsp, 1, log2Nx, dir_vdsp);
					}
				}
			}
			for(i = 0; i < batchSize; i++)
			{
				for(j = 0; j < n.z; j++)
				{
					for(k = 0; k < n.x; k++)
					{
						stride = k + j * n.x * n.y + i * n.x * n.y * n.z;
						out_vdsp.realp = out->real + stride;
						out_vdsp.imagp = out->imag + stride;
						
						vDSP_fft_zip(plan_vdsp, &out_vdsp, n.x, log2Ny, dir_vdsp);
					}
				}
			}
			for(i = 0; i < batchSize; i++)
			{
				for(j = 0; j < n.y; j++)
				{
					for(k = 0; k < n.x; k++)
					{
						stride = k + j * n.x + i * n.x * n.y * n.z;
						out_vdsp.realp = out->real + stride;
						out_vdsp.imagp = out->imag + stride;
						
						vDSP_fft_zip(plan_vdsp, &out_vdsp, n.x*n.y, log2Nz, dir_vdsp);
					}
				}
			}
			break;
	}
	
	vDSP_destroy_fftsetup(plan_vdsp);
}
#endif

#ifdef __APPLE__
void computeReferenceD(clFFT_SplitComplexDouble *out, clFFT_Dim3 n, 
					  unsigned int batchSize, clFFT_Dimension dim, clFFT_Direction dir)
{
	FFTSetupD plan_vdsp;
	DSPDoubleSplitComplex out_vdsp;
	FFTDirection dir_vdsp = dir == clFFT_Forward ? FFT_FORWARD : FFT_INVERSE;
	
	unsigned int i, j, k;
	unsigned int stride;
	unsigned int log2Nx = (int) log2(n.x);
	unsigned int log2Ny = (int) log2(n.y);
	unsigned int log2Nz = (int) log2(n.z);
	unsigned int log2N;
	
	log2N = log2Nx;
	log2N = log2N > log2Ny ? log2N : log2Ny;
	log2N = log2N > log2Nz ? log2N : log2Nz;
	
	plan_vdsp = vDSP_create_fftsetupD(log2N, 2);
	
	switch(dim)
	{
		case clFFT_1D:
			
			for(i = 0; i < batchSize; i++)
			{
				stride = i * n.x;
				out_vdsp.realp  = out->real  + stride;
				out_vdsp.imagp  = out->imag  + stride;
				
			    vDSP_fft_zipD(plan_vdsp, &out_vdsp, 1, log2Nx, dir_vdsp);
			}
			break;
			
		case clFFT_2D:
			
			for(i = 0; i < batchSize; i++)
			{
				for(j = 0; j < n.y; j++)
				{
					stride = j * n.x + i * n.x * n.y;
					out_vdsp.realp = out->real + stride;
					out_vdsp.imagp = out->imag + stride;
					
					vDSP_fft_zipD(plan_vdsp, &out_vdsp, 1, log2Nx, dir_vdsp);
				}
			}
			for(i = 0; i < batchSize; i++)
			{
				for(j = 0; j < n.x; j++)
				{
					stride = j + i * n.x  * n.y;
					out_vdsp.realp = out->real + stride;
					out_vdsp.imagp = out->imag + stride;
					
					vDSP_fft_zipD(plan_vdsp, &out_vdsp, n.x, log2Ny, dir_vdsp);
				}
			}
			break;
			
		case clFFT_3D:
			
			for(i = 0; i < batchSize; i++)
			{
				for(j = 0; j < n.z; j++)
				{
					for(k = 0; k < n.y; k++)
					{
						stride = k * n.x + j * n.x * n.y + i * n.x * n.y * n.z;
						out_vdsp.realp = out->real + stride;
						out_vdsp.imagp = out->imag + stride;
						
						vDSP_fft_zipD(plan_vdsp, &out_vdsp, 1, log2Nx, dir_vdsp);
					}
				}
			}
			for(i = 0; i < batchSize; i++)
			{
				for(j = 0; j < n.z; j++)
				{
					for(k = 0; k < n.x; k++)
					{
						stride = k + j * n.x * n.y + i * n.x * n.y * n.z;
						out_vdsp.realp = out->real + stride;
						out_vdsp.imagp = out->imag + stride;
						
						vDSP_fft_zipD(plan_vdsp, &out_vdsp, n.x, log2Ny, dir_vdsp);
					}
				}
			}
			for(i = 0; i < batchSize; i++)
			{
				for(j = 0; j < n.y; j++)
				{
					for(k = 0; k < n.x; k++)
					{
						stride = k + j * n.x + i * n.x * n.y * n.z;
						out_vdsp.realp = out->real + stride;
						out_vdsp.imagp = out->imag + stride;
						
						vDSP_fft_zipD(plan_vdsp, &out_vdsp, n.x*n.y, log2Nz, dir_vdsp);
					}
				}
			}
			break;
	}
	
	vDSP_destroy_fftsetupD(plan_vdsp);
}
#endif

double complexNormSq(clFFT_ComplexDouble a)
{
	return (a.real * a.real + a.imag * a.imag);
}

double computeL2Error(clFFT_SplitComplex *data, clFFT_SplitComplexDouble *data_ref, int n, int batchSize, double *max_diff, double *min_diff)
{
	int i, j;
	double avg_norm = 0.0;
	*max_diff = 0.0;
	*min_diff = 0x1.0p1000;
	
	for(j = 0; j < batchSize; j++)
	{
		double norm_ref = 0.0;
		double norm = 0.0;
	    for(i = 0; i < n; i++) 
		{
			int index = j * n + i;
		    clFFT_ComplexDouble diff = (clFFT_ComplexDouble) { data_ref->real[index] - data->real[index], data_ref->imag[index] - data->imag[index] };
		    double norm_tmp = complexNormSq(diff);
		    norm += norm_tmp;
		    norm_ref += (data_ref->real[index] * data_ref->real[index] + data_ref->imag[index] * data_ref->imag[index]);
	    }
	    double curr_norm = sqrt( norm / norm_ref ) / FLT_EPSILON;
		avg_norm += curr_norm;
		*max_diff = *max_diff < curr_norm ? curr_norm : *max_diff;
		*min_diff = *min_diff > curr_norm ? curr_norm : *min_diff;
	}
	
	return avg_norm / batchSize;
}

void convertInterleavedToSplit(clFFT_SplitComplex *result_split, clFFT_Complex *data_cl, int length)
{
	int i;
	for(i = 0; i < length; i++) {
		result_split->real[i] = data_cl[i].real;
		result_split->imag[i] = data_cl[i].imag;
	}
}

int runTest(clFFT_Dim3 n, int batchSize, clFFT_Direction dir, clFFT_Dimension dim, 
			clFFT_DataFormat dataFormat, int numIter, clFFT_TestType testType)
{	
	cl_int err = CL_SUCCESS;
	int iter;
	double t;
	
	uint64_t t0, t1;
	int mx = log2(n.x);
	int my = log2(n.y);
	int mz = log2(n.z);

	int length = n.x * n.y * n.z * batchSize;
		
	double gflops = 5e-9 * ((double)mx + (double)my + (double)mz) * (double)n.x * (double)n.y * (double)n.z * (double)batchSize * (double)numIter;
	
	clFFT_SplitComplex data_i_split = (clFFT_SplitComplex) { NULL, NULL };
	clFFT_SplitComplex data_cl_split = (clFFT_SplitComplex) { NULL, NULL };
	clFFT_Complex *data_i = NULL;
	clFFT_Complex *data_cl = NULL;
	clFFT_SplitComplexDouble data_iref = (clFFT_SplitComplexDouble) { NULL, NULL }; 
	clFFT_SplitComplexDouble data_oref = (clFFT_SplitComplexDouble) { NULL, NULL };
	
	clFFT_Plan plan = NULL;
	cl_mem data_in = NULL;
	cl_mem data_out = NULL;
	cl_mem data_in_real = NULL;
	cl_mem data_in_imag = NULL;
	cl_mem data_out_real = NULL;
	cl_mem data_out_imag = NULL;
	
	if(dataFormat == clFFT_SplitComplexFormat) {
		data_i_split.real     = (float *) malloc(sizeof(float) * length);
		data_i_split.imag     = (float *) malloc(sizeof(float) * length);
		data_cl_split.real    = (float *) malloc(sizeof(float) * length);
		data_cl_split.imag    = (float *) malloc(sizeof(float) * length);
		if(!data_i_split.real || !data_i_split.imag || !data_cl_split.real || !data_cl_split.imag)
		{
			err = -1;
			log_error("Out-of-Resources\n");
			goto cleanup;
		}
	}
	else {
		data_i  = (clFFT_Complex *) malloc(sizeof(clFFT_Complex)*length);
		data_cl = (clFFT_Complex *) malloc(sizeof(clFFT_Complex)*length);
		if(!data_i || !data_cl)
		{
			err = -2;
			log_error("Out-of-Resouces\n");
			goto cleanup;
		}
	}
	
	data_iref.real   = (double *) malloc(sizeof(double) * length);
	data_iref.imag   = (double *) malloc(sizeof(double) * length);
	data_oref.real   = (double *) malloc(sizeof(double) * length);
	data_oref.imag   = (double *) malloc(sizeof(double) * length);	
	if(!data_iref.real || !data_iref.imag || !data_oref.real || !data_oref.imag)
	{
		err = -3;
		log_error("Out-of-Resources\n");
		goto cleanup;
	}

	int i;
	if(dataFormat == clFFT_SplitComplexFormat) {
		for(i = 0; i < length; i++)
		{
			data_i_split.real[i] = 2.0f * (float) rand() / (float) RAND_MAX - 1.0f;
			data_i_split.imag[i] = 2.0f * (float) rand() / (float) RAND_MAX - 1.0f;
			data_cl_split.real[i] = 0.0f;
			data_cl_split.imag[i] = 0.0f;			
			data_iref.real[i] = data_i_split.real[i];
			data_iref.imag[i] = data_i_split.imag[i];
			data_oref.real[i] = data_iref.real[i];
			data_oref.imag[i] = data_iref.imag[i];	
		}
	}
	else {
		for(i = 0; i < length; i++)
		{
			data_i[i].real = 2.0f * (float) rand() / (float) RAND_MAX - 1.0f;
			data_i[i].imag = 2.0f * (float) rand() / (float) RAND_MAX - 1.0f;
			data_cl[i].real = 0.0f;
			data_cl[i].imag = 0.0f;			
			data_iref.real[i] = data_i[i].real;
			data_iref.imag[i] = data_i[i].imag;
			data_oref.real[i] = data_iref.real[i];
			data_oref.imag[i] = data_iref.imag[i];	
		}		
	}
	
	plan = clFFT_CreatePlan( context, n, dim, dataFormat, &err );
	if(!plan || err) 
	{
		log_error("clFFT_CreatePlan failed\n");
		goto cleanup;
	}
	
	//clFFT_DumpPlan(plan, stdout);
	
	if(dataFormat == clFFT_SplitComplexFormat)
	{
		data_in_real = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length*sizeof(float), data_i_split.real, &err);
	    if(!data_in_real || err) 
	    {
			log_error("clCreateBuffer failed\n");
			goto cleanup;
	    }
		
		data_in_imag = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length*sizeof(float), data_i_split.imag, &err);
	    if(!data_in_imag || err) 
	    {
			log_error("clCreateBuffer failed\n");
			goto cleanup;
	    }
		
		if(testType == clFFT_OUT_OF_PLACE)
		{
			data_out_real = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length*sizeof(float), data_cl_split.real, &err);
			if(!data_out_real || err) 
			{
				log_error("clCreateBuffer failed\n");
				goto cleanup;
			}
			
			data_out_imag = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length*sizeof(float), data_cl_split.imag, &err);
			if(!data_out_imag || err) 
			{
				log_error("clCreateBuffer failed\n");
				goto cleanup;
			}			
		}
		else
		{
			data_out_real = data_in_real;
			data_out_imag = data_in_imag;
		}
	}
	else
	{
	    data_in = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length*sizeof(float)*2, data_i, &err);
	    if(!data_in) 
	    {
			log_error("clCreateBuffer failed\n");
			goto cleanup;
	    }
		if(testType == clFFT_OUT_OF_PLACE)
		{
			data_out = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length*sizeof(float)*2, data_cl, &err);
			if(!data_out) 
			{
				log_error("clCreateBuffer failed\n");
				goto cleanup;
			}			
		}
		else
			data_out = data_in;
	}
		
			
	err = CL_SUCCESS;
	
	t0 = mach_absolute_time();
	if(dataFormat == clFFT_SplitComplexFormat)
	{
		for(iter = 0; iter < numIter; iter++)
		    err |= clFFT_ExecutePlannar(queue, plan, batchSize, dir, data_in_real, data_in_imag, data_out_real, data_out_imag, 0, NULL, NULL);
	}
	else
	{
	    for(iter = 0; iter < numIter; iter++) 
			err |= clFFT_ExecuteInterleaved(queue, plan, batchSize, dir, data_in, data_out, 0, NULL, NULL);
	}
	
	err |= clFinish(queue);
	
	if(err) 
	{
		log_error("clFFT_Execute\n");
		goto cleanup;	
	}

#ifdef __APPLE__
	t1 = mach_absolute_time(); 
	t = subtractTimes(t1, t0);
	char temp[100];
	sprintf(temp, "GFlops achieved for n = (%d, %d, %d), batchsize = %d", n.x, n.y, n.z, batchSize);
	log_perf(gflops / (float) t, 1, "GFlops/s", "%s", temp);
#endif

	if(dataFormat == clFFT_SplitComplexFormat)
	{	
		err |= clEnqueueReadBuffer(queue, data_out_real, CL_TRUE, 0, length*sizeof(float), data_cl_split.real, 0, NULL, NULL);
		err |= clEnqueueReadBuffer(queue, data_out_imag, CL_TRUE, 0, length*sizeof(float), data_cl_split.imag, 0, NULL, NULL);
	}
	else
	{
		err |= clEnqueueReadBuffer(queue, data_out, CL_TRUE, 0, length*sizeof(float)*2, data_cl, 0, NULL, NULL);
	}
	
	if(err) 
	{
		log_error("clEnqueueReadBuffer failed\n");
        goto cleanup;
	}	

#ifdef __APPLE__
	computeReferenceD(&data_oref, n, batchSize, dim, dir);
	
	double diff_avg, diff_max, diff_min;
	if(dataFormat == clFFT_SplitComplexFormat) {
		diff_avg = computeL2Error(&data_cl_split, &data_oref, n.x*n.y*n.z, batchSize, &diff_max, &diff_min);
		if(diff_avg > eps_avg)
			log_error("Test failed (n=(%d, %d, %d), batchsize=%d): %s Test: rel. L2-error = %f eps (max=%f eps, min=%f eps)\n", n.x, n.y, n.z, batchSize, (testType == clFFT_OUT_OF_PLACE) ? "out-of-place" : "in-place", diff_avg, diff_max, diff_min);
		else
			log_info("Test passed (n=(%d, %d, %d), batchsize=%d): %s Test: rel. L2-error = %f eps (max=%f eps, min=%f eps)\n", n.x, n.y, n.z, batchSize, (testType == clFFT_OUT_OF_PLACE) ? "out-of-place" : "in-place", diff_avg, diff_max, diff_min);			
	}
	else {
		clFFT_SplitComplex result_split;
		result_split.real = (float *) malloc(length*sizeof(float));
		result_split.imag = (float *) malloc(length*sizeof(float));
		convertInterleavedToSplit(&result_split, data_cl, length);
		diff_avg = computeL2Error(&result_split, &data_oref, n.x*n.y*n.z, batchSize, &diff_max, &diff_min);
		
		if(diff_avg > eps_avg)
			log_error("Test failed (n=(%d, %d, %d), batchsize=%d): %s Test: rel. L2-error = %f eps (max=%f eps, min=%f eps)\n", n.x, n.y, n.z, batchSize, (testType == clFFT_OUT_OF_PLACE) ? "out-of-place" : "in-place", diff_avg, diff_max, diff_min);
		else
			log_info("Test passed (n=(%d, %d, %d), batchsize=%d): %s Test: rel. L2-error = %f eps (max=%f eps, min=%f eps)\n", n.x, n.y, n.z, batchSize, (testType == clFFT_OUT_OF_PLACE) ? "out-of-place" : "in-place", diff_avg, diff_max, diff_min);	
		free(result_split.real);
		free(result_split.imag);
	}
#endif

cleanup:
	clFFT_DestroyPlan(plan);	
	if(dataFormat == clFFT_SplitComplexFormat) 
	{
		if(data_i_split.real)
			free(data_i_split.real);
		if(data_i_split.imag)
			free(data_i_split.imag);
		if(data_cl_split.real)
			free(data_cl_split.real);
		if(data_cl_split.imag)
			free(data_cl_split.imag);
		
		if(data_in_real)
			clReleaseMemObject(data_in_real);
		if(data_in_imag)
			clReleaseMemObject(data_in_imag);
		if(data_out_real && testType == clFFT_OUT_OF_PLACE)
			clReleaseMemObject(data_out_real);
		if(data_out_imag && clFFT_OUT_OF_PLACE)
			clReleaseMemObject(data_out_imag);
	}
	else 
	{
		if(data_i)
			free(data_i);
		if(data_cl)
			free(data_cl);
		
		if(data_in)
			clReleaseMemObject(data_in);
		if(data_out && testType == clFFT_OUT_OF_PLACE)
			clReleaseMemObject(data_out);
	}
	
	if(data_iref.real)
		free(data_iref.real);
	if(data_iref.imag)
		free(data_iref.imag);		
	if(data_oref.real)
		free(data_oref.real);
	if(data_oref.imag)
		free(data_oref.imag);
	
	return err;
}

bool ifLineCommented(const char *line) {
	const char *Line = line;
	while(*Line != '\0')
		if((*Line == '/') && (*(Line + 1) == '/'))
			return true;
		else
			Line++;
	return false;
}

cl_device_type getGlobalDeviceType()
{
	char *force_cpu = getenv( "CL_DEVICE_TYPE" );
	if( force_cpu != NULL )
	{
		if( strcmp( force_cpu, "gpu" ) == 0 || strcmp( force_cpu, "CL_DEVICE_TYPE_GPU" ) == 0 )
			return CL_DEVICE_TYPE_GPU;
		else if( strcmp( force_cpu, "cpu" ) == 0 || strcmp( force_cpu, "CL_DEVICE_TYPE_CPU" ) == 0 )
			return CL_DEVICE_TYPE_CPU;
		else if( strcmp( force_cpu, "accelerator" ) == 0 || strcmp( force_cpu, "CL_DEVICE_TYPE_ACCELERATOR" ) == 0 )
			return CL_DEVICE_TYPE_ACCELERATOR;
		else if( strcmp( force_cpu, "CL_DEVICE_TYPE_DEFAULT" ) == 0 )
			return CL_DEVICE_TYPE_DEFAULT;
	}
	// default
	return CL_DEVICE_TYPE_GPU;
}

void 
notify_callback(const char *errinfo, const void *private_info, size_t cb, void *user_data)
{
    log_error( "%s\n", errinfo );
}

int
checkMemRequirements(clFFT_Dim3 n, int batchSize, clFFT_TestType testType, cl_ulong gMemSize)
{
	cl_ulong memReq = (testType == clFFT_OUT_OF_PLACE) ? 3 : 2;
	memReq *= n.x*n.y*n.z*sizeof(clFFT_Complex)*batchSize;
	memReq = memReq/1024/1024;
	if(memReq >= gMemSize)
		return -1;
	return 0;
}

int main (int argc, char * const argv[]) {
	
	test_start();
	
	cl_ulong gMemSize;
	clFFT_Direction dir = clFFT_Forward;
	int numIter = 1;
	clFFT_Dim3 n = { 1024, 1, 1 };
	int batchSize = 1;
	clFFT_DataFormat dataFormat = clFFT_SplitComplexFormat;
	clFFT_Dimension dim = clFFT_1D;
	clFFT_TestType testType = clFFT_OUT_OF_PLACE;
	cl_device_id device_ids[16];
	
	FILE *paramFile;
			
	cl_int err;
	unsigned int num_devices;
	
	cl_device_type device_type = getGlobalDeviceType();	
	if(device_type != CL_DEVICE_TYPE_GPU) 
	{
		log_info("Test only supported on DEVICE_TYPE_GPU\n");
		test_finish();
		exit(0);
	}
	
	err = clGetDeviceIDs(NULL, device_type, sizeof(device_ids), device_ids, &num_devices);
	if(err) 
	{		
		log_error("clGetComputeDevice failed\n");
		test_finish();
		return -1;
	}
	
	device_id = NULL;

	unsigned int i;
	for(i = 0; i < num_devices; i++)
	{
	    cl_bool available;
	    err = clGetDeviceInfo(device_ids[i], CL_DEVICE_AVAILABLE, sizeof(cl_bool), &available, NULL);
	    if(err)
	    {
	         log_error("Cannot check device availability of device # %d\n", i);
	    }
	    
	    if(available)
	    {
	        device_id = device_ids[i];
	        break;
	    }
	    else
	    {
	        char name[200];
	        err = clGetDeviceInfo(device_ids[i], CL_DEVICE_NAME, sizeof(name), name, NULL);
	        if(err == CL_SUCCESS)
	        {
	             log_info("Device %s not available for compute\n", name);
	        }
	        else
	        {
	             log_info("Device # %d not available for compute\n", i);
	        }
	    }
	}
	
	if(!device_id)
	{
	    log_error("None of the devices available for compute ... aborting test\n");
	    test_finish();
	    return -1;
	}

	context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
	if(!context || err) 
	{
		log_error("clCreateContext failed\n");
		test_finish();
		return -1;
	}
	
    queue = clCreateCommandQueue(context, device_id, 0, &err);
    if(!queue || err)
	{
        log_error("clCreateCommandQueue() failed.\n");
		clReleaseContext(context);
        test_finish();
        return -1;
    }  
	
	err = clGetDeviceInfo(device_id, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(cl_ulong), &gMemSize, NULL);
	if(err)
	{
		log_error("Failed to get global mem size\n");
		clReleaseContext(context);
		clReleaseCommandQueue(queue);
		test_finish();
		return -2;
	}
	
	gMemSize /= (1024*1024);
			
	char delim[] = " \n";
	char tmpStr[100];
	char line[200];
	char *param, *val;	
	int total_errors = 0;
	if(argc == 1) {
		log_error("Need file name with list of parameters to run the test\n");
		test_finish();
		return -1;
	}
	
	if(argc == 2) {	// arguments are supplied in a file with arguments for a single run are all on the same line
		paramFile = fopen(argv[1], "r");
		if(!paramFile) {
			log_error("Cannot open the parameter file\n");
			clReleaseContext(context);
			clReleaseCommandQueue(queue);			
			test_finish();
			return -3;
		}
		while(fgets(line, 199, paramFile)) {
			if(!strcmp(line, "") || !strcmp(line, "\n") || ifLineCommented(line))
				continue;
			param = strtok(line, delim);
			while(param) {
				val = strtok(NULL, delim);
				if(!strcmp(param, "-n")) {
					sscanf(val, "%d", &n.x);
					val = strtok(NULL, delim);
					sscanf(val, "%d", &n.y);
					val = strtok(NULL, delim);
					sscanf(val, "%d", &n.z);					
				}
				else if(!strcmp(param, "-batchsize")) 
					sscanf(val, "%d", &batchSize);
				else if(!strcmp(param, "-dir")) {
					sscanf(val, "%s", tmpStr);
					if(!strcmp(tmpStr, "forward"))
						dir = clFFT_Forward;
					else if(!strcmp(tmpStr, "inverse"))
						dir = clFFT_Inverse;
				}
				else if(!strcmp(param, "-dim")) {
					sscanf(val, "%s", tmpStr);
					if(!strcmp(tmpStr, "1D"))
						dim = clFFT_1D;
					else if(!strcmp(tmpStr, "2D"))
						dim = clFFT_2D; 
					else if(!strcmp(tmpStr, "3D"))
						dim = clFFT_3D;					
				}
				else if(!strcmp(param, "-format")) {
					sscanf(val, "%s", tmpStr);
					if(!strcmp(tmpStr, "plannar"))
						dataFormat = clFFT_SplitComplexFormat;
					else if(!strcmp(tmpStr, "interleaved"))
						dataFormat = clFFT_InterleavedComplexFormat;					
				}
				else if(!strcmp(param, "-numiter"))
					sscanf(val, "%d", &numIter);
				else if(!strcmp(param, "-testtype")) {
					sscanf(val, "%s", tmpStr);
					if(!strcmp(tmpStr, "out-of-place"))
						testType = clFFT_OUT_OF_PLACE;
					else if(!strcmp(tmpStr, "in-place"))
						testType = clFFT_IN_PLACE;										
				}
				param = strtok(NULL, delim);
			}
			
			if(checkMemRequirements(n, batchSize, testType, gMemSize)) {
				log_info("This test cannot run because memory requirements canot be met by the available device\n");
				continue;
			}
				
			err = runTest(n, batchSize, dir, dim, dataFormat, numIter, testType);
			if (err)
				total_errors++;
		}
	}
	
	clReleaseContext(context);
	clReleaseCommandQueue(queue);
	
	test_finish();
	return total_errors;		
}