diff --git a/pyfstat/grid_based_searches.py b/pyfstat/grid_based_searches.py index 091b2ae2a977f15832ddc23cd0bd8432220ec75a..feeb97a8c7710ba9ebbdb67a6d87b35e82d9f670 100644 --- a/pyfstat/grid_based_searches.py +++ b/pyfstat/grid_based_searches.py @@ -438,6 +438,10 @@ class TransientGridSearch(GridSearch): FstatMap = getattr(self.search, 'FstatMap', None) thisCand = list(vals) + [detstat] if getattr(self, 'transientWindowType', None): + if self.tCWFstatMapVersion == 'lal': + F_mn = FstatMap.F_mn.data + else: + F_mn = FstatMap.F_mn if self.outputTransientFstatMap: tCWfile = os.path.splitext(self.out_file)[0]+'_tCW_%.16f_%.16f_%.16f_%.16g_%.16g.dat' % (vals[2],vals[5],vals[6],vals[3],vals[4]) # freq alpha delta f1dot f2dot if self.tCWFstatMapVersion == 'lal': @@ -445,9 +449,8 @@ class TransientGridSearch(GridSearch): lalpulsar.write_transientFstatMap_to_fp ( fo, FstatMap, windowRange, None ) del fo # instead of lal.FileClose() which is not SWIG-exported else: - np.savetxt(tCWfile, 2.0*FstatMap.F_mn, delimiter=' ') - Fmn = FstatMap.F_mn.data - maxidx = np.unravel_index(Fmn.argmax(), Fmn.shape) + np.savetxt(tCWfile, 2.0*F_mn, delimiter=' ') + maxidx = np.unravel_index(F_mn.argmax(), F_mn.shape) thisCand += [windowRange.t0+maxidx[0]*windowRange.dt0, windowRange.tau+maxidx[1]*windowRange.dtau] data.append(thisCand) diff --git a/pyfstat/pyCUDAkernels/cudaTransientFstatExpWindow.cu b/pyfstat/pyCUDAkernels/cudaTransientFstatExpWindow.cu new file mode 100644 index 0000000000000000000000000000000000000000..28b9d8e288f142a8d4222adb56709ef561599ef4 --- /dev/null +++ b/pyfstat/pyCUDAkernels/cudaTransientFstatExpWindow.cu @@ -0,0 +1,122 @@ +__global__ void cudaTransientFstatExpWindow ( float *input, + unsigned int numAtoms, + unsigned int TAtom, + unsigned int t0_data, + unsigned int win_t0, + unsigned int win_dt0, + unsigned int win_tau, + unsigned int win_dtau, + unsigned int Fmn_rows, + unsigned int Fmn_cols, + float *Fmn + ) +{ + + /* match CUDA thread indexing and high-level (t0,tau) indexing */ + unsigned int m = blockDim.x * blockIdx.x + threadIdx.x; // t0: row + unsigned int n = blockDim.y * blockIdx.y + threadIdx.y; // tau: column + /* unraveled 1D index for 2D output array */ + unsigned int outidx = Fmn_cols * m + n; + + /* hardcoded copy from lalpulsar */ + unsigned int TRANSIENT_EXP_EFOLDING = 3; + + if ( (m < Fmn_rows) && (n < Fmn_cols) ) { + + /* compute Fstat-atom index i_t0 in [0, numAtoms) */ + unsigned int TAtomHalf = TAtom/2; // integer division + unsigned int t0 = win_t0 + m * win_dt0; + int i_tmp = ( t0 - t0_data + TAtomHalf ) / TAtom; // integer round: floor(x+0.5) + if ( i_tmp < 0 ) { + i_tmp = 0; + } + unsigned int i_t0 = (unsigned int)i_tmp; + if ( i_t0 >= numAtoms ) { + i_t0 = numAtoms - 1; + } + + /* translate n into an atoms end-index for this search interval [t0, t0+Tcoh], + * giving the index range of atoms to sum over + */ + unsigned int tau = win_tau + n * win_dtau; + + /* get end-time t1 of this transient-window search + * for given tau, what Tcoh should the exponential window cover? + * for speed reasons we want to truncate Tcoh = tau * TRANSIENT_EXP_EFOLDING + * with the e-folding factor chosen such that the window-value + * is practically negligible after that, where it will be set to 0 + */ +// unsigned int t1 = lround( win_t0 + TRANSIENT_EXP_EFOLDING * win_tau); + unsigned int t1 = t0 + TRANSIENT_EXP_EFOLDING * tau; + + /* compute window end-time Fstat-atom index i_t1 in [0, numAtoms) */ + i_tmp = ( t1 - t0_data + TAtomHalf ) / TAtom - 1; // integer round: floor(x+0.5) + if ( i_tmp < 0 ) { + i_tmp = 0; + } + unsigned int i_t1 = (unsigned int)i_tmp; + if ( i_t1 >= numAtoms ) { + i_t1 = numAtoms - 1; + } + + /* now we have two valid atoms-indices [i_t0, i_t1] + * spanning our Fstat-window to sum over + */ + + float Ad = 0.0f; + float Bd = 0.0f; + float Cd = 0.0f; + float Fa_re = 0.0f; + float Fa_im = 0.0f; + float Fb_re = 0.0f; + float Fb_im = 0.0f; + + unsigned short input_cols = 7; // must match input matrix! + + /* sum up atoms */ + for ( unsigned int i=i_t0; i<=i_t1; i++ ) { + + unsigned int t_i = t0_data + i * TAtom; + + float win_i = 0.0; + if ( t_i >= t0 && t_i <= t1 ) { + float x = 1.0 * ( t_i - t0 ) / tau; + win_i = exp ( -x ); + } + + float win2_i = win_i * win_i; + + Ad += input[i*input_cols+0] * win2_i; // a2_alpha + Bd += input[i*input_cols+1] * win2_i; // b2_alpha + Cd += input[i*input_cols+2] * win2_i; // ab_alpha + Fa_re += input[i*input_cols+3] * win_i; // Fa_alpha_re + Fa_im += input[i*input_cols+4] * win_i; // Fa_alpha_im + Fb_re += input[i*input_cols+5] * win_i; // Fb_alpha_re + Fb_im += input[i*input_cols+6] * win_i; // Fb_alpha_im + + } + + /* get determinant */ + float Dd = ( Ad * Bd - Cd * Cd ); + float DdInv = 0.0f; + /* safety catch as in XLALWeightMultiAMCoeffs(): + * make it so that in the end F=0 instead of -nan + */ + if ( Dd > 0.0 ) { + DdInv = 1.0 / Dd; + } + + /* from XLALComputeFstatFromFaFb */ + float F = DdInv * ( Bd * ( Fa_re*Fa_re + Fa_im*Fa_im ) + + Ad * ( Fb_re*Fb_re + Fb_im*Fb_im ) + - 2.0 * Cd * ( Fa_re * Fb_re + Fa_im * Fb_im ) + ); + + /* store result in Fstat-matrix + * at unraveled index of element {m,n} + */ + Fmn[outidx] = F; + + } // ( (m < Fmn_rows) && (n < Fmn_cols) ) + +} // cudaTransientFstatExpWindow() diff --git a/pyfstat/pyCUDAkernels/cudaTransientFstatRectWindow.cu b/pyfstat/pyCUDAkernels/cudaTransientFstatRectWindow.cu new file mode 100644 index 0000000000000000000000000000000000000000..276cc2e27cb78ae008e8d74e235675f7510a1fde --- /dev/null +++ b/pyfstat/pyCUDAkernels/cudaTransientFstatRectWindow.cu @@ -0,0 +1,114 @@ +__global__ void cudaTransientFstatRectWindow ( float *input, + unsigned int numAtoms, + unsigned int TAtom, + unsigned int t0_data, + unsigned int win_t0, + unsigned int win_dt0, + unsigned int win_tau, + unsigned int win_dtau, + unsigned int N_tauRange, + float *Fmn + ) +{ + + /* match CUDA thread indexing and high-level (t0,tau) indexing */ + // assume 1D block, grid setup + unsigned int m = blockDim.x * blockIdx.x + threadIdx.x; // t0: row + + unsigned short input_cols = 7; // must match input matrix! + + /* compute Fstat-atom index i_t0 in [0, numAtoms) */ + unsigned int TAtomHalf = TAtom/2; // integer division + unsigned int t0 = win_t0 + m * win_dt0; + int i_tmp = ( t0 - t0_data + TAtomHalf ) / TAtom; // integer round: floor(x+0.5) + if ( i_tmp < 0 ) { + i_tmp = 0; + } + unsigned int i_t0 = (unsigned int)i_tmp; + if ( i_t0 >= numAtoms ) { + i_t0 = numAtoms - 1; + } + + float Ad = 0.0f; + float Bd = 0.0f; + float Cd = 0.0f; + float Fa_re = 0.0f; + float Fa_im = 0.0f; + float Fb_re = 0.0f; + float Fb_im = 0.0f; + unsigned int i_t1_last = i_t0; + + /* INNER loop over timescale-parameter tau + * NOT parallelized so that we can still use the i_t1_last trick + * (empirically seems to be faster than 2D CUDA version) + */ + for ( unsigned int n = 0; n < N_tauRange; n ++ ) { + + if ( (m < N_tauRange) && (n < N_tauRange) ) { + + /* translate n into an atoms end-index for this search interval [t0, t0+Tcoh], + * giving the index range of atoms to sum over + */ + unsigned int tau = win_tau + n * win_dtau; + + /* get end-time t1 of this transient-window search */ + unsigned int t1 = t0 + tau; + + /* compute window end-time Fstat-atom index i_t1 in [0, numAtoms) */ + i_tmp = ( t1 - t0_data + TAtomHalf ) / TAtom - 1; // integer round: floor(x+0.5) + if ( i_tmp < 0 ) { + i_tmp = 0; + } + unsigned int i_t1 = (unsigned int)i_tmp; + if ( i_t1 >= numAtoms ) { + i_t1 = numAtoms - 1; + } + + /* now we have two valid atoms-indices [i_t0, i_t1] + * spanning our Fstat-window to sum over + */ + + for ( unsigned int i = i_t1_last; i <= i_t1; i ++ ) { + /* sum up atoms, + * special optimiziation in the rectangular-window case: + * just add on to previous tau values, + * ie re-use the sum over [i_t0, i_t1_last] from the pevious tau-loop iteration + */ + Ad += input[i*input_cols+0]; // a2_alpha + Bd += input[i*input_cols+1]; // b2_alpha + Cd += input[i*input_cols+2]; // ab_alpha + Fa_re += input[i*input_cols+3]; // Fa_alpha_re + Fa_im += input[i*input_cols+4]; // Fa_alpha_im + Fb_re += input[i*input_cols+5]; // Fb_alpha_re + Fb_im += input[i*input_cols+6]; // Fb_alpha_im + /* keep track of up to where we summed for the next iteration */ + i_t1_last = i_t1 + 1; + } + + /* get determinant */ + float Dd = ( Ad * Bd - Cd * Cd ); + float DdInv = 0.0f; + /* safety catch as in XLALWeightMultiAMCoeffs(): + * make it so that in the end F=0 instead of -nan + */ + if ( Dd > 0.0 ) { + DdInv = 1.0 / Dd; + } + + /* from XLALComputeFstatFromFaFb */ + float F = DdInv * ( Bd * ( Fa_re*Fa_re + Fa_im*Fa_im ) + + Ad * ( Fb_re*Fb_re + Fb_im*Fb_im ) + - 2.0 * Cd * ( Fa_re * Fb_re + Fa_im * Fb_im ) + ); + + /* store result in Fstat-matrix + * at unraveled index of element {m,n} + */ + unsigned int outidx = m * N_tauRange + n; + Fmn[outidx] = F; + + } // if ( (m < N_tauRange) && (n < N_tauRange) ) + + } // for ( unsigned int n = 0; n < N_tauRange; n ++ ) + +} // cudaTransientFstatRectWindow() diff --git a/pyfstat/tcw_fstat_map_funcs.py b/pyfstat/tcw_fstat_map_funcs.py index 7294e0a6ced84f8776d1bafb7fad8b5766420ebd..fd43b289dd660a1ea2a13e6cfb8ac46b92f5eb7c 100644 --- a/pyfstat/tcw_fstat_map_funcs.py +++ b/pyfstat/tcw_fstat_map_funcs.py @@ -1,5 +1,7 @@ """ Additional helper functions dealing with transient-CW F(t0,tau) maps """ +import numpy as np +import os import logging # optional imports @@ -31,15 +33,34 @@ def optional_import ( modulename, shorthand=None ): return success +class pyTransientFstatMap(object): + ''' + simplified object class for a F(t0,tau) F-stat map (not 2F!) + based on LALSuite's transientFstatMap_t type + replacing the gsl matrix with a numpy array + + F_mn: 2D array of 2F values + maxF: maximum of F (not 2F!) + t0_ML: maximum likelihood transient start time t0 estimate + tau_ML: maximum likelihood transient duration tau estimate + ''' + + def __init__(self, N_t0Range, N_tauRange): + self.F_mn = np.zeros((N_t0Range, N_tauRange), dtype=np.float32) + self.maxF = float(-1.0) # Initializing to a negative value ensures that we always update at least once and hence return sane t0_d_ML, tau_d_ML even if there is only a single bin where F=0 happens. + self.t0_ML = float(0.0) + self.tau_ML = float(0.0) + + # dictionary of the actual callable F-stat map functions we support, # if the corresponding modules are available. fstatmap_versions = { 'lal': lambda multiFstatAtoms, windowRange: getattr(lalpulsar,'ComputeTransientFstatMap') ( multiFstatAtoms, windowRange, False ), - #'pycuda': lambda multiFstatAtoms, windowRange: - #pycuda_compute_transient_fstat_map - #( multiFstatAtoms, windowRange ) + 'pycuda': lambda multiFstatAtoms, windowRange: + pycuda_compute_transient_fstat_map + ( multiFstatAtoms, windowRange ) } @@ -51,13 +72,24 @@ def init_transient_fstat_map_features ( ): each key's value set to True only if all required modules are importable on this system. ''' + features = {} + have_lal = optional_import('lal') have_lalpulsar = optional_import('lalpulsar') features['lal'] = have_lal and have_lalpulsar - features['pycuda'] = False + + # import GPU features + have_pycuda_drv = optional_import('pycuda.driver', 'drv') + have_pycuda_init = optional_import('pycuda.autoinit', 'autoinit') + have_pycuda_gpuarray = optional_import('pycuda.gpuarray', 'gpuarray') + have_pycuda_tools = optional_import('pycuda.tools', 'cudatools') + have_pycuda_compiler = optional_import('pycuda.compiler', 'cudacomp') + features['pycuda'] = have_pycuda_drv and have_pycuda_init and have_pycuda_gpuarray and have_pycuda_tools and have_pycuda_compiler + logging.debug('Got the following features for transient F-stat maps:') logging.debug(features) + return features @@ -72,3 +104,222 @@ def call_compute_transient_fstat_map ( version, features, multiFstatAtoms=None, else: raise Exception('Transient F-stat map method "{}" not implemented!'.format(version)) return FstatMap + + +def reshape_FstatAtomsVector ( atomsVector ): + ''' + Make a dictionary of ndarrays out of a atoms "vector" structure. + + The input is a "vector"-like structure with times as the higher hierarchical + level and a set of "atoms" quantities defined at each timestamp. + The output is a dictionary with an entry for each quantity, + which is a 1D ndarray over timestamps for that one quantity. + ''' + + numAtoms = atomsVector.length + atomsDict = {} + atom_fieldnames = ['timestamp', 'Fa_alpha', 'Fb_alpha', + 'a2_alpha', 'ab_alpha', 'b2_alpha'] + atom_dtypes = [np.uint32, complex, complex, + np.float32, np.float32, np.float32] + for f, field in enumerate(atom_fieldnames): + atomsDict[field] = np.ndarray(numAtoms,dtype=atom_dtypes[f]) + + for n,atom in enumerate(atomsVector.data): + for field in atom_fieldnames: + atomsDict[field][n] = atom.__getattribute__(field) + + atomsDict['Fa_alpha_re'] = np.float32(atomsDict['Fa_alpha'].real) + atomsDict['Fa_alpha_im'] = np.float32(atomsDict['Fa_alpha'].imag) + atomsDict['Fb_alpha_re'] = np.float32(atomsDict['Fb_alpha'].real) + atomsDict['Fb_alpha_im'] = np.float32(atomsDict['Fb_alpha'].imag) + + return atomsDict + + +def get_absolute_kernel_path ( kernel ): + pyfstatdir = os.path.dirname(os.path.abspath(os.path.realpath(__file__))) + kernelfile = kernel + '.cu' + return os.path.join(pyfstatdir,'pyCUDAkernels',kernelfile) + + +def pycuda_compute_transient_fstat_map ( multiFstatAtoms, windowRange ): + ''' + GPU version of the function to compute transient-window "F-statistic map" + over start-time and timescale {t0, tau}. + Based on XLALComputeTransientFstatMap from LALSuite, + (C) 2009 Reinhard Prix, licensed under GPL + + Returns a 2D matrix F_mn, + with m = index over start-times t0, + and n = index over timescales tau, + in steps of dt0 in [t0, t0+t0Band], + and dtau in [tau, tau+tauBand] + as defined in windowRange input. + ''' + + if ( windowRange.type >= lalpulsar.TRANSIENT_LAST ): + raise ValueError ('Unknown window-type ({}) passed as input. Allowed are [0,{}].'.format(windowRange.type, lalpulsar.TRANSIENT_LAST-1)) + + # internal dict for search/setup parameters + tCWparams = {} + + # first combine all multi-atoms + # into a single atoms-vector with *unique* timestamps + tCWparams['TAtom'] = multiFstatAtoms.data[0].TAtom + TAtomHalf = int(tCWparams['TAtom']/2) # integer division + atoms = lalpulsar.mergeMultiFstatAtomsBinned ( multiFstatAtoms, + tCWparams['TAtom'] ) + + # make a combined input matrix of all atoms vectors, for transfer to GPU + tCWparams['numAtoms'] = atoms.length + atomsDict = reshape_FstatAtomsVector(atoms) + atomsInputMatrix = np.column_stack ( (atomsDict['a2_alpha'], + atomsDict['b2_alpha'], + atomsDict['ab_alpha'], + atomsDict['Fa_alpha_re'], + atomsDict['Fa_alpha_im'], + atomsDict['Fb_alpha_re'], + atomsDict['Fb_alpha_im']) + ) + + # actual data spans [t0_data, t0_data + tCWparams['numAtoms'] * TAtom] + # in steps of TAtom + tCWparams['t0_data'] = int(atoms.data[0].timestamp) + tCWparams['t1_data'] = int(atoms.data[tCWparams['numAtoms']-1].timestamp + tCWparams['TAtom']) + + logging.debug('t0_data={:d}, t1_data={:d}'.format(tCWparams['t0_data'], + tCWparams['t1_data'])) + logging.debug('numAtoms={:d}, TAtom={:d}, TAtomHalf={:d}'.format(tCWparams['numAtoms'], + tCWparams['TAtom'], + TAtomHalf)) + + # special treatment of window_type = none + # ==> replace by rectangular window spanning all the data + if ( windowRange.type == lalpulsar.TRANSIENT_NONE ): + windowRange.type = lalpulsar.TRANSIENT_RECTANGULAR + windowRange.t0 = tCWparams['t0_data'] + windowRange.t0Band = 0 + windowRange.dt0 = tCWparams['TAtom'] # irrelevant + windowRange.tau = tCWparams['numAtoms'] * tCWparams['TAtom'] + windowRange.tauBand = 0; + windowRange.dtau = tCWparams['TAtom'] # irrelevant + + """ NOTE: indices {i,j} enumerate *actual* atoms and their timestamps t_i, while the + * indices {m,n} enumerate the full grid of values in [t0_min, t0_max]x[Tcoh_min, Tcoh_max] in + * steps of deltaT. This allows us to deal with gaps in the data in a transparent way. + * + * NOTE2: we operate on the 'binned' atoms returned from XLALmergeMultiFstatAtomsBinned(), + * which means we can safely assume all atoms to be lined up perfectly on a 'deltaT' binned grid. + * + * The mapping used will therefore be {i,j} -> {m,n}: + * m = offs_i / deltaT = start-time offset from t0_min measured in deltaT + * n = Tcoh_ij / deltaT = duration Tcoh_ij measured in deltaT, + * + * where + * offs_i = t_i - t0_min + * Tcoh_ij = t_j - t_i + deltaT + * + """ + + # We allocate a matrix {m x n} = t0Range * TcohRange elements + # covering the full timerange the transient window-range [t0,t0+t0Band]x[tau,tau+tauBand] + tCWparams['N_t0Range'] = int(np.floor( 1.0*windowRange.t0Band / windowRange.dt0 ) + 1) + tCWparams['N_tauRange'] = int(np.floor( 1.0*windowRange.tauBand / windowRange.dtau ) + 1) + FstatMap = pyTransientFstatMap ( tCWparams['N_t0Range'], tCWparams['N_tauRange'] ) + + logging.debug('N_t0Range={:d}, N_tauRange={:d}'.format(tCWparams['N_t0Range'], + tCWparams['N_tauRange'])) + + if ( windowRange.type == lalpulsar.TRANSIENT_RECTANGULAR ): + FstatMap.F_mn = pycuda_compute_transient_fstat_map_rect ( atomsInputMatrix, + windowRange, + tCWparams ) + elif ( windowRange.type == lalpulsar.TRANSIENT_EXPONENTIAL ): + FstatMap.F_mn = pycuda_compute_transient_fstat_map_exp ( atomsInputMatrix, + windowRange, + tCWparams ) + else: + raise ValueError('Invalid transient window type {} not in [{}, {}].'.format(windowRange.type, lalpulsar.TRANSIENT_NONE, lalpulsar.TRANSIENT_LAST-1)) + + # out of loop: get max2F and ML estimates over the m x n matrix + FstatMap.maxF = FstatMap.F_mn.max() + maxidx = np.unravel_index ( FstatMap.F_mn.argmax(), + (tCWparams['N_t0Range'], + tCWparams['N_tauRange'])) + FstatMap.t0_ML = windowRange.t0 + maxidx[0] * windowRange.dt0 + FstatMap.tau_ML = windowRange.tau + maxidx[1] * windowRange.dtau + + logging.debug('Done computing transient F-stat map. maxF={}, t0_ML={}, tau_ML={}'.format(FstatMap.maxF , FstatMap.t0_ML, FstatMap.tau_ML)) + + return FstatMap + + +def pycuda_compute_transient_fstat_map_rect ( atomsInputMatrix, windowRange, tCWparams ): + '''only GPU-parallizing outer loop, keeping partial sums with memory in kernel''' + + # gpu data setup and transfer + logging.debug('Initial GPU memory: {}/{} MB free'.format(drv.mem_get_info()[0]/(2.**20),drv.mem_get_info()[1]/(2.**20))) + input_gpu = gpuarray.to_gpu ( atomsInputMatrix ) + Fmn_gpu = gpuarray.GPUArray ( (tCWparams['N_t0Range'],tCWparams['N_tauRange']), dtype=np.float32 ) + logging.debug('GPU memory with input+output allocated: {}/{} MB free'.format(drv.mem_get_info()[0]/(2.**20), drv.mem_get_info()[1]/(2.**20))) + + # GPU kernel + kernel = 'cudaTransientFstatRectWindow' + kernelfile = get_absolute_kernel_path ( kernel ) + partial_Fstat_cuda_code = cudacomp.SourceModule(open(kernelfile,'r').read()) + partial_Fstat_cuda = partial_Fstat_cuda_code.get_function(kernel) + partial_Fstat_cuda.prepare('PIIIIIIIIP') + + # GPU grid setup + blockRows = min(1024,tCWparams['N_t0Range']) + blockCols = 1 + gridRows = int(np.ceil(1.0*tCWparams['N_t0Range']/blockRows)) + gridCols = 1 + logging.debug('Looking to compute transient F-stat map over a N={}*{}={} (t0,tau) grid...'.format(tCWparams['N_t0Range'],tCWparams['N_tauRange'],tCWparams['N_t0Range']*tCWparams['N_tauRange'])) + + # running the kernel + logging.debug('Calling kernel with a grid of {}*{}={} blocks of {}*{}={} threads each: {} total threads...'.format(gridRows, gridCols, gridRows*gridCols, blockRows, blockCols, blockRows*blockCols, gridRows*gridCols*blockRows*blockCols)) + partial_Fstat_cuda.prepared_call ( (gridRows,gridCols), (blockRows,blockCols,1), input_gpu.gpudata, tCWparams['numAtoms'], tCWparams['TAtom'], tCWparams['t0_data'], windowRange.t0, windowRange.dt0, windowRange.tau, windowRange.dtau, tCWparams['N_tauRange'], Fmn_gpu.gpudata ) + + # return results to host + F_mn = Fmn_gpu.get() + + logging.debug('Final GPU memory: {}/{} MB free'.format(drv.mem_get_info()[0]/(2.**20),drv.mem_get_info()[1]/(2.**20))) + + return F_mn + + +def pycuda_compute_transient_fstat_map_exp ( atomsInputMatrix, windowRange, tCWparams ): + '''exponential window, inner and outer loop GPU-parallelized''' + + # gpu data setup and transfer + logging.debug('Initial GPU memory: {}/{} MB free'.format(drv.mem_get_info()[0]/(2.**20),drv.mem_get_info()[1]/(2.**20))) + input_gpu = gpuarray.to_gpu ( atomsInputMatrix ) + Fmn_gpu = gpuarray.GPUArray ( (tCWparams['N_t0Range'],tCWparams['N_tauRange']), dtype=np.float32 ) + logging.debug('GPU memory with input+output allocated: {}/{} MB free'.format(drv.mem_get_info()[0]/(2.**20), drv.mem_get_info()[1]/(2.**20))) + + # GPU kernel + kernel = 'cudaTransientFstatExpWindow' + kernelfile = get_absolute_kernel_path ( kernel ) + partial_Fstat_cuda_code = cudacomp.SourceModule(open(kernelfile,'r').read()) + partial_Fstat_cuda = partial_Fstat_cuda_code.get_function(kernel) + partial_Fstat_cuda.prepare('PIIIIIIIIIP') + + # GPU grid setup + blockRows = min(32,tCWparams['N_t0Range']) + blockCols = min(32,tCWparams['N_tauRange']) + gridRows = int(np.ceil(1.0*tCWparams['N_t0Range']/blockRows)) + gridCols = int(np.ceil(1.0*tCWparams['N_tauRange']/blockCols)) + logging.debug('Looking to compute transient F-stat map over a N={}*{}={} (t0,tau) grid...'.format(tCWparams['N_t0Range'], tCWparams['N_tauRange'], tCWparams['N_t0Range']*tCWparams['N_tauRange'])) + + # running the kernel + logging.debug('Calling kernel with a grid of {}*{}={} blocks of {}*{}={} threads each: {} total threads...'.format(gridRows, gridCols, gridRows*gridCols, blockRows, blockCols, blockRows*blockCols, gridRows*gridCols*blockRows*blockCols)) + partial_Fstat_cuda.prepared_call ( (gridRows,gridCols), (blockRows,blockCols,1), input_gpu.gpudata, tCWparams['numAtoms'], tCWparams['TAtom'], tCWparams['t0_data'], windowRange.t0, windowRange.dt0, windowRange.tau, windowRange.dtau, tCWparams['N_t0Range'], tCWparams['N_tauRange'], Fmn_gpu.gpudata ) + + # return results to host + F_mn = Fmn_gpu.get() + + logging.debug('Final GPU memory: {}/{} MB free'.format(drv.mem_get_info()[0]/(2.**20),drv.mem_get_info()[1]/(2.**20))) + + return F_mn diff --git a/setup.py b/setup.py index a9d69f304d33b723a3894c2f4ab88f2adfc2f737..2f03d835110d223f056e4798043888d9b241d60d 100644 --- a/setup.py +++ b/setup.py @@ -7,4 +7,7 @@ setup(name='PyFstat', author='Gregory Ashton', author_email='gregory.ashton@ligo.org', packages=['pyfstat'], + include_package_data=True, + package_data={'pyfstat': ['pyCUDAkernels/cudaTransientFstatExpWindow.cu', + 'pyCUDAkernels/cudaTransientFstatRectWindow.cu']}, )