diff --git a/pyfstat/core.py b/pyfstat/core.py index 26fcf4e9a15a15f0f7a4793d94ab7818cf21bf14..50705e73f48d8ceb6c0ff0db3314a49c3426b7d2 100755 --- a/pyfstat/core.py +++ b/pyfstat/core.py @@ -632,8 +632,8 @@ class ComputeFstat(BaseSearchClass): self.windowRange.dt0 = self.Tsft self.windowRange.dtau = self.Tsft - # special treatment of window_type = none ==> replace by - # rectangular window spanning all the data + # special treatment of window_type = none + # ==> replace by rectangular window spanning all the data if self.windowRange.type == lalpulsar.TRANSIENT_NONE: self.windowRange.t0 = int(self.minStartTime) self.windowRange.t0Band = 0 @@ -661,6 +661,7 @@ class ComputeFstat(BaseSearchClass): if self.dtau: self.windowRange.dtau = self.dtau + logging.info('Initialising transient FstatMap features...') self.tCWFstatMapFeatures, self.gpu_context = ( tcw.init_transient_fstat_map_features( self.tCWFstatMapVersion == 'pycuda', self.cudaDeviceName)) @@ -707,15 +708,6 @@ class ComputeFstat(BaseSearchClass): # F-stat computation self.windowRange.tau = int(2*self.Tsft) - # logging.debug( - # 'Calling "%s" version of ComputeTransientFstatMap() with\ - # windowRange: (type=%d (%s), t0=%f, t0Band=%f, dt0=%f, tau=%f,\ - # tauBand=%f, dtau=%f)...' % ( - # self.tCWFstatMapVersion, self.windowRange.type, - # self.transientWindowType, self.windowRange.t0, - # self.windowRange.t0Band, self.windowRange.dt0, - # self.windowRange.tau, self.windowRange.tauBand, - # self.windowRange.dtau)) self.FstatMap = tcw.call_compute_transient_fstat_map( self.tCWFstatMapVersion, self.tCWFstatMapFeatures, self.FstatResults.multiFatoms[0], self.windowRange) @@ -724,13 +716,6 @@ class ComputeFstat(BaseSearchClass): else: F_mn = self.FstatMap.F_mn - # logging.debug('maxF: {}'.format(self.FstatMap.maxF)) - # logging.debug('t0_ML: %ds=T0+%fd' % ( - # self.FstatMap.t0_ML, (self.FstatMap.t0_ML-tstart)/(3600.*24.))) - # logging.debug('tau_ML: %ds=%fd' % ( - # self.FstatMap.tau_ML, self.FstatMap.tau_ML/(3600.*24.))) - # logging.debug('F_mn: {}'.format(F_mn)) - twoF = 2*np.max(F_mn) if self.BSGL is False: if np.isnan(twoF): diff --git a/pyfstat/grid_based_searches.py b/pyfstat/grid_based_searches.py index 1dc24f2d5ae71e6f90ec1229c1312e74e088b29c..8d830c291da203b1c05f8f8008f7cd96cb675c2e 100644 --- a/pyfstat/grid_based_searches.py +++ b/pyfstat/grid_based_searches.py @@ -435,6 +435,10 @@ class TransientGridSearch(GridSearch): self.inititate_search_object() data = [] + if self.outputTransientFstatMap: + tCWfilebase = os.path.splitext(self.out_file)[0] + '_tCW_' + logging.info('Will save per-Doppler Fstatmap' \ + ' results to {}*.dat'.format(tCWfilebase)) for vals in tqdm(self.input_data): detstat = self.search.get_det_stat(*vals) windowRange = getattr(self.search, 'windowRange', None) @@ -446,11 +450,18 @@ class TransientGridSearch(GridSearch): else: F_mn = FstatMap.F_mn if self.outputTransientFstatMap: - tCWfile = os.path.splitext(self.out_file)[0]+'_tCW_%.16f_%.16f_%.16f_%.16g_%.16g.dat' % (vals[2],vals[5],vals[6],vals[3],vals[4]) # freq alpha delta f1dot f2dot + # per-Doppler filename convention: + # freq alpha delta f1dot f2dot + tCWfile = ( tCWfilebase + + '%.16f_%.16f_%.16f_%.16g_%.16g.dat' % + (vals[2],vals[5],vals[6],vals[3],vals[4]) ) if self.tCWFstatMapVersion == 'lal': fo = lal.FileOpen(tCWfile, 'w') - lalpulsar.write_transientFstatMap_to_fp ( fo, FstatMap, windowRange, None ) - del fo # instead of lal.FileClose() which is not SWIG-exported + lalpulsar.write_transientFstatMap_to_fp ( + fo, FstatMap, windowRange, None ) + # instead of lal.FileClose(), + # which is not SWIG-exported: + del fo else: self.write_F_mn ( tCWfile, F_mn, windowRange) maxidx = np.unravel_index(F_mn.argmax(), F_mn.shape) diff --git a/pyfstat/pyCUDAkernels/cudaTransientFstatExpWindow.cu b/pyfstat/pyCUDAkernels/cudaTransientFstatExpWindow.cu index 28b9d8e288f142a8d4222adb56709ef561599ef4..85d9972c5d6cee5c6e8c4443e8c425b8cb335b8d 100644 --- a/pyfstat/pyCUDAkernels/cudaTransientFstatExpWindow.cu +++ b/pyfstat/pyCUDAkernels/cudaTransientFstatExpWindow.cu @@ -26,7 +26,8 @@ __global__ void cudaTransientFstatExpWindow ( float *input, /* compute Fstat-atom index i_t0 in [0, numAtoms) */ unsigned int TAtomHalf = TAtom/2; // integer division unsigned int t0 = win_t0 + m * win_dt0; - int i_tmp = ( t0 - t0_data + TAtomHalf ) / TAtom; // integer round: floor(x+0.5) + /* integer round: floor(x+0.5) */ + int i_tmp = ( t0 - t0_data + TAtomHalf ) / TAtom; if ( i_tmp < 0 ) { i_tmp = 0; } @@ -35,22 +36,26 @@ __global__ void cudaTransientFstatExpWindow ( float *input, i_t0 = numAtoms - 1; } - /* translate n into an atoms end-index for this search interval [t0, t0+Tcoh], + /* translate n into an atoms end-index + * for this search interval [t0, t0+Tcoh], * giving the index range of atoms to sum over */ unsigned int tau = win_tau + n * win_dtau; /* get end-time t1 of this transient-window search * for given tau, what Tcoh should the exponential window cover? - * for speed reasons we want to truncate Tcoh = tau * TRANSIENT_EXP_EFOLDING + * for speed reasons we want to truncate + * Tcoh = tau * TRANSIENT_EXP_EFOLDING * with the e-folding factor chosen such that the window-value * is practically negligible after that, where it will be set to 0 */ // unsigned int t1 = lround( win_t0 + TRANSIENT_EXP_EFOLDING * win_tau); unsigned int t1 = t0 + TRANSIENT_EXP_EFOLDING * tau; - /* compute window end-time Fstat-atom index i_t1 in [0, numAtoms) */ - i_tmp = ( t1 - t0_data + TAtomHalf ) / TAtom - 1; // integer round: floor(x+0.5) + /* compute window end-time Fstat-atom index i_t1 in [0, numAtoms) + * using integer round: floor(x+0.5) + */ + i_tmp = ( t1 - t0_data + TAtomHalf ) / TAtom - 1; if ( i_tmp < 0 ) { i_tmp = 0; } diff --git a/pyfstat/pyCUDAkernels/cudaTransientFstatRectWindow.cu b/pyfstat/pyCUDAkernels/cudaTransientFstatRectWindow.cu index 276cc2e27cb78ae008e8d74e235675f7510a1fde..dd353bbcf7fb52d2ff88e8b59d114057754a5f8c 100644 --- a/pyfstat/pyCUDAkernels/cudaTransientFstatRectWindow.cu +++ b/pyfstat/pyCUDAkernels/cudaTransientFstatRectWindow.cu @@ -20,7 +20,8 @@ __global__ void cudaTransientFstatRectWindow ( float *input, /* compute Fstat-atom index i_t0 in [0, numAtoms) */ unsigned int TAtomHalf = TAtom/2; // integer division unsigned int t0 = win_t0 + m * win_dt0; - int i_tmp = ( t0 - t0_data + TAtomHalf ) / TAtom; // integer round: floor(x+0.5) + /* integer round: floor(x+0.5) */ + int i_tmp = ( t0 - t0_data + TAtomHalf ) / TAtom; if ( i_tmp < 0 ) { i_tmp = 0; } @@ -46,7 +47,8 @@ __global__ void cudaTransientFstatRectWindow ( float *input, if ( (m < N_tauRange) && (n < N_tauRange) ) { - /* translate n into an atoms end-index for this search interval [t0, t0+Tcoh], + /* translate n into an atoms end-index + * for this search interval [t0, t0+Tcoh], * giving the index range of atoms to sum over */ unsigned int tau = win_tau + n * win_dtau; @@ -54,8 +56,10 @@ __global__ void cudaTransientFstatRectWindow ( float *input, /* get end-time t1 of this transient-window search */ unsigned int t1 = t0 + tau; - /* compute window end-time Fstat-atom index i_t1 in [0, numAtoms) */ - i_tmp = ( t1 - t0_data + TAtomHalf ) / TAtom - 1; // integer round: floor(x+0.5) + /* compute window end-time Fstat-atom index i_t1 in [0, numAtoms) + * using integer round: floor(x+0.5) + */ + i_tmp = ( t1 - t0_data + TAtomHalf ) / TAtom - 1; if ( i_tmp < 0 ) { i_tmp = 0; } @@ -72,7 +76,8 @@ __global__ void cudaTransientFstatRectWindow ( float *input, /* sum up atoms, * special optimiziation in the rectangular-window case: * just add on to previous tau values, - * ie re-use the sum over [i_t0, i_t1_last] from the pevious tau-loop iteration + * ie re-use the sum over [i_t0, i_t1_last] + from the pevious tau-loop iteration */ Ad += input[i*input_cols+0]; // a2_alpha Bd += input[i*input_cols+1]; // b2_alpha diff --git a/pyfstat/tcw_fstat_map_funcs.py b/pyfstat/tcw_fstat_map_funcs.py index c95509a1b97e376f310e1f77dbbaaafb7f139974..f84d71da5d7dd83ff455b43cd54b2880986483f7 100644 --- a/pyfstat/tcw_fstat_map_funcs.py +++ b/pyfstat/tcw_fstat_map_funcs.py @@ -9,7 +9,7 @@ import logging import importlib as imp -def optional_import ( modulename, shorthand=None ): +def _optional_import ( modulename, shorthand=None ): ''' Import a module/submodule only if it's available. @@ -29,12 +29,16 @@ def optional_import ( modulename, shorthand=None ): if('pycuda' in sys.modules): try: globals()[shorthand] = imp.import_module(modulename) - logging.debug('Successfully imported module %s%s.' % (modulename, shorthandbit)) + logging.debug('Successfully imported module %s%s.' + % (modulename, shorthandbit)) success = True except pycuda._driver.LogicError, e: if e.message == 'cuDeviceGet failed: invalid device ordinal': devn = int(os.environ['CUDA_DEVICE']) - raise RuntimeError('Requested CUDA device number {} exceeds number of available devices! Please change through environment variable $CUDA_DEVICE.'.format(devn)) + raise RuntimeError('Requested CUDA device number {} exceeds' \ + ' number of available devices!' \ + ' Please change through environment' \ + ' variable $CUDA_DEVICE.'.format(devn)) else: raise pycuda._driver.LogicError(e.message) except ImportError, e: @@ -46,7 +50,8 @@ def optional_import ( modulename, shorthand=None ): else: try: globals()[shorthand] = imp.import_module(modulename) - logging.debug('Successfully imported module %s%s.' % (modulename, shorthandbit)) + logging.debug('Successfully imported module %s%s.' + % (modulename, shorthandbit)) success = True except ImportError, e: if e.message == 'No module named '+modulename: @@ -72,7 +77,11 @@ class pyTransientFstatMap(object): def __init__(self, N_t0Range, N_tauRange): self.F_mn = np.zeros((N_t0Range, N_tauRange), dtype=np.float32) - self.maxF = float(-1.0) # Initializing to a negative value ensures that we always update at least once and hence return sane t0_d_ML, tau_d_ML even if there is only a single bin where F=0 happens. + # Initializing maxF to a negative value ensures + # that we always update at least once and hence return + # sane t0_d_ML, tau_d_ML + # even if there is only a single bin where F=0 happens. + self.maxF = float(-1.0) self.t0_ML = float(0.0) self.tau_ML = float(0.0) @@ -100,26 +109,28 @@ def init_transient_fstat_map_features ( wantCuda=False, cudaDeviceName=None ): features = {} - have_lal = optional_import('lal') - have_lalpulsar = optional_import('lalpulsar') + have_lal = _optional_import('lal') + have_lalpulsar = _optional_import('lalpulsar') features['lal'] = have_lal and have_lalpulsar # import GPU features - have_pycuda = optional_import('pycuda') - have_pycuda_drv = optional_import('pycuda.driver', 'drv') - have_pycuda_gpuarray = optional_import('pycuda.gpuarray', 'gpuarray') - have_pycuda_tools = optional_import('pycuda.tools', 'cudatools') - have_pycuda_compiler = optional_import('pycuda.compiler', 'cudacomp') - features['pycuda'] = have_pycuda_drv and have_pycuda_gpuarray and have_pycuda_tools and have_pycuda_compiler + have_pycuda = _optional_import('pycuda') + have_pycuda_drv = _optional_import('pycuda.driver', 'drv') + have_pycuda_gpuarray = _optional_import('pycuda.gpuarray', 'gpuarray') + have_pycuda_tools = _optional_import('pycuda.tools', 'cudatools') + have_pycuda_compiler = _optional_import('pycuda.compiler', 'cudacomp') + features['pycuda'] = ( have_pycuda_drv and have_pycuda_gpuarray and + have_pycuda_tools and have_pycuda_compiler ) logging.debug('Got the following features for transient F-stat maps:') logging.debug(features) if wantCuda and features['pycuda']: - logging.debug('CUDA version: {}'.format(drv.get_version())) + logging.debug('CUDA version: '+'.'.join(map(str,drv.get_version()))) drv.init() - logging.debug('Starting with default context, then checking all available devices...') + logging.debug('Starting with default pyCUDA context,' \ + ' then checking all available devices...') context0 = pycuda.tools.make_default_context() num_gpus = drv.Device.count() @@ -131,7 +142,8 @@ def init_transient_fstat_map_features ( wantCuda=False, cudaDeviceName=None ): devn = drv.Device(n) devices.append(devn) devnames[n] = devn.name().replace(' ','-').replace('_','-') - logging.debug('device {}: model: {}, RAM: {}MB'.format(n,devnames[n],devn.total_memory()/(2.**20) )) + logging.debug('device {}: model: {}, RAM: {}MB'.format( + n, devnames[n], devn.total_memory()/(2.**20) )) if 'CUDA_DEVICE' in os.environ: devnum0 = int(os.environ['CUDA_DEVICE']) @@ -141,14 +153,19 @@ def init_transient_fstat_map_features ( wantCuda=False, cudaDeviceName=None ): matchbit = '' if cudaDeviceName: # allow partial matches in device names - devmatches = [devidx for devidx, devname in enumerate(devnames) if cudaDeviceName in devname] + devmatches = [devidx for devidx, devname in enumerate(devnames) + if cudaDeviceName in devname] if len(devmatches) == 0: context0.detach() - raise RuntimeError('Requested CUDA device "{}" not found. Available devices: [{}]'.format(cudaDeviceName,','.join(devnames))) + raise RuntimeError('Requested CUDA device "{}" not found.' \ + ' Available devices: [{}]'.format( + cudaDeviceName,','.join(devnames))) else: devnum = devmatches[0] if len(devmatches) > 1: - logging.warning('Found {} CUDA devices matching name "{}". Choosing first one with index {}.'.format(len(devmatches),cudaDeviceName,devnum)) + logging.warning('Found {} CUDA devices matching name "{}".' \ + ' Choosing first one with index {}.'.format( + len(devmatches),cudaDeviceName,devnum)) os.environ['CUDA_DEVICE'] = str(devnum) matchbit = '(matched to user request "{}")'.format(cudaDeviceName) elif 'CUDA_DEVICE' in os.environ: @@ -156,7 +173,9 @@ def init_transient_fstat_map_features ( wantCuda=False, cudaDeviceName=None ): else: devnum = 0 devn = devices[devnum] - logging.info('Choosing CUDA device {}, of {} devices present: {}{}...'.format(devnum,num_gpus,devn.name(),matchbit)) + logging.info('Choosing CUDA device {},' \ + ' of {} devices present: {}{}...'.format( + devnum, num_gpus, devn.name(), matchbit)) if devnum == devnum0: gpu_context = context0 else: @@ -164,23 +183,28 @@ def init_transient_fstat_map_features ( wantCuda=False, cudaDeviceName=None ): gpu_context = pycuda.tools.make_default_context() gpu_context.push() - logging.debug('Available GPU memory: {}/{} MB free'.format(drv.mem_get_info()[0]/(2.**20),drv.mem_get_info()[1]/(2.**20))) + _print_GPU_memory_MB('Available') else: gpu_context = None return features, gpu_context -def call_compute_transient_fstat_map ( version, features, multiFstatAtoms=None, windowRange=None ): +def call_compute_transient_fstat_map ( version, + features, + multiFstatAtoms=None, + windowRange=None ): '''Choose which version of the ComputeTransientFstatMap function to call.''' if version in fstatmap_versions: if features[version]: FstatMap = fstatmap_versions[version](multiFstatAtoms, windowRange) else: - raise Exception('Required module(s) for transient F-stat map method "{}" not available!'.format(version)) + raise Exception('Required module(s) for transient F-stat map' \ + ' method "{}" not available!'.format(version)) else: - raise Exception('Transient F-stat map method "{}" not implemented!'.format(version)) + raise Exception('Transient F-stat map method "{}"' \ + ' not implemented!'.format(version)) return FstatMap @@ -215,12 +239,19 @@ def reshape_FstatAtomsVector ( atomsVector ): return atomsDict -def get_absolute_kernel_path ( kernel ): +def _get_absolute_kernel_path ( kernel ): pyfstatdir = os.path.dirname(os.path.abspath(os.path.realpath(__file__))) kernelfile = kernel + '.cu' return os.path.join(pyfstatdir,'pyCUDAkernels',kernelfile) +def _print_GPU_memory_MB ( key ): + mem_used_MB = drv.mem_get_info()[0]/(2.**20) + mem_total_MB = drv.mem_get_info()[1]/(2.**20) + logging.debug('{} GPU memory: {:.4f} / {:.4f} MB free'.format( + key, mem_used_MB, mem_total_MB)) + + def pycuda_compute_transient_fstat_map ( multiFstatAtoms, windowRange ): ''' GPU version of the function to compute transient-window "F-statistic map" @@ -237,7 +268,9 @@ def pycuda_compute_transient_fstat_map ( multiFstatAtoms, windowRange ): ''' if ( windowRange.type >= lalpulsar.TRANSIENT_LAST ): - raise ValueError ('Unknown window-type ({}) passed as input. Allowed are [0,{}].'.format(windowRange.type, lalpulsar.TRANSIENT_LAST-1)) + raise ValueError ('Unknown window-type ({}) passed as input.' \ + ' Allowed are [0,{}].'.format( + windowRange.type, lalpulsar.TRANSIENT_LAST-1)) # internal dict for search/setup parameters tCWparams = {} @@ -264,13 +297,16 @@ def pycuda_compute_transient_fstat_map ( multiFstatAtoms, windowRange ): # actual data spans [t0_data, t0_data + tCWparams['numAtoms'] * TAtom] # in steps of TAtom tCWparams['t0_data'] = int(atoms.data[0].timestamp) - tCWparams['t1_data'] = int(atoms.data[tCWparams['numAtoms']-1].timestamp + tCWparams['TAtom']) + tCWparams['t1_data'] = int(atoms.data[tCWparams['numAtoms']-1].timestamp + + tCWparams['TAtom']) - logging.debug('t0_data={:d}, t1_data={:d}'.format(tCWparams['t0_data'], - tCWparams['t1_data'])) - logging.debug('numAtoms={:d}, TAtom={:d}, TAtomHalf={:d}'.format(tCWparams['numAtoms'], - tCWparams['TAtom'], - TAtomHalf)) + logging.debug('Transient F-stat map:' \ + ' t0_data={:d}, t1_data={:d}'.format( + tCWparams['t0_data'], tCWparams['t1_data'])) + logging.debug('Transient F-stat map:' \ + ' numAtoms={:d}, TAtom={:d},' \ + ' TAtomHalf={:d}'.format( + tCWparams['numAtoms'], tCWparams['TAtom'], TAtomHalf)) # special treatment of window_type = none # ==> replace by rectangular window spanning all the data @@ -283,16 +319,21 @@ def pycuda_compute_transient_fstat_map ( multiFstatAtoms, windowRange ): windowRange.tauBand = 0; windowRange.dtau = tCWparams['TAtom'] # irrelevant - """ NOTE: indices {i,j} enumerate *actual* atoms and their timestamps t_i, while the - * indices {m,n} enumerate the full grid of values in [t0_min, t0_max]x[Tcoh_min, Tcoh_max] in - * steps of deltaT. This allows us to deal with gaps in the data in a transparent way. + """ NOTE: indices {i,j} enumerate *actual* atoms and their timestamps t_i, + * while the indices {m,n} enumerate the full grid of values + * in [t0_min, t0_max]x[Tcoh_min, Tcoh_max] in steps of deltaT. + * This allows us to deal with gaps in the data in a transparent way. * - * NOTE2: we operate on the 'binned' atoms returned from XLALmergeMultiFstatAtomsBinned(), - * which means we can safely assume all atoms to be lined up perfectly on a 'deltaT' binned grid. + * NOTE2: we operate on the 'binned' atoms returned + * from XLALmergeMultiFstatAtomsBinned(), + * which means we can safely assume all atoms to be lined up + * perfectly on a 'deltaT' binned grid. * * The mapping used will therefore be {i,j} -> {m,n}: - * m = offs_i / deltaT = start-time offset from t0_min measured in deltaT - * n = Tcoh_ij / deltaT = duration Tcoh_ij measured in deltaT, + * m = offs_i / deltaT + * start-time offset from t0_min measured in deltaT + * n = Tcoh_ij / deltaT + * duration Tcoh_ij measured in deltaT, * * where * offs_i = t_i - t0_min @@ -301,24 +342,31 @@ def pycuda_compute_transient_fstat_map ( multiFstatAtoms, windowRange ): """ # We allocate a matrix {m x n} = t0Range * TcohRange elements - # covering the full timerange the transient window-range [t0,t0+t0Band]x[tau,tau+tauBand] - tCWparams['N_t0Range'] = int(np.floor( 1.0*windowRange.t0Band / windowRange.dt0 ) + 1) - tCWparams['N_tauRange'] = int(np.floor( 1.0*windowRange.tauBand / windowRange.dtau ) + 1) - FstatMap = pyTransientFstatMap ( tCWparams['N_t0Range'], tCWparams['N_tauRange'] ) - - logging.debug('N_t0Range={:d}, N_tauRange={:d}'.format(tCWparams['N_t0Range'], - tCWparams['N_tauRange'])) + # covering the full transient window-range [t0,t0+t0Band]x[tau,tau+tauBand] + tCWparams['N_t0Range'] = int(np.floor( 1.0*windowRange.t0Band / + windowRange.dt0 ) + 1) + tCWparams['N_tauRange'] = int(np.floor( 1.0*windowRange.tauBand / + windowRange.dtau ) + 1) + FstatMap = pyTransientFstatMap ( tCWparams['N_t0Range'], + tCWparams['N_tauRange'] ) + + logging.debug('Transient F-stat map:' \ + ' N_t0Range={:d}, N_tauRange={:d},' \ + ' total grid points: {:d}'.format( + tCWparams['N_t0Range'], tCWparams['N_tauRange'], + tCWparams['N_t0Range']*tCWparams['N_tauRange'])) if ( windowRange.type == lalpulsar.TRANSIENT_RECTANGULAR ): - FstatMap.F_mn = pycuda_compute_transient_fstat_map_rect ( atomsInputMatrix, - windowRange, - tCWparams ) + FstatMap.F_mn = pycuda_compute_transient_fstat_map_rect ( + atomsInputMatrix, windowRange, tCWparams ) elif ( windowRange.type == lalpulsar.TRANSIENT_EXPONENTIAL ): - FstatMap.F_mn = pycuda_compute_transient_fstat_map_exp ( atomsInputMatrix, - windowRange, - tCWparams ) + FstatMap.F_mn = pycuda_compute_transient_fstat_map_exp ( + atomsInputMatrix, windowRange, tCWparams ) else: - raise ValueError('Invalid transient window type {} not in [{}, {}].'.format(windowRange.type, lalpulsar.TRANSIENT_NONE, lalpulsar.TRANSIENT_LAST-1)) + raise ValueError('Invalid transient window type {}' \ + ' not in [{}, {}].'.format( + windowRange.type, lalpulsar.TRANSIENT_NONE, + lalpulsar.TRANSIENT_LAST-1)) # out of loop: get max2F and ML estimates over the m x n matrix FstatMap.maxF = FstatMap.F_mn.max() @@ -328,23 +376,32 @@ def pycuda_compute_transient_fstat_map ( multiFstatAtoms, windowRange ): FstatMap.t0_ML = windowRange.t0 + maxidx[0] * windowRange.dt0 FstatMap.tau_ML = windowRange.tau + maxidx[1] * windowRange.dtau - logging.debug('Done computing transient F-stat map. maxF={}, t0_ML={}, tau_ML={}'.format(FstatMap.maxF , FstatMap.t0_ML, FstatMap.tau_ML)) + logging.debug('Done computing transient F-stat map.' \ + ' maxF={:.4f}, t0_ML={}, tau_ML={}'.format( + FstatMap.maxF , FstatMap.t0_ML, FstatMap.tau_ML)) return FstatMap -def pycuda_compute_transient_fstat_map_rect ( atomsInputMatrix, windowRange, tCWparams ): - '''only GPU-parallizing outer loop, keeping partial sums with memory in kernel''' +def pycuda_compute_transient_fstat_map_rect ( atomsInputMatrix, + windowRange, + tCWparams ): + ''' + only GPU-parallizing outer loop, + keeping partial sums with memory in kernel + ''' # gpu data setup and transfer - logging.debug('Initial GPU memory: {}/{} MB free'.format(drv.mem_get_info()[0]/(2.**20),drv.mem_get_info()[1]/(2.**20))) + _print_GPU_memory_MB('Initial') input_gpu = gpuarray.to_gpu ( atomsInputMatrix ) - Fmn_gpu = gpuarray.GPUArray ( (tCWparams['N_t0Range'],tCWparams['N_tauRange']), dtype=np.float32 ) - logging.debug('GPU memory with input+output allocated: {}/{} MB free'.format(drv.mem_get_info()[0]/(2.**20), drv.mem_get_info()[1]/(2.**20))) + Fmn_gpu = gpuarray.GPUArray ( (tCWparams['N_t0Range'], + tCWparams['N_tauRange']), + dtype=np.float32 ) + _print_GPU_memory_MB('After input+output allocation:') # GPU kernel kernel = 'cudaTransientFstatRectWindow' - kernelfile = get_absolute_kernel_path ( kernel ) + kernelfile = _get_absolute_kernel_path ( kernel ) partial_Fstat_cuda_code = cudacomp.SourceModule(open(kernelfile,'r').read()) partial_Fstat_cuda = partial_Fstat_cuda_code.get_function(kernel) partial_Fstat_cuda.prepare('PIIIIIIIIP') @@ -354,32 +411,48 @@ def pycuda_compute_transient_fstat_map_rect ( atomsInputMatrix, windowRange, tCW blockCols = 1 gridRows = int(np.ceil(1.0*tCWparams['N_t0Range']/blockRows)) gridCols = 1 - logging.debug('Looking to compute transient F-stat map over a N={}*{}={} (t0,tau) grid...'.format(tCWparams['N_t0Range'],tCWparams['N_tauRange'],tCWparams['N_t0Range']*tCWparams['N_tauRange'])) # running the kernel - logging.debug('Calling kernel with a grid of {}*{}={} blocks of {}*{}={} threads each: {} total threads...'.format(gridRows, gridCols, gridRows*gridCols, blockRows, blockCols, blockRows*blockCols, gridRows*gridCols*blockRows*blockCols)) - partial_Fstat_cuda.prepared_call ( (gridRows,gridCols), (blockRows,blockCols,1), input_gpu.gpudata, tCWparams['numAtoms'], tCWparams['TAtom'], tCWparams['t0_data'], windowRange.t0, windowRange.dt0, windowRange.tau, windowRange.dtau, tCWparams['N_tauRange'], Fmn_gpu.gpudata ) + logging.debug('Calling pyCUDA kernel with a grid of {}*{}={} blocks' \ + ' of {}*{}={} threads each: {} total threads...'.format( + gridRows, gridCols, gridRows*gridCols, + blockRows, blockCols, blockRows*blockCols, + gridRows*gridCols*blockRows*blockCols)) + partial_Fstat_cuda.prepared_call ( (gridRows,gridCols), + (blockRows,blockCols,1), + input_gpu.gpudata, + tCWparams['numAtoms'], + tCWparams['TAtom'], + tCWparams['t0_data'], + windowRange.t0, windowRange.dt0, + windowRange.tau, windowRange.dtau, + tCWparams['N_tauRange'], + Fmn_gpu.gpudata ) # return results to host F_mn = Fmn_gpu.get() - logging.debug('Final GPU memory: {}/{} MB free'.format(drv.mem_get_info()[0]/(2.**20),drv.mem_get_info()[1]/(2.**20))) + _print_GPU_memory_MB('Final') return F_mn -def pycuda_compute_transient_fstat_map_exp ( atomsInputMatrix, windowRange, tCWparams ): +def pycuda_compute_transient_fstat_map_exp ( atomsInputMatrix, + windowRange, + tCWparams ): '''exponential window, inner and outer loop GPU-parallelized''' # gpu data setup and transfer - logging.debug('Initial GPU memory: {}/{} MB free'.format(drv.mem_get_info()[0]/(2.**20),drv.mem_get_info()[1]/(2.**20))) + _print_GPU_memory_MB('Initial') input_gpu = gpuarray.to_gpu ( atomsInputMatrix ) - Fmn_gpu = gpuarray.GPUArray ( (tCWparams['N_t0Range'],tCWparams['N_tauRange']), dtype=np.float32 ) - logging.debug('GPU memory with input+output allocated: {}/{} MB free'.format(drv.mem_get_info()[0]/(2.**20), drv.mem_get_info()[1]/(2.**20))) + Fmn_gpu = gpuarray.GPUArray ( (tCWparams['N_t0Range'], + tCWparams['N_tauRange']), + dtype=np.float32 ) + _print_GPU_memory_MB('After input+output allocation:') # GPU kernel kernel = 'cudaTransientFstatExpWindow' - kernelfile = get_absolute_kernel_path ( kernel ) + kernelfile = _get_absolute_kernel_path ( kernel ) partial_Fstat_cuda_code = cudacomp.SourceModule(open(kernelfile,'r').read()) partial_Fstat_cuda = partial_Fstat_cuda_code.get_function(kernel) partial_Fstat_cuda.prepare('PIIIIIIIIIP') @@ -389,15 +462,28 @@ def pycuda_compute_transient_fstat_map_exp ( atomsInputMatrix, windowRange, tCWp blockCols = min(32,tCWparams['N_tauRange']) gridRows = int(np.ceil(1.0*tCWparams['N_t0Range']/blockRows)) gridCols = int(np.ceil(1.0*tCWparams['N_tauRange']/blockCols)) - logging.debug('Looking to compute transient F-stat map over a N={}*{}={} (t0,tau) grid...'.format(tCWparams['N_t0Range'], tCWparams['N_tauRange'], tCWparams['N_t0Range']*tCWparams['N_tauRange'])) # running the kernel - logging.debug('Calling kernel with a grid of {}*{}={} blocks of {}*{}={} threads each: {} total threads...'.format(gridRows, gridCols, gridRows*gridCols, blockRows, blockCols, blockRows*blockCols, gridRows*gridCols*blockRows*blockCols)) - partial_Fstat_cuda.prepared_call ( (gridRows,gridCols), (blockRows,blockCols,1), input_gpu.gpudata, tCWparams['numAtoms'], tCWparams['TAtom'], tCWparams['t0_data'], windowRange.t0, windowRange.dt0, windowRange.tau, windowRange.dtau, tCWparams['N_t0Range'], tCWparams['N_tauRange'], Fmn_gpu.gpudata ) + logging.debug('Calling kernel with a grid of {}*{}={} blocks' \ + ' of {}*{}={} threads each: {} total threads...'.format( + gridRows, gridCols, gridRows*gridCols, + blockRows, blockCols, blockRows*blockCols, + gridRows*gridCols*blockRows*blockCols)) + partial_Fstat_cuda.prepared_call ( (gridRows,gridCols), + (blockRows,blockCols,1), + input_gpu.gpudata, + tCWparams['numAtoms'], + tCWparams['TAtom'], + tCWparams['t0_data'], + windowRange.t0, windowRange.dt0, + windowRange.tau, windowRange.dtau, + tCWparams['N_t0Range'], + tCWparams['N_tauRange'], + Fmn_gpu.gpudata ) # return results to host F_mn = Fmn_gpu.get() - logging.debug('Final GPU memory: {}/{} MB free'.format(drv.mem_get_info()[0]/(2.**20),drv.mem_get_info()[1]/(2.**20))) + _print_GPU_memory_MB('Final') return F_mn