Skip to content
Snippets Groups Projects
Commit 6de7b834 authored by David Keitel's avatar David Keitel
Browse files

tCW pyCUDA: minor code cleanup

 -make some functions internal in tcw_fstat_map_funcs.py
 -clean up debug messages
 -clean up code formatting
parent 691dae29
No related branches found
No related tags found
No related merge requests found
...@@ -632,8 +632,8 @@ class ComputeFstat(BaseSearchClass): ...@@ -632,8 +632,8 @@ class ComputeFstat(BaseSearchClass):
self.windowRange.dt0 = self.Tsft self.windowRange.dt0 = self.Tsft
self.windowRange.dtau = self.Tsft self.windowRange.dtau = self.Tsft
# special treatment of window_type = none ==> replace by # special treatment of window_type = none
# rectangular window spanning all the data # ==> replace by rectangular window spanning all the data
if self.windowRange.type == lalpulsar.TRANSIENT_NONE: if self.windowRange.type == lalpulsar.TRANSIENT_NONE:
self.windowRange.t0 = int(self.minStartTime) self.windowRange.t0 = int(self.minStartTime)
self.windowRange.t0Band = 0 self.windowRange.t0Band = 0
...@@ -661,6 +661,7 @@ class ComputeFstat(BaseSearchClass): ...@@ -661,6 +661,7 @@ class ComputeFstat(BaseSearchClass):
if self.dtau: if self.dtau:
self.windowRange.dtau = self.dtau self.windowRange.dtau = self.dtau
logging.info('Initialising transient FstatMap features...')
self.tCWFstatMapFeatures, self.gpu_context = ( self.tCWFstatMapFeatures, self.gpu_context = (
tcw.init_transient_fstat_map_features( tcw.init_transient_fstat_map_features(
self.tCWFstatMapVersion == 'pycuda', self.cudaDeviceName)) self.tCWFstatMapVersion == 'pycuda', self.cudaDeviceName))
...@@ -707,15 +708,6 @@ class ComputeFstat(BaseSearchClass): ...@@ -707,15 +708,6 @@ class ComputeFstat(BaseSearchClass):
# F-stat computation # F-stat computation
self.windowRange.tau = int(2*self.Tsft) self.windowRange.tau = int(2*self.Tsft)
# logging.debug(
# 'Calling "%s" version of ComputeTransientFstatMap() with\
# windowRange: (type=%d (%s), t0=%f, t0Band=%f, dt0=%f, tau=%f,\
# tauBand=%f, dtau=%f)...' % (
# self.tCWFstatMapVersion, self.windowRange.type,
# self.transientWindowType, self.windowRange.t0,
# self.windowRange.t0Band, self.windowRange.dt0,
# self.windowRange.tau, self.windowRange.tauBand,
# self.windowRange.dtau))
self.FstatMap = tcw.call_compute_transient_fstat_map( self.FstatMap = tcw.call_compute_transient_fstat_map(
self.tCWFstatMapVersion, self.tCWFstatMapFeatures, self.tCWFstatMapVersion, self.tCWFstatMapFeatures,
self.FstatResults.multiFatoms[0], self.windowRange) self.FstatResults.multiFatoms[0], self.windowRange)
...@@ -724,13 +716,6 @@ class ComputeFstat(BaseSearchClass): ...@@ -724,13 +716,6 @@ class ComputeFstat(BaseSearchClass):
else: else:
F_mn = self.FstatMap.F_mn F_mn = self.FstatMap.F_mn
# logging.debug('maxF: {}'.format(self.FstatMap.maxF))
# logging.debug('t0_ML: %ds=T0+%fd' % (
# self.FstatMap.t0_ML, (self.FstatMap.t0_ML-tstart)/(3600.*24.)))
# logging.debug('tau_ML: %ds=%fd' % (
# self.FstatMap.tau_ML, self.FstatMap.tau_ML/(3600.*24.)))
# logging.debug('F_mn: {}'.format(F_mn))
twoF = 2*np.max(F_mn) twoF = 2*np.max(F_mn)
if self.BSGL is False: if self.BSGL is False:
if np.isnan(twoF): if np.isnan(twoF):
......
...@@ -435,6 +435,10 @@ class TransientGridSearch(GridSearch): ...@@ -435,6 +435,10 @@ class TransientGridSearch(GridSearch):
self.inititate_search_object() self.inititate_search_object()
data = [] data = []
if self.outputTransientFstatMap:
tCWfilebase = os.path.splitext(self.out_file)[0] + '_tCW_'
logging.info('Will save per-Doppler Fstatmap' \
' results to {}*.dat'.format(tCWfilebase))
for vals in tqdm(self.input_data): for vals in tqdm(self.input_data):
detstat = self.search.get_det_stat(*vals) detstat = self.search.get_det_stat(*vals)
windowRange = getattr(self.search, 'windowRange', None) windowRange = getattr(self.search, 'windowRange', None)
...@@ -446,11 +450,18 @@ class TransientGridSearch(GridSearch): ...@@ -446,11 +450,18 @@ class TransientGridSearch(GridSearch):
else: else:
F_mn = FstatMap.F_mn F_mn = FstatMap.F_mn
if self.outputTransientFstatMap: if self.outputTransientFstatMap:
tCWfile = os.path.splitext(self.out_file)[0]+'_tCW_%.16f_%.16f_%.16f_%.16g_%.16g.dat' % (vals[2],vals[5],vals[6],vals[3],vals[4]) # freq alpha delta f1dot f2dot # per-Doppler filename convention:
# freq alpha delta f1dot f2dot
tCWfile = ( tCWfilebase
+ '%.16f_%.16f_%.16f_%.16g_%.16g.dat' %
(vals[2],vals[5],vals[6],vals[3],vals[4]) )
if self.tCWFstatMapVersion == 'lal': if self.tCWFstatMapVersion == 'lal':
fo = lal.FileOpen(tCWfile, 'w') fo = lal.FileOpen(tCWfile, 'w')
lalpulsar.write_transientFstatMap_to_fp ( fo, FstatMap, windowRange, None ) lalpulsar.write_transientFstatMap_to_fp (
del fo # instead of lal.FileClose() which is not SWIG-exported fo, FstatMap, windowRange, None )
# instead of lal.FileClose(),
# which is not SWIG-exported:
del fo
else: else:
self.write_F_mn ( tCWfile, F_mn, windowRange) self.write_F_mn ( tCWfile, F_mn, windowRange)
maxidx = np.unravel_index(F_mn.argmax(), F_mn.shape) maxidx = np.unravel_index(F_mn.argmax(), F_mn.shape)
......
...@@ -26,7 +26,8 @@ __global__ void cudaTransientFstatExpWindow ( float *input, ...@@ -26,7 +26,8 @@ __global__ void cudaTransientFstatExpWindow ( float *input,
/* compute Fstat-atom index i_t0 in [0, numAtoms) */ /* compute Fstat-atom index i_t0 in [0, numAtoms) */
unsigned int TAtomHalf = TAtom/2; // integer division unsigned int TAtomHalf = TAtom/2; // integer division
unsigned int t0 = win_t0 + m * win_dt0; unsigned int t0 = win_t0 + m * win_dt0;
int i_tmp = ( t0 - t0_data + TAtomHalf ) / TAtom; // integer round: floor(x+0.5) /* integer round: floor(x+0.5) */
int i_tmp = ( t0 - t0_data + TAtomHalf ) / TAtom;
if ( i_tmp < 0 ) { if ( i_tmp < 0 ) {
i_tmp = 0; i_tmp = 0;
} }
...@@ -35,22 +36,26 @@ __global__ void cudaTransientFstatExpWindow ( float *input, ...@@ -35,22 +36,26 @@ __global__ void cudaTransientFstatExpWindow ( float *input,
i_t0 = numAtoms - 1; i_t0 = numAtoms - 1;
} }
/* translate n into an atoms end-index for this search interval [t0, t0+Tcoh], /* translate n into an atoms end-index
* for this search interval [t0, t0+Tcoh],
* giving the index range of atoms to sum over * giving the index range of atoms to sum over
*/ */
unsigned int tau = win_tau + n * win_dtau; unsigned int tau = win_tau + n * win_dtau;
/* get end-time t1 of this transient-window search /* get end-time t1 of this transient-window search
* for given tau, what Tcoh should the exponential window cover? * for given tau, what Tcoh should the exponential window cover?
* for speed reasons we want to truncate Tcoh = tau * TRANSIENT_EXP_EFOLDING * for speed reasons we want to truncate
* Tcoh = tau * TRANSIENT_EXP_EFOLDING
* with the e-folding factor chosen such that the window-value * with the e-folding factor chosen such that the window-value
* is practically negligible after that, where it will be set to 0 * is practically negligible after that, where it will be set to 0
*/ */
// unsigned int t1 = lround( win_t0 + TRANSIENT_EXP_EFOLDING * win_tau); // unsigned int t1 = lround( win_t0 + TRANSIENT_EXP_EFOLDING * win_tau);
unsigned int t1 = t0 + TRANSIENT_EXP_EFOLDING * tau; unsigned int t1 = t0 + TRANSIENT_EXP_EFOLDING * tau;
/* compute window end-time Fstat-atom index i_t1 in [0, numAtoms) */ /* compute window end-time Fstat-atom index i_t1 in [0, numAtoms)
i_tmp = ( t1 - t0_data + TAtomHalf ) / TAtom - 1; // integer round: floor(x+0.5) * using integer round: floor(x+0.5)
*/
i_tmp = ( t1 - t0_data + TAtomHalf ) / TAtom - 1;
if ( i_tmp < 0 ) { if ( i_tmp < 0 ) {
i_tmp = 0; i_tmp = 0;
} }
......
...@@ -20,7 +20,8 @@ __global__ void cudaTransientFstatRectWindow ( float *input, ...@@ -20,7 +20,8 @@ __global__ void cudaTransientFstatRectWindow ( float *input,
/* compute Fstat-atom index i_t0 in [0, numAtoms) */ /* compute Fstat-atom index i_t0 in [0, numAtoms) */
unsigned int TAtomHalf = TAtom/2; // integer division unsigned int TAtomHalf = TAtom/2; // integer division
unsigned int t0 = win_t0 + m * win_dt0; unsigned int t0 = win_t0 + m * win_dt0;
int i_tmp = ( t0 - t0_data + TAtomHalf ) / TAtom; // integer round: floor(x+0.5) /* integer round: floor(x+0.5) */
int i_tmp = ( t0 - t0_data + TAtomHalf ) / TAtom;
if ( i_tmp < 0 ) { if ( i_tmp < 0 ) {
i_tmp = 0; i_tmp = 0;
} }
...@@ -46,7 +47,8 @@ __global__ void cudaTransientFstatRectWindow ( float *input, ...@@ -46,7 +47,8 @@ __global__ void cudaTransientFstatRectWindow ( float *input,
if ( (m < N_tauRange) && (n < N_tauRange) ) { if ( (m < N_tauRange) && (n < N_tauRange) ) {
/* translate n into an atoms end-index for this search interval [t0, t0+Tcoh], /* translate n into an atoms end-index
* for this search interval [t0, t0+Tcoh],
* giving the index range of atoms to sum over * giving the index range of atoms to sum over
*/ */
unsigned int tau = win_tau + n * win_dtau; unsigned int tau = win_tau + n * win_dtau;
...@@ -54,8 +56,10 @@ __global__ void cudaTransientFstatRectWindow ( float *input, ...@@ -54,8 +56,10 @@ __global__ void cudaTransientFstatRectWindow ( float *input,
/* get end-time t1 of this transient-window search */ /* get end-time t1 of this transient-window search */
unsigned int t1 = t0 + tau; unsigned int t1 = t0 + tau;
/* compute window end-time Fstat-atom index i_t1 in [0, numAtoms) */ /* compute window end-time Fstat-atom index i_t1 in [0, numAtoms)
i_tmp = ( t1 - t0_data + TAtomHalf ) / TAtom - 1; // integer round: floor(x+0.5) * using integer round: floor(x+0.5)
*/
i_tmp = ( t1 - t0_data + TAtomHalf ) / TAtom - 1;
if ( i_tmp < 0 ) { if ( i_tmp < 0 ) {
i_tmp = 0; i_tmp = 0;
} }
...@@ -72,7 +76,8 @@ __global__ void cudaTransientFstatRectWindow ( float *input, ...@@ -72,7 +76,8 @@ __global__ void cudaTransientFstatRectWindow ( float *input,
/* sum up atoms, /* sum up atoms,
* special optimiziation in the rectangular-window case: * special optimiziation in the rectangular-window case:
* just add on to previous tau values, * just add on to previous tau values,
* ie re-use the sum over [i_t0, i_t1_last] from the pevious tau-loop iteration * ie re-use the sum over [i_t0, i_t1_last]
from the pevious tau-loop iteration
*/ */
Ad += input[i*input_cols+0]; // a2_alpha Ad += input[i*input_cols+0]; // a2_alpha
Bd += input[i*input_cols+1]; // b2_alpha Bd += input[i*input_cols+1]; // b2_alpha
......
...@@ -9,7 +9,7 @@ import logging ...@@ -9,7 +9,7 @@ import logging
import importlib as imp import importlib as imp
def optional_import ( modulename, shorthand=None ): def _optional_import ( modulename, shorthand=None ):
''' '''
Import a module/submodule only if it's available. Import a module/submodule only if it's available.
...@@ -29,12 +29,16 @@ def optional_import ( modulename, shorthand=None ): ...@@ -29,12 +29,16 @@ def optional_import ( modulename, shorthand=None ):
if('pycuda' in sys.modules): if('pycuda' in sys.modules):
try: try:
globals()[shorthand] = imp.import_module(modulename) globals()[shorthand] = imp.import_module(modulename)
logging.debug('Successfully imported module %s%s.' % (modulename, shorthandbit)) logging.debug('Successfully imported module %s%s.'
% (modulename, shorthandbit))
success = True success = True
except pycuda._driver.LogicError, e: except pycuda._driver.LogicError, e:
if e.message == 'cuDeviceGet failed: invalid device ordinal': if e.message == 'cuDeviceGet failed: invalid device ordinal':
devn = int(os.environ['CUDA_DEVICE']) devn = int(os.environ['CUDA_DEVICE'])
raise RuntimeError('Requested CUDA device number {} exceeds number of available devices! Please change through environment variable $CUDA_DEVICE.'.format(devn)) raise RuntimeError('Requested CUDA device number {} exceeds' \
' number of available devices!' \
' Please change through environment' \
' variable $CUDA_DEVICE.'.format(devn))
else: else:
raise pycuda._driver.LogicError(e.message) raise pycuda._driver.LogicError(e.message)
except ImportError, e: except ImportError, e:
...@@ -46,7 +50,8 @@ def optional_import ( modulename, shorthand=None ): ...@@ -46,7 +50,8 @@ def optional_import ( modulename, shorthand=None ):
else: else:
try: try:
globals()[shorthand] = imp.import_module(modulename) globals()[shorthand] = imp.import_module(modulename)
logging.debug('Successfully imported module %s%s.' % (modulename, shorthandbit)) logging.debug('Successfully imported module %s%s.'
% (modulename, shorthandbit))
success = True success = True
except ImportError, e: except ImportError, e:
if e.message == 'No module named '+modulename: if e.message == 'No module named '+modulename:
...@@ -72,7 +77,11 @@ class pyTransientFstatMap(object): ...@@ -72,7 +77,11 @@ class pyTransientFstatMap(object):
def __init__(self, N_t0Range, N_tauRange): def __init__(self, N_t0Range, N_tauRange):
self.F_mn = np.zeros((N_t0Range, N_tauRange), dtype=np.float32) self.F_mn = np.zeros((N_t0Range, N_tauRange), dtype=np.float32)
self.maxF = float(-1.0) # Initializing to a negative value ensures that we always update at least once and hence return sane t0_d_ML, tau_d_ML even if there is only a single bin where F=0 happens. # Initializing maxF to a negative value ensures
# that we always update at least once and hence return
# sane t0_d_ML, tau_d_ML
# even if there is only a single bin where F=0 happens.
self.maxF = float(-1.0)
self.t0_ML = float(0.0) self.t0_ML = float(0.0)
self.tau_ML = float(0.0) self.tau_ML = float(0.0)
...@@ -100,26 +109,28 @@ def init_transient_fstat_map_features ( wantCuda=False, cudaDeviceName=None ): ...@@ -100,26 +109,28 @@ def init_transient_fstat_map_features ( wantCuda=False, cudaDeviceName=None ):
features = {} features = {}
have_lal = optional_import('lal') have_lal = _optional_import('lal')
have_lalpulsar = optional_import('lalpulsar') have_lalpulsar = _optional_import('lalpulsar')
features['lal'] = have_lal and have_lalpulsar features['lal'] = have_lal and have_lalpulsar
# import GPU features # import GPU features
have_pycuda = optional_import('pycuda') have_pycuda = _optional_import('pycuda')
have_pycuda_drv = optional_import('pycuda.driver', 'drv') have_pycuda_drv = _optional_import('pycuda.driver', 'drv')
have_pycuda_gpuarray = optional_import('pycuda.gpuarray', 'gpuarray') have_pycuda_gpuarray = _optional_import('pycuda.gpuarray', 'gpuarray')
have_pycuda_tools = optional_import('pycuda.tools', 'cudatools') have_pycuda_tools = _optional_import('pycuda.tools', 'cudatools')
have_pycuda_compiler = optional_import('pycuda.compiler', 'cudacomp') have_pycuda_compiler = _optional_import('pycuda.compiler', 'cudacomp')
features['pycuda'] = have_pycuda_drv and have_pycuda_gpuarray and have_pycuda_tools and have_pycuda_compiler features['pycuda'] = ( have_pycuda_drv and have_pycuda_gpuarray and
have_pycuda_tools and have_pycuda_compiler )
logging.debug('Got the following features for transient F-stat maps:') logging.debug('Got the following features for transient F-stat maps:')
logging.debug(features) logging.debug(features)
if wantCuda and features['pycuda']: if wantCuda and features['pycuda']:
logging.debug('CUDA version: {}'.format(drv.get_version())) logging.debug('CUDA version: '+'.'.join(map(str,drv.get_version())))
drv.init() drv.init()
logging.debug('Starting with default context, then checking all available devices...') logging.debug('Starting with default pyCUDA context,' \
' then checking all available devices...')
context0 = pycuda.tools.make_default_context() context0 = pycuda.tools.make_default_context()
num_gpus = drv.Device.count() num_gpus = drv.Device.count()
...@@ -131,7 +142,8 @@ def init_transient_fstat_map_features ( wantCuda=False, cudaDeviceName=None ): ...@@ -131,7 +142,8 @@ def init_transient_fstat_map_features ( wantCuda=False, cudaDeviceName=None ):
devn = drv.Device(n) devn = drv.Device(n)
devices.append(devn) devices.append(devn)
devnames[n] = devn.name().replace(' ','-').replace('_','-') devnames[n] = devn.name().replace(' ','-').replace('_','-')
logging.debug('device {}: model: {}, RAM: {}MB'.format(n,devnames[n],devn.total_memory()/(2.**20) )) logging.debug('device {}: model: {}, RAM: {}MB'.format(
n, devnames[n], devn.total_memory()/(2.**20) ))
if 'CUDA_DEVICE' in os.environ: if 'CUDA_DEVICE' in os.environ:
devnum0 = int(os.environ['CUDA_DEVICE']) devnum0 = int(os.environ['CUDA_DEVICE'])
...@@ -141,14 +153,19 @@ def init_transient_fstat_map_features ( wantCuda=False, cudaDeviceName=None ): ...@@ -141,14 +153,19 @@ def init_transient_fstat_map_features ( wantCuda=False, cudaDeviceName=None ):
matchbit = '' matchbit = ''
if cudaDeviceName: if cudaDeviceName:
# allow partial matches in device names # allow partial matches in device names
devmatches = [devidx for devidx, devname in enumerate(devnames) if cudaDeviceName in devname] devmatches = [devidx for devidx, devname in enumerate(devnames)
if cudaDeviceName in devname]
if len(devmatches) == 0: if len(devmatches) == 0:
context0.detach() context0.detach()
raise RuntimeError('Requested CUDA device "{}" not found. Available devices: [{}]'.format(cudaDeviceName,','.join(devnames))) raise RuntimeError('Requested CUDA device "{}" not found.' \
' Available devices: [{}]'.format(
cudaDeviceName,','.join(devnames)))
else: else:
devnum = devmatches[0] devnum = devmatches[0]
if len(devmatches) > 1: if len(devmatches) > 1:
logging.warning('Found {} CUDA devices matching name "{}". Choosing first one with index {}.'.format(len(devmatches),cudaDeviceName,devnum)) logging.warning('Found {} CUDA devices matching name "{}".' \
' Choosing first one with index {}.'.format(
len(devmatches),cudaDeviceName,devnum))
os.environ['CUDA_DEVICE'] = str(devnum) os.environ['CUDA_DEVICE'] = str(devnum)
matchbit = '(matched to user request "{}")'.format(cudaDeviceName) matchbit = '(matched to user request "{}")'.format(cudaDeviceName)
elif 'CUDA_DEVICE' in os.environ: elif 'CUDA_DEVICE' in os.environ:
...@@ -156,7 +173,9 @@ def init_transient_fstat_map_features ( wantCuda=False, cudaDeviceName=None ): ...@@ -156,7 +173,9 @@ def init_transient_fstat_map_features ( wantCuda=False, cudaDeviceName=None ):
else: else:
devnum = 0 devnum = 0
devn = devices[devnum] devn = devices[devnum]
logging.info('Choosing CUDA device {}, of {} devices present: {}{}...'.format(devnum,num_gpus,devn.name(),matchbit)) logging.info('Choosing CUDA device {},' \
' of {} devices present: {}{}...'.format(
devnum, num_gpus, devn.name(), matchbit))
if devnum == devnum0: if devnum == devnum0:
gpu_context = context0 gpu_context = context0
else: else:
...@@ -164,23 +183,28 @@ def init_transient_fstat_map_features ( wantCuda=False, cudaDeviceName=None ): ...@@ -164,23 +183,28 @@ def init_transient_fstat_map_features ( wantCuda=False, cudaDeviceName=None ):
gpu_context = pycuda.tools.make_default_context() gpu_context = pycuda.tools.make_default_context()
gpu_context.push() gpu_context.push()
logging.debug('Available GPU memory: {}/{} MB free'.format(drv.mem_get_info()[0]/(2.**20),drv.mem_get_info()[1]/(2.**20))) _print_GPU_memory_MB('Available')
else: else:
gpu_context = None gpu_context = None
return features, gpu_context return features, gpu_context
def call_compute_transient_fstat_map ( version, features, multiFstatAtoms=None, windowRange=None ): def call_compute_transient_fstat_map ( version,
features,
multiFstatAtoms=None,
windowRange=None ):
'''Choose which version of the ComputeTransientFstatMap function to call.''' '''Choose which version of the ComputeTransientFstatMap function to call.'''
if version in fstatmap_versions: if version in fstatmap_versions:
if features[version]: if features[version]:
FstatMap = fstatmap_versions[version](multiFstatAtoms, windowRange) FstatMap = fstatmap_versions[version](multiFstatAtoms, windowRange)
else: else:
raise Exception('Required module(s) for transient F-stat map method "{}" not available!'.format(version)) raise Exception('Required module(s) for transient F-stat map' \
' method "{}" not available!'.format(version))
else: else:
raise Exception('Transient F-stat map method "{}" not implemented!'.format(version)) raise Exception('Transient F-stat map method "{}"' \
' not implemented!'.format(version))
return FstatMap return FstatMap
...@@ -215,12 +239,19 @@ def reshape_FstatAtomsVector ( atomsVector ): ...@@ -215,12 +239,19 @@ def reshape_FstatAtomsVector ( atomsVector ):
return atomsDict return atomsDict
def get_absolute_kernel_path ( kernel ): def _get_absolute_kernel_path ( kernel ):
pyfstatdir = os.path.dirname(os.path.abspath(os.path.realpath(__file__))) pyfstatdir = os.path.dirname(os.path.abspath(os.path.realpath(__file__)))
kernelfile = kernel + '.cu' kernelfile = kernel + '.cu'
return os.path.join(pyfstatdir,'pyCUDAkernels',kernelfile) return os.path.join(pyfstatdir,'pyCUDAkernels',kernelfile)
def _print_GPU_memory_MB ( key ):
mem_used_MB = drv.mem_get_info()[0]/(2.**20)
mem_total_MB = drv.mem_get_info()[1]/(2.**20)
logging.debug('{} GPU memory: {:.4f} / {:.4f} MB free'.format(
key, mem_used_MB, mem_total_MB))
def pycuda_compute_transient_fstat_map ( multiFstatAtoms, windowRange ): def pycuda_compute_transient_fstat_map ( multiFstatAtoms, windowRange ):
''' '''
GPU version of the function to compute transient-window "F-statistic map" GPU version of the function to compute transient-window "F-statistic map"
...@@ -237,7 +268,9 @@ def pycuda_compute_transient_fstat_map ( multiFstatAtoms, windowRange ): ...@@ -237,7 +268,9 @@ def pycuda_compute_transient_fstat_map ( multiFstatAtoms, windowRange ):
''' '''
if ( windowRange.type >= lalpulsar.TRANSIENT_LAST ): if ( windowRange.type >= lalpulsar.TRANSIENT_LAST ):
raise ValueError ('Unknown window-type ({}) passed as input. Allowed are [0,{}].'.format(windowRange.type, lalpulsar.TRANSIENT_LAST-1)) raise ValueError ('Unknown window-type ({}) passed as input.' \
' Allowed are [0,{}].'.format(
windowRange.type, lalpulsar.TRANSIENT_LAST-1))
# internal dict for search/setup parameters # internal dict for search/setup parameters
tCWparams = {} tCWparams = {}
...@@ -264,13 +297,16 @@ def pycuda_compute_transient_fstat_map ( multiFstatAtoms, windowRange ): ...@@ -264,13 +297,16 @@ def pycuda_compute_transient_fstat_map ( multiFstatAtoms, windowRange ):
# actual data spans [t0_data, t0_data + tCWparams['numAtoms'] * TAtom] # actual data spans [t0_data, t0_data + tCWparams['numAtoms'] * TAtom]
# in steps of TAtom # in steps of TAtom
tCWparams['t0_data'] = int(atoms.data[0].timestamp) tCWparams['t0_data'] = int(atoms.data[0].timestamp)
tCWparams['t1_data'] = int(atoms.data[tCWparams['numAtoms']-1].timestamp + tCWparams['TAtom']) tCWparams['t1_data'] = int(atoms.data[tCWparams['numAtoms']-1].timestamp
+ tCWparams['TAtom'])
logging.debug('t0_data={:d}, t1_data={:d}'.format(tCWparams['t0_data'], logging.debug('Transient F-stat map:' \
tCWparams['t1_data'])) ' t0_data={:d}, t1_data={:d}'.format(
logging.debug('numAtoms={:d}, TAtom={:d}, TAtomHalf={:d}'.format(tCWparams['numAtoms'], tCWparams['t0_data'], tCWparams['t1_data']))
tCWparams['TAtom'], logging.debug('Transient F-stat map:' \
TAtomHalf)) ' numAtoms={:d}, TAtom={:d},' \
' TAtomHalf={:d}'.format(
tCWparams['numAtoms'], tCWparams['TAtom'], TAtomHalf))
# special treatment of window_type = none # special treatment of window_type = none
# ==> replace by rectangular window spanning all the data # ==> replace by rectangular window spanning all the data
...@@ -283,16 +319,21 @@ def pycuda_compute_transient_fstat_map ( multiFstatAtoms, windowRange ): ...@@ -283,16 +319,21 @@ def pycuda_compute_transient_fstat_map ( multiFstatAtoms, windowRange ):
windowRange.tauBand = 0; windowRange.tauBand = 0;
windowRange.dtau = tCWparams['TAtom'] # irrelevant windowRange.dtau = tCWparams['TAtom'] # irrelevant
""" NOTE: indices {i,j} enumerate *actual* atoms and their timestamps t_i, while the """ NOTE: indices {i,j} enumerate *actual* atoms and their timestamps t_i,
* indices {m,n} enumerate the full grid of values in [t0_min, t0_max]x[Tcoh_min, Tcoh_max] in * while the indices {m,n} enumerate the full grid of values
* steps of deltaT. This allows us to deal with gaps in the data in a transparent way. * in [t0_min, t0_max]x[Tcoh_min, Tcoh_max] in steps of deltaT.
* This allows us to deal with gaps in the data in a transparent way.
* *
* NOTE2: we operate on the 'binned' atoms returned from XLALmergeMultiFstatAtomsBinned(), * NOTE2: we operate on the 'binned' atoms returned
* which means we can safely assume all atoms to be lined up perfectly on a 'deltaT' binned grid. * from XLALmergeMultiFstatAtomsBinned(),
* which means we can safely assume all atoms to be lined up
* perfectly on a 'deltaT' binned grid.
* *
* The mapping used will therefore be {i,j} -> {m,n}: * The mapping used will therefore be {i,j} -> {m,n}:
* m = offs_i / deltaT = start-time offset from t0_min measured in deltaT * m = offs_i / deltaT
* n = Tcoh_ij / deltaT = duration Tcoh_ij measured in deltaT, * start-time offset from t0_min measured in deltaT
* n = Tcoh_ij / deltaT
* duration Tcoh_ij measured in deltaT,
* *
* where * where
* offs_i = t_i - t0_min * offs_i = t_i - t0_min
...@@ -301,24 +342,31 @@ def pycuda_compute_transient_fstat_map ( multiFstatAtoms, windowRange ): ...@@ -301,24 +342,31 @@ def pycuda_compute_transient_fstat_map ( multiFstatAtoms, windowRange ):
""" """
# We allocate a matrix {m x n} = t0Range * TcohRange elements # We allocate a matrix {m x n} = t0Range * TcohRange elements
# covering the full timerange the transient window-range [t0,t0+t0Band]x[tau,tau+tauBand] # covering the full transient window-range [t0,t0+t0Band]x[tau,tau+tauBand]
tCWparams['N_t0Range'] = int(np.floor( 1.0*windowRange.t0Band / windowRange.dt0 ) + 1) tCWparams['N_t0Range'] = int(np.floor( 1.0*windowRange.t0Band /
tCWparams['N_tauRange'] = int(np.floor( 1.0*windowRange.tauBand / windowRange.dtau ) + 1) windowRange.dt0 ) + 1)
FstatMap = pyTransientFstatMap ( tCWparams['N_t0Range'], tCWparams['N_tauRange'] ) tCWparams['N_tauRange'] = int(np.floor( 1.0*windowRange.tauBand /
windowRange.dtau ) + 1)
logging.debug('N_t0Range={:d}, N_tauRange={:d}'.format(tCWparams['N_t0Range'], FstatMap = pyTransientFstatMap ( tCWparams['N_t0Range'],
tCWparams['N_tauRange'])) tCWparams['N_tauRange'] )
logging.debug('Transient F-stat map:' \
' N_t0Range={:d}, N_tauRange={:d},' \
' total grid points: {:d}'.format(
tCWparams['N_t0Range'], tCWparams['N_tauRange'],
tCWparams['N_t0Range']*tCWparams['N_tauRange']))
if ( windowRange.type == lalpulsar.TRANSIENT_RECTANGULAR ): if ( windowRange.type == lalpulsar.TRANSIENT_RECTANGULAR ):
FstatMap.F_mn = pycuda_compute_transient_fstat_map_rect ( atomsInputMatrix, FstatMap.F_mn = pycuda_compute_transient_fstat_map_rect (
windowRange, atomsInputMatrix, windowRange, tCWparams )
tCWparams )
elif ( windowRange.type == lalpulsar.TRANSIENT_EXPONENTIAL ): elif ( windowRange.type == lalpulsar.TRANSIENT_EXPONENTIAL ):
FstatMap.F_mn = pycuda_compute_transient_fstat_map_exp ( atomsInputMatrix, FstatMap.F_mn = pycuda_compute_transient_fstat_map_exp (
windowRange, atomsInputMatrix, windowRange, tCWparams )
tCWparams )
else: else:
raise ValueError('Invalid transient window type {} not in [{}, {}].'.format(windowRange.type, lalpulsar.TRANSIENT_NONE, lalpulsar.TRANSIENT_LAST-1)) raise ValueError('Invalid transient window type {}' \
' not in [{}, {}].'.format(
windowRange.type, lalpulsar.TRANSIENT_NONE,
lalpulsar.TRANSIENT_LAST-1))
# out of loop: get max2F and ML estimates over the m x n matrix # out of loop: get max2F and ML estimates over the m x n matrix
FstatMap.maxF = FstatMap.F_mn.max() FstatMap.maxF = FstatMap.F_mn.max()
...@@ -328,23 +376,32 @@ def pycuda_compute_transient_fstat_map ( multiFstatAtoms, windowRange ): ...@@ -328,23 +376,32 @@ def pycuda_compute_transient_fstat_map ( multiFstatAtoms, windowRange ):
FstatMap.t0_ML = windowRange.t0 + maxidx[0] * windowRange.dt0 FstatMap.t0_ML = windowRange.t0 + maxidx[0] * windowRange.dt0
FstatMap.tau_ML = windowRange.tau + maxidx[1] * windowRange.dtau FstatMap.tau_ML = windowRange.tau + maxidx[1] * windowRange.dtau
logging.debug('Done computing transient F-stat map. maxF={}, t0_ML={}, tau_ML={}'.format(FstatMap.maxF , FstatMap.t0_ML, FstatMap.tau_ML)) logging.debug('Done computing transient F-stat map.' \
' maxF={:.4f}, t0_ML={}, tau_ML={}'.format(
FstatMap.maxF , FstatMap.t0_ML, FstatMap.tau_ML))
return FstatMap return FstatMap
def pycuda_compute_transient_fstat_map_rect ( atomsInputMatrix, windowRange, tCWparams ): def pycuda_compute_transient_fstat_map_rect ( atomsInputMatrix,
'''only GPU-parallizing outer loop, keeping partial sums with memory in kernel''' windowRange,
tCWparams ):
'''
only GPU-parallizing outer loop,
keeping partial sums with memory in kernel
'''
# gpu data setup and transfer # gpu data setup and transfer
logging.debug('Initial GPU memory: {}/{} MB free'.format(drv.mem_get_info()[0]/(2.**20),drv.mem_get_info()[1]/(2.**20))) _print_GPU_memory_MB('Initial')
input_gpu = gpuarray.to_gpu ( atomsInputMatrix ) input_gpu = gpuarray.to_gpu ( atomsInputMatrix )
Fmn_gpu = gpuarray.GPUArray ( (tCWparams['N_t0Range'],tCWparams['N_tauRange']), dtype=np.float32 ) Fmn_gpu = gpuarray.GPUArray ( (tCWparams['N_t0Range'],
logging.debug('GPU memory with input+output allocated: {}/{} MB free'.format(drv.mem_get_info()[0]/(2.**20), drv.mem_get_info()[1]/(2.**20))) tCWparams['N_tauRange']),
dtype=np.float32 )
_print_GPU_memory_MB('After input+output allocation:')
# GPU kernel # GPU kernel
kernel = 'cudaTransientFstatRectWindow' kernel = 'cudaTransientFstatRectWindow'
kernelfile = get_absolute_kernel_path ( kernel ) kernelfile = _get_absolute_kernel_path ( kernel )
partial_Fstat_cuda_code = cudacomp.SourceModule(open(kernelfile,'r').read()) partial_Fstat_cuda_code = cudacomp.SourceModule(open(kernelfile,'r').read())
partial_Fstat_cuda = partial_Fstat_cuda_code.get_function(kernel) partial_Fstat_cuda = partial_Fstat_cuda_code.get_function(kernel)
partial_Fstat_cuda.prepare('PIIIIIIIIP') partial_Fstat_cuda.prepare('PIIIIIIIIP')
...@@ -354,32 +411,48 @@ def pycuda_compute_transient_fstat_map_rect ( atomsInputMatrix, windowRange, tCW ...@@ -354,32 +411,48 @@ def pycuda_compute_transient_fstat_map_rect ( atomsInputMatrix, windowRange, tCW
blockCols = 1 blockCols = 1
gridRows = int(np.ceil(1.0*tCWparams['N_t0Range']/blockRows)) gridRows = int(np.ceil(1.0*tCWparams['N_t0Range']/blockRows))
gridCols = 1 gridCols = 1
logging.debug('Looking to compute transient F-stat map over a N={}*{}={} (t0,tau) grid...'.format(tCWparams['N_t0Range'],tCWparams['N_tauRange'],tCWparams['N_t0Range']*tCWparams['N_tauRange']))
# running the kernel # running the kernel
logging.debug('Calling kernel with a grid of {}*{}={} blocks of {}*{}={} threads each: {} total threads...'.format(gridRows, gridCols, gridRows*gridCols, blockRows, blockCols, blockRows*blockCols, gridRows*gridCols*blockRows*blockCols)) logging.debug('Calling pyCUDA kernel with a grid of {}*{}={} blocks' \
partial_Fstat_cuda.prepared_call ( (gridRows,gridCols), (blockRows,blockCols,1), input_gpu.gpudata, tCWparams['numAtoms'], tCWparams['TAtom'], tCWparams['t0_data'], windowRange.t0, windowRange.dt0, windowRange.tau, windowRange.dtau, tCWparams['N_tauRange'], Fmn_gpu.gpudata ) ' of {}*{}={} threads each: {} total threads...'.format(
gridRows, gridCols, gridRows*gridCols,
blockRows, blockCols, blockRows*blockCols,
gridRows*gridCols*blockRows*blockCols))
partial_Fstat_cuda.prepared_call ( (gridRows,gridCols),
(blockRows,blockCols,1),
input_gpu.gpudata,
tCWparams['numAtoms'],
tCWparams['TAtom'],
tCWparams['t0_data'],
windowRange.t0, windowRange.dt0,
windowRange.tau, windowRange.dtau,
tCWparams['N_tauRange'],
Fmn_gpu.gpudata )
# return results to host # return results to host
F_mn = Fmn_gpu.get() F_mn = Fmn_gpu.get()
logging.debug('Final GPU memory: {}/{} MB free'.format(drv.mem_get_info()[0]/(2.**20),drv.mem_get_info()[1]/(2.**20))) _print_GPU_memory_MB('Final')
return F_mn return F_mn
def pycuda_compute_transient_fstat_map_exp ( atomsInputMatrix, windowRange, tCWparams ): def pycuda_compute_transient_fstat_map_exp ( atomsInputMatrix,
windowRange,
tCWparams ):
'''exponential window, inner and outer loop GPU-parallelized''' '''exponential window, inner and outer loop GPU-parallelized'''
# gpu data setup and transfer # gpu data setup and transfer
logging.debug('Initial GPU memory: {}/{} MB free'.format(drv.mem_get_info()[0]/(2.**20),drv.mem_get_info()[1]/(2.**20))) _print_GPU_memory_MB('Initial')
input_gpu = gpuarray.to_gpu ( atomsInputMatrix ) input_gpu = gpuarray.to_gpu ( atomsInputMatrix )
Fmn_gpu = gpuarray.GPUArray ( (tCWparams['N_t0Range'],tCWparams['N_tauRange']), dtype=np.float32 ) Fmn_gpu = gpuarray.GPUArray ( (tCWparams['N_t0Range'],
logging.debug('GPU memory with input+output allocated: {}/{} MB free'.format(drv.mem_get_info()[0]/(2.**20), drv.mem_get_info()[1]/(2.**20))) tCWparams['N_tauRange']),
dtype=np.float32 )
_print_GPU_memory_MB('After input+output allocation:')
# GPU kernel # GPU kernel
kernel = 'cudaTransientFstatExpWindow' kernel = 'cudaTransientFstatExpWindow'
kernelfile = get_absolute_kernel_path ( kernel ) kernelfile = _get_absolute_kernel_path ( kernel )
partial_Fstat_cuda_code = cudacomp.SourceModule(open(kernelfile,'r').read()) partial_Fstat_cuda_code = cudacomp.SourceModule(open(kernelfile,'r').read())
partial_Fstat_cuda = partial_Fstat_cuda_code.get_function(kernel) partial_Fstat_cuda = partial_Fstat_cuda_code.get_function(kernel)
partial_Fstat_cuda.prepare('PIIIIIIIIIP') partial_Fstat_cuda.prepare('PIIIIIIIIIP')
...@@ -389,15 +462,28 @@ def pycuda_compute_transient_fstat_map_exp ( atomsInputMatrix, windowRange, tCWp ...@@ -389,15 +462,28 @@ def pycuda_compute_transient_fstat_map_exp ( atomsInputMatrix, windowRange, tCWp
blockCols = min(32,tCWparams['N_tauRange']) blockCols = min(32,tCWparams['N_tauRange'])
gridRows = int(np.ceil(1.0*tCWparams['N_t0Range']/blockRows)) gridRows = int(np.ceil(1.0*tCWparams['N_t0Range']/blockRows))
gridCols = int(np.ceil(1.0*tCWparams['N_tauRange']/blockCols)) gridCols = int(np.ceil(1.0*tCWparams['N_tauRange']/blockCols))
logging.debug('Looking to compute transient F-stat map over a N={}*{}={} (t0,tau) grid...'.format(tCWparams['N_t0Range'], tCWparams['N_tauRange'], tCWparams['N_t0Range']*tCWparams['N_tauRange']))
# running the kernel # running the kernel
logging.debug('Calling kernel with a grid of {}*{}={} blocks of {}*{}={} threads each: {} total threads...'.format(gridRows, gridCols, gridRows*gridCols, blockRows, blockCols, blockRows*blockCols, gridRows*gridCols*blockRows*blockCols)) logging.debug('Calling kernel with a grid of {}*{}={} blocks' \
partial_Fstat_cuda.prepared_call ( (gridRows,gridCols), (blockRows,blockCols,1), input_gpu.gpudata, tCWparams['numAtoms'], tCWparams['TAtom'], tCWparams['t0_data'], windowRange.t0, windowRange.dt0, windowRange.tau, windowRange.dtau, tCWparams['N_t0Range'], tCWparams['N_tauRange'], Fmn_gpu.gpudata ) ' of {}*{}={} threads each: {} total threads...'.format(
gridRows, gridCols, gridRows*gridCols,
blockRows, blockCols, blockRows*blockCols,
gridRows*gridCols*blockRows*blockCols))
partial_Fstat_cuda.prepared_call ( (gridRows,gridCols),
(blockRows,blockCols,1),
input_gpu.gpudata,
tCWparams['numAtoms'],
tCWparams['TAtom'],
tCWparams['t0_data'],
windowRange.t0, windowRange.dt0,
windowRange.tau, windowRange.dtau,
tCWparams['N_t0Range'],
tCWparams['N_tauRange'],
Fmn_gpu.gpudata )
# return results to host # return results to host
F_mn = Fmn_gpu.get() F_mn = Fmn_gpu.get()
logging.debug('Final GPU memory: {}/{} MB free'.format(drv.mem_get_info()[0]/(2.**20),drv.mem_get_info()[1]/(2.**20))) _print_GPU_memory_MB('Final')
return F_mn return F_mn
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment