tcw_fstat_map_funcs.py 20 KB
Newer Older
1
2
""" Additional helper functions dealing with transient-CW F(t0,tau) maps """

3
4
import numpy as np
import os
5
import sys
6
7
8
9
10
11
import logging

# optional imports
import importlib as imp


David Keitel's avatar
David Keitel committed
12
def _optional_import ( modulename, shorthand=None ):
13
14
15
16
17
    '''
    Import a module/submodule only if it's available.

    using importlib instead of __import__
    because the latter doesn't handle sub.modules
18
19
20

    Also including a special check to fail more gracefully
    when CUDA_DEVICE is set to too high a number.
21
    '''
22

23
24
25
26
27
    if shorthand is None:
        shorthand    = modulename
        shorthandbit = ''
    else:
        shorthandbit = ' as '+shorthand
28
29
30
31

    if('pycuda' in sys.modules):
        try:
            globals()[shorthand] = imp.import_module(modulename)
David Keitel's avatar
David Keitel committed
32
33
            logging.debug('Successfully imported module %s%s.'
                          % (modulename, shorthandbit))
34
35
36
37
            success = True
        except pycuda._driver.LogicError, e:
            if e.message == 'cuDeviceGet failed: invalid device ordinal':
                devn = int(os.environ['CUDA_DEVICE'])
David Keitel's avatar
David Keitel committed
38
39
40
41
                raise RuntimeError('Requested CUDA device number {} exceeds' \
                                   ' number of available devices!' \
                                   ' Please change through environment' \
                                   ' variable $CUDA_DEVICE.'.format(devn))
42
43
44
45
46
47
48
49
50
51
52
            else:
                raise pycuda._driver.LogicError(e.message)
        except ImportError, e:
            if e.message == 'No module named '+modulename:
                logging.debug('No module {:s} found.'.format(modulename))
                success = False
            else:
                raise
    else:
        try:
            globals()[shorthand] = imp.import_module(modulename)
David Keitel's avatar
David Keitel committed
53
54
            logging.debug('Successfully imported module %s%s.'
                          % (modulename, shorthandbit))
55
56
57
58
59
60
61
62
            success = True
        except ImportError, e:
            if e.message == 'No module named '+modulename:
                logging.debug('No module {:s} found.'.format(modulename))
                success = False
            else:
                raise

63
64
65
    return success


66
67
68
69
70
71
72
73
74
75
76
77
78
79
class pyTransientFstatMap(object):
    '''
    simplified object class for a F(t0,tau) F-stat map (not 2F!)
    based on LALSuite's transientFstatMap_t type
    replacing the gsl matrix with a numpy array

    F_mn:   2D array of 2F values
    maxF:   maximum of F (not 2F!)
    t0_ML:  maximum likelihood transient start time t0 estimate
    tau_ML: maximum likelihood transient duration tau estimate
    '''

    def __init__(self, N_t0Range, N_tauRange):
        self.F_mn   = np.zeros((N_t0Range, N_tauRange), dtype=np.float32)
David Keitel's avatar
David Keitel committed
80
81
82
83
84
        # Initializing maxF to a negative value ensures
        # that we always update at least once and hence return
        # sane t0_d_ML, tau_d_ML
        # even if there is only a single bin where F=0 happens.
        self.maxF   = float(-1.0)
85
86
87
88
        self.t0_ML  = float(0.0)
        self.tau_ML = float(0.0)


89
90
91
92
93
94
# dictionary of the actual callable F-stat map functions we support,
# if the corresponding modules are available.
fstatmap_versions = {
                     'lal':    lambda multiFstatAtoms, windowRange:
                               getattr(lalpulsar,'ComputeTransientFstatMap')
                                ( multiFstatAtoms, windowRange, False ),
95
96
97
                     'pycuda': lambda multiFstatAtoms, windowRange:
                               pycuda_compute_transient_fstat_map
                                ( multiFstatAtoms, windowRange )
98
99
100
                    }


101
def init_transient_fstat_map_features ( wantCuda=False, cudaDeviceName=None ):
102
103
104
105
106
107
108
    '''
    Initialization of available modules (or "features") for F-stat maps.

    Returns a dictionary of method names, to match fstatmap_versions
    each key's value set to True only if
    all required modules are importable on this system.
    '''
109

110
    features = {}
111

David Keitel's avatar
David Keitel committed
112
113
    have_lal           = _optional_import('lal')
    have_lalpulsar     = _optional_import('lalpulsar')
114
    features['lal']    = have_lal and have_lalpulsar
115
116

    # import GPU features
David Keitel's avatar
David Keitel committed
117
118
119
120
121
122
123
    have_pycuda          = _optional_import('pycuda')
    have_pycuda_drv      = _optional_import('pycuda.driver', 'drv')
    have_pycuda_gpuarray = _optional_import('pycuda.gpuarray', 'gpuarray')
    have_pycuda_tools    = _optional_import('pycuda.tools', 'cudatools')
    have_pycuda_compiler = _optional_import('pycuda.compiler', 'cudacomp')
    features['pycuda']   = ( have_pycuda_drv and have_pycuda_gpuarray and
                            have_pycuda_tools and have_pycuda_compiler )
124

125
126
    logging.debug('Got the following features for transient F-stat maps:')
    logging.debug(features)
127

128
    if wantCuda and features['pycuda']:
David Keitel's avatar
David Keitel committed
129
        logging.debug('CUDA version: '+'.'.join(map(str,drv.get_version())))
130

131
        drv.init()
David Keitel's avatar
David Keitel committed
132
133
        logging.debug('Starting with default pyCUDA context,' \
                      ' then checking all available devices...')
134
135
        context0 = pycuda.tools.make_default_context()

136
137
138
139
        num_gpus = drv.Device.count()
        logging.debug('Found {} CUDA device(s).'.format(num_gpus))

        devices = []
140
        devnames = np.empty(num_gpus,dtype='S32')
141
        for n in range(num_gpus):
142
143
144
            devn = drv.Device(n)
            devices.append(devn)
            devnames[n] = devn.name().replace(' ','-').replace('_','-')
David Keitel's avatar
David Keitel committed
145
146
            logging.debug('device {}: model: {}, RAM: {}MB'.format(
                n, devnames[n], devn.total_memory()/(2.**20) ))
147

148
        if 'CUDA_DEVICE' in os.environ:
149
150
151
152
            devnum0 = int(os.environ['CUDA_DEVICE'])
        else:
            devnum0 = 0

153
        matchbit = ''
154
        if cudaDeviceName:
155
            # allow partial matches in device names
David Keitel's avatar
David Keitel committed
156
157
            devmatches = [devidx for devidx, devname in enumerate(devnames)
                          if cudaDeviceName in devname]
158
159
            if len(devmatches) == 0:
                context0.detach()
David Keitel's avatar
David Keitel committed
160
161
162
                raise RuntimeError('Requested CUDA device "{}" not found.' \
                                   ' Available devices: [{}]'.format(
                                      cudaDeviceName,','.join(devnames)))
163
164
165
            else:
                devnum = devmatches[0]
                if len(devmatches) > 1:
David Keitel's avatar
David Keitel committed
166
167
168
                    logging.warning('Found {} CUDA devices matching name "{}".' \
                                    ' Choosing first one with index {}.'.format(
                                        len(devmatches),cudaDeviceName,devnum))
169
            os.environ['CUDA_DEVICE'] = str(devnum)
170
            matchbit =  '(matched to user request "{}")'.format(cudaDeviceName)
171
        elif 'CUDA_DEVICE' in os.environ:
172
173
174
            devnum = int(os.environ['CUDA_DEVICE'])
        else:
            devnum = 0
175
        devn = devices[devnum]
David Keitel's avatar
David Keitel committed
176
177
178
        logging.info('Choosing CUDA device {},' \
                     ' of {} devices present: {}{}...'.format(
                         devnum, num_gpus, devn.name(), matchbit))
179
180
181
182
183
184
185
        if devnum == devnum0:
            gpu_context = context0
        else:
            context0.pop()
            gpu_context = pycuda.tools.make_default_context()
            gpu_context.push()

David Keitel's avatar
David Keitel committed
186
        _print_GPU_memory_MB('Available')
187
188
    else:
        gpu_context = None
189

190
    return features, gpu_context
191
192


David Keitel's avatar
David Keitel committed
193
194
195
196
def call_compute_transient_fstat_map ( version,
                                       features,
                                       multiFstatAtoms=None,
                                       windowRange=None ):
197
198
199
200
201
202
    '''Choose which version of the ComputeTransientFstatMap function to call.'''

    if version in fstatmap_versions:
        if features[version]:
            FstatMap = fstatmap_versions[version](multiFstatAtoms, windowRange)
        else:
David Keitel's avatar
David Keitel committed
203
204
            raise Exception('Required module(s) for transient F-stat map' \
                            ' method "{}" not available!'.format(version))
205
    else:
David Keitel's avatar
David Keitel committed
206
207
        raise Exception('Transient F-stat map method "{}"' \
                        ' not implemented!'.format(version))
208
    return FstatMap
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241


def reshape_FstatAtomsVector ( atomsVector ):
    '''
    Make a dictionary of ndarrays out of a atoms "vector" structure.

    The input is a "vector"-like structure with times as the higher hierarchical
    level and a set of "atoms" quantities defined at each timestamp.
    The output is a dictionary with an entry for each quantity,
    which is a 1D ndarray over timestamps for that one quantity.
    '''

    numAtoms = atomsVector.length
    atomsDict = {}
    atom_fieldnames = ['timestamp', 'Fa_alpha', 'Fb_alpha',
                       'a2_alpha', 'ab_alpha', 'b2_alpha']
    atom_dtypes     = [np.uint32, complex, complex,
                       np.float32, np.float32, np.float32]
    for f, field in enumerate(atom_fieldnames):
        atomsDict[field] = np.ndarray(numAtoms,dtype=atom_dtypes[f])

    for n,atom in enumerate(atomsVector.data):
        for field in atom_fieldnames:
            atomsDict[field][n] = atom.__getattribute__(field)

    atomsDict['Fa_alpha_re'] = np.float32(atomsDict['Fa_alpha'].real)
    atomsDict['Fa_alpha_im'] = np.float32(atomsDict['Fa_alpha'].imag)
    atomsDict['Fb_alpha_re'] = np.float32(atomsDict['Fb_alpha'].real)
    atomsDict['Fb_alpha_im'] = np.float32(atomsDict['Fb_alpha'].imag)

    return atomsDict


David Keitel's avatar
David Keitel committed
242
def _get_absolute_kernel_path ( kernel ):
243
244
245
246
247
    pyfstatdir = os.path.dirname(os.path.abspath(os.path.realpath(__file__)))
    kernelfile = kernel + '.cu'
    return os.path.join(pyfstatdir,'pyCUDAkernels',kernelfile)


David Keitel's avatar
David Keitel committed
248
249
250
251
252
253
254
def _print_GPU_memory_MB ( key ):
    mem_used_MB  = drv.mem_get_info()[0]/(2.**20)
    mem_total_MB = drv.mem_get_info()[1]/(2.**20)
    logging.debug('{} GPU memory: {:.4f} / {:.4f} MB free'.format(
                      key, mem_used_MB, mem_total_MB))


255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
def pycuda_compute_transient_fstat_map ( multiFstatAtoms, windowRange ):
    '''
    GPU version of the function to compute transient-window "F-statistic map"
    over start-time and timescale {t0, tau}.
    Based on XLALComputeTransientFstatMap from LALSuite,
    (C) 2009 Reinhard Prix, licensed under GPL

    Returns a 2D matrix F_mn,
    with m = index over start-times t0,
    and  n = index over timescales tau,
    in steps of dt0  in [t0,  t0+t0Band],
    and         dtau in [tau, tau+tauBand]
    as defined in windowRange input.
    '''

    if ( windowRange.type >= lalpulsar.TRANSIENT_LAST ):
David Keitel's avatar
David Keitel committed
271
272
273
        raise ValueError ('Unknown window-type ({}) passed as input.' \
                          ' Allowed are [0,{}].'.format(
                              windowRange.type, lalpulsar.TRANSIENT_LAST-1))
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299

    # internal dict for search/setup parameters
    tCWparams = {}

    # first combine all multi-atoms
    # into a single atoms-vector with *unique* timestamps
    tCWparams['TAtom'] = multiFstatAtoms.data[0].TAtom
    TAtomHalf          = int(tCWparams['TAtom']/2) # integer division
    atoms = lalpulsar.mergeMultiFstatAtomsBinned ( multiFstatAtoms,
                                                   tCWparams['TAtom'] )

    # make a combined input matrix of all atoms vectors, for transfer to GPU
    tCWparams['numAtoms'] = atoms.length
    atomsDict = reshape_FstatAtomsVector(atoms)
    atomsInputMatrix = np.column_stack ( (atomsDict['a2_alpha'],
                                          atomsDict['b2_alpha'],
                                          atomsDict['ab_alpha'],
                                          atomsDict['Fa_alpha_re'],
                                          atomsDict['Fa_alpha_im'],
                                          atomsDict['Fb_alpha_re'],
                                          atomsDict['Fb_alpha_im'])
                                       )

    # actual data spans [t0_data, t0_data + tCWparams['numAtoms'] * TAtom]
    # in steps of TAtom
    tCWparams['t0_data'] = int(atoms.data[0].timestamp)
David Keitel's avatar
David Keitel committed
300
301
    tCWparams['t1_data'] = int(atoms.data[tCWparams['numAtoms']-1].timestamp
                               + tCWparams['TAtom'])
302

David Keitel's avatar
David Keitel committed
303
304
305
306
307
308
309
    logging.debug('Transient F-stat map:' \
                  ' t0_data={:d}, t1_data={:d}'.format(
                      tCWparams['t0_data'], tCWparams['t1_data']))
    logging.debug('Transient F-stat map:' \
                  ' numAtoms={:d}, TAtom={:d},' \
                  ' TAtomHalf={:d}'.format(
                      tCWparams['numAtoms'], tCWparams['TAtom'], TAtomHalf))
310
311
312
313
314
315
316
317
318
319
320
321

    # special treatment of window_type = none
    # ==> replace by rectangular window spanning all the data
    if ( windowRange.type == lalpulsar.TRANSIENT_NONE ):
        windowRange.type    = lalpulsar.TRANSIENT_RECTANGULAR
        windowRange.t0      = tCWparams['t0_data']
        windowRange.t0Band  = 0
        windowRange.dt0     = tCWparams['TAtom'] # irrelevant
        windowRange.tau     = tCWparams['numAtoms'] * tCWparams['TAtom']
        windowRange.tauBand = 0;
        windowRange.dtau    = tCWparams['TAtom'] # irrelevant

David Keitel's avatar
David Keitel committed
322
323
324
325
    """ NOTE: indices {i,j} enumerate *actual* atoms and their timestamps t_i,
    * while the indices {m,n} enumerate the full grid of values
    * in [t0_min, t0_max]x[Tcoh_min, Tcoh_max] in steps of deltaT.
    * This allows us to deal with gaps in the data in a transparent way.
326
    *
David Keitel's avatar
David Keitel committed
327
328
329
330
    * NOTE2: we operate on the 'binned' atoms returned
    * from XLALmergeMultiFstatAtomsBinned(),
    * which means we can safely assume all atoms to be lined up
    * perfectly on a 'deltaT' binned grid.
331
332
    *
    * The mapping used will therefore be {i,j} -> {m,n}:
David Keitel's avatar
David Keitel committed
333
334
335
336
    *   m = offs_i  / deltaT
    *   start-time offset from t0_min measured in deltaT
    *   n = Tcoh_ij / deltaT
    *   duration Tcoh_ij measured in deltaT,
337
338
339
340
341
342
343
344
    *
    * where
    *   offs_i  = t_i - t0_min
    *   Tcoh_ij = t_j - t_i + deltaT
    *
    """

    # We allocate a matrix  {m x n} = t0Range * TcohRange elements
David Keitel's avatar
David Keitel committed
345
346
347
348
349
350
351
352
353
354
355
356
357
    # covering the full transient window-range [t0,t0+t0Band]x[tau,tau+tauBand]
    tCWparams['N_t0Range']  = int(np.floor( 1.0*windowRange.t0Band /
                                            windowRange.dt0 ) + 1)
    tCWparams['N_tauRange'] = int(np.floor( 1.0*windowRange.tauBand /
                                            windowRange.dtau ) + 1)
    FstatMap = pyTransientFstatMap ( tCWparams['N_t0Range'],
                                     tCWparams['N_tauRange'] )

    logging.debug('Transient F-stat map:' \
                  ' N_t0Range={:d}, N_tauRange={:d},' \
                  ' total grid points: {:d}'.format(
                      tCWparams['N_t0Range'], tCWparams['N_tauRange'],
                      tCWparams['N_t0Range']*tCWparams['N_tauRange']))
358
359

    if ( windowRange.type == lalpulsar.TRANSIENT_RECTANGULAR ):
David Keitel's avatar
David Keitel committed
360
361
        FstatMap.F_mn = pycuda_compute_transient_fstat_map_rect (
                           atomsInputMatrix, windowRange, tCWparams )
362
    elif ( windowRange.type == lalpulsar.TRANSIENT_EXPONENTIAL ):
David Keitel's avatar
David Keitel committed
363
364
        FstatMap.F_mn = pycuda_compute_transient_fstat_map_exp (
                           atomsInputMatrix, windowRange, tCWparams )
365
    else:
David Keitel's avatar
David Keitel committed
366
367
368
369
        raise ValueError('Invalid transient window type {}' \
                         ' not in [{}, {}].'.format(
                            windowRange.type, lalpulsar.TRANSIENT_NONE,
                            lalpulsar.TRANSIENT_LAST-1))
370
371
372
373
374
375
376
377
378

    # out of loop: get max2F and ML estimates over the m x n matrix
    FstatMap.maxF = FstatMap.F_mn.max()
    maxidx = np.unravel_index ( FstatMap.F_mn.argmax(),
                               (tCWparams['N_t0Range'],
                                tCWparams['N_tauRange']))
    FstatMap.t0_ML  = windowRange.t0  + maxidx[0] * windowRange.dt0
    FstatMap.tau_ML = windowRange.tau + maxidx[1] * windowRange.dtau

David Keitel's avatar
David Keitel committed
379
380
381
    logging.debug('Done computing transient F-stat map.' \
                  ' maxF={:.4f}, t0_ML={}, tau_ML={}'.format(
                      FstatMap.maxF , FstatMap.t0_ML, FstatMap.tau_ML))
382
383
384
385

    return FstatMap


David Keitel's avatar
David Keitel committed
386
387
388
389
390
391
392
def pycuda_compute_transient_fstat_map_rect ( atomsInputMatrix,
                                              windowRange,
                                              tCWparams ):
    '''
    only GPU-parallizing outer loop,
    keeping partial sums with memory in kernel
    '''
393
394

    # gpu data setup and transfer
David Keitel's avatar
David Keitel committed
395
    _print_GPU_memory_MB('Initial')
396
    input_gpu = gpuarray.to_gpu ( atomsInputMatrix )
David Keitel's avatar
David Keitel committed
397
398
399
400
    Fmn_gpu   = gpuarray.GPUArray ( (tCWparams['N_t0Range'],
                                     tCWparams['N_tauRange']),
                                    dtype=np.float32 )
    _print_GPU_memory_MB('After input+output allocation:')
401
402
403

    # GPU kernel
    kernel = 'cudaTransientFstatRectWindow'
David Keitel's avatar
David Keitel committed
404
    kernelfile = _get_absolute_kernel_path ( kernel )
405
406
407
408
409
410
411
412
413
414
415
    partial_Fstat_cuda_code = cudacomp.SourceModule(open(kernelfile,'r').read())
    partial_Fstat_cuda = partial_Fstat_cuda_code.get_function(kernel)
    partial_Fstat_cuda.prepare('PIIIIIIIIP')

    # GPU grid setup
    blockRows = min(1024,tCWparams['N_t0Range'])
    blockCols = 1
    gridRows  = int(np.ceil(1.0*tCWparams['N_t0Range']/blockRows))
    gridCols  = 1

    # running the kernel
David Keitel's avatar
David Keitel committed
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
    logging.debug('Calling pyCUDA kernel with a grid of {}*{}={} blocks' \
                  ' of {}*{}={} threads each: {} total threads...'.format(
                      gridRows, gridCols, gridRows*gridCols,
                      blockRows, blockCols, blockRows*blockCols,
                      gridRows*gridCols*blockRows*blockCols))
    partial_Fstat_cuda.prepared_call ( (gridRows,gridCols),
                                       (blockRows,blockCols,1),
                                       input_gpu.gpudata,
                                       tCWparams['numAtoms'],
                                       tCWparams['TAtom'],
                                       tCWparams['t0_data'],
                                       windowRange.t0, windowRange.dt0,
                                       windowRange.tau, windowRange.dtau,
                                       tCWparams['N_tauRange'],
                                       Fmn_gpu.gpudata )
431
432
433
434

    # return results to host
    F_mn = Fmn_gpu.get()

David Keitel's avatar
David Keitel committed
435
    _print_GPU_memory_MB('Final')
436
437
438
439

    return F_mn


David Keitel's avatar
David Keitel committed
440
441
442
def pycuda_compute_transient_fstat_map_exp ( atomsInputMatrix,
                                             windowRange,
                                             tCWparams ):
443
444
445
    '''exponential window, inner and outer loop GPU-parallelized'''

    # gpu data setup and transfer
David Keitel's avatar
David Keitel committed
446
    _print_GPU_memory_MB('Initial')
447
    input_gpu = gpuarray.to_gpu ( atomsInputMatrix )
David Keitel's avatar
David Keitel committed
448
449
450
451
    Fmn_gpu   = gpuarray.GPUArray ( (tCWparams['N_t0Range'],
                                     tCWparams['N_tauRange']),
                                    dtype=np.float32 )
    _print_GPU_memory_MB('After input+output allocation:')
452
453
454

    # GPU kernel
    kernel = 'cudaTransientFstatExpWindow'
David Keitel's avatar
David Keitel committed
455
    kernelfile = _get_absolute_kernel_path ( kernel )
456
457
458
459
460
461
462
463
464
465
466
    partial_Fstat_cuda_code = cudacomp.SourceModule(open(kernelfile,'r').read())
    partial_Fstat_cuda = partial_Fstat_cuda_code.get_function(kernel)
    partial_Fstat_cuda.prepare('PIIIIIIIIIP')

    # GPU grid setup
    blockRows = min(32,tCWparams['N_t0Range'])
    blockCols = min(32,tCWparams['N_tauRange'])
    gridRows  = int(np.ceil(1.0*tCWparams['N_t0Range']/blockRows))
    gridCols  = int(np.ceil(1.0*tCWparams['N_tauRange']/blockCols))

    # running the kernel
David Keitel's avatar
David Keitel committed
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
    logging.debug('Calling kernel with a grid of {}*{}={} blocks' \
                  ' of {}*{}={} threads each: {} total threads...'.format(
                      gridRows, gridCols, gridRows*gridCols,
                      blockRows, blockCols, blockRows*blockCols,
                      gridRows*gridCols*blockRows*blockCols))
    partial_Fstat_cuda.prepared_call ( (gridRows,gridCols),
                                       (blockRows,blockCols,1),
                                       input_gpu.gpudata,
                                       tCWparams['numAtoms'],
                                       tCWparams['TAtom'],
                                       tCWparams['t0_data'],
                                       windowRange.t0, windowRange.dt0,
                                       windowRange.tau, windowRange.dtau,
                                       tCWparams['N_t0Range'],
                                       tCWparams['N_tauRange'],
                                       Fmn_gpu.gpudata )
483
484
485
486

    # return results to host
    F_mn = Fmn_gpu.get()

David Keitel's avatar
David Keitel committed
487
    _print_GPU_memory_MB('Final')
488
489

    return F_mn