tcw_fstat_map_funcs.py 19.6 KB
Newer Older
1
2
""" Additional helper functions dealing with transient-CW F(t0,tau) maps """

3
4
import numpy as np
import os
5
import logging
6
from time import time
7
8
9
10
11

# optional imports
import importlib as imp


David Keitel's avatar
David Keitel committed
12
def _optional_import ( modulename, shorthand=None ):
13
14
15
16
17
    '''
    Import a module/submodule only if it's available.

    using importlib instead of __import__
    because the latter doesn't handle sub.modules
18
19
20

    Also including a special check to fail more gracefully
    when CUDA_DEVICE is set to too high a number.
21
    '''
22

23
24
25
26
27
    if shorthand is None:
        shorthand    = modulename
        shorthandbit = ''
    else:
        shorthandbit = ' as '+shorthand
28

29
30
31
32
33
34
35
36
37
38
39
    try:
        globals()[shorthand] = imp.import_module(modulename)
        logging.debug('Successfully imported module %s%s.'
                      % (modulename, shorthandbit))
        success = True
    except ImportError, e:
        if e.message == 'No module named '+modulename:
            logging.debug('No module {:s} found.'.format(modulename))
            success = False
        else:
            raise
40

41
42
43
    return success


44
45
46
47
48
49
50
51
52
53
54
55
56
57
class pyTransientFstatMap(object):
    '''
    simplified object class for a F(t0,tau) F-stat map (not 2F!)
    based on LALSuite's transientFstatMap_t type
    replacing the gsl matrix with a numpy array

    F_mn:   2D array of 2F values
    maxF:   maximum of F (not 2F!)
    t0_ML:  maximum likelihood transient start time t0 estimate
    tau_ML: maximum likelihood transient duration tau estimate
    '''

    def __init__(self, N_t0Range, N_tauRange):
        self.F_mn   = np.zeros((N_t0Range, N_tauRange), dtype=np.float32)
David Keitel's avatar
David Keitel committed
58
59
60
61
62
        # Initializing maxF to a negative value ensures
        # that we always update at least once and hence return
        # sane t0_d_ML, tau_d_ML
        # even if there is only a single bin where F=0 happens.
        self.maxF   = float(-1.0)
63
64
65
66
        self.t0_ML  = float(0.0)
        self.tau_ML = float(0.0)


67
68
69
70
71
72
# dictionary of the actual callable F-stat map functions we support,
# if the corresponding modules are available.
fstatmap_versions = {
                     'lal':    lambda multiFstatAtoms, windowRange:
                               getattr(lalpulsar,'ComputeTransientFstatMap')
                                ( multiFstatAtoms, windowRange, False ),
73
74
75
                     'pycuda': lambda multiFstatAtoms, windowRange:
                               pycuda_compute_transient_fstat_map
                                ( multiFstatAtoms, windowRange )
76
77
78
                    }


79
def init_transient_fstat_map_features ( wantCuda=False, cudaDeviceName=None ):
80
81
82
83
84
85
86
    '''
    Initialization of available modules (or "features") for F-stat maps.

    Returns a dictionary of method names, to match fstatmap_versions
    each key's value set to True only if
    all required modules are importable on this system.
    '''
87

88
    features = {}
89

David Keitel's avatar
David Keitel committed
90
91
    have_lal           = _optional_import('lal')
    have_lalpulsar     = _optional_import('lalpulsar')
92
    features['lal']    = have_lal and have_lalpulsar
93
94

    # import GPU features
David Keitel's avatar
David Keitel committed
95
96
97
98
99
100
101
    have_pycuda          = _optional_import('pycuda')
    have_pycuda_drv      = _optional_import('pycuda.driver', 'drv')
    have_pycuda_gpuarray = _optional_import('pycuda.gpuarray', 'gpuarray')
    have_pycuda_tools    = _optional_import('pycuda.tools', 'cudatools')
    have_pycuda_compiler = _optional_import('pycuda.compiler', 'cudacomp')
    features['pycuda']   = ( have_pycuda_drv and have_pycuda_gpuarray and
                            have_pycuda_tools and have_pycuda_compiler )
102

103
104
    logging.debug('Got the following features for transient F-stat maps:')
    logging.debug(features)
105

106
    if wantCuda and features['pycuda']:
David Keitel's avatar
David Keitel committed
107
        logging.debug('CUDA version: '+'.'.join(map(str,drv.get_version())))
108

109
        drv.init()
David Keitel's avatar
David Keitel committed
110
111
        logging.debug('Starting with default pyCUDA context,' \
                      ' then checking all available devices...')
112
113
114
115
116
117
118
119
120
121
122
        try:
            context0 = pycuda.tools.make_default_context()
        except pycuda._driver.LogicError, e:
            if e.message == 'cuDeviceGet failed: invalid device ordinal':
                devn = int(os.environ['CUDA_DEVICE'])
                raise RuntimeError('Requested CUDA device number {} exceeds' \
                                   ' number of available devices!' \
                                   ' Please change through environment' \
                                   ' variable $CUDA_DEVICE.'.format(devn))
            else:
                raise pycuda._driver.LogicError(e.message)
123

124
125
126
127
        num_gpus = drv.Device.count()
        logging.debug('Found {} CUDA device(s).'.format(num_gpus))

        devices = []
128
        devnames = np.empty(num_gpus,dtype='S32')
129
        for n in range(num_gpus):
130
131
132
            devn = drv.Device(n)
            devices.append(devn)
            devnames[n] = devn.name().replace(' ','-').replace('_','-')
David Keitel's avatar
David Keitel committed
133
134
            logging.debug('device {}: model: {}, RAM: {}MB'.format(
                n, devnames[n], devn.total_memory()/(2.**20) ))
135

136
        if 'CUDA_DEVICE' in os.environ:
137
138
139
140
            devnum0 = int(os.environ['CUDA_DEVICE'])
        else:
            devnum0 = 0

141
        matchbit = ''
142
        if cudaDeviceName:
143
            # allow partial matches in device names
David Keitel's avatar
David Keitel committed
144
145
            devmatches = [devidx for devidx, devname in enumerate(devnames)
                          if cudaDeviceName in devname]
146
147
            if len(devmatches) == 0:
                context0.detach()
David Keitel's avatar
David Keitel committed
148
149
150
                raise RuntimeError('Requested CUDA device "{}" not found.' \
                                   ' Available devices: [{}]'.format(
                                      cudaDeviceName,','.join(devnames)))
151
152
153
            else:
                devnum = devmatches[0]
                if len(devmatches) > 1:
David Keitel's avatar
David Keitel committed
154
155
156
                    logging.warning('Found {} CUDA devices matching name "{}".' \
                                    ' Choosing first one with index {}.'.format(
                                        len(devmatches),cudaDeviceName,devnum))
157
            os.environ['CUDA_DEVICE'] = str(devnum)
158
            matchbit =  '(matched to user request "{}")'.format(cudaDeviceName)
159
        elif 'CUDA_DEVICE' in os.environ:
160
161
162
            devnum = int(os.environ['CUDA_DEVICE'])
        else:
            devnum = 0
163
        devn = devices[devnum]
David Keitel's avatar
David Keitel committed
164
165
166
        logging.info('Choosing CUDA device {},' \
                     ' of {} devices present: {}{}...'.format(
                         devnum, num_gpus, devn.name(), matchbit))
167
168
169
170
171
172
173
        if devnum == devnum0:
            gpu_context = context0
        else:
            context0.pop()
            gpu_context = pycuda.tools.make_default_context()
            gpu_context.push()

David Keitel's avatar
David Keitel committed
174
        _print_GPU_memory_MB('Available')
175
176
    else:
        gpu_context = None
177

178
    return features, gpu_context
179
180


David Keitel's avatar
David Keitel committed
181
182
183
184
def call_compute_transient_fstat_map ( version,
                                       features,
                                       multiFstatAtoms=None,
                                       windowRange=None ):
185
186
187
188
    '''Choose which version of the ComputeTransientFstatMap function to call.'''

    if version in fstatmap_versions:
        if features[version]:
189
            time0 = time()
190
            FstatMap = fstatmap_versions[version](multiFstatAtoms, windowRange)
191
            timingFstatMap = time()-time0
192
        else:
David Keitel's avatar
David Keitel committed
193
194
            raise Exception('Required module(s) for transient F-stat map' \
                            ' method "{}" not available!'.format(version))
195
    else:
David Keitel's avatar
David Keitel committed
196
197
        raise Exception('Transient F-stat map method "{}"' \
                        ' not implemented!'.format(version))
198
    return FstatMap, timingFstatMap
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231


def reshape_FstatAtomsVector ( atomsVector ):
    '''
    Make a dictionary of ndarrays out of a atoms "vector" structure.

    The input is a "vector"-like structure with times as the higher hierarchical
    level and a set of "atoms" quantities defined at each timestamp.
    The output is a dictionary with an entry for each quantity,
    which is a 1D ndarray over timestamps for that one quantity.
    '''

    numAtoms = atomsVector.length
    atomsDict = {}
    atom_fieldnames = ['timestamp', 'Fa_alpha', 'Fb_alpha',
                       'a2_alpha', 'ab_alpha', 'b2_alpha']
    atom_dtypes     = [np.uint32, complex, complex,
                       np.float32, np.float32, np.float32]
    for f, field in enumerate(atom_fieldnames):
        atomsDict[field] = np.ndarray(numAtoms,dtype=atom_dtypes[f])

    for n,atom in enumerate(atomsVector.data):
        for field in atom_fieldnames:
            atomsDict[field][n] = atom.__getattribute__(field)

    atomsDict['Fa_alpha_re'] = np.float32(atomsDict['Fa_alpha'].real)
    atomsDict['Fa_alpha_im'] = np.float32(atomsDict['Fa_alpha'].imag)
    atomsDict['Fb_alpha_re'] = np.float32(atomsDict['Fb_alpha'].real)
    atomsDict['Fb_alpha_im'] = np.float32(atomsDict['Fb_alpha'].imag)

    return atomsDict


David Keitel's avatar
David Keitel committed
232
def _get_absolute_kernel_path ( kernel ):
233
234
235
236
237
    pyfstatdir = os.path.dirname(os.path.abspath(os.path.realpath(__file__)))
    kernelfile = kernel + '.cu'
    return os.path.join(pyfstatdir,'pyCUDAkernels',kernelfile)


David Keitel's avatar
David Keitel committed
238
239
240
241
242
243
244
def _print_GPU_memory_MB ( key ):
    mem_used_MB  = drv.mem_get_info()[0]/(2.**20)
    mem_total_MB = drv.mem_get_info()[1]/(2.**20)
    logging.debug('{} GPU memory: {:.4f} / {:.4f} MB free'.format(
                      key, mem_used_MB, mem_total_MB))


245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
def pycuda_compute_transient_fstat_map ( multiFstatAtoms, windowRange ):
    '''
    GPU version of the function to compute transient-window "F-statistic map"
    over start-time and timescale {t0, tau}.
    Based on XLALComputeTransientFstatMap from LALSuite,
    (C) 2009 Reinhard Prix, licensed under GPL

    Returns a 2D matrix F_mn,
    with m = index over start-times t0,
    and  n = index over timescales tau,
    in steps of dt0  in [t0,  t0+t0Band],
    and         dtau in [tau, tau+tauBand]
    as defined in windowRange input.
    '''

    if ( windowRange.type >= lalpulsar.TRANSIENT_LAST ):
David Keitel's avatar
David Keitel committed
261
262
263
        raise ValueError ('Unknown window-type ({}) passed as input.' \
                          ' Allowed are [0,{}].'.format(
                              windowRange.type, lalpulsar.TRANSIENT_LAST-1))
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289

    # internal dict for search/setup parameters
    tCWparams = {}

    # first combine all multi-atoms
    # into a single atoms-vector with *unique* timestamps
    tCWparams['TAtom'] = multiFstatAtoms.data[0].TAtom
    TAtomHalf          = int(tCWparams['TAtom']/2) # integer division
    atoms = lalpulsar.mergeMultiFstatAtomsBinned ( multiFstatAtoms,
                                                   tCWparams['TAtom'] )

    # make a combined input matrix of all atoms vectors, for transfer to GPU
    tCWparams['numAtoms'] = atoms.length
    atomsDict = reshape_FstatAtomsVector(atoms)
    atomsInputMatrix = np.column_stack ( (atomsDict['a2_alpha'],
                                          atomsDict['b2_alpha'],
                                          atomsDict['ab_alpha'],
                                          atomsDict['Fa_alpha_re'],
                                          atomsDict['Fa_alpha_im'],
                                          atomsDict['Fb_alpha_re'],
                                          atomsDict['Fb_alpha_im'])
                                       )

    # actual data spans [t0_data, t0_data + tCWparams['numAtoms'] * TAtom]
    # in steps of TAtom
    tCWparams['t0_data'] = int(atoms.data[0].timestamp)
David Keitel's avatar
David Keitel committed
290
291
    tCWparams['t1_data'] = int(atoms.data[tCWparams['numAtoms']-1].timestamp
                               + tCWparams['TAtom'])
292

David Keitel's avatar
David Keitel committed
293
294
295
296
297
298
299
    logging.debug('Transient F-stat map:' \
                  ' t0_data={:d}, t1_data={:d}'.format(
                      tCWparams['t0_data'], tCWparams['t1_data']))
    logging.debug('Transient F-stat map:' \
                  ' numAtoms={:d}, TAtom={:d},' \
                  ' TAtomHalf={:d}'.format(
                      tCWparams['numAtoms'], tCWparams['TAtom'], TAtomHalf))
300
301
302
303
304
305
306
307
308
309
310
311

    # special treatment of window_type = none
    # ==> replace by rectangular window spanning all the data
    if ( windowRange.type == lalpulsar.TRANSIENT_NONE ):
        windowRange.type    = lalpulsar.TRANSIENT_RECTANGULAR
        windowRange.t0      = tCWparams['t0_data']
        windowRange.t0Band  = 0
        windowRange.dt0     = tCWparams['TAtom'] # irrelevant
        windowRange.tau     = tCWparams['numAtoms'] * tCWparams['TAtom']
        windowRange.tauBand = 0;
        windowRange.dtau    = tCWparams['TAtom'] # irrelevant

David Keitel's avatar
David Keitel committed
312
313
314
315
    """ NOTE: indices {i,j} enumerate *actual* atoms and their timestamps t_i,
    * while the indices {m,n} enumerate the full grid of values
    * in [t0_min, t0_max]x[Tcoh_min, Tcoh_max] in steps of deltaT.
    * This allows us to deal with gaps in the data in a transparent way.
316
    *
David Keitel's avatar
David Keitel committed
317
318
319
320
    * NOTE2: we operate on the 'binned' atoms returned
    * from XLALmergeMultiFstatAtomsBinned(),
    * which means we can safely assume all atoms to be lined up
    * perfectly on a 'deltaT' binned grid.
321
322
    *
    * The mapping used will therefore be {i,j} -> {m,n}:
David Keitel's avatar
David Keitel committed
323
324
325
326
    *   m = offs_i  / deltaT
    *   start-time offset from t0_min measured in deltaT
    *   n = Tcoh_ij / deltaT
    *   duration Tcoh_ij measured in deltaT,
327
328
329
330
331
332
333
334
    *
    * where
    *   offs_i  = t_i - t0_min
    *   Tcoh_ij = t_j - t_i + deltaT
    *
    """

    # We allocate a matrix  {m x n} = t0Range * TcohRange elements
David Keitel's avatar
David Keitel committed
335
336
337
338
339
340
341
342
343
344
345
346
347
    # covering the full transient window-range [t0,t0+t0Band]x[tau,tau+tauBand]
    tCWparams['N_t0Range']  = int(np.floor( 1.0*windowRange.t0Band /
                                            windowRange.dt0 ) + 1)
    tCWparams['N_tauRange'] = int(np.floor( 1.0*windowRange.tauBand /
                                            windowRange.dtau ) + 1)
    FstatMap = pyTransientFstatMap ( tCWparams['N_t0Range'],
                                     tCWparams['N_tauRange'] )

    logging.debug('Transient F-stat map:' \
                  ' N_t0Range={:d}, N_tauRange={:d},' \
                  ' total grid points: {:d}'.format(
                      tCWparams['N_t0Range'], tCWparams['N_tauRange'],
                      tCWparams['N_t0Range']*tCWparams['N_tauRange']))
348
349

    if ( windowRange.type == lalpulsar.TRANSIENT_RECTANGULAR ):
David Keitel's avatar
David Keitel committed
350
351
        FstatMap.F_mn = pycuda_compute_transient_fstat_map_rect (
                           atomsInputMatrix, windowRange, tCWparams )
352
    elif ( windowRange.type == lalpulsar.TRANSIENT_EXPONENTIAL ):
David Keitel's avatar
David Keitel committed
353
354
        FstatMap.F_mn = pycuda_compute_transient_fstat_map_exp (
                           atomsInputMatrix, windowRange, tCWparams )
355
    else:
David Keitel's avatar
David Keitel committed
356
357
358
359
        raise ValueError('Invalid transient window type {}' \
                         ' not in [{}, {}].'.format(
                            windowRange.type, lalpulsar.TRANSIENT_NONE,
                            lalpulsar.TRANSIENT_LAST-1))
360
361
362
363
364
365
366
367
368

    # out of loop: get max2F and ML estimates over the m x n matrix
    FstatMap.maxF = FstatMap.F_mn.max()
    maxidx = np.unravel_index ( FstatMap.F_mn.argmax(),
                               (tCWparams['N_t0Range'],
                                tCWparams['N_tauRange']))
    FstatMap.t0_ML  = windowRange.t0  + maxidx[0] * windowRange.dt0
    FstatMap.tau_ML = windowRange.tau + maxidx[1] * windowRange.dtau

David Keitel's avatar
David Keitel committed
369
370
371
    logging.debug('Done computing transient F-stat map.' \
                  ' maxF={:.4f}, t0_ML={}, tau_ML={}'.format(
                      FstatMap.maxF , FstatMap.t0_ML, FstatMap.tau_ML))
372
373
374
375

    return FstatMap


David Keitel's avatar
David Keitel committed
376
377
378
379
380
381
382
def pycuda_compute_transient_fstat_map_rect ( atomsInputMatrix,
                                              windowRange,
                                              tCWparams ):
    '''
    only GPU-parallizing outer loop,
    keeping partial sums with memory in kernel
    '''
383
384

    # gpu data setup and transfer
David Keitel's avatar
David Keitel committed
385
    _print_GPU_memory_MB('Initial')
386
    input_gpu = gpuarray.to_gpu ( atomsInputMatrix )
David Keitel's avatar
David Keitel committed
387
388
389
390
    Fmn_gpu   = gpuarray.GPUArray ( (tCWparams['N_t0Range'],
                                     tCWparams['N_tauRange']),
                                    dtype=np.float32 )
    _print_GPU_memory_MB('After input+output allocation:')
391
392
393

    # GPU kernel
    kernel = 'cudaTransientFstatRectWindow'
David Keitel's avatar
David Keitel committed
394
    kernelfile = _get_absolute_kernel_path ( kernel )
395
396
397
398
399
400
401
402
403
404
405
    partial_Fstat_cuda_code = cudacomp.SourceModule(open(kernelfile,'r').read())
    partial_Fstat_cuda = partial_Fstat_cuda_code.get_function(kernel)
    partial_Fstat_cuda.prepare('PIIIIIIIIP')

    # GPU grid setup
    blockRows = min(1024,tCWparams['N_t0Range'])
    blockCols = 1
    gridRows  = int(np.ceil(1.0*tCWparams['N_t0Range']/blockRows))
    gridCols  = 1

    # running the kernel
David Keitel's avatar
David Keitel committed
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
    logging.debug('Calling pyCUDA kernel with a grid of {}*{}={} blocks' \
                  ' of {}*{}={} threads each: {} total threads...'.format(
                      gridRows, gridCols, gridRows*gridCols,
                      blockRows, blockCols, blockRows*blockCols,
                      gridRows*gridCols*blockRows*blockCols))
    partial_Fstat_cuda.prepared_call ( (gridRows,gridCols),
                                       (blockRows,blockCols,1),
                                       input_gpu.gpudata,
                                       tCWparams['numAtoms'],
                                       tCWparams['TAtom'],
                                       tCWparams['t0_data'],
                                       windowRange.t0, windowRange.dt0,
                                       windowRange.tau, windowRange.dtau,
                                       tCWparams['N_tauRange'],
                                       Fmn_gpu.gpudata )
421
422
423
424

    # return results to host
    F_mn = Fmn_gpu.get()

David Keitel's avatar
David Keitel committed
425
    _print_GPU_memory_MB('Final')
426
427
428
429

    return F_mn


David Keitel's avatar
David Keitel committed
430
431
432
def pycuda_compute_transient_fstat_map_exp ( atomsInputMatrix,
                                             windowRange,
                                             tCWparams ):
433
434
435
    '''exponential window, inner and outer loop GPU-parallelized'''

    # gpu data setup and transfer
David Keitel's avatar
David Keitel committed
436
    _print_GPU_memory_MB('Initial')
437
    input_gpu = gpuarray.to_gpu ( atomsInputMatrix )
David Keitel's avatar
David Keitel committed
438
439
440
441
    Fmn_gpu   = gpuarray.GPUArray ( (tCWparams['N_t0Range'],
                                     tCWparams['N_tauRange']),
                                    dtype=np.float32 )
    _print_GPU_memory_MB('After input+output allocation:')
442
443
444

    # GPU kernel
    kernel = 'cudaTransientFstatExpWindow'
David Keitel's avatar
David Keitel committed
445
    kernelfile = _get_absolute_kernel_path ( kernel )
446
447
448
449
450
451
452
453
454
455
456
    partial_Fstat_cuda_code = cudacomp.SourceModule(open(kernelfile,'r').read())
    partial_Fstat_cuda = partial_Fstat_cuda_code.get_function(kernel)
    partial_Fstat_cuda.prepare('PIIIIIIIIIP')

    # GPU grid setup
    blockRows = min(32,tCWparams['N_t0Range'])
    blockCols = min(32,tCWparams['N_tauRange'])
    gridRows  = int(np.ceil(1.0*tCWparams['N_t0Range']/blockRows))
    gridCols  = int(np.ceil(1.0*tCWparams['N_tauRange']/blockCols))

    # running the kernel
David Keitel's avatar
David Keitel committed
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
    logging.debug('Calling kernel with a grid of {}*{}={} blocks' \
                  ' of {}*{}={} threads each: {} total threads...'.format(
                      gridRows, gridCols, gridRows*gridCols,
                      blockRows, blockCols, blockRows*blockCols,
                      gridRows*gridCols*blockRows*blockCols))
    partial_Fstat_cuda.prepared_call ( (gridRows,gridCols),
                                       (blockRows,blockCols,1),
                                       input_gpu.gpudata,
                                       tCWparams['numAtoms'],
                                       tCWparams['TAtom'],
                                       tCWparams['t0_data'],
                                       windowRange.t0, windowRange.dt0,
                                       windowRange.tau, windowRange.dtau,
                                       tCWparams['N_t0Range'],
                                       tCWparams['N_tauRange'],
                                       Fmn_gpu.gpudata )
473
474
475
476

    # return results to host
    F_mn = Fmn_gpu.get()

David Keitel's avatar
David Keitel committed
477
    _print_GPU_memory_MB('Final')
478
479

    return F_mn