tcw_fstat_map_funcs.py 18.3 KB
Newer Older
1
2
""" Additional helper functions dealing with transient-CW F(t0,tau) maps """

3
4
import numpy as np
import os
5
import sys
6
7
8
9
10
11
12
13
14
15
16
17
import logging

# optional imports
import importlib as imp


def optional_import ( modulename, shorthand=None ):
    '''
    Import a module/submodule only if it's available.

    using importlib instead of __import__
    because the latter doesn't handle sub.modules
18
19
20

    Also including a special check to fail more gracefully
    when CUDA_DEVICE is set to too high a number.
21
    '''
22

23
24
25
26
27
    if shorthand is None:
        shorthand    = modulename
        shorthandbit = ''
    else:
        shorthandbit = ' as '+shorthand
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57

    if('pycuda' in sys.modules):
        try:
            globals()[shorthand] = imp.import_module(modulename)
            logging.debug('Successfully imported module %s%s.' % (modulename, shorthandbit))
            success = True
        except pycuda._driver.LogicError, e:
            if e.message == 'cuDeviceGet failed: invalid device ordinal':
                devn = int(os.environ['CUDA_DEVICE'])
                raise RuntimeError('Requested CUDA device number {} exceeds number of available devices! Please change through environment variable $CUDA_DEVICE.'.format(devn))
            else:
                raise pycuda._driver.LogicError(e.message)
        except ImportError, e:
            if e.message == 'No module named '+modulename:
                logging.debug('No module {:s} found.'.format(modulename))
                success = False
            else:
                raise
    else:
        try:
            globals()[shorthand] = imp.import_module(modulename)
            logging.debug('Successfully imported module %s%s.' % (modulename, shorthandbit))
            success = True
        except ImportError, e:
            if e.message == 'No module named '+modulename:
                logging.debug('No module {:s} found.'.format(modulename))
                success = False
            else:
                raise

58
59
60
    return success


61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
class pyTransientFstatMap(object):
    '''
    simplified object class for a F(t0,tau) F-stat map (not 2F!)
    based on LALSuite's transientFstatMap_t type
    replacing the gsl matrix with a numpy array

    F_mn:   2D array of 2F values
    maxF:   maximum of F (not 2F!)
    t0_ML:  maximum likelihood transient start time t0 estimate
    tau_ML: maximum likelihood transient duration tau estimate
    '''

    def __init__(self, N_t0Range, N_tauRange):
        self.F_mn   = np.zeros((N_t0Range, N_tauRange), dtype=np.float32)
        self.maxF   = float(-1.0) # Initializing to a negative value ensures that we always update at least once and hence return sane t0_d_ML, tau_d_ML even if there is only a single bin where F=0 happens.
        self.t0_ML  = float(0.0)
        self.tau_ML = float(0.0)


80
81
82
83
84
85
# dictionary of the actual callable F-stat map functions we support,
# if the corresponding modules are available.
fstatmap_versions = {
                     'lal':    lambda multiFstatAtoms, windowRange:
                               getattr(lalpulsar,'ComputeTransientFstatMap')
                                ( multiFstatAtoms, windowRange, False ),
86
87
88
                     'pycuda': lambda multiFstatAtoms, windowRange:
                               pycuda_compute_transient_fstat_map
                                ( multiFstatAtoms, windowRange )
89
90
91
                    }


92
def init_transient_fstat_map_features ( cudaDeviceName ):
93
94
95
96
97
98
99
    '''
    Initialization of available modules (or "features") for F-stat maps.

    Returns a dictionary of method names, to match fstatmap_versions
    each key's value set to True only if
    all required modules are importable on this system.
    '''
100

101
    features = {}
102

103
104
105
    have_lal           = optional_import('lal')
    have_lalpulsar     = optional_import('lalpulsar')
    features['lal']    = have_lal and have_lalpulsar
106
107

    # import GPU features
108
109
    have_pycuda          = optional_import('pycuda')
    have_pycuda_drv      = optional_import('pycuda.driver', 'drv')
110
111
112
    have_pycuda_gpuarray = optional_import('pycuda.gpuarray', 'gpuarray')
    have_pycuda_tools    = optional_import('pycuda.tools', 'cudatools')
    have_pycuda_compiler = optional_import('pycuda.compiler', 'cudacomp')
113
    features['pycuda']   = have_pycuda_drv and have_pycuda_gpuarray and have_pycuda_tools and have_pycuda_compiler
114

115
116
    logging.debug('Got the following features for transient F-stat maps:')
    logging.debug(features)
117

118
119
120
    if features['pycuda']:
        logging.debug('CUDA version: {}'.format(drv.get_version()))

121
122
123
124
        drv.init()
        logging.debug('Starting with default context, then checking all available devices...')
        context0 = pycuda.tools.make_default_context()

125
126
127
128
        num_gpus = drv.Device.count()
        logging.debug('Found {} CUDA device(s).'.format(num_gpus))

        devices = []
129
        devnames = np.empty(num_gpus,dtype='S32')
130
        for n in range(num_gpus):
131
132
133
134
            devn = drv.Device(n)
            devices.append(devn)
            devnames[n] = devn.name().replace(' ','-').replace('_','-')
            logging.debug('device {}: model: {}, RAM: {}MB'.format(n,devnames[n],devn.total_memory()/(2.**20) ))
135

136
        if 'CUDA_DEVICE' in os.environ:
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
            devnum0 = int(os.environ['CUDA_DEVICE'])
        else:
            devnum0 = 0

        if cudaDeviceName:
            devmatches = np.where(devnames == cudaDeviceName)[0]
            if len(devmatches) == 0:
                context0.detach()
                raise RuntimeError('Requested CUDA device "{}" not found. Available devices: [{}]'.format(cudaDeviceName,','.join(devnames)))
            else:
                devnum = devmatches[0]
                if len(devmatches) > 1:
                    logging.warning('Found {} CUDA devices matching name "{}". Choosing first one with index {}.'.format(len(devmatches),cudaDeviceName,devnum))
            os.environ['CUDA_DEVICE'] = str(devnum)
        elif 'CUDA_DEVICE' in os.environ:
152
153
154
            devnum = int(os.environ['CUDA_DEVICE'])
        else:
            devnum = 0
155
156
157
158
159
160
161
162
163
        devn = devices[devnum]
        logging.info('Choosing CUDA device {}, of {} devices present: {} (matched to user request "{}")...'.format(devnum,num_gpus,devn.name(),devnames[devnum]))
        if devnum == devnum0:
            gpu_context = context0
        else:
            context0.pop()
            gpu_context = pycuda.tools.make_default_context()
            gpu_context.push()

164
        logging.debug('Available GPU memory: {}/{} MB free'.format(drv.mem_get_info()[0]/(2.**20),drv.mem_get_info()[1]/(2.**20)))
165
166
    else:
        gpu_context = None
167

168
    return features, gpu_context
169
170
171
172
173
174
175
176
177
178
179
180
181


def call_compute_transient_fstat_map ( version, features, multiFstatAtoms=None, windowRange=None ):
    '''Choose which version of the ComputeTransientFstatMap function to call.'''

    if version in fstatmap_versions:
        if features[version]:
            FstatMap = fstatmap_versions[version](multiFstatAtoms, windowRange)
        else:
            raise Exception('Required module(s) for transient F-stat map method "{}" not available!'.format(version))
    else:
        raise Exception('Transient F-stat map method "{}" not implemented!'.format(version))
    return FstatMap
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400


def reshape_FstatAtomsVector ( atomsVector ):
    '''
    Make a dictionary of ndarrays out of a atoms "vector" structure.

    The input is a "vector"-like structure with times as the higher hierarchical
    level and a set of "atoms" quantities defined at each timestamp.
    The output is a dictionary with an entry for each quantity,
    which is a 1D ndarray over timestamps for that one quantity.
    '''

    numAtoms = atomsVector.length
    atomsDict = {}
    atom_fieldnames = ['timestamp', 'Fa_alpha', 'Fb_alpha',
                       'a2_alpha', 'ab_alpha', 'b2_alpha']
    atom_dtypes     = [np.uint32, complex, complex,
                       np.float32, np.float32, np.float32]
    for f, field in enumerate(atom_fieldnames):
        atomsDict[field] = np.ndarray(numAtoms,dtype=atom_dtypes[f])

    for n,atom in enumerate(atomsVector.data):
        for field in atom_fieldnames:
            atomsDict[field][n] = atom.__getattribute__(field)

    atomsDict['Fa_alpha_re'] = np.float32(atomsDict['Fa_alpha'].real)
    atomsDict['Fa_alpha_im'] = np.float32(atomsDict['Fa_alpha'].imag)
    atomsDict['Fb_alpha_re'] = np.float32(atomsDict['Fb_alpha'].real)
    atomsDict['Fb_alpha_im'] = np.float32(atomsDict['Fb_alpha'].imag)

    return atomsDict


def get_absolute_kernel_path ( kernel ):
    pyfstatdir = os.path.dirname(os.path.abspath(os.path.realpath(__file__)))
    kernelfile = kernel + '.cu'
    return os.path.join(pyfstatdir,'pyCUDAkernels',kernelfile)


def pycuda_compute_transient_fstat_map ( multiFstatAtoms, windowRange ):
    '''
    GPU version of the function to compute transient-window "F-statistic map"
    over start-time and timescale {t0, tau}.
    Based on XLALComputeTransientFstatMap from LALSuite,
    (C) 2009 Reinhard Prix, licensed under GPL

    Returns a 2D matrix F_mn,
    with m = index over start-times t0,
    and  n = index over timescales tau,
    in steps of dt0  in [t0,  t0+t0Band],
    and         dtau in [tau, tau+tauBand]
    as defined in windowRange input.
    '''

    if ( windowRange.type >= lalpulsar.TRANSIENT_LAST ):
        raise ValueError ('Unknown window-type ({}) passed as input. Allowed are [0,{}].'.format(windowRange.type, lalpulsar.TRANSIENT_LAST-1))

    # internal dict for search/setup parameters
    tCWparams = {}

    # first combine all multi-atoms
    # into a single atoms-vector with *unique* timestamps
    tCWparams['TAtom'] = multiFstatAtoms.data[0].TAtom
    TAtomHalf          = int(tCWparams['TAtom']/2) # integer division
    atoms = lalpulsar.mergeMultiFstatAtomsBinned ( multiFstatAtoms,
                                                   tCWparams['TAtom'] )

    # make a combined input matrix of all atoms vectors, for transfer to GPU
    tCWparams['numAtoms'] = atoms.length
    atomsDict = reshape_FstatAtomsVector(atoms)
    atomsInputMatrix = np.column_stack ( (atomsDict['a2_alpha'],
                                          atomsDict['b2_alpha'],
                                          atomsDict['ab_alpha'],
                                          atomsDict['Fa_alpha_re'],
                                          atomsDict['Fa_alpha_im'],
                                          atomsDict['Fb_alpha_re'],
                                          atomsDict['Fb_alpha_im'])
                                       )

    # actual data spans [t0_data, t0_data + tCWparams['numAtoms'] * TAtom]
    # in steps of TAtom
    tCWparams['t0_data'] = int(atoms.data[0].timestamp)
    tCWparams['t1_data'] = int(atoms.data[tCWparams['numAtoms']-1].timestamp + tCWparams['TAtom'])

    logging.debug('t0_data={:d}, t1_data={:d}'.format(tCWparams['t0_data'],
                                                      tCWparams['t1_data']))
    logging.debug('numAtoms={:d}, TAtom={:d}, TAtomHalf={:d}'.format(tCWparams['numAtoms'],
                                                                     tCWparams['TAtom'],
                                                                     TAtomHalf))

    # special treatment of window_type = none
    # ==> replace by rectangular window spanning all the data
    if ( windowRange.type == lalpulsar.TRANSIENT_NONE ):
        windowRange.type    = lalpulsar.TRANSIENT_RECTANGULAR
        windowRange.t0      = tCWparams['t0_data']
        windowRange.t0Band  = 0
        windowRange.dt0     = tCWparams['TAtom'] # irrelevant
        windowRange.tau     = tCWparams['numAtoms'] * tCWparams['TAtom']
        windowRange.tauBand = 0;
        windowRange.dtau    = tCWparams['TAtom'] # irrelevant

    """ NOTE: indices {i,j} enumerate *actual* atoms and their timestamps t_i, while the
    * indices {m,n} enumerate the full grid of values in [t0_min, t0_max]x[Tcoh_min, Tcoh_max] in
    * steps of deltaT. This allows us to deal with gaps in the data in a transparent way.
    *
    * NOTE2: we operate on the 'binned' atoms returned from XLALmergeMultiFstatAtomsBinned(),
    * which means we can safely assume all atoms to be lined up perfectly on a 'deltaT' binned grid.
    *
    * The mapping used will therefore be {i,j} -> {m,n}:
    *   m = offs_i  / deltaT		= start-time offset from t0_min measured in deltaT
    *   n = Tcoh_ij / deltaT		= duration Tcoh_ij measured in deltaT,
    *
    * where
    *   offs_i  = t_i - t0_min
    *   Tcoh_ij = t_j - t_i + deltaT
    *
    """

    # We allocate a matrix  {m x n} = t0Range * TcohRange elements
    # covering the full timerange the transient window-range [t0,t0+t0Band]x[tau,tau+tauBand]
    tCWparams['N_t0Range']  = int(np.floor( 1.0*windowRange.t0Band / windowRange.dt0 ) + 1)
    tCWparams['N_tauRange'] = int(np.floor( 1.0*windowRange.tauBand / windowRange.dtau ) + 1)
    FstatMap = pyTransientFstatMap ( tCWparams['N_t0Range'], tCWparams['N_tauRange'] )

    logging.debug('N_t0Range={:d}, N_tauRange={:d}'.format(tCWparams['N_t0Range'],
                                                           tCWparams['N_tauRange']))

    if ( windowRange.type == lalpulsar.TRANSIENT_RECTANGULAR ):
        FstatMap.F_mn = pycuda_compute_transient_fstat_map_rect ( atomsInputMatrix,
                                                                  windowRange,
                                                                  tCWparams )
    elif ( windowRange.type == lalpulsar.TRANSIENT_EXPONENTIAL ):
        FstatMap.F_mn = pycuda_compute_transient_fstat_map_exp ( atomsInputMatrix,
                                                                 windowRange,
                                                                 tCWparams )
    else:
        raise ValueError('Invalid transient window type {} not in [{}, {}].'.format(windowRange.type, lalpulsar.TRANSIENT_NONE, lalpulsar.TRANSIENT_LAST-1))

    # out of loop: get max2F and ML estimates over the m x n matrix
    FstatMap.maxF = FstatMap.F_mn.max()
    maxidx = np.unravel_index ( FstatMap.F_mn.argmax(),
                               (tCWparams['N_t0Range'],
                                tCWparams['N_tauRange']))
    FstatMap.t0_ML  = windowRange.t0  + maxidx[0] * windowRange.dt0
    FstatMap.tau_ML = windowRange.tau + maxidx[1] * windowRange.dtau

    logging.debug('Done computing transient F-stat map. maxF={}, t0_ML={}, tau_ML={}'.format(FstatMap.maxF , FstatMap.t0_ML, FstatMap.tau_ML))

    return FstatMap


def pycuda_compute_transient_fstat_map_rect ( atomsInputMatrix, windowRange, tCWparams ):
    '''only GPU-parallizing outer loop, keeping partial sums with memory in kernel'''

    # gpu data setup and transfer
    logging.debug('Initial GPU memory: {}/{} MB free'.format(drv.mem_get_info()[0]/(2.**20),drv.mem_get_info()[1]/(2.**20)))
    input_gpu = gpuarray.to_gpu ( atomsInputMatrix )
    Fmn_gpu   = gpuarray.GPUArray ( (tCWparams['N_t0Range'],tCWparams['N_tauRange']), dtype=np.float32 )
    logging.debug('GPU memory with input+output allocated: {}/{} MB free'.format(drv.mem_get_info()[0]/(2.**20), drv.mem_get_info()[1]/(2.**20)))

    # GPU kernel
    kernel = 'cudaTransientFstatRectWindow'
    kernelfile = get_absolute_kernel_path ( kernel )
    partial_Fstat_cuda_code = cudacomp.SourceModule(open(kernelfile,'r').read())
    partial_Fstat_cuda = partial_Fstat_cuda_code.get_function(kernel)
    partial_Fstat_cuda.prepare('PIIIIIIIIP')

    # GPU grid setup
    blockRows = min(1024,tCWparams['N_t0Range'])
    blockCols = 1
    gridRows  = int(np.ceil(1.0*tCWparams['N_t0Range']/blockRows))
    gridCols  = 1
    logging.debug('Looking to compute transient F-stat map over a N={}*{}={} (t0,tau) grid...'.format(tCWparams['N_t0Range'],tCWparams['N_tauRange'],tCWparams['N_t0Range']*tCWparams['N_tauRange']))

    # running the kernel
    logging.debug('Calling kernel with a grid of {}*{}={} blocks of {}*{}={} threads each: {} total threads...'.format(gridRows, gridCols, gridRows*gridCols, blockRows, blockCols, blockRows*blockCols, gridRows*gridCols*blockRows*blockCols))
    partial_Fstat_cuda.prepared_call ( (gridRows,gridCols), (blockRows,blockCols,1), input_gpu.gpudata, tCWparams['numAtoms'], tCWparams['TAtom'], tCWparams['t0_data'], windowRange.t0, windowRange.dt0, windowRange.tau, windowRange.dtau, tCWparams['N_tauRange'], Fmn_gpu.gpudata )

    # return results to host
    F_mn = Fmn_gpu.get()

    logging.debug('Final GPU memory: {}/{} MB free'.format(drv.mem_get_info()[0]/(2.**20),drv.mem_get_info()[1]/(2.**20)))

    return F_mn


def pycuda_compute_transient_fstat_map_exp ( atomsInputMatrix, windowRange, tCWparams ):
    '''exponential window, inner and outer loop GPU-parallelized'''

    # gpu data setup and transfer
    logging.debug('Initial GPU memory: {}/{} MB free'.format(drv.mem_get_info()[0]/(2.**20),drv.mem_get_info()[1]/(2.**20)))
    input_gpu = gpuarray.to_gpu ( atomsInputMatrix )
    Fmn_gpu   = gpuarray.GPUArray ( (tCWparams['N_t0Range'],tCWparams['N_tauRange']), dtype=np.float32 )
    logging.debug('GPU memory with input+output allocated: {}/{} MB free'.format(drv.mem_get_info()[0]/(2.**20), drv.mem_get_info()[1]/(2.**20)))

    # GPU kernel
    kernel = 'cudaTransientFstatExpWindow'
    kernelfile = get_absolute_kernel_path ( kernel )
    partial_Fstat_cuda_code = cudacomp.SourceModule(open(kernelfile,'r').read())
    partial_Fstat_cuda = partial_Fstat_cuda_code.get_function(kernel)
    partial_Fstat_cuda.prepare('PIIIIIIIIIP')

    # GPU grid setup
    blockRows = min(32,tCWparams['N_t0Range'])
    blockCols = min(32,tCWparams['N_tauRange'])
    gridRows  = int(np.ceil(1.0*tCWparams['N_t0Range']/blockRows))
    gridCols  = int(np.ceil(1.0*tCWparams['N_tauRange']/blockCols))
    logging.debug('Looking to compute transient F-stat map over a N={}*{}={} (t0,tau) grid...'.format(tCWparams['N_t0Range'], tCWparams['N_tauRange'], tCWparams['N_t0Range']*tCWparams['N_tauRange']))

    # running the kernel
    logging.debug('Calling kernel with a grid of {}*{}={} blocks of {}*{}={} threads each: {} total threads...'.format(gridRows, gridCols, gridRows*gridCols, blockRows, blockCols, blockRows*blockCols, gridRows*gridCols*blockRows*blockCols))
    partial_Fstat_cuda.prepared_call ( (gridRows,gridCols), (blockRows,blockCols,1), input_gpu.gpudata, tCWparams['numAtoms'], tCWparams['TAtom'], tCWparams['t0_data'], windowRange.t0, windowRange.dt0, windowRange.tau, windowRange.dtau, tCWparams['N_t0Range'], tCWparams['N_tauRange'], Fmn_gpu.gpudata )

    # return results to host
    F_mn = Fmn_gpu.get()

    logging.debug('Final GPU memory: {}/{} MB free'.format(drv.mem_get_info()[0]/(2.**20),drv.mem_get_info()[1]/(2.**20)))

    return F_mn