tcw_fstat_map_funcs.py 18.1 KB
Newer Older
1 2
""" Additional helper functions dealing with transient-CW F(t0,tau) maps """

3 4
import numpy as np
import os
5
import logging
6
from time import time
7 8 9 10 11

# optional imports
import importlib as imp


12 13
def _optional_import(modulename, shorthand=None):
    """
14 15 16 17
    Import a module/submodule only if it's available.

    using importlib instead of __import__
    because the latter doesn't handle sub.modules
18 19 20

    Also including a special check to fail more gracefully
    when CUDA_DEVICE is set to too high a number.
21
    """
22

23
    if shorthand is None:
24 25
        shorthand = modulename
        shorthandbit = ""
26
    else:
27
        shorthandbit = " as " + shorthand
28

29 30
    try:
        globals()[shorthand] = imp.import_module(modulename)
31
        logging.debug("Successfully imported module %s%s." % (modulename, shorthandbit))
32
        success = True
33
    except ImportError as e:
34 35
        logging.debug("Failed to import module {:s}.".format(modulename))
        success = False
36

37 38 39
    return success


40
class pyTransientFstatMap(object):
41
    """
42 43 44 45 46 47 48 49
    simplified object class for a F(t0,tau) F-stat map (not 2F!)
    based on LALSuite's transientFstatMap_t type
    replacing the gsl matrix with a numpy array

    F_mn:   2D array of 2F values
    maxF:   maximum of F (not 2F!)
    t0_ML:  maximum likelihood transient start time t0 estimate
    tau_ML: maximum likelihood transient duration tau estimate
50
    """
51 52

    def __init__(self, N_t0Range, N_tauRange):
53
        self.F_mn = np.zeros((N_t0Range, N_tauRange), dtype=np.float32)
David Keitel's avatar
David Keitel committed
54 55 56 57
        # Initializing maxF to a negative value ensures
        # that we always update at least once and hence return
        # sane t0_d_ML, tau_d_ML
        # even if there is only a single bin where F=0 happens.
58 59
        self.maxF = float(-1.0)
        self.t0_ML = float(0.0)
60 61 62
        self.tau_ML = float(0.0)


63 64 65
# dictionary of the actual callable F-stat map functions we support,
# if the corresponding modules are available.
fstatmap_versions = {
66 67 68 69 70 71 72
    "lal": lambda multiFstatAtoms, windowRange: getattr(
        lalpulsar, "ComputeTransientFstatMap"
    )(multiFstatAtoms, windowRange, False),
    "pycuda": lambda multiFstatAtoms, windowRange: pycuda_compute_transient_fstat_map(
        multiFstatAtoms, windowRange
    ),
}
73 74


75 76
def init_transient_fstat_map_features(wantCuda=False, cudaDeviceName=None):
    """
77 78 79 80 81
    Initialization of available modules (or "features") for F-stat maps.

    Returns a dictionary of method names, to match fstatmap_versions
    each key's value set to True only if
    all required modules are importable on this system.
82
    """
83

84
    features = {}
85

86 87 88
    have_lal = _optional_import("lal")
    have_lalpulsar = _optional_import("lalpulsar")
    features["lal"] = have_lal and have_lalpulsar
89 90

    # import GPU features
91 92 93 94 95 96 97 98 99 100 101 102 103
    have_pycuda = _optional_import("pycuda")
    have_pycuda_drv = _optional_import("pycuda.driver", "drv")
    have_pycuda_gpuarray = _optional_import("pycuda.gpuarray", "gpuarray")
    have_pycuda_tools = _optional_import("pycuda.tools", "cudatools")
    have_pycuda_compiler = _optional_import("pycuda.compiler", "cudacomp")
    features["pycuda"] = (
        have_pycuda_drv
        and have_pycuda_gpuarray
        and have_pycuda_tools
        and have_pycuda_compiler
    )

    logging.debug("Got the following features for transient F-stat maps:")
104
    logging.debug(features)
105

106 107
    if wantCuda and features["pycuda"]:
        logging.debug("CUDA version: " + ".".join(map(str, drv.get_version())))
108

109
        drv.init()
110 111 112 113
        logging.debug(
            "Starting with default pyCUDA context,"
            " then checking all available devices..."
        )
114 115
        try:
            context0 = pycuda.tools.make_default_context()
116
        except pycuda._driver.LogicError as e:
117 118 119 120 121 122 123 124
            if e.message == "cuDeviceGet failed: invalid device ordinal":
                devn = int(os.environ["CUDA_DEVICE"])
                raise RuntimeError(
                    "Requested CUDA device number {} exceeds"
                    " number of available devices!"
                    " Please change through environment"
                    " variable $CUDA_DEVICE.".format(devn)
                )
125 126
            else:
                raise pycuda._driver.LogicError(e.message)
127

128
        num_gpus = drv.Device.count()
129
        logging.debug("Found {} CUDA device(s).".format(num_gpus))
130 131

        devices = []
132
        devnames = np.empty(num_gpus, dtype="S32")
133
        for n in range(num_gpus):
134 135
            devn = drv.Device(n)
            devices.append(devn)
136 137 138 139 140 141 142 143 144
            devnames[n] = devn.name().replace(" ", "-").replace("_", "-")
            logging.debug(
                "device {}: model: {}, RAM: {}MB".format(
                    n, devnames[n], devn.total_memory() / (2.0 ** 20)
                )
            )

        if "CUDA_DEVICE" in os.environ:
            devnum0 = int(os.environ["CUDA_DEVICE"])
145 146 147
        else:
            devnum0 = 0

148
        matchbit = ""
149
        if cudaDeviceName:
150
            # allow partial matches in device names
151 152 153 154 155
            devmatches = [
                devidx
                for devidx, devname in enumerate(devnames)
                if cudaDeviceName in devname
            ]
156 157
            if len(devmatches) == 0:
                context0.detach()
158 159 160 161 162 163
                raise RuntimeError(
                    'Requested CUDA device "{}" not found.'
                    " Available devices: [{}]".format(
                        cudaDeviceName, ",".join(devnames)
                    )
                )
164 165 166
            else:
                devnum = devmatches[0]
                if len(devmatches) > 1:
167 168 169 170 171 172 173 174 175 176
                    logging.warning(
                        'Found {} CUDA devices matching name "{}".'
                        " Choosing first one with index {}.".format(
                            len(devmatches), cudaDeviceName, devnum
                        )
                    )
            os.environ["CUDA_DEVICE"] = str(devnum)
            matchbit = '(matched to user request "{}")'.format(cudaDeviceName)
        elif "CUDA_DEVICE" in os.environ:
            devnum = int(os.environ["CUDA_DEVICE"])
177 178
        else:
            devnum = 0
179
        devn = devices[devnum]
180 181 182 183 184 185
        logging.info(
            "Choosing CUDA device {},"
            " of {} devices present: {}{}...".format(
                devnum, num_gpus, devn.name(), matchbit
            )
        )
186 187 188 189 190 191 192
        if devnum == devnum0:
            gpu_context = context0
        else:
            context0.pop()
            gpu_context = pycuda.tools.make_default_context()
            gpu_context.push()

193
        _print_GPU_memory_MB("Available")
194 195
    else:
        gpu_context = None
196

197
    return features, gpu_context
198 199


200 201 202 203
def call_compute_transient_fstat_map(
    version, features, multiFstatAtoms=None, windowRange=None
):
    """Choose which version of the ComputeTransientFstatMap function to call."""
204 205 206

    if version in fstatmap_versions:
        if features[version]:
207
            time0 = time()
208
            FstatMap = fstatmap_versions[version](multiFstatAtoms, windowRange)
209
            timingFstatMap = time() - time0
210
        else:
211 212 213 214
            raise Exception(
                "Required module(s) for transient F-stat map"
                ' method "{}" not available!'.format(version)
            )
215
    else:
216 217 218
        raise Exception(
            'Transient F-stat map method "{}"' " not implemented!".format(version)
        )
219
    return FstatMap, timingFstatMap
220 221


222 223
def reshape_FstatAtomsVector(atomsVector):
    """
224 225 226 227 228 229
    Make a dictionary of ndarrays out of a atoms "vector" structure.

    The input is a "vector"-like structure with times as the higher hierarchical
    level and a set of "atoms" quantities defined at each timestamp.
    The output is a dictionary with an entry for each quantity,
    which is a 1D ndarray over timestamps for that one quantity.
230
    """
231 232 233

    numAtoms = atomsVector.length
    atomsDict = {}
234 235 236 237 238 239 240 241 242
    atom_fieldnames = [
        "timestamp",
        "Fa_alpha",
        "Fb_alpha",
        "a2_alpha",
        "ab_alpha",
        "b2_alpha",
    ]
    atom_dtypes = [np.uint32, complex, complex, np.float32, np.float32, np.float32]
243
    for f, field in enumerate(atom_fieldnames):
244
        atomsDict[field] = np.ndarray(numAtoms, dtype=atom_dtypes[f])
245

246
    for n, atom in enumerate(atomsVector.data):
247 248 249
        for field in atom_fieldnames:
            atomsDict[field][n] = atom.__getattribute__(field)

250 251 252 253
    atomsDict["Fa_alpha_re"] = np.float32(atomsDict["Fa_alpha"].real)
    atomsDict["Fa_alpha_im"] = np.float32(atomsDict["Fa_alpha"].imag)
    atomsDict["Fb_alpha_re"] = np.float32(atomsDict["Fb_alpha"].real)
    atomsDict["Fb_alpha_im"] = np.float32(atomsDict["Fb_alpha"].imag)
254 255 256 257

    return atomsDict


258
def _get_absolute_kernel_path(kernel):
259
    pyfstatdir = os.path.dirname(os.path.abspath(os.path.realpath(__file__)))
260 261
    kernelfile = kernel + ".cu"
    return os.path.join(pyfstatdir, "pyCUDAkernels", kernelfile)
262 263


264 265 266 267 268 269
def _print_GPU_memory_MB(key):
    mem_used_MB = drv.mem_get_info()[0] / (2.0 ** 20)
    mem_total_MB = drv.mem_get_info()[1] / (2.0 ** 20)
    logging.debug(
        "{} GPU memory: {:.4f} / {:.4f} MB free".format(key, mem_used_MB, mem_total_MB)
    )
David Keitel's avatar
David Keitel committed
270 271


272 273
def pycuda_compute_transient_fstat_map(multiFstatAtoms, windowRange):
    """
274 275 276 277 278 279 280 281 282 283 284
    GPU version of the function to compute transient-window "F-statistic map"
    over start-time and timescale {t0, tau}.
    Based on XLALComputeTransientFstatMap from LALSuite,
    (C) 2009 Reinhard Prix, licensed under GPL

    Returns a 2D matrix F_mn,
    with m = index over start-times t0,
    and  n = index over timescales tau,
    in steps of dt0  in [t0,  t0+t0Band],
    and         dtau in [tau, tau+tauBand]
    as defined in windowRange input.
285
    """
286

287 288 289 290 291 292 293
    if windowRange.type >= lalpulsar.TRANSIENT_LAST:
        raise ValueError(
            "Unknown window-type ({}) passed as input."
            " Allowed are [0,{}].".format(
                windowRange.type, lalpulsar.TRANSIENT_LAST - 1
            )
        )
294 295 296 297 298 299

    # internal dict for search/setup parameters
    tCWparams = {}

    # first combine all multi-atoms
    # into a single atoms-vector with *unique* timestamps
300 301 302
    tCWparams["TAtom"] = multiFstatAtoms.data[0].TAtom
    TAtomHalf = int(tCWparams["TAtom"] / 2)  # integer division
    atoms = lalpulsar.mergeMultiFstatAtomsBinned(multiFstatAtoms, tCWparams["TAtom"])
303 304

    # make a combined input matrix of all atoms vectors, for transfer to GPU
305
    tCWparams["numAtoms"] = atoms.length
306
    atomsDict = reshape_FstatAtomsVector(atoms)
307 308 309 310 311 312 313 314 315 316 317
    atomsInputMatrix = np.column_stack(
        (
            atomsDict["a2_alpha"],
            atomsDict["b2_alpha"],
            atomsDict["ab_alpha"],
            atomsDict["Fa_alpha_re"],
            atomsDict["Fa_alpha_im"],
            atomsDict["Fb_alpha_re"],
            atomsDict["Fb_alpha_im"],
        )
    )
318 319 320

    # actual data spans [t0_data, t0_data + tCWparams['numAtoms'] * TAtom]
    # in steps of TAtom
321 322 323 324 325 326 327 328 329 330 331 332 333 334
    tCWparams["t0_data"] = int(atoms.data[0].timestamp)
    tCWparams["t1_data"] = int(
        atoms.data[tCWparams["numAtoms"] - 1].timestamp + tCWparams["TAtom"]
    )

    logging.debug(
        "Transient F-stat map:"
        " t0_data={:d}, t1_data={:d}".format(tCWparams["t0_data"], tCWparams["t1_data"])
    )
    logging.debug(
        "Transient F-stat map:"
        " numAtoms={:d}, TAtom={:d},"
        " TAtomHalf={:d}".format(tCWparams["numAtoms"], tCWparams["TAtom"], TAtomHalf)
    )
335 336 337

    # special treatment of window_type = none
    # ==> replace by rectangular window spanning all the data
338 339 340 341 342 343 344 345
    if windowRange.type == lalpulsar.TRANSIENT_NONE:
        windowRange.type = lalpulsar.TRANSIENT_RECTANGULAR
        windowRange.t0 = tCWparams["t0_data"]
        windowRange.t0Band = 0
        windowRange.dt0 = tCWparams["TAtom"]  # irrelevant
        windowRange.tau = tCWparams["numAtoms"] * tCWparams["TAtom"]
        windowRange.tauBand = 0
        windowRange.dtau = tCWparams["TAtom"]  # irrelevant
346

David Keitel's avatar
David Keitel committed
347 348 349 350
    """ NOTE: indices {i,j} enumerate *actual* atoms and their timestamps t_i,
    * while the indices {m,n} enumerate the full grid of values
    * in [t0_min, t0_max]x[Tcoh_min, Tcoh_max] in steps of deltaT.
    * This allows us to deal with gaps in the data in a transparent way.
351
    *
David Keitel's avatar
David Keitel committed
352 353 354 355
    * NOTE2: we operate on the 'binned' atoms returned
    * from XLALmergeMultiFstatAtomsBinned(),
    * which means we can safely assume all atoms to be lined up
    * perfectly on a 'deltaT' binned grid.
356 357
    *
    * The mapping used will therefore be {i,j} -> {m,n}:
David Keitel's avatar
David Keitel committed
358 359 360 361
    *   m = offs_i  / deltaT
    *   start-time offset from t0_min measured in deltaT
    *   n = Tcoh_ij / deltaT
    *   duration Tcoh_ij measured in deltaT,
362 363 364 365 366 367 368 369
    *
    * where
    *   offs_i  = t_i - t0_min
    *   Tcoh_ij = t_j - t_i + deltaT
    *
    """

    # We allocate a matrix  {m x n} = t0Range * TcohRange elements
David Keitel's avatar
David Keitel committed
370
    # covering the full transient window-range [t0,t0+t0Band]x[tau,tau+tauBand]
371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396
    tCWparams["N_t0Range"] = int(
        np.floor(1.0 * windowRange.t0Band / windowRange.dt0) + 1
    )
    tCWparams["N_tauRange"] = int(
        np.floor(1.0 * windowRange.tauBand / windowRange.dtau) + 1
    )
    FstatMap = pyTransientFstatMap(tCWparams["N_t0Range"], tCWparams["N_tauRange"])

    logging.debug(
        "Transient F-stat map:"
        " N_t0Range={:d}, N_tauRange={:d},"
        " total grid points: {:d}".format(
            tCWparams["N_t0Range"],
            tCWparams["N_tauRange"],
            tCWparams["N_t0Range"] * tCWparams["N_tauRange"],
        )
    )

    if windowRange.type == lalpulsar.TRANSIENT_RECTANGULAR:
        FstatMap.F_mn = pycuda_compute_transient_fstat_map_rect(
            atomsInputMatrix, windowRange, tCWparams
        )
    elif windowRange.type == lalpulsar.TRANSIENT_EXPONENTIAL:
        FstatMap.F_mn = pycuda_compute_transient_fstat_map_exp(
            atomsInputMatrix, windowRange, tCWparams
        )
397
    else:
398 399 400 401 402 403
        raise ValueError(
            "Invalid transient window type {}"
            " not in [{}, {}].".format(
                windowRange.type, lalpulsar.TRANSIENT_NONE, lalpulsar.TRANSIENT_LAST - 1
            )
        )
404 405 406

    # out of loop: get max2F and ML estimates over the m x n matrix
    FstatMap.maxF = FstatMap.F_mn.max()
407 408 409 410
    maxidx = np.unravel_index(
        FstatMap.F_mn.argmax(), (tCWparams["N_t0Range"], tCWparams["N_tauRange"])
    )
    FstatMap.t0_ML = windowRange.t0 + maxidx[0] * windowRange.dt0
411 412
    FstatMap.tau_ML = windowRange.tau + maxidx[1] * windowRange.dtau

413 414 415 416 417 418
    logging.debug(
        "Done computing transient F-stat map."
        " maxF={:.4f}, t0_ML={}, tau_ML={}".format(
            FstatMap.maxF, FstatMap.t0_ML, FstatMap.tau_ML
        )
    )
419 420 421 422

    return FstatMap


423 424
def pycuda_compute_transient_fstat_map_rect(atomsInputMatrix, windowRange, tCWparams):
    """
David Keitel's avatar
David Keitel committed
425 426
    only GPU-parallizing outer loop,
    keeping partial sums with memory in kernel
427
    """
428 429

    # gpu data setup and transfer
430 431 432 433 434 435
    _print_GPU_memory_MB("Initial")
    input_gpu = gpuarray.to_gpu(atomsInputMatrix)
    Fmn_gpu = gpuarray.GPUArray(
        (tCWparams["N_t0Range"], tCWparams["N_tauRange"]), dtype=np.float32
    )
    _print_GPU_memory_MB("After input+output allocation:")
436 437

    # GPU kernel
438 439 440
    kernel = "cudaTransientFstatRectWindow"
    kernelfile = _get_absolute_kernel_path(kernel)
    partial_Fstat_cuda_code = cudacomp.SourceModule(open(kernelfile, "r").read())
441
    partial_Fstat_cuda = partial_Fstat_cuda_code.get_function(kernel)
442
    partial_Fstat_cuda.prepare("PIIIIIIIIP")
443 444

    # GPU grid setup
445
    blockRows = min(1024, tCWparams["N_t0Range"])
446
    blockCols = 1
447 448
    gridRows = int(np.ceil(1.0 * tCWparams["N_t0Range"] / blockRows))
    gridCols = 1
449 450

    # running the kernel
451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476
    logging.debug(
        "Calling pyCUDA kernel with a grid of {}*{}={} blocks"
        " of {}*{}={} threads each: {} total threads...".format(
            gridRows,
            gridCols,
            gridRows * gridCols,
            blockRows,
            blockCols,
            blockRows * blockCols,
            gridRows * gridCols * blockRows * blockCols,
        )
    )
    partial_Fstat_cuda.prepared_call(
        (gridRows, gridCols),
        (blockRows, blockCols, 1),
        input_gpu.gpudata,
        tCWparams["numAtoms"],
        tCWparams["TAtom"],
        tCWparams["t0_data"],
        windowRange.t0,
        windowRange.dt0,
        windowRange.tau,
        windowRange.dtau,
        tCWparams["N_tauRange"],
        Fmn_gpu.gpudata,
    )
477 478 479 480

    # return results to host
    F_mn = Fmn_gpu.get()

481
    _print_GPU_memory_MB("Final")
482 483 484 485

    return F_mn


486 487
def pycuda_compute_transient_fstat_map_exp(atomsInputMatrix, windowRange, tCWparams):
    """exponential window, inner and outer loop GPU-parallelized"""
488 489

    # gpu data setup and transfer
490 491 492 493 494 495
    _print_GPU_memory_MB("Initial")
    input_gpu = gpuarray.to_gpu(atomsInputMatrix)
    Fmn_gpu = gpuarray.GPUArray(
        (tCWparams["N_t0Range"], tCWparams["N_tauRange"]), dtype=np.float32
    )
    _print_GPU_memory_MB("After input+output allocation:")
496 497

    # GPU kernel
498 499 500
    kernel = "cudaTransientFstatExpWindow"
    kernelfile = _get_absolute_kernel_path(kernel)
    partial_Fstat_cuda_code = cudacomp.SourceModule(open(kernelfile, "r").read())
501
    partial_Fstat_cuda = partial_Fstat_cuda_code.get_function(kernel)
502
    partial_Fstat_cuda.prepare("PIIIIIIIIIP")
503 504

    # GPU grid setup
505 506 507 508
    blockRows = min(32, tCWparams["N_t0Range"])
    blockCols = min(32, tCWparams["N_tauRange"])
    gridRows = int(np.ceil(1.0 * tCWparams["N_t0Range"] / blockRows))
    gridCols = int(np.ceil(1.0 * tCWparams["N_tauRange"] / blockCols))
509 510

    # running the kernel
511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537
    logging.debug(
        "Calling kernel with a grid of {}*{}={} blocks"
        " of {}*{}={} threads each: {} total threads...".format(
            gridRows,
            gridCols,
            gridRows * gridCols,
            blockRows,
            blockCols,
            blockRows * blockCols,
            gridRows * gridCols * blockRows * blockCols,
        )
    )
    partial_Fstat_cuda.prepared_call(
        (gridRows, gridCols),
        (blockRows, blockCols, 1),
        input_gpu.gpudata,
        tCWparams["numAtoms"],
        tCWparams["TAtom"],
        tCWparams["t0_data"],
        windowRange.t0,
        windowRange.dt0,
        windowRange.tau,
        windowRange.dtau,
        tCWparams["N_t0Range"],
        tCWparams["N_tauRange"],
        Fmn_gpu.gpudata,
    )
538 539 540 541

    # return results to host
    F_mn = Fmn_gpu.get()

542
    _print_GPU_memory_MB("Final")
543 544

    return F_mn