Skip to content
Snippets Groups Projects
Select Git revision
  • d569dc90b0e739936c1d4ac54d419f537ccbd934
  • master default protected
  • 72-improve-docs-for_optimal_setup
  • os-path-join
  • develop-GA
  • add-higher-spindown-components
  • v1.3
  • v1.2
  • v1.1.2
  • v1.1.0
  • v1.0.1
11 results

tcw_fstat_map_funcs.py

Blame
  • Reinhard Prix's avatar
    Reinhard Prix authored
    - any import failure should just be treated as such: we can't import
      that module for whatever reason
    fa204efb
    History
    tcw_fstat_map_funcs.py 18.09 KiB
    """ Additional helper functions dealing with transient-CW F(t0,tau) maps """
    
    import numpy as np
    import os
    import logging
    from time import time
    
    # optional imports
    import importlib as imp
    
    
    def _optional_import(modulename, shorthand=None):
        """
        Import a module/submodule only if it's available.
    
        using importlib instead of __import__
        because the latter doesn't handle sub.modules
    
        Also including a special check to fail more gracefully
        when CUDA_DEVICE is set to too high a number.
        """
    
        if shorthand is None:
            shorthand = modulename
            shorthandbit = ""
        else:
            shorthandbit = " as " + shorthand
    
        try:
            globals()[shorthand] = imp.import_module(modulename)
            logging.debug("Successfully imported module %s%s." % (modulename, shorthandbit))
            success = True
        except ImportError as e:
            logging.debug("Failed to import module {:s}.".format(modulename))
            success = False
    
        return success
    
    
    class pyTransientFstatMap(object):
        """
        simplified object class for a F(t0,tau) F-stat map (not 2F!)
        based on LALSuite's transientFstatMap_t type
        replacing the gsl matrix with a numpy array
    
        F_mn:   2D array of 2F values
        maxF:   maximum of F (not 2F!)
        t0_ML:  maximum likelihood transient start time t0 estimate
        tau_ML: maximum likelihood transient duration tau estimate
        """
    
        def __init__(self, N_t0Range, N_tauRange):
            self.F_mn = np.zeros((N_t0Range, N_tauRange), dtype=np.float32)
            # Initializing maxF to a negative value ensures
            # that we always update at least once and hence return
            # sane t0_d_ML, tau_d_ML
            # even if there is only a single bin where F=0 happens.
            self.maxF = float(-1.0)
            self.t0_ML = float(0.0)
            self.tau_ML = float(0.0)
    
    
    # dictionary of the actual callable F-stat map functions we support,
    # if the corresponding modules are available.
    fstatmap_versions = {
        "lal": lambda multiFstatAtoms, windowRange: getattr(
            lalpulsar, "ComputeTransientFstatMap"
        )(multiFstatAtoms, windowRange, False),
        "pycuda": lambda multiFstatAtoms, windowRange: pycuda_compute_transient_fstat_map(
            multiFstatAtoms, windowRange
        ),
    }
    
    
    def init_transient_fstat_map_features(wantCuda=False, cudaDeviceName=None):
        """
        Initialization of available modules (or "features") for F-stat maps.
    
        Returns a dictionary of method names, to match fstatmap_versions
        each key's value set to True only if
        all required modules are importable on this system.
        """
    
        features = {}
    
        have_lal = _optional_import("lal")
        have_lalpulsar = _optional_import("lalpulsar")
        features["lal"] = have_lal and have_lalpulsar
    
        # import GPU features
        have_pycuda = _optional_import("pycuda")
        have_pycuda_drv = _optional_import("pycuda.driver", "drv")
        have_pycuda_gpuarray = _optional_import("pycuda.gpuarray", "gpuarray")
        have_pycuda_tools = _optional_import("pycuda.tools", "cudatools")
        have_pycuda_compiler = _optional_import("pycuda.compiler", "cudacomp")
        features["pycuda"] = (
            have_pycuda_drv
            and have_pycuda_gpuarray
            and have_pycuda_tools
            and have_pycuda_compiler
        )
    
        logging.debug("Got the following features for transient F-stat maps:")
        logging.debug(features)
    
        if wantCuda and features["pycuda"]:
            logging.debug("CUDA version: " + ".".join(map(str, drv.get_version())))
    
            drv.init()
            logging.debug(
                "Starting with default pyCUDA context,"
                " then checking all available devices..."
            )
            try:
                context0 = pycuda.tools.make_default_context()
            except pycuda._driver.LogicError as e:
                if e.message == "cuDeviceGet failed: invalid device ordinal":
                    devn = int(os.environ["CUDA_DEVICE"])
                    raise RuntimeError(
                        "Requested CUDA device number {} exceeds"
                        " number of available devices!"
                        " Please change through environment"
                        " variable $CUDA_DEVICE.".format(devn)
                    )
                else:
                    raise pycuda._driver.LogicError(e.message)
    
            num_gpus = drv.Device.count()
            logging.debug("Found {} CUDA device(s).".format(num_gpus))
    
            devices = []
            devnames = np.empty(num_gpus, dtype="S32")
            for n in range(num_gpus):
                devn = drv.Device(n)
                devices.append(devn)
                devnames[n] = devn.name().replace(" ", "-").replace("_", "-")
                logging.debug(
                    "device {}: model: {}, RAM: {}MB".format(
                        n, devnames[n], devn.total_memory() / (2.0 ** 20)
                    )
                )
    
            if "CUDA_DEVICE" in os.environ:
                devnum0 = int(os.environ["CUDA_DEVICE"])
            else:
                devnum0 = 0
    
            matchbit = ""
            if cudaDeviceName:
                # allow partial matches in device names
                devmatches = [
                    devidx
                    for devidx, devname in enumerate(devnames)
                    if cudaDeviceName in devname
                ]
                if len(devmatches) == 0:
                    context0.detach()
                    raise RuntimeError(
                        'Requested CUDA device "{}" not found.'
                        " Available devices: [{}]".format(
                            cudaDeviceName, ",".join(devnames)
                        )
                    )
                else:
                    devnum = devmatches[0]
                    if len(devmatches) > 1:
                        logging.warning(
                            'Found {} CUDA devices matching name "{}".'
                            " Choosing first one with index {}.".format(
                                len(devmatches), cudaDeviceName, devnum
                            )
                        )
                os.environ["CUDA_DEVICE"] = str(devnum)
                matchbit = '(matched to user request "{}")'.format(cudaDeviceName)
            elif "CUDA_DEVICE" in os.environ:
                devnum = int(os.environ["CUDA_DEVICE"])
            else:
                devnum = 0
            devn = devices[devnum]
            logging.info(
                "Choosing CUDA device {},"
                " of {} devices present: {}{}...".format(
                    devnum, num_gpus, devn.name(), matchbit
                )
            )
            if devnum == devnum0:
                gpu_context = context0
            else:
                context0.pop()
                gpu_context = pycuda.tools.make_default_context()
                gpu_context.push()
    
            _print_GPU_memory_MB("Available")
        else:
            gpu_context = None
    
        return features, gpu_context
    
    
    def call_compute_transient_fstat_map(
        version, features, multiFstatAtoms=None, windowRange=None
    ):
        """Choose which version of the ComputeTransientFstatMap function to call."""
    
        if version in fstatmap_versions:
            if features[version]:
                time0 = time()
                FstatMap = fstatmap_versions[version](multiFstatAtoms, windowRange)
                timingFstatMap = time() - time0
            else:
                raise Exception(
                    "Required module(s) for transient F-stat map"
                    ' method "{}" not available!'.format(version)
                )
        else:
            raise Exception(
                'Transient F-stat map method "{}"' " not implemented!".format(version)
            )
        return FstatMap, timingFstatMap
    
    
    def reshape_FstatAtomsVector(atomsVector):
        """
        Make a dictionary of ndarrays out of a atoms "vector" structure.
    
        The input is a "vector"-like structure with times as the higher hierarchical
        level and a set of "atoms" quantities defined at each timestamp.
        The output is a dictionary with an entry for each quantity,
        which is a 1D ndarray over timestamps for that one quantity.
        """
    
        numAtoms = atomsVector.length
        atomsDict = {}
        atom_fieldnames = [
            "timestamp",
            "Fa_alpha",
            "Fb_alpha",
            "a2_alpha",
            "ab_alpha",
            "b2_alpha",
        ]
        atom_dtypes = [np.uint32, complex, complex, np.float32, np.float32, np.float32]
        for f, field in enumerate(atom_fieldnames):
            atomsDict[field] = np.ndarray(numAtoms, dtype=atom_dtypes[f])
    
        for n, atom in enumerate(atomsVector.data):
            for field in atom_fieldnames:
                atomsDict[field][n] = atom.__getattribute__(field)
    
        atomsDict["Fa_alpha_re"] = np.float32(atomsDict["Fa_alpha"].real)
        atomsDict["Fa_alpha_im"] = np.float32(atomsDict["Fa_alpha"].imag)
        atomsDict["Fb_alpha_re"] = np.float32(atomsDict["Fb_alpha"].real)
        atomsDict["Fb_alpha_im"] = np.float32(atomsDict["Fb_alpha"].imag)
    
        return atomsDict
    
    
    def _get_absolute_kernel_path(kernel):
        pyfstatdir = os.path.dirname(os.path.abspath(os.path.realpath(__file__)))
        kernelfile = kernel + ".cu"
        return os.path.join(pyfstatdir, "pyCUDAkernels", kernelfile)
    
    
    def _print_GPU_memory_MB(key):
        mem_used_MB = drv.mem_get_info()[0] / (2.0 ** 20)
        mem_total_MB = drv.mem_get_info()[1] / (2.0 ** 20)
        logging.debug(
            "{} GPU memory: {:.4f} / {:.4f} MB free".format(key, mem_used_MB, mem_total_MB)
        )
    
    
    def pycuda_compute_transient_fstat_map(multiFstatAtoms, windowRange):
        """
        GPU version of the function to compute transient-window "F-statistic map"
        over start-time and timescale {t0, tau}.
        Based on XLALComputeTransientFstatMap from LALSuite,
        (C) 2009 Reinhard Prix, licensed under GPL
    
        Returns a 2D matrix F_mn,
        with m = index over start-times t0,
        and  n = index over timescales tau,
        in steps of dt0  in [t0,  t0+t0Band],
        and         dtau in [tau, tau+tauBand]
        as defined in windowRange input.
        """
    
        if windowRange.type >= lalpulsar.TRANSIENT_LAST:
            raise ValueError(
                "Unknown window-type ({}) passed as input."
                " Allowed are [0,{}].".format(
                    windowRange.type, lalpulsar.TRANSIENT_LAST - 1
                )
            )
    
        # internal dict for search/setup parameters
        tCWparams = {}
    
        # first combine all multi-atoms
        # into a single atoms-vector with *unique* timestamps
        tCWparams["TAtom"] = multiFstatAtoms.data[0].TAtom
        TAtomHalf = int(tCWparams["TAtom"] / 2)  # integer division
        atoms = lalpulsar.mergeMultiFstatAtomsBinned(multiFstatAtoms, tCWparams["TAtom"])
    
        # make a combined input matrix of all atoms vectors, for transfer to GPU
        tCWparams["numAtoms"] = atoms.length
        atomsDict = reshape_FstatAtomsVector(atoms)
        atomsInputMatrix = np.column_stack(
            (
                atomsDict["a2_alpha"],
                atomsDict["b2_alpha"],
                atomsDict["ab_alpha"],
                atomsDict["Fa_alpha_re"],
                atomsDict["Fa_alpha_im"],
                atomsDict["Fb_alpha_re"],
                atomsDict["Fb_alpha_im"],
            )
        )
    
        # actual data spans [t0_data, t0_data + tCWparams['numAtoms'] * TAtom]
        # in steps of TAtom
        tCWparams["t0_data"] = int(atoms.data[0].timestamp)
        tCWparams["t1_data"] = int(
            atoms.data[tCWparams["numAtoms"] - 1].timestamp + tCWparams["TAtom"]
        )
    
        logging.debug(
            "Transient F-stat map:"
            " t0_data={:d}, t1_data={:d}".format(tCWparams["t0_data"], tCWparams["t1_data"])
        )
        logging.debug(
            "Transient F-stat map:"
            " numAtoms={:d}, TAtom={:d},"
            " TAtomHalf={:d}".format(tCWparams["numAtoms"], tCWparams["TAtom"], TAtomHalf)
        )
    
        # special treatment of window_type = none
        # ==> replace by rectangular window spanning all the data
        if windowRange.type == lalpulsar.TRANSIENT_NONE:
            windowRange.type = lalpulsar.TRANSIENT_RECTANGULAR
            windowRange.t0 = tCWparams["t0_data"]
            windowRange.t0Band = 0
            windowRange.dt0 = tCWparams["TAtom"]  # irrelevant
            windowRange.tau = tCWparams["numAtoms"] * tCWparams["TAtom"]
            windowRange.tauBand = 0
            windowRange.dtau = tCWparams["TAtom"]  # irrelevant
    
        """ NOTE: indices {i,j} enumerate *actual* atoms and their timestamps t_i,
        * while the indices {m,n} enumerate the full grid of values
        * in [t0_min, t0_max]x[Tcoh_min, Tcoh_max] in steps of deltaT.
        * This allows us to deal with gaps in the data in a transparent way.
        *
        * NOTE2: we operate on the 'binned' atoms returned
        * from XLALmergeMultiFstatAtomsBinned(),
        * which means we can safely assume all atoms to be lined up
        * perfectly on a 'deltaT' binned grid.
        *
        * The mapping used will therefore be {i,j} -> {m,n}:
        *   m = offs_i  / deltaT
        *   start-time offset from t0_min measured in deltaT
        *   n = Tcoh_ij / deltaT
        *   duration Tcoh_ij measured in deltaT,
        *
        * where
        *   offs_i  = t_i - t0_min
        *   Tcoh_ij = t_j - t_i + deltaT
        *
        """
    
        # We allocate a matrix  {m x n} = t0Range * TcohRange elements
        # covering the full transient window-range [t0,t0+t0Band]x[tau,tau+tauBand]
        tCWparams["N_t0Range"] = int(
            np.floor(1.0 * windowRange.t0Band / windowRange.dt0) + 1
        )
        tCWparams["N_tauRange"] = int(
            np.floor(1.0 * windowRange.tauBand / windowRange.dtau) + 1
        )
        FstatMap = pyTransientFstatMap(tCWparams["N_t0Range"], tCWparams["N_tauRange"])
    
        logging.debug(
            "Transient F-stat map:"
            " N_t0Range={:d}, N_tauRange={:d},"
            " total grid points: {:d}".format(
                tCWparams["N_t0Range"],
                tCWparams["N_tauRange"],
                tCWparams["N_t0Range"] * tCWparams["N_tauRange"],
            )
        )
    
        if windowRange.type == lalpulsar.TRANSIENT_RECTANGULAR:
            FstatMap.F_mn = pycuda_compute_transient_fstat_map_rect(
                atomsInputMatrix, windowRange, tCWparams
            )
        elif windowRange.type == lalpulsar.TRANSIENT_EXPONENTIAL:
            FstatMap.F_mn = pycuda_compute_transient_fstat_map_exp(
                atomsInputMatrix, windowRange, tCWparams
            )
        else:
            raise ValueError(
                "Invalid transient window type {}"
                " not in [{}, {}].".format(
                    windowRange.type, lalpulsar.TRANSIENT_NONE, lalpulsar.TRANSIENT_LAST - 1
                )
            )
    
        # out of loop: get max2F and ML estimates over the m x n matrix
        FstatMap.maxF = FstatMap.F_mn.max()
        maxidx = np.unravel_index(
            FstatMap.F_mn.argmax(), (tCWparams["N_t0Range"], tCWparams["N_tauRange"])
        )
        FstatMap.t0_ML = windowRange.t0 + maxidx[0] * windowRange.dt0
        FstatMap.tau_ML = windowRange.tau + maxidx[1] * windowRange.dtau
    
        logging.debug(
            "Done computing transient F-stat map."
            " maxF={:.4f}, t0_ML={}, tau_ML={}".format(
                FstatMap.maxF, FstatMap.t0_ML, FstatMap.tau_ML
            )
        )
    
        return FstatMap
    
    
    def pycuda_compute_transient_fstat_map_rect(atomsInputMatrix, windowRange, tCWparams):
        """
        only GPU-parallizing outer loop,
        keeping partial sums with memory in kernel
        """
    
        # gpu data setup and transfer
        _print_GPU_memory_MB("Initial")
        input_gpu = gpuarray.to_gpu(atomsInputMatrix)
        Fmn_gpu = gpuarray.GPUArray(
            (tCWparams["N_t0Range"], tCWparams["N_tauRange"]), dtype=np.float32
        )
        _print_GPU_memory_MB("After input+output allocation:")
    
        # GPU kernel
        kernel = "cudaTransientFstatRectWindow"
        kernelfile = _get_absolute_kernel_path(kernel)
        partial_Fstat_cuda_code = cudacomp.SourceModule(open(kernelfile, "r").read())
        partial_Fstat_cuda = partial_Fstat_cuda_code.get_function(kernel)
        partial_Fstat_cuda.prepare("PIIIIIIIIP")
    
        # GPU grid setup
        blockRows = min(1024, tCWparams["N_t0Range"])
        blockCols = 1
        gridRows = int(np.ceil(1.0 * tCWparams["N_t0Range"] / blockRows))
        gridCols = 1
    
        # running the kernel
        logging.debug(
            "Calling pyCUDA kernel with a grid of {}*{}={} blocks"
            " of {}*{}={} threads each: {} total threads...".format(
                gridRows,
                gridCols,
                gridRows * gridCols,
                blockRows,
                blockCols,
                blockRows * blockCols,
                gridRows * gridCols * blockRows * blockCols,
            )
        )
        partial_Fstat_cuda.prepared_call(
            (gridRows, gridCols),
            (blockRows, blockCols, 1),
            input_gpu.gpudata,
            tCWparams["numAtoms"],
            tCWparams["TAtom"],
            tCWparams["t0_data"],
            windowRange.t0,
            windowRange.dt0,
            windowRange.tau,
            windowRange.dtau,
            tCWparams["N_tauRange"],
            Fmn_gpu.gpudata,
        )
    
        # return results to host
        F_mn = Fmn_gpu.get()
    
        _print_GPU_memory_MB("Final")
    
        return F_mn
    
    
    def pycuda_compute_transient_fstat_map_exp(atomsInputMatrix, windowRange, tCWparams):
        """exponential window, inner and outer loop GPU-parallelized"""
    
        # gpu data setup and transfer
        _print_GPU_memory_MB("Initial")
        input_gpu = gpuarray.to_gpu(atomsInputMatrix)
        Fmn_gpu = gpuarray.GPUArray(
            (tCWparams["N_t0Range"], tCWparams["N_tauRange"]), dtype=np.float32
        )
        _print_GPU_memory_MB("After input+output allocation:")
    
        # GPU kernel
        kernel = "cudaTransientFstatExpWindow"
        kernelfile = _get_absolute_kernel_path(kernel)
        partial_Fstat_cuda_code = cudacomp.SourceModule(open(kernelfile, "r").read())
        partial_Fstat_cuda = partial_Fstat_cuda_code.get_function(kernel)
        partial_Fstat_cuda.prepare("PIIIIIIIIIP")
    
        # GPU grid setup
        blockRows = min(32, tCWparams["N_t0Range"])
        blockCols = min(32, tCWparams["N_tauRange"])
        gridRows = int(np.ceil(1.0 * tCWparams["N_t0Range"] / blockRows))
        gridCols = int(np.ceil(1.0 * tCWparams["N_tauRange"] / blockCols))
    
        # running the kernel
        logging.debug(
            "Calling kernel with a grid of {}*{}={} blocks"
            " of {}*{}={} threads each: {} total threads...".format(
                gridRows,
                gridCols,
                gridRows * gridCols,
                blockRows,
                blockCols,
                blockRows * blockCols,
                gridRows * gridCols * blockRows * blockCols,
            )
        )
        partial_Fstat_cuda.prepared_call(
            (gridRows, gridCols),
            (blockRows, blockCols, 1),
            input_gpu.gpudata,
            tCWparams["numAtoms"],
            tCWparams["TAtom"],
            tCWparams["t0_data"],
            windowRange.t0,
            windowRange.dt0,
            windowRange.tau,
            windowRange.dtau,
            tCWparams["N_t0Range"],
            tCWparams["N_tauRange"],
            Fmn_gpu.gpudata,
        )
    
        # return results to host
        F_mn = Fmn_gpu.get()
    
        _print_GPU_memory_MB("Final")
    
        return F_mn