""" Additional helper functions dealing with transient-CW F(t0,tau) maps """ import numpy as np import os import logging from time import time # optional imports import importlib as imp def _optional_import(modulename, shorthand=None): """ Import a module/submodule only if it's available. using importlib instead of __import__ because the latter doesn't handle sub.modules Also including a special check to fail more gracefully when CUDA_DEVICE is set to too high a number. """ if shorthand is None: shorthand = modulename shorthandbit = "" else: shorthandbit = " as " + shorthand try: globals()[shorthand] = imp.import_module(modulename) logging.debug("Successfully imported module %s%s." % (modulename, shorthandbit)) success = True except ImportError as e: logging.debug("Failed to import module {:s}.".format(modulename)) success = False return success class pyTransientFstatMap(object): """ simplified object class for a F(t0,tau) F-stat map (not 2F!) based on LALSuite's transientFstatMap_t type replacing the gsl matrix with a numpy array F_mn: 2D array of 2F values maxF: maximum of F (not 2F!) t0_ML: maximum likelihood transient start time t0 estimate tau_ML: maximum likelihood transient duration tau estimate """ def __init__(self, N_t0Range, N_tauRange): self.F_mn = np.zeros((N_t0Range, N_tauRange), dtype=np.float32) # Initializing maxF to a negative value ensures # that we always update at least once and hence return # sane t0_d_ML, tau_d_ML # even if there is only a single bin where F=0 happens. self.maxF = float(-1.0) self.t0_ML = float(0.0) self.tau_ML = float(0.0) # dictionary of the actual callable F-stat map functions we support, # if the corresponding modules are available. fstatmap_versions = { "lal": lambda multiFstatAtoms, windowRange: getattr( lalpulsar, "ComputeTransientFstatMap" )(multiFstatAtoms, windowRange, False), "pycuda": lambda multiFstatAtoms, windowRange: pycuda_compute_transient_fstat_map( multiFstatAtoms, windowRange ), } def init_transient_fstat_map_features(wantCuda=False, cudaDeviceName=None): """ Initialization of available modules (or "features") for F-stat maps. Returns a dictionary of method names, to match fstatmap_versions each key's value set to True only if all required modules are importable on this system. """ features = {} have_lal = _optional_import("lal") have_lalpulsar = _optional_import("lalpulsar") features["lal"] = have_lal and have_lalpulsar # import GPU features have_pycuda = _optional_import("pycuda") have_pycuda_drv = _optional_import("pycuda.driver", "drv") have_pycuda_gpuarray = _optional_import("pycuda.gpuarray", "gpuarray") have_pycuda_tools = _optional_import("pycuda.tools", "cudatools") have_pycuda_compiler = _optional_import("pycuda.compiler", "cudacomp") features["pycuda"] = ( have_pycuda_drv and have_pycuda_gpuarray and have_pycuda_tools and have_pycuda_compiler ) logging.debug("Got the following features for transient F-stat maps:") logging.debug(features) if wantCuda and features["pycuda"]: logging.debug("CUDA version: " + ".".join(map(str, drv.get_version()))) drv.init() logging.debug( "Starting with default pyCUDA context," " then checking all available devices..." ) try: context0 = pycuda.tools.make_default_context() except pycuda._driver.LogicError as e: if e.message == "cuDeviceGet failed: invalid device ordinal": devn = int(os.environ["CUDA_DEVICE"]) raise RuntimeError( "Requested CUDA device number {} exceeds" " number of available devices!" " Please change through environment" " variable $CUDA_DEVICE.".format(devn) ) else: raise pycuda._driver.LogicError(e.message) num_gpus = drv.Device.count() logging.debug("Found {} CUDA device(s).".format(num_gpus)) devices = [] devnames = np.empty(num_gpus, dtype="S32") for n in range(num_gpus): devn = drv.Device(n) devices.append(devn) devnames[n] = devn.name().replace(" ", "-").replace("_", "-") logging.debug( "device {}: model: {}, RAM: {}MB".format( n, devnames[n], devn.total_memory() / (2.0 ** 20) ) ) if "CUDA_DEVICE" in os.environ: devnum0 = int(os.environ["CUDA_DEVICE"]) else: devnum0 = 0 matchbit = "" if cudaDeviceName: # allow partial matches in device names devmatches = [ devidx for devidx, devname in enumerate(devnames) if cudaDeviceName in devname ] if len(devmatches) == 0: context0.detach() raise RuntimeError( 'Requested CUDA device "{}" not found.' " Available devices: [{}]".format( cudaDeviceName, ",".join(devnames) ) ) else: devnum = devmatches[0] if len(devmatches) > 1: logging.warning( 'Found {} CUDA devices matching name "{}".' " Choosing first one with index {}.".format( len(devmatches), cudaDeviceName, devnum ) ) os.environ["CUDA_DEVICE"] = str(devnum) matchbit = '(matched to user request "{}")'.format(cudaDeviceName) elif "CUDA_DEVICE" in os.environ: devnum = int(os.environ["CUDA_DEVICE"]) else: devnum = 0 devn = devices[devnum] logging.info( "Choosing CUDA device {}," " of {} devices present: {}{}...".format( devnum, num_gpus, devn.name(), matchbit ) ) if devnum == devnum0: gpu_context = context0 else: context0.pop() gpu_context = pycuda.tools.make_default_context() gpu_context.push() _print_GPU_memory_MB("Available") else: gpu_context = None return features, gpu_context def call_compute_transient_fstat_map( version, features, multiFstatAtoms=None, windowRange=None ): """Choose which version of the ComputeTransientFstatMap function to call.""" if version in fstatmap_versions: if features[version]: time0 = time() FstatMap = fstatmap_versions[version](multiFstatAtoms, windowRange) timingFstatMap = time() - time0 else: raise Exception( "Required module(s) for transient F-stat map" ' method "{}" not available!'.format(version) ) else: raise Exception( 'Transient F-stat map method "{}"' " not implemented!".format(version) ) return FstatMap, timingFstatMap def reshape_FstatAtomsVector(atomsVector): """ Make a dictionary of ndarrays out of a atoms "vector" structure. The input is a "vector"-like structure with times as the higher hierarchical level and a set of "atoms" quantities defined at each timestamp. The output is a dictionary with an entry for each quantity, which is a 1D ndarray over timestamps for that one quantity. """ numAtoms = atomsVector.length atomsDict = {} atom_fieldnames = [ "timestamp", "Fa_alpha", "Fb_alpha", "a2_alpha", "ab_alpha", "b2_alpha", ] atom_dtypes = [np.uint32, complex, complex, np.float32, np.float32, np.float32] for f, field in enumerate(atom_fieldnames): atomsDict[field] = np.ndarray(numAtoms, dtype=atom_dtypes[f]) for n, atom in enumerate(atomsVector.data): for field in atom_fieldnames: atomsDict[field][n] = atom.__getattribute__(field) atomsDict["Fa_alpha_re"] = np.float32(atomsDict["Fa_alpha"].real) atomsDict["Fa_alpha_im"] = np.float32(atomsDict["Fa_alpha"].imag) atomsDict["Fb_alpha_re"] = np.float32(atomsDict["Fb_alpha"].real) atomsDict["Fb_alpha_im"] = np.float32(atomsDict["Fb_alpha"].imag) return atomsDict def _get_absolute_kernel_path(kernel): pyfstatdir = os.path.dirname(os.path.abspath(os.path.realpath(__file__))) kernelfile = kernel + ".cu" return os.path.join(pyfstatdir, "pyCUDAkernels", kernelfile) def _print_GPU_memory_MB(key): mem_used_MB = drv.mem_get_info()[0] / (2.0 ** 20) mem_total_MB = drv.mem_get_info()[1] / (2.0 ** 20) logging.debug( "{} GPU memory: {:.4f} / {:.4f} MB free".format(key, mem_used_MB, mem_total_MB) ) def pycuda_compute_transient_fstat_map(multiFstatAtoms, windowRange): """ GPU version of the function to compute transient-window "F-statistic map" over start-time and timescale {t0, tau}. Based on XLALComputeTransientFstatMap from LALSuite, (C) 2009 Reinhard Prix, licensed under GPL Returns a 2D matrix F_mn, with m = index over start-times t0, and n = index over timescales tau, in steps of dt0 in [t0, t0+t0Band], and dtau in [tau, tau+tauBand] as defined in windowRange input. """ if windowRange.type >= lalpulsar.TRANSIENT_LAST: raise ValueError( "Unknown window-type ({}) passed as input." " Allowed are [0,{}].".format( windowRange.type, lalpulsar.TRANSIENT_LAST - 1 ) ) # internal dict for search/setup parameters tCWparams = {} # first combine all multi-atoms # into a single atoms-vector with *unique* timestamps tCWparams["TAtom"] = multiFstatAtoms.data[0].TAtom TAtomHalf = int(tCWparams["TAtom"] / 2) # integer division atoms = lalpulsar.mergeMultiFstatAtomsBinned(multiFstatAtoms, tCWparams["TAtom"]) # make a combined input matrix of all atoms vectors, for transfer to GPU tCWparams["numAtoms"] = atoms.length atomsDict = reshape_FstatAtomsVector(atoms) atomsInputMatrix = np.column_stack( ( atomsDict["a2_alpha"], atomsDict["b2_alpha"], atomsDict["ab_alpha"], atomsDict["Fa_alpha_re"], atomsDict["Fa_alpha_im"], atomsDict["Fb_alpha_re"], atomsDict["Fb_alpha_im"], ) ) # actual data spans [t0_data, t0_data + tCWparams['numAtoms'] * TAtom] # in steps of TAtom tCWparams["t0_data"] = int(atoms.data[0].timestamp) tCWparams["t1_data"] = int( atoms.data[tCWparams["numAtoms"] - 1].timestamp + tCWparams["TAtom"] ) logging.debug( "Transient F-stat map:" " t0_data={:d}, t1_data={:d}".format(tCWparams["t0_data"], tCWparams["t1_data"]) ) logging.debug( "Transient F-stat map:" " numAtoms={:d}, TAtom={:d}," " TAtomHalf={:d}".format(tCWparams["numAtoms"], tCWparams["TAtom"], TAtomHalf) ) # special treatment of window_type = none # ==> replace by rectangular window spanning all the data if windowRange.type == lalpulsar.TRANSIENT_NONE: windowRange.type = lalpulsar.TRANSIENT_RECTANGULAR windowRange.t0 = tCWparams["t0_data"] windowRange.t0Band = 0 windowRange.dt0 = tCWparams["TAtom"] # irrelevant windowRange.tau = tCWparams["numAtoms"] * tCWparams["TAtom"] windowRange.tauBand = 0 windowRange.dtau = tCWparams["TAtom"] # irrelevant """ NOTE: indices {i,j} enumerate *actual* atoms and their timestamps t_i, * while the indices {m,n} enumerate the full grid of values * in [t0_min, t0_max]x[Tcoh_min, Tcoh_max] in steps of deltaT. * This allows us to deal with gaps in the data in a transparent way. * * NOTE2: we operate on the 'binned' atoms returned * from XLALmergeMultiFstatAtomsBinned(), * which means we can safely assume all atoms to be lined up * perfectly on a 'deltaT' binned grid. * * The mapping used will therefore be {i,j} -> {m,n}: * m = offs_i / deltaT * start-time offset from t0_min measured in deltaT * n = Tcoh_ij / deltaT * duration Tcoh_ij measured in deltaT, * * where * offs_i = t_i - t0_min * Tcoh_ij = t_j - t_i + deltaT * """ # We allocate a matrix {m x n} = t0Range * TcohRange elements # covering the full transient window-range [t0,t0+t0Band]x[tau,tau+tauBand] tCWparams["N_t0Range"] = int( np.floor(1.0 * windowRange.t0Band / windowRange.dt0) + 1 ) tCWparams["N_tauRange"] = int( np.floor(1.0 * windowRange.tauBand / windowRange.dtau) + 1 ) FstatMap = pyTransientFstatMap(tCWparams["N_t0Range"], tCWparams["N_tauRange"]) logging.debug( "Transient F-stat map:" " N_t0Range={:d}, N_tauRange={:d}," " total grid points: {:d}".format( tCWparams["N_t0Range"], tCWparams["N_tauRange"], tCWparams["N_t0Range"] * tCWparams["N_tauRange"], ) ) if windowRange.type == lalpulsar.TRANSIENT_RECTANGULAR: FstatMap.F_mn = pycuda_compute_transient_fstat_map_rect( atomsInputMatrix, windowRange, tCWparams ) elif windowRange.type == lalpulsar.TRANSIENT_EXPONENTIAL: FstatMap.F_mn = pycuda_compute_transient_fstat_map_exp( atomsInputMatrix, windowRange, tCWparams ) else: raise ValueError( "Invalid transient window type {}" " not in [{}, {}].".format( windowRange.type, lalpulsar.TRANSIENT_NONE, lalpulsar.TRANSIENT_LAST - 1 ) ) # out of loop: get max2F and ML estimates over the m x n matrix FstatMap.maxF = FstatMap.F_mn.max() maxidx = np.unravel_index( FstatMap.F_mn.argmax(), (tCWparams["N_t0Range"], tCWparams["N_tauRange"]) ) FstatMap.t0_ML = windowRange.t0 + maxidx[0] * windowRange.dt0 FstatMap.tau_ML = windowRange.tau + maxidx[1] * windowRange.dtau logging.debug( "Done computing transient F-stat map." " maxF={:.4f}, t0_ML={}, tau_ML={}".format( FstatMap.maxF, FstatMap.t0_ML, FstatMap.tau_ML ) ) return FstatMap def pycuda_compute_transient_fstat_map_rect(atomsInputMatrix, windowRange, tCWparams): """ only GPU-parallizing outer loop, keeping partial sums with memory in kernel """ # gpu data setup and transfer _print_GPU_memory_MB("Initial") input_gpu = gpuarray.to_gpu(atomsInputMatrix) Fmn_gpu = gpuarray.GPUArray( (tCWparams["N_t0Range"], tCWparams["N_tauRange"]), dtype=np.float32 ) _print_GPU_memory_MB("After input+output allocation:") # GPU kernel kernel = "cudaTransientFstatRectWindow" kernelfile = _get_absolute_kernel_path(kernel) partial_Fstat_cuda_code = cudacomp.SourceModule(open(kernelfile, "r").read()) partial_Fstat_cuda = partial_Fstat_cuda_code.get_function(kernel) partial_Fstat_cuda.prepare("PIIIIIIIIP") # GPU grid setup blockRows = min(1024, tCWparams["N_t0Range"]) blockCols = 1 gridRows = int(np.ceil(1.0 * tCWparams["N_t0Range"] / blockRows)) gridCols = 1 # running the kernel logging.debug( "Calling pyCUDA kernel with a grid of {}*{}={} blocks" " of {}*{}={} threads each: {} total threads...".format( gridRows, gridCols, gridRows * gridCols, blockRows, blockCols, blockRows * blockCols, gridRows * gridCols * blockRows * blockCols, ) ) partial_Fstat_cuda.prepared_call( (gridRows, gridCols), (blockRows, blockCols, 1), input_gpu.gpudata, tCWparams["numAtoms"], tCWparams["TAtom"], tCWparams["t0_data"], windowRange.t0, windowRange.dt0, windowRange.tau, windowRange.dtau, tCWparams["N_tauRange"], Fmn_gpu.gpudata, ) # return results to host F_mn = Fmn_gpu.get() _print_GPU_memory_MB("Final") return F_mn def pycuda_compute_transient_fstat_map_exp(atomsInputMatrix, windowRange, tCWparams): """exponential window, inner and outer loop GPU-parallelized""" # gpu data setup and transfer _print_GPU_memory_MB("Initial") input_gpu = gpuarray.to_gpu(atomsInputMatrix) Fmn_gpu = gpuarray.GPUArray( (tCWparams["N_t0Range"], tCWparams["N_tauRange"]), dtype=np.float32 ) _print_GPU_memory_MB("After input+output allocation:") # GPU kernel kernel = "cudaTransientFstatExpWindow" kernelfile = _get_absolute_kernel_path(kernel) partial_Fstat_cuda_code = cudacomp.SourceModule(open(kernelfile, "r").read()) partial_Fstat_cuda = partial_Fstat_cuda_code.get_function(kernel) partial_Fstat_cuda.prepare("PIIIIIIIIIP") # GPU grid setup blockRows = min(32, tCWparams["N_t0Range"]) blockCols = min(32, tCWparams["N_tauRange"]) gridRows = int(np.ceil(1.0 * tCWparams["N_t0Range"] / blockRows)) gridCols = int(np.ceil(1.0 * tCWparams["N_tauRange"] / blockCols)) # running the kernel logging.debug( "Calling kernel with a grid of {}*{}={} blocks" " of {}*{}={} threads each: {} total threads...".format( gridRows, gridCols, gridRows * gridCols, blockRows, blockCols, blockRows * blockCols, gridRows * gridCols * blockRows * blockCols, ) ) partial_Fstat_cuda.prepared_call( (gridRows, gridCols), (blockRows, blockCols, 1), input_gpu.gpudata, tCWparams["numAtoms"], tCWparams["TAtom"], tCWparams["t0_data"], windowRange.t0, windowRange.dt0, windowRange.tau, windowRange.dtau, tCWparams["N_t0Range"], tCWparams["N_tauRange"], Fmn_gpu.gpudata, ) # return results to host F_mn = Fmn_gpu.get() _print_GPU_memory_MB("Final") return F_mn