Commit 7ae31caa authored by David Keitel's avatar David Keitel

add pyCUDA version of TransientFstatMap

 -optional import of pyCUDA package only if requested
 -including kernel .cu files in packaging
parent 7fae97b6
......@@ -438,6 +438,10 @@ class TransientGridSearch(GridSearch):
FstatMap = getattr(self.search, 'FstatMap', None)
thisCand = list(vals) + [detstat]
if getattr(self, 'transientWindowType', None):
if self.tCWFstatMapVersion == 'lal':
F_mn = FstatMap.F_mn.data
else:
F_mn = FstatMap.F_mn
if self.outputTransientFstatMap:
tCWfile = os.path.splitext(self.out_file)[0]+'_tCW_%.16f_%.16f_%.16f_%.16g_%.16g.dat' % (vals[2],vals[5],vals[6],vals[3],vals[4]) # freq alpha delta f1dot f2dot
if self.tCWFstatMapVersion == 'lal':
......@@ -445,9 +449,8 @@ class TransientGridSearch(GridSearch):
lalpulsar.write_transientFstatMap_to_fp ( fo, FstatMap, windowRange, None )
del fo # instead of lal.FileClose() which is not SWIG-exported
else:
np.savetxt(tCWfile, 2.0*FstatMap.F_mn, delimiter=' ')
Fmn = FstatMap.F_mn.data
maxidx = np.unravel_index(Fmn.argmax(), Fmn.shape)
np.savetxt(tCWfile, 2.0*F_mn, delimiter=' ')
maxidx = np.unravel_index(F_mn.argmax(), F_mn.shape)
thisCand += [windowRange.t0+maxidx[0]*windowRange.dt0,
windowRange.tau+maxidx[1]*windowRange.dtau]
data.append(thisCand)
......
__global__ void cudaTransientFstatExpWindow ( float *input,
unsigned int numAtoms,
unsigned int TAtom,
unsigned int t0_data,
unsigned int win_t0,
unsigned int win_dt0,
unsigned int win_tau,
unsigned int win_dtau,
unsigned int Fmn_rows,
unsigned int Fmn_cols,
float *Fmn
)
{
/* match CUDA thread indexing and high-level (t0,tau) indexing */
unsigned int m = blockDim.x * blockIdx.x + threadIdx.x; // t0: row
unsigned int n = blockDim.y * blockIdx.y + threadIdx.y; // tau: column
/* unraveled 1D index for 2D output array */
unsigned int outidx = Fmn_cols * m + n;
/* hardcoded copy from lalpulsar */
unsigned int TRANSIENT_EXP_EFOLDING = 3;
if ( (m < Fmn_rows) && (n < Fmn_cols) ) {
/* compute Fstat-atom index i_t0 in [0, numAtoms) */
unsigned int TAtomHalf = TAtom/2; // integer division
unsigned int t0 = win_t0 + m * win_dt0;
int i_tmp = ( t0 - t0_data + TAtomHalf ) / TAtom; // integer round: floor(x+0.5)
if ( i_tmp < 0 ) {
i_tmp = 0;
}
unsigned int i_t0 = (unsigned int)i_tmp;
if ( i_t0 >= numAtoms ) {
i_t0 = numAtoms - 1;
}
/* translate n into an atoms end-index for this search interval [t0, t0+Tcoh],
* giving the index range of atoms to sum over
*/
unsigned int tau = win_tau + n * win_dtau;
/* get end-time t1 of this transient-window search
* for given tau, what Tcoh should the exponential window cover?
* for speed reasons we want to truncate Tcoh = tau * TRANSIENT_EXP_EFOLDING
* with the e-folding factor chosen such that the window-value
* is practically negligible after that, where it will be set to 0
*/
// unsigned int t1 = lround( win_t0 + TRANSIENT_EXP_EFOLDING * win_tau);
unsigned int t1 = t0 + TRANSIENT_EXP_EFOLDING * tau;
/* compute window end-time Fstat-atom index i_t1 in [0, numAtoms) */
i_tmp = ( t1 - t0_data + TAtomHalf ) / TAtom - 1; // integer round: floor(x+0.5)
if ( i_tmp < 0 ) {
i_tmp = 0;
}
unsigned int i_t1 = (unsigned int)i_tmp;
if ( i_t1 >= numAtoms ) {
i_t1 = numAtoms - 1;
}
/* now we have two valid atoms-indices [i_t0, i_t1]
* spanning our Fstat-window to sum over
*/
float Ad = 0.0f;
float Bd = 0.0f;
float Cd = 0.0f;
float Fa_re = 0.0f;
float Fa_im = 0.0f;
float Fb_re = 0.0f;
float Fb_im = 0.0f;
unsigned short input_cols = 7; // must match input matrix!
/* sum up atoms */
for ( unsigned int i=i_t0; i<=i_t1; i++ ) {
unsigned int t_i = t0_data + i * TAtom;
float win_i = 0.0;
if ( t_i >= t0 && t_i <= t1 ) {
float x = 1.0 * ( t_i - t0 ) / tau;
win_i = exp ( -x );
}
float win2_i = win_i * win_i;
Ad += input[i*input_cols+0] * win2_i; // a2_alpha
Bd += input[i*input_cols+1] * win2_i; // b2_alpha
Cd += input[i*input_cols+2] * win2_i; // ab_alpha
Fa_re += input[i*input_cols+3] * win_i; // Fa_alpha_re
Fa_im += input[i*input_cols+4] * win_i; // Fa_alpha_im
Fb_re += input[i*input_cols+5] * win_i; // Fb_alpha_re
Fb_im += input[i*input_cols+6] * win_i; // Fb_alpha_im
}
/* get determinant */
float Dd = ( Ad * Bd - Cd * Cd );
float DdInv = 0.0f;
/* safety catch as in XLALWeightMultiAMCoeffs():
* make it so that in the end F=0 instead of -nan
*/
if ( Dd > 0.0 ) {
DdInv = 1.0 / Dd;
}
/* from XLALComputeFstatFromFaFb */
float F = DdInv * ( Bd * ( Fa_re*Fa_re + Fa_im*Fa_im )
+ Ad * ( Fb_re*Fb_re + Fb_im*Fb_im )
- 2.0 * Cd * ( Fa_re * Fb_re + Fa_im * Fb_im )
);
/* store result in Fstat-matrix
* at unraveled index of element {m,n}
*/
Fmn[outidx] = F;
} // ( (m < Fmn_rows) && (n < Fmn_cols) )
} // cudaTransientFstatExpWindow()
__global__ void cudaTransientFstatRectWindow ( float *input,
unsigned int numAtoms,
unsigned int TAtom,
unsigned int t0_data,
unsigned int win_t0,
unsigned int win_dt0,
unsigned int win_tau,
unsigned int win_dtau,
unsigned int N_tauRange,
float *Fmn
)
{
/* match CUDA thread indexing and high-level (t0,tau) indexing */
// assume 1D block, grid setup
unsigned int m = blockDim.x * blockIdx.x + threadIdx.x; // t0: row
unsigned short input_cols = 7; // must match input matrix!
/* compute Fstat-atom index i_t0 in [0, numAtoms) */
unsigned int TAtomHalf = TAtom/2; // integer division
unsigned int t0 = win_t0 + m * win_dt0;
int i_tmp = ( t0 - t0_data + TAtomHalf ) / TAtom; // integer round: floor(x+0.5)
if ( i_tmp < 0 ) {
i_tmp = 0;
}
unsigned int i_t0 = (unsigned int)i_tmp;
if ( i_t0 >= numAtoms ) {
i_t0 = numAtoms - 1;
}
float Ad = 0.0f;
float Bd = 0.0f;
float Cd = 0.0f;
float Fa_re = 0.0f;
float Fa_im = 0.0f;
float Fb_re = 0.0f;
float Fb_im = 0.0f;
unsigned int i_t1_last = i_t0;
/* INNER loop over timescale-parameter tau
* NOT parallelized so that we can still use the i_t1_last trick
* (empirically seems to be faster than 2D CUDA version)
*/
for ( unsigned int n = 0; n < N_tauRange; n ++ ) {
if ( (m < N_tauRange) && (n < N_tauRange) ) {
/* translate n into an atoms end-index for this search interval [t0, t0+Tcoh],
* giving the index range of atoms to sum over
*/
unsigned int tau = win_tau + n * win_dtau;
/* get end-time t1 of this transient-window search */
unsigned int t1 = t0 + tau;
/* compute window end-time Fstat-atom index i_t1 in [0, numAtoms) */
i_tmp = ( t1 - t0_data + TAtomHalf ) / TAtom - 1; // integer round: floor(x+0.5)
if ( i_tmp < 0 ) {
i_tmp = 0;
}
unsigned int i_t1 = (unsigned int)i_tmp;
if ( i_t1 >= numAtoms ) {
i_t1 = numAtoms - 1;
}
/* now we have two valid atoms-indices [i_t0, i_t1]
* spanning our Fstat-window to sum over
*/
for ( unsigned int i = i_t1_last; i <= i_t1; i ++ ) {
/* sum up atoms,
* special optimiziation in the rectangular-window case:
* just add on to previous tau values,
* ie re-use the sum over [i_t0, i_t1_last] from the pevious tau-loop iteration
*/
Ad += input[i*input_cols+0]; // a2_alpha
Bd += input[i*input_cols+1]; // b2_alpha
Cd += input[i*input_cols+2]; // ab_alpha
Fa_re += input[i*input_cols+3]; // Fa_alpha_re
Fa_im += input[i*input_cols+4]; // Fa_alpha_im
Fb_re += input[i*input_cols+5]; // Fb_alpha_re
Fb_im += input[i*input_cols+6]; // Fb_alpha_im
/* keep track of up to where we summed for the next iteration */
i_t1_last = i_t1 + 1;
}
/* get determinant */
float Dd = ( Ad * Bd - Cd * Cd );
float DdInv = 0.0f;
/* safety catch as in XLALWeightMultiAMCoeffs():
* make it so that in the end F=0 instead of -nan
*/
if ( Dd > 0.0 ) {
DdInv = 1.0 / Dd;
}
/* from XLALComputeFstatFromFaFb */
float F = DdInv * ( Bd * ( Fa_re*Fa_re + Fa_im*Fa_im )
+ Ad * ( Fb_re*Fb_re + Fb_im*Fb_im )
- 2.0 * Cd * ( Fa_re * Fb_re + Fa_im * Fb_im )
);
/* store result in Fstat-matrix
* at unraveled index of element {m,n}
*/
unsigned int outidx = m * N_tauRange + n;
Fmn[outidx] = F;
} // if ( (m < N_tauRange) && (n < N_tauRange) )
} // for ( unsigned int n = 0; n < N_tauRange; n ++ )
} // cudaTransientFstatRectWindow()
""" Additional helper functions dealing with transient-CW F(t0,tau) maps """
import numpy as np
import os
import logging
# optional imports
......@@ -31,15 +33,34 @@ def optional_import ( modulename, shorthand=None ):
return success
class pyTransientFstatMap(object):
'''
simplified object class for a F(t0,tau) F-stat map (not 2F!)
based on LALSuite's transientFstatMap_t type
replacing the gsl matrix with a numpy array
F_mn: 2D array of 2F values
maxF: maximum of F (not 2F!)
t0_ML: maximum likelihood transient start time t0 estimate
tau_ML: maximum likelihood transient duration tau estimate
'''
def __init__(self, N_t0Range, N_tauRange):
self.F_mn = np.zeros((N_t0Range, N_tauRange), dtype=np.float32)
self.maxF = float(-1.0) # Initializing to a negative value ensures that we always update at least once and hence return sane t0_d_ML, tau_d_ML even if there is only a single bin where F=0 happens.
self.t0_ML = float(0.0)
self.tau_ML = float(0.0)
# dictionary of the actual callable F-stat map functions we support,
# if the corresponding modules are available.
fstatmap_versions = {
'lal': lambda multiFstatAtoms, windowRange:
getattr(lalpulsar,'ComputeTransientFstatMap')
( multiFstatAtoms, windowRange, False ),
#'pycuda': lambda multiFstatAtoms, windowRange:
#pycuda_compute_transient_fstat_map
#( multiFstatAtoms, windowRange )
'pycuda': lambda multiFstatAtoms, windowRange:
pycuda_compute_transient_fstat_map
( multiFstatAtoms, windowRange )
}
......@@ -51,13 +72,24 @@ def init_transient_fstat_map_features ( ):
each key's value set to True only if
all required modules are importable on this system.
'''
features = {}
have_lal = optional_import('lal')
have_lalpulsar = optional_import('lalpulsar')
features['lal'] = have_lal and have_lalpulsar
features['pycuda'] = False
# import GPU features
have_pycuda_drv = optional_import('pycuda.driver', 'drv')
have_pycuda_init = optional_import('pycuda.autoinit', 'autoinit')
have_pycuda_gpuarray = optional_import('pycuda.gpuarray', 'gpuarray')
have_pycuda_tools = optional_import('pycuda.tools', 'cudatools')
have_pycuda_compiler = optional_import('pycuda.compiler', 'cudacomp')
features['pycuda'] = have_pycuda_drv and have_pycuda_init and have_pycuda_gpuarray and have_pycuda_tools and have_pycuda_compiler
logging.debug('Got the following features for transient F-stat maps:')
logging.debug(features)
return features
......@@ -72,3 +104,222 @@ def call_compute_transient_fstat_map ( version, features, multiFstatAtoms=None,
else:
raise Exception('Transient F-stat map method "{}" not implemented!'.format(version))
return FstatMap
def reshape_FstatAtomsVector ( atomsVector ):
'''
Make a dictionary of ndarrays out of a atoms "vector" structure.
The input is a "vector"-like structure with times as the higher hierarchical
level and a set of "atoms" quantities defined at each timestamp.
The output is a dictionary with an entry for each quantity,
which is a 1D ndarray over timestamps for that one quantity.
'''
numAtoms = atomsVector.length
atomsDict = {}
atom_fieldnames = ['timestamp', 'Fa_alpha', 'Fb_alpha',
'a2_alpha', 'ab_alpha', 'b2_alpha']
atom_dtypes = [np.uint32, complex, complex,
np.float32, np.float32, np.float32]
for f, field in enumerate(atom_fieldnames):
atomsDict[field] = np.ndarray(numAtoms,dtype=atom_dtypes[f])
for n,atom in enumerate(atomsVector.data):
for field in atom_fieldnames:
atomsDict[field][n] = atom.__getattribute__(field)
atomsDict['Fa_alpha_re'] = np.float32(atomsDict['Fa_alpha'].real)
atomsDict['Fa_alpha_im'] = np.float32(atomsDict['Fa_alpha'].imag)
atomsDict['Fb_alpha_re'] = np.float32(atomsDict['Fb_alpha'].real)
atomsDict['Fb_alpha_im'] = np.float32(atomsDict['Fb_alpha'].imag)
return atomsDict
def get_absolute_kernel_path ( kernel ):
pyfstatdir = os.path.dirname(os.path.abspath(os.path.realpath(__file__)))
kernelfile = kernel + '.cu'
return os.path.join(pyfstatdir,'pyCUDAkernels',kernelfile)
def pycuda_compute_transient_fstat_map ( multiFstatAtoms, windowRange ):
'''
GPU version of the function to compute transient-window "F-statistic map"
over start-time and timescale {t0, tau}.
Based on XLALComputeTransientFstatMap from LALSuite,
(C) 2009 Reinhard Prix, licensed under GPL
Returns a 2D matrix F_mn,
with m = index over start-times t0,
and n = index over timescales tau,
in steps of dt0 in [t0, t0+t0Band],
and dtau in [tau, tau+tauBand]
as defined in windowRange input.
'''
if ( windowRange.type >= lalpulsar.TRANSIENT_LAST ):
raise ValueError ('Unknown window-type ({}) passed as input. Allowed are [0,{}].'.format(windowRange.type, lalpulsar.TRANSIENT_LAST-1))
# internal dict for search/setup parameters
tCWparams = {}
# first combine all multi-atoms
# into a single atoms-vector with *unique* timestamps
tCWparams['TAtom'] = multiFstatAtoms.data[0].TAtom
TAtomHalf = int(tCWparams['TAtom']/2) # integer division
atoms = lalpulsar.mergeMultiFstatAtomsBinned ( multiFstatAtoms,
tCWparams['TAtom'] )
# make a combined input matrix of all atoms vectors, for transfer to GPU
tCWparams['numAtoms'] = atoms.length
atomsDict = reshape_FstatAtomsVector(atoms)
atomsInputMatrix = np.column_stack ( (atomsDict['a2_alpha'],
atomsDict['b2_alpha'],
atomsDict['ab_alpha'],
atomsDict['Fa_alpha_re'],
atomsDict['Fa_alpha_im'],
atomsDict['Fb_alpha_re'],
atomsDict['Fb_alpha_im'])
)
# actual data spans [t0_data, t0_data + tCWparams['numAtoms'] * TAtom]
# in steps of TAtom
tCWparams['t0_data'] = int(atoms.data[0].timestamp)
tCWparams['t1_data'] = int(atoms.data[tCWparams['numAtoms']-1].timestamp + tCWparams['TAtom'])
logging.debug('t0_data={:d}, t1_data={:d}'.format(tCWparams['t0_data'],
tCWparams['t1_data']))
logging.debug('numAtoms={:d}, TAtom={:d}, TAtomHalf={:d}'.format(tCWparams['numAtoms'],
tCWparams['TAtom'],
TAtomHalf))
# special treatment of window_type = none
# ==> replace by rectangular window spanning all the data
if ( windowRange.type == lalpulsar.TRANSIENT_NONE ):
windowRange.type = lalpulsar.TRANSIENT_RECTANGULAR
windowRange.t0 = tCWparams['t0_data']
windowRange.t0Band = 0
windowRange.dt0 = tCWparams['TAtom'] # irrelevant
windowRange.tau = tCWparams['numAtoms'] * tCWparams['TAtom']
windowRange.tauBand = 0;
windowRange.dtau = tCWparams['TAtom'] # irrelevant
""" NOTE: indices {i,j} enumerate *actual* atoms and their timestamps t_i, while the
* indices {m,n} enumerate the full grid of values in [t0_min, t0_max]x[Tcoh_min, Tcoh_max] in
* steps of deltaT. This allows us to deal with gaps in the data in a transparent way.
*
* NOTE2: we operate on the 'binned' atoms returned from XLALmergeMultiFstatAtomsBinned(),
* which means we can safely assume all atoms to be lined up perfectly on a 'deltaT' binned grid.
*
* The mapping used will therefore be {i,j} -> {m,n}:
* m = offs_i / deltaT = start-time offset from t0_min measured in deltaT
* n = Tcoh_ij / deltaT = duration Tcoh_ij measured in deltaT,
*
* where
* offs_i = t_i - t0_min
* Tcoh_ij = t_j - t_i + deltaT
*
"""
# We allocate a matrix {m x n} = t0Range * TcohRange elements
# covering the full timerange the transient window-range [t0,t0+t0Band]x[tau,tau+tauBand]
tCWparams['N_t0Range'] = int(np.floor( 1.0*windowRange.t0Band / windowRange.dt0 ) + 1)
tCWparams['N_tauRange'] = int(np.floor( 1.0*windowRange.tauBand / windowRange.dtau ) + 1)
FstatMap = pyTransientFstatMap ( tCWparams['N_t0Range'], tCWparams['N_tauRange'] )
logging.debug('N_t0Range={:d}, N_tauRange={:d}'.format(tCWparams['N_t0Range'],
tCWparams['N_tauRange']))
if ( windowRange.type == lalpulsar.TRANSIENT_RECTANGULAR ):
FstatMap.F_mn = pycuda_compute_transient_fstat_map_rect ( atomsInputMatrix,
windowRange,
tCWparams )
elif ( windowRange.type == lalpulsar.TRANSIENT_EXPONENTIAL ):
FstatMap.F_mn = pycuda_compute_transient_fstat_map_exp ( atomsInputMatrix,
windowRange,
tCWparams )
else:
raise ValueError('Invalid transient window type {} not in [{}, {}].'.format(windowRange.type, lalpulsar.TRANSIENT_NONE, lalpulsar.TRANSIENT_LAST-1))
# out of loop: get max2F and ML estimates over the m x n matrix
FstatMap.maxF = FstatMap.F_mn.max()
maxidx = np.unravel_index ( FstatMap.F_mn.argmax(),
(tCWparams['N_t0Range'],
tCWparams['N_tauRange']))
FstatMap.t0_ML = windowRange.t0 + maxidx[0] * windowRange.dt0
FstatMap.tau_ML = windowRange.tau + maxidx[1] * windowRange.dtau
logging.debug('Done computing transient F-stat map. maxF={}, t0_ML={}, tau_ML={}'.format(FstatMap.maxF , FstatMap.t0_ML, FstatMap.tau_ML))
return FstatMap
def pycuda_compute_transient_fstat_map_rect ( atomsInputMatrix, windowRange, tCWparams ):
'''only GPU-parallizing outer loop, keeping partial sums with memory in kernel'''
# gpu data setup and transfer
logging.debug('Initial GPU memory: {}/{} MB free'.format(drv.mem_get_info()[0]/(2.**20),drv.mem_get_info()[1]/(2.**20)))
input_gpu = gpuarray.to_gpu ( atomsInputMatrix )
Fmn_gpu = gpuarray.GPUArray ( (tCWparams['N_t0Range'],tCWparams['N_tauRange']), dtype=np.float32 )
logging.debug('GPU memory with input+output allocated: {}/{} MB free'.format(drv.mem_get_info()[0]/(2.**20), drv.mem_get_info()[1]/(2.**20)))
# GPU kernel
kernel = 'cudaTransientFstatRectWindow'
kernelfile = get_absolute_kernel_path ( kernel )
partial_Fstat_cuda_code = cudacomp.SourceModule(open(kernelfile,'r').read())
partial_Fstat_cuda = partial_Fstat_cuda_code.get_function(kernel)
partial_Fstat_cuda.prepare('PIIIIIIIIP')
# GPU grid setup
blockRows = min(1024,tCWparams['N_t0Range'])
blockCols = 1
gridRows = int(np.ceil(1.0*tCWparams['N_t0Range']/blockRows))
gridCols = 1
logging.debug('Looking to compute transient F-stat map over a N={}*{}={} (t0,tau) grid...'.format(tCWparams['N_t0Range'],tCWparams['N_tauRange'],tCWparams['N_t0Range']*tCWparams['N_tauRange']))
# running the kernel
logging.debug('Calling kernel with a grid of {}*{}={} blocks of {}*{}={} threads each: {} total threads...'.format(gridRows, gridCols, gridRows*gridCols, blockRows, blockCols, blockRows*blockCols, gridRows*gridCols*blockRows*blockCols))
partial_Fstat_cuda.prepared_call ( (gridRows,gridCols), (blockRows,blockCols,1), input_gpu.gpudata, tCWparams['numAtoms'], tCWparams['TAtom'], tCWparams['t0_data'], windowRange.t0, windowRange.dt0, windowRange.tau, windowRange.dtau, tCWparams['N_tauRange'], Fmn_gpu.gpudata )
# return results to host
F_mn = Fmn_gpu.get()
logging.debug('Final GPU memory: {}/{} MB free'.format(drv.mem_get_info()[0]/(2.**20),drv.mem_get_info()[1]/(2.**20)))
return F_mn
def pycuda_compute_transient_fstat_map_exp ( atomsInputMatrix, windowRange, tCWparams ):
'''exponential window, inner and outer loop GPU-parallelized'''
# gpu data setup and transfer
logging.debug('Initial GPU memory: {}/{} MB free'.format(drv.mem_get_info()[0]/(2.**20),drv.mem_get_info()[1]/(2.**20)))
input_gpu = gpuarray.to_gpu ( atomsInputMatrix )
Fmn_gpu = gpuarray.GPUArray ( (tCWparams['N_t0Range'],tCWparams['N_tauRange']), dtype=np.float32 )
logging.debug('GPU memory with input+output allocated: {}/{} MB free'.format(drv.mem_get_info()[0]/(2.**20), drv.mem_get_info()[1]/(2.**20)))
# GPU kernel
kernel = 'cudaTransientFstatExpWindow'
kernelfile = get_absolute_kernel_path ( kernel )
partial_Fstat_cuda_code = cudacomp.SourceModule(open(kernelfile,'r').read())
partial_Fstat_cuda = partial_Fstat_cuda_code.get_function(kernel)
partial_Fstat_cuda.prepare('PIIIIIIIIIP')
# GPU grid setup
blockRows = min(32,tCWparams['N_t0Range'])
blockCols = min(32,tCWparams['N_tauRange'])
gridRows = int(np.ceil(1.0*tCWparams['N_t0Range']/blockRows))
gridCols = int(np.ceil(1.0*tCWparams['N_tauRange']/blockCols))
logging.debug('Looking to compute transient F-stat map over a N={}*{}={} (t0,tau) grid...'.format(tCWparams['N_t0Range'], tCWparams['N_tauRange'], tCWparams['N_t0Range']*tCWparams['N_tauRange']))
# running the kernel
logging.debug('Calling kernel with a grid of {}*{}={} blocks of {}*{}={} threads each: {} total threads...'.format(gridRows, gridCols, gridRows*gridCols, blockRows, blockCols, blockRows*blockCols, gridRows*gridCols*blockRows*blockCols))
partial_Fstat_cuda.prepared_call ( (gridRows,gridCols), (blockRows,blockCols,1), input_gpu.gpudata, tCWparams['numAtoms'], tCWparams['TAtom'], tCWparams['t0_data'], windowRange.t0, windowRange.dt0, windowRange.tau, windowRange.dtau, tCWparams['N_t0Range'], tCWparams['N_tauRange'], Fmn_gpu.gpudata )
# return results to host
F_mn = Fmn_gpu.get()
logging.debug('Final GPU memory: {}/{} MB free'.format(drv.mem_get_info()[0]/(2.**20),drv.mem_get_info()[1]/(2.**20)))
return F_mn
......@@ -7,4 +7,7 @@ setup(name='PyFstat',
author='Gregory Ashton',
author_email='gregory.ashton@ligo.org',
packages=['pyfstat'],
include_package_data=True,
package_data={'pyfstat': ['pyCUDAkernels/cudaTransientFstatExpWindow.cu',
'pyCUDAkernels/cudaTransientFstatRectWindow.cu']},
)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment