Commit 7ae31caa authored by David Keitel's avatar David Keitel
Browse files

add pyCUDA version of TransientFstatMap

 -optional import of pyCUDA package only if requested
 -including kernel .cu files in packaging
parent 7fae97b6
...@@ -438,6 +438,10 @@ class TransientGridSearch(GridSearch): ...@@ -438,6 +438,10 @@ class TransientGridSearch(GridSearch):
FstatMap = getattr(self.search, 'FstatMap', None) FstatMap = getattr(self.search, 'FstatMap', None)
thisCand = list(vals) + [detstat] thisCand = list(vals) + [detstat]
if getattr(self, 'transientWindowType', None): if getattr(self, 'transientWindowType', None):
if self.tCWFstatMapVersion == 'lal':
F_mn = FstatMap.F_mn.data
else:
F_mn = FstatMap.F_mn
if self.outputTransientFstatMap: if self.outputTransientFstatMap:
tCWfile = os.path.splitext(self.out_file)[0]+'_tCW_%.16f_%.16f_%.16f_%.16g_%.16g.dat' % (vals[2],vals[5],vals[6],vals[3],vals[4]) # freq alpha delta f1dot f2dot tCWfile = os.path.splitext(self.out_file)[0]+'_tCW_%.16f_%.16f_%.16f_%.16g_%.16g.dat' % (vals[2],vals[5],vals[6],vals[3],vals[4]) # freq alpha delta f1dot f2dot
if self.tCWFstatMapVersion == 'lal': if self.tCWFstatMapVersion == 'lal':
...@@ -445,9 +449,8 @@ class TransientGridSearch(GridSearch): ...@@ -445,9 +449,8 @@ class TransientGridSearch(GridSearch):
lalpulsar.write_transientFstatMap_to_fp ( fo, FstatMap, windowRange, None ) lalpulsar.write_transientFstatMap_to_fp ( fo, FstatMap, windowRange, None )
del fo # instead of lal.FileClose() which is not SWIG-exported del fo # instead of lal.FileClose() which is not SWIG-exported
else: else:
np.savetxt(tCWfile, 2.0*FstatMap.F_mn, delimiter=' ') np.savetxt(tCWfile, 2.0*F_mn, delimiter=' ')
Fmn = FstatMap.F_mn.data maxidx = np.unravel_index(F_mn.argmax(), F_mn.shape)
maxidx = np.unravel_index(Fmn.argmax(), Fmn.shape)
thisCand += [windowRange.t0+maxidx[0]*windowRange.dt0, thisCand += [windowRange.t0+maxidx[0]*windowRange.dt0,
windowRange.tau+maxidx[1]*windowRange.dtau] windowRange.tau+maxidx[1]*windowRange.dtau]
data.append(thisCand) data.append(thisCand)
......
__global__ void cudaTransientFstatExpWindow ( float *input,
unsigned int numAtoms,
unsigned int TAtom,
unsigned int t0_data,
unsigned int win_t0,
unsigned int win_dt0,
unsigned int win_tau,
unsigned int win_dtau,
unsigned int Fmn_rows,
unsigned int Fmn_cols,
float *Fmn
)
{
/* match CUDA thread indexing and high-level (t0,tau) indexing */
unsigned int m = blockDim.x * blockIdx.x + threadIdx.x; // t0: row
unsigned int n = blockDim.y * blockIdx.y + threadIdx.y; // tau: column
/* unraveled 1D index for 2D output array */
unsigned int outidx = Fmn_cols * m + n;
/* hardcoded copy from lalpulsar */
unsigned int TRANSIENT_EXP_EFOLDING = 3;
if ( (m < Fmn_rows) && (n < Fmn_cols) ) {
/* compute Fstat-atom index i_t0 in [0, numAtoms) */
unsigned int TAtomHalf = TAtom/2; // integer division
unsigned int t0 = win_t0 + m * win_dt0;
int i_tmp = ( t0 - t0_data + TAtomHalf ) / TAtom; // integer round: floor(x+0.5)
if ( i_tmp < 0 ) {
i_tmp = 0;
}
unsigned int i_t0 = (unsigned int)i_tmp;
if ( i_t0 >= numAtoms ) {
i_t0 = numAtoms - 1;
}
/* translate n into an atoms end-index for this search interval [t0, t0+Tcoh],
* giving the index range of atoms to sum over
*/
unsigned int tau = win_tau + n * win_dtau;
/* get end-time t1 of this transient-window search
* for given tau, what Tcoh should the exponential window cover?
* for speed reasons we want to truncate Tcoh = tau * TRANSIENT_EXP_EFOLDING
* with the e-folding factor chosen such that the window-value
* is practically negligible after that, where it will be set to 0
*/
// unsigned int t1 = lround( win_t0 + TRANSIENT_EXP_EFOLDING * win_tau);
unsigned int t1 = t0 + TRANSIENT_EXP_EFOLDING * tau;
/* compute window end-time Fstat-atom index i_t1 in [0, numAtoms) */
i_tmp = ( t1 - t0_data + TAtomHalf ) / TAtom - 1; // integer round: floor(x+0.5)
if ( i_tmp < 0 ) {
i_tmp = 0;
}
unsigned int i_t1 = (unsigned int)i_tmp;
if ( i_t1 >= numAtoms ) {
i_t1 = numAtoms - 1;
}
/* now we have two valid atoms-indices [i_t0, i_t1]
* spanning our Fstat-window to sum over
*/
float Ad = 0.0f;
float Bd = 0.0f;
float Cd = 0.0f;
float Fa_re = 0.0f;
float Fa_im = 0.0f;
float Fb_re = 0.0f;
float Fb_im = 0.0f;
unsigned short input_cols = 7; // must match input matrix!
/* sum up atoms */
for ( unsigned int i=i_t0; i<=i_t1; i++ ) {
unsigned int t_i = t0_data + i * TAtom;
float win_i = 0.0;
if ( t_i >= t0 && t_i <= t1 ) {
float x = 1.0 * ( t_i - t0 ) / tau;
win_i = exp ( -x );
}
float win2_i = win_i * win_i;
Ad += input[i*input_cols+0] * win2_i; // a2_alpha
Bd += input[i*input_cols+1] * win2_i; // b2_alpha
Cd += input[i*input_cols+2] * win2_i; // ab_alpha
Fa_re += input[i*input_cols+3] * win_i; // Fa_alpha_re
Fa_im += input[i*input_cols+4] * win_i; // Fa_alpha_im
Fb_re += input[i*input_cols+5] * win_i; // Fb_alpha_re
Fb_im += input[i*input_cols+6] * win_i; // Fb_alpha_im
}
/* get determinant */
float Dd = ( Ad * Bd - Cd * Cd );
float DdInv = 0.0f;
/* safety catch as in XLALWeightMultiAMCoeffs():
* make it so that in the end F=0 instead of -nan
*/
if ( Dd > 0.0 ) {
DdInv = 1.0 / Dd;
}
/* from XLALComputeFstatFromFaFb */
float F = DdInv * ( Bd * ( Fa_re*Fa_re + Fa_im*Fa_im )
+ Ad * ( Fb_re*Fb_re + Fb_im*Fb_im )
- 2.0 * Cd * ( Fa_re * Fb_re + Fa_im * Fb_im )
);
/* store result in Fstat-matrix
* at unraveled index of element {m,n}
*/
Fmn[outidx] = F;
} // ( (m < Fmn_rows) && (n < Fmn_cols) )
} // cudaTransientFstatExpWindow()
__global__ void cudaTransientFstatRectWindow ( float *input,
unsigned int numAtoms,
unsigned int TAtom,
unsigned int t0_data,
unsigned int win_t0,
unsigned int win_dt0,
unsigned int win_tau,
unsigned int win_dtau,
unsigned int N_tauRange,
float *Fmn
)
{
/* match CUDA thread indexing and high-level (t0,tau) indexing */
// assume 1D block, grid setup
unsigned int m = blockDim.x * blockIdx.x + threadIdx.x; // t0: row
unsigned short input_cols = 7; // must match input matrix!
/* compute Fstat-atom index i_t0 in [0, numAtoms) */
unsigned int TAtomHalf = TAtom/2; // integer division
unsigned int t0 = win_t0 + m * win_dt0;
int i_tmp = ( t0 - t0_data + TAtomHalf ) / TAtom; // integer round: floor(x+0.5)
if ( i_tmp < 0 ) {
i_tmp = 0;
}
unsigned int i_t0 = (unsigned int)i_tmp;
if ( i_t0 >= numAtoms ) {
i_t0 = numAtoms - 1;
}
float Ad = 0.0f;
float Bd = 0.0f;
float Cd = 0.0f;
float Fa_re = 0.0f;
float Fa_im = 0.0f;
float Fb_re = 0.0f;
float Fb_im = 0.0f;
unsigned int i_t1_last = i_t0;
/* INNER loop over timescale-parameter tau
* NOT parallelized so that we can still use the i_t1_last trick
* (empirically seems to be faster than 2D CUDA version)
*/
for ( unsigned int n = 0; n < N_tauRange; n ++ ) {
if ( (m < N_tauRange) && (n < N_tauRange) ) {
/* translate n into an atoms end-index for this search interval [t0, t0+Tcoh],
* giving the index range of atoms to sum over
*/
unsigned int tau = win_tau + n * win_dtau;
/* get end-time t1 of this transient-window search */
unsigned int t1 = t0 + tau;
/* compute window end-time Fstat-atom index i_t1 in [0, numAtoms) */
i_tmp = ( t1 - t0_data + TAtomHalf ) / TAtom - 1; // integer round: floor(x+0.5)
if ( i_tmp < 0 ) {
i_tmp = 0;
}
unsigned int i_t1 = (unsigned int)i_tmp;
if ( i_t1 >= numAtoms ) {
i_t1 = numAtoms - 1;
}
/* now we have two valid atoms-indices [i_t0, i_t1]
* spanning our Fstat-window to sum over
*/
for ( unsigned int i = i_t1_last; i <= i_t1; i ++ ) {
/* sum up atoms,
* special optimiziation in the rectangular-window case:
* just add on to previous tau values,
* ie re-use the sum over [i_t0, i_t1_last] from the pevious tau-loop iteration
*/
Ad += input[i*input_cols+0]; // a2_alpha
Bd += input[i*input_cols+1]; // b2_alpha
Cd += input[i*input_cols+2]; // ab_alpha
Fa_re += input[i*input_cols+3]; // Fa_alpha_re
Fa_im += input[i*input_cols+4]; // Fa_alpha_im
Fb_re += input[i*input_cols+5]; // Fb_alpha_re
Fb_im += input[i*input_cols+6]; // Fb_alpha_im
/* keep track of up to where we summed for the next iteration */
i_t1_last = i_t1 + 1;
}
/* get determinant */
float Dd = ( Ad * Bd - Cd * Cd );
float DdInv = 0.0f;
/* safety catch as in XLALWeightMultiAMCoeffs():
* make it so that in the end F=0 instead of -nan
*/
if ( Dd > 0.0 ) {
DdInv = 1.0 / Dd;
}
/* from XLALComputeFstatFromFaFb */
float F = DdInv * ( Bd * ( Fa_re*Fa_re + Fa_im*Fa_im )
+ Ad * ( Fb_re*Fb_re + Fb_im*Fb_im )
- 2.0 * Cd * ( Fa_re * Fb_re + Fa_im * Fb_im )
);
/* store result in Fstat-matrix
* at unraveled index of element {m,n}
*/
unsigned int outidx = m * N_tauRange + n;
Fmn[outidx] = F;
} // if ( (m < N_tauRange) && (n < N_tauRange) )
} // for ( unsigned int n = 0; n < N_tauRange; n ++ )
} // cudaTransientFstatRectWindow()
""" Additional helper functions dealing with transient-CW F(t0,tau) maps """ """ Additional helper functions dealing with transient-CW F(t0,tau) maps """
import numpy as np
import os
import logging import logging
# optional imports # optional imports
...@@ -31,15 +33,34 @@ def optional_import ( modulename, shorthand=None ): ...@@ -31,15 +33,34 @@ def optional_import ( modulename, shorthand=None ):
return success return success
class pyTransientFstatMap(object):
'''
simplified object class for a F(t0,tau) F-stat map (not 2F!)
based on LALSuite's transientFstatMap_t type
replacing the gsl matrix with a numpy array
F_mn: 2D array of 2F values
maxF: maximum of F (not 2F!)
t0_ML: maximum likelihood transient start time t0 estimate
tau_ML: maximum likelihood transient duration tau estimate
'''
def __init__(self, N_t0Range, N_tauRange):
self.F_mn = np.zeros((N_t0Range, N_tauRange), dtype=np.float32)
self.maxF = float(-1.0) # Initializing to a negative value ensures that we always update at least once and hence return sane t0_d_ML, tau_d_ML even if there is only a single bin where F=0 happens.
self.t0_ML = float(0.0)
self.tau_ML = float(0.0)
# dictionary of the actual callable F-stat map functions we support, # dictionary of the actual callable F-stat map functions we support,
# if the corresponding modules are available. # if the corresponding modules are available.
fstatmap_versions = { fstatmap_versions = {
'lal': lambda multiFstatAtoms, windowRange: 'lal': lambda multiFstatAtoms, windowRange:
getattr(lalpulsar,'ComputeTransientFstatMap') getattr(lalpulsar,'ComputeTransientFstatMap')
( multiFstatAtoms, windowRange, False ), ( multiFstatAtoms, windowRange, False ),
#'pycuda': lambda multiFstatAtoms, windowRange: 'pycuda': lambda multiFstatAtoms, windowRange:
#pycuda_compute_transient_fstat_map pycuda_compute_transient_fstat_map
#( multiFstatAtoms, windowRange ) ( multiFstatAtoms, windowRange )
} }
...@@ -51,13 +72,24 @@ def init_transient_fstat_map_features ( ): ...@@ -51,13 +72,24 @@ def init_transient_fstat_map_features ( ):
each key's value set to True only if each key's value set to True only if
all required modules are importable on this system. all required modules are importable on this system.
''' '''
features = {} features = {}
have_lal = optional_import('lal') have_lal = optional_import('lal')
have_lalpulsar = optional_import('lalpulsar') have_lalpulsar = optional_import('lalpulsar')
features['lal'] = have_lal and have_lalpulsar features['lal'] = have_lal and have_lalpulsar
features['pycuda'] = False
# import GPU features
have_pycuda_drv = optional_import('pycuda.driver', 'drv')
have_pycuda_init = optional_import('pycuda.autoinit', 'autoinit')
have_pycuda_gpuarray = optional_import('pycuda.gpuarray', 'gpuarray')
have_pycuda_tools = optional_import('pycuda.tools', 'cudatools')
have_pycuda_compiler = optional_import('pycuda.compiler', 'cudacomp')
features['pycuda'] = have_pycuda_drv and have_pycuda_init and have_pycuda_gpuarray and have_pycuda_tools and have_pycuda_compiler
logging.debug('Got the following features for transient F-stat maps:') logging.debug('Got the following features for transient F-stat maps:')
logging.debug(features) logging.debug(features)
return features return features
...@@ -72,3 +104,222 @@ def call_compute_transient_fstat_map ( version, features, multiFstatAtoms=None, ...@@ -72,3 +104,222 @@ def call_compute_transient_fstat_map ( version, features, multiFstatAtoms=None,
else: else:
raise Exception('Transient F-stat map method "{}" not implemented!'.format(version)) raise Exception('Transient F-stat map method "{}" not implemented!'.format(version))
return FstatMap return FstatMap
def reshape_FstatAtomsVector ( atomsVector ):
'''
Make a dictionary of ndarrays out of a atoms "vector" structure.
The input is a "vector"-like structure with times as the higher hierarchical
level and a set of "atoms" quantities defined at each timestamp.
The output is a dictionary with an entry for each quantity,
which is a 1D ndarray over timestamps for that one quantity.
'''
numAtoms = atomsVector.length
atomsDict = {}
atom_fieldnames = ['timestamp', 'Fa_alpha', 'Fb_alpha',
'a2_alpha', 'ab_alpha', 'b2_alpha']
atom_dtypes = [np.uint32, complex, complex,
np.float32, np.float32, np.float32]
for f, field in enumerate(atom_fieldnames):
atomsDict[field] = np.ndarray(numAtoms,dtype=atom_dtypes[f])
for n,atom in enumerate(atomsVector.data):
for field in atom_fieldnames:
atomsDict[field][n] = atom.__getattribute__(field)
atomsDict['Fa_alpha_re'] = np.float32(atomsDict['Fa_alpha'].real)
atomsDict['Fa_alpha_im'] = np.float32(atomsDict['Fa_alpha'].imag)
atomsDict['Fb_alpha_re'] = np.float32(atomsDict['Fb_alpha'].real)
atomsDict['Fb_alpha_im'] = np.float32(atomsDict['Fb_alpha'].imag)
return atomsDict
def get_absolute_kernel_path ( kernel ):
pyfstatdir = os.path.dirname(os.path.abspath(os.path.realpath(__file__)))
kernelfile = kernel + '.cu'
return os.path.join(pyfstatdir,'pyCUDAkernels',kernelfile)
def pycuda_compute_transient_fstat_map ( multiFstatAtoms, windowRange ):
'''
GPU version of the function to compute transient-window "F-statistic map"
over start-time and timescale {t0, tau}.
Based on XLALComputeTransientFstatMap from LALSuite,
(C) 2009 Reinhard Prix, licensed under GPL
Returns a 2D matrix F_mn,
with m = index over start-times t0,
and n = index over timescales tau,
in steps of dt0 in [t0, t0+t0Band],
and dtau in [tau, tau+tauBand]
as defined in windowRange input.
'''
if ( windowRange.type >= lalpulsar.TRANSIENT_LAST ):
raise ValueError ('Unknown window-type ({}) passed as input. Allowed are [0,{}].'.format(windowRange.type, lalpulsar.TRANSIENT_LAST-1))
# internal dict for search/setup parameters
tCWparams = {}
# first combine all multi-atoms
# into a single atoms-vector with *unique* timestamps
tCWparams['TAtom'] = multiFstatAtoms.data[0].TAtom
TAtomHalf = int(tCWparams['TAtom']/2) # integer division
atoms = lalpulsar.mergeMultiFstatAtomsBinned ( multiFstatAtoms,
tCWparams['TAtom'] )
# make a combined input matrix of all atoms vectors, for transfer to GPU
tCWparams['numAtoms'] = atoms.length
atomsDict = reshape_FstatAtomsVector(atoms)
atomsInputMatrix = np.column_stack ( (atomsDict['a2_alpha'],
atomsDict['b2_alpha'],
atomsDict['ab_alpha'],
atomsDict['Fa_alpha_re'],
atomsDict['Fa_alpha_im'],
atomsDict['Fb_alpha_re'],
atomsDict['Fb_alpha_im'])
)
# actual data spans [t0_data, t0_data + tCWparams['numAtoms'] * TAtom]
# in steps of TAtom
tCWparams['t0_data'] = int(atoms.data[0].timestamp)
tCWparams['t1_data'] = int(atoms.data[tCWparams['numAtoms']-1].timestamp + tCWparams['TAtom'])
logging.debug('t0_data={:d}, t1_data={:d}'.format(tCWparams['t0_data'],
tCWparams['t1_data']))
logging.debug('numAtoms={:d}, TAtom={:d}, TAtomHalf={:d}'.format(tCWparams['numAtoms'],
tCWparams['TAtom'],
TAtomHalf))
# special treatment of window_type = none
# ==> replace by rectangular window spanning all the data
if ( windowRange.type == lalpulsar.TRANSIENT_NONE ):
windowRange.type = lalpulsar.TRANSIENT_RECTANGULAR
windowRange.t0 = tCWparams['t0_data']
windowRange.t0Band = 0
windowRange.dt0 = tCWparams['TAtom'] # irrelevant
windowRange.tau = tCWparams['numAtoms'] * tCWparams['TAtom']
windowRange.tauBand = 0;
windowRange.dtau = tCWparams['TAtom'] # irrelevant
""" NOTE: indices {i,j} enumerate *actual* atoms and their timestamps t_i, while the
* indices {m,n} enumerate the full grid of values in [t0_min, t0_max]x[Tcoh_min, Tcoh_max] in
* steps of deltaT. This allows us to deal with gaps in the data in a transparent way.
*
* NOTE2: we operate on the 'binned' atoms returned from XLALmergeMultiFstatAtomsBinned(),
* which means we can safely assume all atoms to be lined up perfectly on a 'deltaT' binned grid.
*
* The mapping used will therefore be {i,j} -> {m,n}:
* m = offs_i / deltaT = start-time offset from t0_min measured in deltaT
* n = Tcoh_ij / deltaT = duration Tcoh_ij measured in deltaT,
*
* where
* offs_i = t_i - t0_min
* Tcoh_ij = t_j - t_i + deltaT
*
"""
# We allocate a matrix {m x n} = t0Range * TcohRange elements
# covering the full timerange the transient window-range [t0,t0+t0Band]x[tau,tau+tauBand]
tCWparams['N_t0Range'] = int(np.floor( 1.0*windowRange.t0Band / windowRange.dt0 ) + 1)
tCWparams['N_tauRange'] = int(np.floor( 1.0*windowRange.tauBand / windowRange.dtau ) + 1)
FstatMap = pyTransientFstatMap ( tCWparams['N_t0Range'], tCWparams['N_tauRange'] )
logging.debug('N_t0Range={:d}, N_tauRange={:d}'.format(tCWparams['N_t0Range'],
tCWparams['N_tauRange']))
if ( windowRange.type == lalpulsar.TRANSIENT_RECTANGULAR ):
FstatMap.F_mn = pycuda_compute_transient_fstat_map_rect ( atomsInputMatrix,
windowRange,
tCWparams )
elif ( windowRange.type == lalpulsar.TRANSIENT_EXPONENTIAL ):
FstatMap.F_mn = pycuda_compute_transient_fstat_map_exp ( atomsInputMatrix,
windowRange,
tCWparams )
else:
raise ValueError('Invalid transient window type {} not in [{}, {}].'.format(windowRange.type, lalpulsar.TRANSIENT_NONE, lalpulsar.TRANSIENT_LAST-1))
# out of loop: get max2F and ML estimates over the m x n matrix
FstatMap.maxF = FstatMap.F_mn.max()
maxidx = np.unravel_index ( FstatMap.F_mn.argmax(),
(tCWparams['N_t0Range'],
tCWparams['N_tauRange']))
FstatMap.t0_ML = windowRange.t0 + maxidx[0] * windowRange.dt0
FstatMap.tau_ML = windowRange.tau + maxidx[1] * windowRange.dtau
logging.debug('Done computing transient F-stat map. maxF={}, t0_ML={}, tau_ML={}'.format(FstatMap.maxF , FstatMap.t0_ML, FstatMap.tau_ML))
return FstatMap
def pycuda_compute_transient_fstat_map_rect ( atomsInputMatrix, windowRange, tCWparams ):
'''only GPU-parallizing outer loop, keeping partial sums with memory in kernel'''
# gpu data setup and transfer
logging.debug('Initial GPU memory: {}/{} MB free'.format(drv.mem_get_info()[0]/(2.**20),drv.mem_get_info()[1]/(2.**20)))
input_gpu = gpuarray.to_gpu ( atomsInputMatrix )
Fmn_gpu = gpuarray.GPUArray ( (tCWparams['N_t0Range'],tCWparams['N_tauRange']), dtype=np.float32 )
logging.debug('GPU memory with input+output allocated: {}/{} MB free'.format(drv.mem_get_info()[0]/(2.**20), drv.mem_get_info()[1]/(2.**20)))
# GPU kernel
kernel = 'cudaTransientFstatRectWindow'
kernelfile = get_absolute_kernel_path ( kernel )
partial_Fstat_cuda_code = cudacomp.SourceModule(open(kernelfile,'r').read())
partial_Fstat_cuda = partial_Fstat_cuda_code.get_function(kernel)
partial_Fstat_cuda.prepare('PIIIIIIIIP')
# GPU grid setup
blockRows = min(1024,tCWparams['N_t0Range'])
blockCols = 1
gridRows = int(np.ceil(1.0*tCWparams['N_t0Range']/blockRows))
gridCols = 1
logging.debug('Looking to compute transient F-stat map over a N={}*{}={} (t0,tau) grid...'.format(tCWparams['N_t0Range'],tCWparams['N_tauRange'],tCWparams['N_t0Range']*tCWparams['N_tauRange']))
# running the kernel
logging.debug('Calling kernel with a grid of {}*{}={} blocks of {}*{}={} threads each: {} total threads...'.format(gridRows, gridCols, gridRows*gridCols, blockRows, blockCols, blockRows*blockCols, gridRows*gridCols*blockRows*blockCols))
partial_Fstat_cuda.prepared_call ( (gridRows,gridCols), (blockRows,blockCols,1), input_gpu.gpudata, tCWparams['numAtoms'], tCWparams['TAtom'], tCWparams['t0_data'], windowRange.t0, windowRange.dt0, windowRange.tau, windowRange.dtau, tCWparams['N_tauRange'], Fmn_gpu.gpudata )
# return results to host
F_mn = Fmn_gpu.get()
logging.debug('Final GPU memory: {}/{} MB free'.format(drv.mem_get_info()[0]/(2.**20),drv.mem_get_info()[1]/(2.**20)))
return F_mn
def pycuda_compute_transient_fstat_map_exp ( atomsInputMatrix, windowRange, tCWparams ):
'''exponential window, inner and outer loop GPU-parallelized'''
# gpu data setup and transfer
logging.debug('Initial GPU memory: {}/{} MB free'.format(drv.mem_get_info()[0]/(2.**20),drv.mem_get_info()[1]/(2.**20)))
input_gpu = gpuarray.to_gpu ( atomsInputMatrix )
Fmn_gpu = gpuarray.GPUArray ( (tCWparams['N_t0Range'],tCWparams['N_tauRange']), dtype=np.float32 )
logging.debug('GPU memory with input+output allocated: {}/{} MB free'.format(drv.mem_get_info()[0]/(2.**20), drv.mem_get_info()[1]/(2.**20)))
# GPU kernel
kernel = 'cudaTransientFstatExpWindow'
kernelfile = get_absolute_kernel_path ( kernel )
partial_Fstat_cuda_code = cudacomp.SourceModule(open(kernelfile,'r').read())
partial_Fstat_cuda = partial_Fstat_cuda_code.get_function(kernel)
partial_Fstat_cuda.prepare('PIIIIIIIIIP')
# GPU grid setup
blockRows = min(32,tCWparams['N_t0Range'])
blockCols = min(32,tCWparams['N_tauRange'])
gridRows = int(np.ceil(1.0*tCWparams['N_t0Range']/blockRows))
gridCols = int(np.ceil(1.0*tCWparams['N_tauRange']/blockCols))
logging.debug('Looking to compute transient F-stat map over a N={}*{}={} (t0,tau) grid...'.format(tCWparams['N_t0Range'], tCWparams['N_tauRange'], tCWparams['N_t0Range']*tCWparams['N_tauRange']))
# running the kernel
logging.debug('Calling kernel with a grid of {}*{}={} blocks of {}*{}={} threads each: {} total threads...'.format(gridRows, gridCols, gridRows*gridCols, blockRows, blockCols, blockRows*blockCols, gridRows*gridCols*blockRows*blockCols))
partial_Fstat_cuda.prepared_call ( (gridRows,gridCols), (blockRows,blockCols,1), input_gpu.gpudata, tCWparams['numAtoms'], tCWparams['TAtom'], tCWparams['t0_data'], windowRange.t0, windowRange.dt0, windowRange.tau, windowRange.dtau, tCWparams['N_t0Range'], tCWparams['N_tauRange'], Fmn_gpu.gpudata )
# return results to host
F_mn = Fmn_gpu.get()
logging.debug('Final GPU memory: {}/{} MB free'.format(drv.mem_get_info()[0]/(2.**20),drv.mem_get_info()[1]/(2.**20)))
return F_mn
...@@ -7,4 +7,7 @@ setup(name='PyFstat', ...@@ -7,4 +7,7 @@ setup(name='PyFstat',
author='Gregory Ashton', author='Gregory Ashton',
author_email='gregory.ashton@ligo.org', author_email='gregory.ashton@ligo.org',
packages=['pyfstat'], packages=['pyfstat'],