Commit d55178b6 authored by David Keitel's avatar David Keitel

Merge branch 'tCWgpu' into 'master'

pyCUDA version of transient F-stat

See merge request !13
parents 793e1f4e 7e71b9d2
......@@ -50,7 +50,8 @@ search2 = pyfstat.TransientGridSearch(
minStartTime=minStartTime, maxStartTime=maxStartTime,
transientWindowType='rect', t0Band=Tspan-2*Tsft, tauBand=Tspan,
BSGL=False,
outputTransientFstatMap=True)
outputTransientFstatMap=True,
tCWFstatMapVersion='lal')
search2.run()
search2.print_max_twoF()
......
......@@ -13,6 +13,7 @@ import scipy.optimize
import lal
import lalpulsar
import pyfstat.helper_functions as helper_functions
import pyfstat.tcw_fstat_map_funcs as tcw
# workaround for matplotlib on X-less remote logins
if 'DISPLAY' in os.environ:
......@@ -335,7 +336,8 @@ class ComputeFstat(BaseSearchClass):
dt0=None, dtau=None,
detectors=None, minCoverFreq=None, maxCoverFreq=None,
injectSources=None, injectSqrtSX=None, assumeSqrtSX=None,
SSBprec=None):
SSBprec=None,
tCWFstatMapVersion='lal', cudaDeviceName=None):
"""
Parameters
----------
......@@ -383,6 +385,11 @@ class ComputeFstat(BaseSearchClass):
SSBprec : int
Flag to set the SSB calculation: 0=Newtonian, 1=relativistic,
2=relativisitic optimised, 3=DMoff, 4=NO_SPIN
tCWFstatMapVersion: str
Choose between standard 'lal' implementation,
'pycuda' for gpu, and some others for devel/debug.
cudaDeviceName: str
GPU name to be matched against drv.Device output.
"""
......@@ -625,13 +632,14 @@ class ComputeFstat(BaseSearchClass):
self.windowRange.dt0 = self.Tsft
self.windowRange.dtau = self.Tsft
# special treatment of window_type = none ==> replace by rectangular window spanning all the data
# special treatment of window_type = none
# ==> replace by rectangular window spanning all the data
if self.windowRange.type == lalpulsar.TRANSIENT_NONE:
self.windowRange.t0 = int(self.minStartTime)
self.windowRange.t0Band = 0
self.windowRange.tau = int(self.maxStartTime-self.minStartTime)
self.windowRange.tauBand = 0
else: # user-set bands and spacings
else: # user-set bands and spacings
if self.t0Band is None:
self.windowRange.t0Band = 0
else:
......@@ -653,6 +661,11 @@ class ComputeFstat(BaseSearchClass):
if self.dtau:
self.windowRange.dtau = self.dtau
logging.info('Initialising transient FstatMap features...')
self.tCWFstatMapFeatures, self.gpu_context = (
tcw.init_transient_fstat_map_features(
self.tCWFstatMapVersion == 'pycuda', self.cudaDeviceName))
def get_fullycoherent_twoF(self, tstart, tend, F0, F1, F2, Alpha, Delta,
asini=None, period=None, ecc=None, tp=None,
argp=None):
......@@ -695,9 +708,13 @@ class ComputeFstat(BaseSearchClass):
# F-stat computation
self.windowRange.tau = int(2*self.Tsft)
self.FstatMap = lalpulsar.ComputeTransientFstatMap(
self.FstatResults.multiFatoms[0], self.windowRange, False)
F_mn = self.FstatMap.F_mn.data
self.FstatMap = tcw.call_compute_transient_fstat_map(
self.tCWFstatMapVersion, self.tCWFstatMapFeatures,
self.FstatResults.multiFatoms[0], self.windowRange)
if self.tCWFstatMapVersion == 'lal':
F_mn = self.FstatMap.F_mn.data
else:
F_mn = self.FstatMap.F_mn
twoF = 2*np.max(F_mn)
if self.BSGL is False:
......@@ -920,6 +937,15 @@ class ComputeFstat(BaseSearchClass):
raise RuntimeError('Cannot print atoms vector to file: no FstatResults.multiFatoms, or it is None!')
def __del__(self):
"""
In pyCuda case without autoinit,
we need to make sure the context is removed at the end
"""
if hasattr(self,'gpu_context') and self.gpu_context:
self.gpu_context.detach()
class SemiCoherentSearch(ComputeFstat):
""" A semi-coherent search """
......@@ -950,6 +976,8 @@ class SemiCoherentSearch(ComputeFstat):
self.transientWindowType = 'rect'
self.t0Band = None
self.tauBand = None
self.tCWFstatMapVersion = 'lal'
self.cudaDeviceName = None
self.init_computefstatistic_single_point()
self.init_semicoherent_parameters()
......@@ -1089,6 +1117,8 @@ class SemiCoherentGlitchSearch(ComputeFstat):
self.transientWindowType = 'rect'
self.t0Band = None
self.tauBand = None
self.tCWFstatMapVersion = 'lal'
self.cudaDeviceName = None
self.binary = False
self.init_computefstatistic_single_point()
......
......@@ -355,7 +355,8 @@ class TransientGridSearch(GridSearch):
transientWindowType=None, t0Band=None, tauBand=None,
dt0=None, dtau=None,
outputTransientFstatMap=False,
outputAtoms=False):
outputAtoms=False,
tCWFstatMapVersion='lal', cudaDeviceName=None):
"""
Parameters
----------
......@@ -388,6 +389,11 @@ class TransientGridSearch(GridSearch):
outputTransientFstatMap: bool
if true, write output files for (t0,tau) Fstat maps
(one file for each doppler grid point!)
tCWFstatMapVersion: str
Choose between standard 'lal' implementation,
'pycuda' for gpu, and some others for devel/debug.
cudaDeviceName: str
GPU name to be matched against drv.Device output.
For all other parameters, see `pyfstat.ComputeFStat` for details
"""
......@@ -413,7 +419,9 @@ class TransientGridSearch(GridSearch):
minStartTime=self.minStartTime, maxStartTime=self.maxStartTime,
BSGL=self.BSGL, SSBprec=self.SSBprec,
injectSources=self.injectSources,
assumeSqrtSX=self.assumeSqrtSX)
assumeSqrtSX=self.assumeSqrtSX,
tCWFstatMapVersion=self.tCWFstatMapVersion,
cudaDeviceName=self.cudaDeviceName)
self.search.get_det_stat = self.search.get_fullycoherent_twoF
def run(self, return_data=False):
......@@ -427,19 +435,36 @@ class TransientGridSearch(GridSearch):
self.inititate_search_object()
data = []
if self.outputTransientFstatMap:
tCWfilebase = os.path.splitext(self.out_file)[0] + '_tCW_'
logging.info('Will save per-Doppler Fstatmap' \
' results to {}*.dat'.format(tCWfilebase))
for vals in tqdm(self.input_data):
detstat = self.search.get_det_stat(*vals)
windowRange = getattr(self.search, 'windowRange', None)
FstatMap = getattr(self.search, 'FstatMap', None)
thisCand = list(vals) + [detstat]
if getattr(self, 'transientWindowType', None):
if self.tCWFstatMapVersion == 'lal':
F_mn = FstatMap.F_mn.data
else:
F_mn = FstatMap.F_mn
if self.outputTransientFstatMap:
tCWfile = os.path.splitext(self.out_file)[0]+'_tCW_%.16f_%.16f_%.16f_%.16g_%.16g.dat' % (vals[2],vals[5],vals[6],vals[3],vals[4]) # freq alpha delta f1dot f2dot
fo = lal.FileOpen(tCWfile, 'w')
lalpulsar.write_transientFstatMap_to_fp ( fo, FstatMap, windowRange, None )
del fo # instead of lal.FileClose() which is not SWIG-exported
Fmn = FstatMap.F_mn.data
maxidx = np.unravel_index(Fmn.argmax(), Fmn.shape)
# per-Doppler filename convention:
# freq alpha delta f1dot f2dot
tCWfile = ( tCWfilebase
+ '%.16f_%.16f_%.16f_%.16g_%.16g.dat' %
(vals[2],vals[5],vals[6],vals[3],vals[4]) )
if self.tCWFstatMapVersion == 'lal':
fo = lal.FileOpen(tCWfile, 'w')
lalpulsar.write_transientFstatMap_to_fp (
fo, FstatMap, windowRange, None )
# instead of lal.FileClose(),
# which is not SWIG-exported:
del fo
else:
self.write_F_mn ( tCWfile, F_mn, windowRange)
maxidx = np.unravel_index(F_mn.argmax(), F_mn.shape)
thisCand += [windowRange.t0+maxidx[0]*windowRange.dt0,
windowRange.tau+maxidx[1]*windowRange.dtau]
data.append(thisCand)
......@@ -453,6 +478,19 @@ class TransientGridSearch(GridSearch):
self.save_array_to_disk(data)
self.data = data
def write_F_mn (self, tCWfile, F_mn, windowRange ):
with open(tCWfile, 'w') as tfp:
tfp.write('# t0 [s] tau [s] 2F\n')
for m, F_m in enumerate(F_mn):
this_t0 = windowRange.t0 + m * windowRange.dt0
for n, this_F in enumerate(F_m):
this_tau = windowRange.tau + n * windowRange.dtau;
tfp.write(' %10d %10d %- 11.8g\n' % (this_t0, this_tau, 2.0*this_F))
def __del__(self):
if hasattr(self,'search'):
self.search.__del__()
class SliceGridSearch(GridSearch):
""" Slice gridded search using ComputeFstat """
......
......@@ -82,6 +82,9 @@ class MCMCSearch(core.BaseSearchClass):
('none' instead of None explicitly calls the transient-window function,
but with the full range, for debugging)
Currently only supported for nsegs=1.
tCWFstatMapVersion: str
Choose between standard 'lal' implementation,
'pycuda' for gpu, and some others for devel/debug.
Attributes
----------
......@@ -115,7 +118,7 @@ class MCMCSearch(core.BaseSearchClass):
rhohatmax=1000, binary=False, BSGL=False,
SSBprec=None, minCoverFreq=None, maxCoverFreq=None,
injectSources=None, assumeSqrtSX=None,
transientWindowType=None):
transientWindowType=None, tCWFstatMapVersion='lal'):
if os.path.isdir(outdir) is False:
os.mkdir(outdir)
......@@ -161,7 +164,8 @@ class MCMCSearch(core.BaseSearchClass):
transientWindowType=self.transientWindowType,
minStartTime=self.minStartTime, maxStartTime=self.maxStartTime,
binary=self.binary, injectSources=self.injectSources,
assumeSqrtSX=self.assumeSqrtSX, SSBprec=self.SSBprec)
assumeSqrtSX=self.assumeSqrtSX, SSBprec=self.SSBprec,
tCWFstatMapVersion=self.tCWFstatMapVersion)
if self.minStartTime is None:
self.minStartTime = self.search.minStartTime
if self.maxStartTime is None:
......@@ -2212,7 +2216,8 @@ class MCMCTransientSearch(MCMCSearch):
transientWindowType=self.transientWindowType,
minStartTime=self.minStartTime, maxStartTime=self.maxStartTime,
BSGL=self.BSGL, binary=self.binary,
injectSources=self.injectSources)
injectSources=self.injectSources,
tCWFstatMapVersion=self.tCWFstatMapVersion)
if self.minStartTime is None:
self.minStartTime = self.search.minStartTime
if self.maxStartTime is None:
......
__global__ void cudaTransientFstatExpWindow ( float *input,
unsigned int numAtoms,
unsigned int TAtom,
unsigned int t0_data,
unsigned int win_t0,
unsigned int win_dt0,
unsigned int win_tau,
unsigned int win_dtau,
unsigned int Fmn_rows,
unsigned int Fmn_cols,
float *Fmn
)
{
/* match CUDA thread indexing and high-level (t0,tau) indexing */
unsigned int m = blockDim.x * blockIdx.x + threadIdx.x; // t0: row
unsigned int n = blockDim.y * blockIdx.y + threadIdx.y; // tau: column
/* unraveled 1D index for 2D output array */
unsigned int outidx = Fmn_cols * m + n;
/* hardcoded copy from lalpulsar */
unsigned int TRANSIENT_EXP_EFOLDING = 3;
if ( (m < Fmn_rows) && (n < Fmn_cols) ) {
/* compute Fstat-atom index i_t0 in [0, numAtoms) */
unsigned int TAtomHalf = TAtom/2; // integer division
unsigned int t0 = win_t0 + m * win_dt0;
/* integer round: floor(x+0.5) */
int i_tmp = ( t0 - t0_data + TAtomHalf ) / TAtom;
if ( i_tmp < 0 ) {
i_tmp = 0;
}
unsigned int i_t0 = (unsigned int)i_tmp;
if ( i_t0 >= numAtoms ) {
i_t0 = numAtoms - 1;
}
/* translate n into an atoms end-index
* for this search interval [t0, t0+Tcoh],
* giving the index range of atoms to sum over
*/
unsigned int tau = win_tau + n * win_dtau;
/* get end-time t1 of this transient-window search
* for given tau, what Tcoh should the exponential window cover?
* for speed reasons we want to truncate
* Tcoh = tau * TRANSIENT_EXP_EFOLDING
* with the e-folding factor chosen such that the window-value
* is practically negligible after that, where it will be set to 0
*/
// unsigned int t1 = lround( win_t0 + TRANSIENT_EXP_EFOLDING * win_tau);
unsigned int t1 = t0 + TRANSIENT_EXP_EFOLDING * tau;
/* compute window end-time Fstat-atom index i_t1 in [0, numAtoms)
* using integer round: floor(x+0.5)
*/
i_tmp = ( t1 - t0_data + TAtomHalf ) / TAtom - 1;
if ( i_tmp < 0 ) {
i_tmp = 0;
}
unsigned int i_t1 = (unsigned int)i_tmp;
if ( i_t1 >= numAtoms ) {
i_t1 = numAtoms - 1;
}
/* now we have two valid atoms-indices [i_t0, i_t1]
* spanning our Fstat-window to sum over
*/
float Ad = 0.0f;
float Bd = 0.0f;
float Cd = 0.0f;
float Fa_re = 0.0f;
float Fa_im = 0.0f;
float Fb_re = 0.0f;
float Fb_im = 0.0f;
unsigned short input_cols = 7; // must match input matrix!
/* sum up atoms */
for ( unsigned int i=i_t0; i<=i_t1; i++ ) {
unsigned int t_i = t0_data + i * TAtom;
float win_i = 0.0;
if ( t_i >= t0 && t_i <= t1 ) {
float x = 1.0 * ( t_i - t0 ) / tau;
win_i = exp ( -x );
}
float win2_i = win_i * win_i;
Ad += input[i*input_cols+0] * win2_i; // a2_alpha
Bd += input[i*input_cols+1] * win2_i; // b2_alpha
Cd += input[i*input_cols+2] * win2_i; // ab_alpha
Fa_re += input[i*input_cols+3] * win_i; // Fa_alpha_re
Fa_im += input[i*input_cols+4] * win_i; // Fa_alpha_im
Fb_re += input[i*input_cols+5] * win_i; // Fb_alpha_re
Fb_im += input[i*input_cols+6] * win_i; // Fb_alpha_im
}
/* get determinant */
float Dd = ( Ad * Bd - Cd * Cd );
float DdInv = 0.0f;
/* safety catch as in XLALWeightMultiAMCoeffs():
* make it so that in the end F=0 instead of -nan
*/
if ( Dd > 0.0 ) {
DdInv = 1.0 / Dd;
}
/* from XLALComputeFstatFromFaFb */
float F = DdInv * ( Bd * ( Fa_re*Fa_re + Fa_im*Fa_im )
+ Ad * ( Fb_re*Fb_re + Fb_im*Fb_im )
- 2.0 * Cd * ( Fa_re * Fb_re + Fa_im * Fb_im )
);
/* store result in Fstat-matrix
* at unraveled index of element {m,n}
*/
Fmn[outidx] = F;
} // ( (m < Fmn_rows) && (n < Fmn_cols) )
} // cudaTransientFstatExpWindow()
__global__ void cudaTransientFstatRectWindow ( float *input,
unsigned int numAtoms,
unsigned int TAtom,
unsigned int t0_data,
unsigned int win_t0,
unsigned int win_dt0,
unsigned int win_tau,
unsigned int win_dtau,
unsigned int N_tauRange,
float *Fmn
)
{
/* match CUDA thread indexing and high-level (t0,tau) indexing */
// assume 1D block, grid setup
unsigned int m = blockDim.x * blockIdx.x + threadIdx.x; // t0: row
unsigned short input_cols = 7; // must match input matrix!
/* compute Fstat-atom index i_t0 in [0, numAtoms) */
unsigned int TAtomHalf = TAtom/2; // integer division
unsigned int t0 = win_t0 + m * win_dt0;
/* integer round: floor(x+0.5) */
int i_tmp = ( t0 - t0_data + TAtomHalf ) / TAtom;
if ( i_tmp < 0 ) {
i_tmp = 0;
}
unsigned int i_t0 = (unsigned int)i_tmp;
if ( i_t0 >= numAtoms ) {
i_t0 = numAtoms - 1;
}
float Ad = 0.0f;
float Bd = 0.0f;
float Cd = 0.0f;
float Fa_re = 0.0f;
float Fa_im = 0.0f;
float Fb_re = 0.0f;
float Fb_im = 0.0f;
unsigned int i_t1_last = i_t0;
/* INNER loop over timescale-parameter tau
* NOT parallelized so that we can still use the i_t1_last trick
* (empirically seems to be faster than 2D CUDA version)
*/
for ( unsigned int n = 0; n < N_tauRange; n ++ ) {
if ( (m < N_tauRange) && (n < N_tauRange) ) {
/* translate n into an atoms end-index
* for this search interval [t0, t0+Tcoh],
* giving the index range of atoms to sum over
*/
unsigned int tau = win_tau + n * win_dtau;
/* get end-time t1 of this transient-window search */
unsigned int t1 = t0 + tau;
/* compute window end-time Fstat-atom index i_t1 in [0, numAtoms)
* using integer round: floor(x+0.5)
*/
i_tmp = ( t1 - t0_data + TAtomHalf ) / TAtom - 1;
if ( i_tmp < 0 ) {
i_tmp = 0;
}
unsigned int i_t1 = (unsigned int)i_tmp;
if ( i_t1 >= numAtoms ) {
i_t1 = numAtoms - 1;
}
/* now we have two valid atoms-indices [i_t0, i_t1]
* spanning our Fstat-window to sum over
*/
for ( unsigned int i = i_t1_last; i <= i_t1; i ++ ) {
/* sum up atoms,
* special optimiziation in the rectangular-window case:
* just add on to previous tau values,
* ie re-use the sum over [i_t0, i_t1_last]
from the pevious tau-loop iteration
*/
Ad += input[i*input_cols+0]; // a2_alpha
Bd += input[i*input_cols+1]; // b2_alpha
Cd += input[i*input_cols+2]; // ab_alpha
Fa_re += input[i*input_cols+3]; // Fa_alpha_re
Fa_im += input[i*input_cols+4]; // Fa_alpha_im
Fb_re += input[i*input_cols+5]; // Fb_alpha_re
Fb_im += input[i*input_cols+6]; // Fb_alpha_im
/* keep track of up to where we summed for the next iteration */
i_t1_last = i_t1 + 1;
}
/* get determinant */
float Dd = ( Ad * Bd - Cd * Cd );
float DdInv = 0.0f;
/* safety catch as in XLALWeightMultiAMCoeffs():
* make it so that in the end F=0 instead of -nan
*/
if ( Dd > 0.0 ) {
DdInv = 1.0 / Dd;
}
/* from XLALComputeFstatFromFaFb */
float F = DdInv * ( Bd * ( Fa_re*Fa_re + Fa_im*Fa_im )
+ Ad * ( Fb_re*Fb_re + Fb_im*Fb_im )
- 2.0 * Cd * ( Fa_re * Fb_re + Fa_im * Fb_im )
);
/* store result in Fstat-matrix
* at unraveled index of element {m,n}
*/
unsigned int outidx = m * N_tauRange + n;
Fmn[outidx] = F;
} // if ( (m < N_tauRange) && (n < N_tauRange) )
} // for ( unsigned int n = 0; n < N_tauRange; n ++ )
} // cudaTransientFstatRectWindow()
""" Additional helper functions dealing with transient-CW F(t0,tau) maps """
import numpy as np
import os
import sys
import logging
# optional imports
import importlib as imp
def _optional_import ( modulename, shorthand=None ):
'''
Import a module/submodule only if it's available.
using importlib instead of __import__
because the latter doesn't handle sub.modules
Also including a special check to fail more gracefully
when CUDA_DEVICE is set to too high a number.
'''
if shorthand is None:
shorthand = modulename
shorthandbit = ''
else:
shorthandbit = ' as '+shorthand
try:
globals()[shorthand] = imp.import_module(modulename)
logging.debug('Successfully imported module %s%s.'
% (modulename, shorthandbit))
success = True
except ImportError, e:
if e.message == 'No module named '+modulename:
logging.debug('No module {:s} found.'.format(modulename))
success = False
else:
raise
return success
class pyTransientFstatMap(object):
'''
simplified object class for a F(t0,tau) F-stat map (not 2F!)
based on LALSuite's transientFstatMap_t type
replacing the gsl matrix with a numpy array
F_mn: 2D array of 2F values
maxF: maximum of F (not 2F!)
t0_ML: maximum likelihood transient start time t0 estimate
tau_ML: maximum likelihood transient duration tau estimate
'''
def __init__(self, N_t0Range, N_tauRange):
self.F_mn = np.zeros((N_t0Range, N_tauRange), dtype=np.float32)
# Initializing maxF to a negative value ensures
# that we always update at least once and hence return
# sane t0_d_ML, tau_d_ML
# even if there is only a single bin where F=0 happens.
self.maxF = float(-1.0)
self.t0_ML = float(0.0)
self.tau_ML = float(0.0)
# dictionary of the actual callable F-stat map functions we support,
# if the corresponding modules are available.
fstatmap_versions = {
'lal': lambda multiFstatAtoms, windowRange:
getattr(lalpulsar,'ComputeTransientFstatMap')
( multiFstatAtoms, windowRange, False ),
'pycuda': lambda multiFstatAtoms, windowRange:
pycuda_compute_transient_fstat_map
( multiFstatAtoms, windowRange )
}
def init_transient_fstat_map_features ( wantCuda=False, cudaDeviceName=None ):
'''
Initialization of available modules (or "features") for F-stat maps.
Returns a dictionary of method names, to match fstatmap_versions
each key's value set to True only if
all required modules are importable on this system.
'''
features = {}
have_lal = _optional_import('lal')
have_lalpulsar = _optional_import('lalpulsar')
features['lal'] = have_lal and have_lalpulsar
# import GPU features
have_pycuda = _optional_import('pycuda')
have_pycuda_drv = _optional_import('pycuda.driver', 'drv')
have_pycuda_gpuarray = _optional_import('pycuda.gpuarray', 'gpuarray')
have_pycuda_tools = _optional_import('pycuda.tools', 'cudatools')
have_pycuda_compiler = _optional_import('pycuda.compiler', 'cudacomp')
features['pycuda'] = ( have_pycuda_drv and have_pycuda_gpuarray and
have_pycuda_tools and have_pycuda_compiler )
logging.debug('Got the following features for transient F-stat maps:')
logging.debug(features)
if wantCuda and features['pycuda']:
logging.debug('CUDA version: '+'.'.join(map(str,drv.get_version())))
drv.init()
logging.debug('Starting with default pyCUDA context,' \
' then checking all available devices...')
try:
context0 = pycuda.tools.make_default_context()
except pycuda._driver.LogicError, e:
if e.message == 'cuDeviceGet failed: invalid device ordinal':
devn = int(os.environ['CUDA_DEVICE'])
raise RuntimeError('Requested CUDA device number {} exceeds' \
' number of available devices!' \
' Please change through environment' \
' variable $CUDA_DEVICE.'.format(devn))
else:
raise pycuda._driver.LogicError(e.message)
num_gpus = drv.Device.count()
logging.debug('Found {} CUDA device(s).'.format(num_gpus))
devices = []
devnames = np.empty(num_gpus,dtype='S32')
for n in range(num_gpus):
devn = drv.Device(n)