Commit d55178b6 authored by David Keitel's avatar David Keitel
Browse files

Merge branch 'tCWgpu' into 'master'

pyCUDA version of transient F-stat

See merge request !13
parents 793e1f4e 7e71b9d2
...@@ -50,7 +50,8 @@ search2 = pyfstat.TransientGridSearch( ...@@ -50,7 +50,8 @@ search2 = pyfstat.TransientGridSearch(
minStartTime=minStartTime, maxStartTime=maxStartTime, minStartTime=minStartTime, maxStartTime=maxStartTime,
transientWindowType='rect', t0Band=Tspan-2*Tsft, tauBand=Tspan, transientWindowType='rect', t0Band=Tspan-2*Tsft, tauBand=Tspan,
BSGL=False, BSGL=False,
outputTransientFstatMap=True) outputTransientFstatMap=True,
tCWFstatMapVersion='lal')
search2.run() search2.run()
search2.print_max_twoF() search2.print_max_twoF()
......
...@@ -13,6 +13,7 @@ import scipy.optimize ...@@ -13,6 +13,7 @@ import scipy.optimize
import lal import lal
import lalpulsar import lalpulsar
import pyfstat.helper_functions as helper_functions import pyfstat.helper_functions as helper_functions
import pyfstat.tcw_fstat_map_funcs as tcw
# workaround for matplotlib on X-less remote logins # workaround for matplotlib on X-less remote logins
if 'DISPLAY' in os.environ: if 'DISPLAY' in os.environ:
...@@ -335,7 +336,8 @@ class ComputeFstat(BaseSearchClass): ...@@ -335,7 +336,8 @@ class ComputeFstat(BaseSearchClass):
dt0=None, dtau=None, dt0=None, dtau=None,
detectors=None, minCoverFreq=None, maxCoverFreq=None, detectors=None, minCoverFreq=None, maxCoverFreq=None,
injectSources=None, injectSqrtSX=None, assumeSqrtSX=None, injectSources=None, injectSqrtSX=None, assumeSqrtSX=None,
SSBprec=None): SSBprec=None,
tCWFstatMapVersion='lal', cudaDeviceName=None):
""" """
Parameters Parameters
---------- ----------
...@@ -383,6 +385,11 @@ class ComputeFstat(BaseSearchClass): ...@@ -383,6 +385,11 @@ class ComputeFstat(BaseSearchClass):
SSBprec : int SSBprec : int
Flag to set the SSB calculation: 0=Newtonian, 1=relativistic, Flag to set the SSB calculation: 0=Newtonian, 1=relativistic,
2=relativisitic optimised, 3=DMoff, 4=NO_SPIN 2=relativisitic optimised, 3=DMoff, 4=NO_SPIN
tCWFstatMapVersion: str
Choose between standard 'lal' implementation,
'pycuda' for gpu, and some others for devel/debug.
cudaDeviceName: str
GPU name to be matched against drv.Device output.
""" """
...@@ -625,13 +632,14 @@ class ComputeFstat(BaseSearchClass): ...@@ -625,13 +632,14 @@ class ComputeFstat(BaseSearchClass):
self.windowRange.dt0 = self.Tsft self.windowRange.dt0 = self.Tsft
self.windowRange.dtau = self.Tsft self.windowRange.dtau = self.Tsft
# special treatment of window_type = none ==> replace by rectangular window spanning all the data # special treatment of window_type = none
# ==> replace by rectangular window spanning all the data
if self.windowRange.type == lalpulsar.TRANSIENT_NONE: if self.windowRange.type == lalpulsar.TRANSIENT_NONE:
self.windowRange.t0 = int(self.minStartTime) self.windowRange.t0 = int(self.minStartTime)
self.windowRange.t0Band = 0 self.windowRange.t0Band = 0
self.windowRange.tau = int(self.maxStartTime-self.minStartTime) self.windowRange.tau = int(self.maxStartTime-self.minStartTime)
self.windowRange.tauBand = 0 self.windowRange.tauBand = 0
else: # user-set bands and spacings else: # user-set bands and spacings
if self.t0Band is None: if self.t0Band is None:
self.windowRange.t0Band = 0 self.windowRange.t0Band = 0
else: else:
...@@ -653,6 +661,11 @@ class ComputeFstat(BaseSearchClass): ...@@ -653,6 +661,11 @@ class ComputeFstat(BaseSearchClass):
if self.dtau: if self.dtau:
self.windowRange.dtau = self.dtau self.windowRange.dtau = self.dtau
logging.info('Initialising transient FstatMap features...')
self.tCWFstatMapFeatures, self.gpu_context = (
tcw.init_transient_fstat_map_features(
self.tCWFstatMapVersion == 'pycuda', self.cudaDeviceName))
def get_fullycoherent_twoF(self, tstart, tend, F0, F1, F2, Alpha, Delta, def get_fullycoherent_twoF(self, tstart, tend, F0, F1, F2, Alpha, Delta,
asini=None, period=None, ecc=None, tp=None, asini=None, period=None, ecc=None, tp=None,
argp=None): argp=None):
...@@ -695,9 +708,13 @@ class ComputeFstat(BaseSearchClass): ...@@ -695,9 +708,13 @@ class ComputeFstat(BaseSearchClass):
# F-stat computation # F-stat computation
self.windowRange.tau = int(2*self.Tsft) self.windowRange.tau = int(2*self.Tsft)
self.FstatMap = lalpulsar.ComputeTransientFstatMap( self.FstatMap = tcw.call_compute_transient_fstat_map(
self.FstatResults.multiFatoms[0], self.windowRange, False) self.tCWFstatMapVersion, self.tCWFstatMapFeatures,
F_mn = self.FstatMap.F_mn.data self.FstatResults.multiFatoms[0], self.windowRange)
if self.tCWFstatMapVersion == 'lal':
F_mn = self.FstatMap.F_mn.data
else:
F_mn = self.FstatMap.F_mn
twoF = 2*np.max(F_mn) twoF = 2*np.max(F_mn)
if self.BSGL is False: if self.BSGL is False:
...@@ -920,6 +937,15 @@ class ComputeFstat(BaseSearchClass): ...@@ -920,6 +937,15 @@ class ComputeFstat(BaseSearchClass):
raise RuntimeError('Cannot print atoms vector to file: no FstatResults.multiFatoms, or it is None!') raise RuntimeError('Cannot print atoms vector to file: no FstatResults.multiFatoms, or it is None!')
def __del__(self):
"""
In pyCuda case without autoinit,
we need to make sure the context is removed at the end
"""
if hasattr(self,'gpu_context') and self.gpu_context:
self.gpu_context.detach()
class SemiCoherentSearch(ComputeFstat): class SemiCoherentSearch(ComputeFstat):
""" A semi-coherent search """ """ A semi-coherent search """
...@@ -950,6 +976,8 @@ class SemiCoherentSearch(ComputeFstat): ...@@ -950,6 +976,8 @@ class SemiCoherentSearch(ComputeFstat):
self.transientWindowType = 'rect' self.transientWindowType = 'rect'
self.t0Band = None self.t0Band = None
self.tauBand = None self.tauBand = None
self.tCWFstatMapVersion = 'lal'
self.cudaDeviceName = None
self.init_computefstatistic_single_point() self.init_computefstatistic_single_point()
self.init_semicoherent_parameters() self.init_semicoherent_parameters()
...@@ -1089,6 +1117,8 @@ class SemiCoherentGlitchSearch(ComputeFstat): ...@@ -1089,6 +1117,8 @@ class SemiCoherentGlitchSearch(ComputeFstat):
self.transientWindowType = 'rect' self.transientWindowType = 'rect'
self.t0Band = None self.t0Band = None
self.tauBand = None self.tauBand = None
self.tCWFstatMapVersion = 'lal'
self.cudaDeviceName = None
self.binary = False self.binary = False
self.init_computefstatistic_single_point() self.init_computefstatistic_single_point()
......
...@@ -355,7 +355,8 @@ class TransientGridSearch(GridSearch): ...@@ -355,7 +355,8 @@ class TransientGridSearch(GridSearch):
transientWindowType=None, t0Band=None, tauBand=None, transientWindowType=None, t0Band=None, tauBand=None,
dt0=None, dtau=None, dt0=None, dtau=None,
outputTransientFstatMap=False, outputTransientFstatMap=False,
outputAtoms=False): outputAtoms=False,
tCWFstatMapVersion='lal', cudaDeviceName=None):
""" """
Parameters Parameters
---------- ----------
...@@ -388,6 +389,11 @@ class TransientGridSearch(GridSearch): ...@@ -388,6 +389,11 @@ class TransientGridSearch(GridSearch):
outputTransientFstatMap: bool outputTransientFstatMap: bool
if true, write output files for (t0,tau) Fstat maps if true, write output files for (t0,tau) Fstat maps
(one file for each doppler grid point!) (one file for each doppler grid point!)
tCWFstatMapVersion: str
Choose between standard 'lal' implementation,
'pycuda' for gpu, and some others for devel/debug.
cudaDeviceName: str
GPU name to be matched against drv.Device output.
For all other parameters, see `pyfstat.ComputeFStat` for details For all other parameters, see `pyfstat.ComputeFStat` for details
""" """
...@@ -413,7 +419,9 @@ class TransientGridSearch(GridSearch): ...@@ -413,7 +419,9 @@ class TransientGridSearch(GridSearch):
minStartTime=self.minStartTime, maxStartTime=self.maxStartTime, minStartTime=self.minStartTime, maxStartTime=self.maxStartTime,
BSGL=self.BSGL, SSBprec=self.SSBprec, BSGL=self.BSGL, SSBprec=self.SSBprec,
injectSources=self.injectSources, injectSources=self.injectSources,
assumeSqrtSX=self.assumeSqrtSX) assumeSqrtSX=self.assumeSqrtSX,
tCWFstatMapVersion=self.tCWFstatMapVersion,
cudaDeviceName=self.cudaDeviceName)
self.search.get_det_stat = self.search.get_fullycoherent_twoF self.search.get_det_stat = self.search.get_fullycoherent_twoF
def run(self, return_data=False): def run(self, return_data=False):
...@@ -427,19 +435,36 @@ class TransientGridSearch(GridSearch): ...@@ -427,19 +435,36 @@ class TransientGridSearch(GridSearch):
self.inititate_search_object() self.inititate_search_object()
data = [] data = []
if self.outputTransientFstatMap:
tCWfilebase = os.path.splitext(self.out_file)[0] + '_tCW_'
logging.info('Will save per-Doppler Fstatmap' \
' results to {}*.dat'.format(tCWfilebase))
for vals in tqdm(self.input_data): for vals in tqdm(self.input_data):
detstat = self.search.get_det_stat(*vals) detstat = self.search.get_det_stat(*vals)
windowRange = getattr(self.search, 'windowRange', None) windowRange = getattr(self.search, 'windowRange', None)
FstatMap = getattr(self.search, 'FstatMap', None) FstatMap = getattr(self.search, 'FstatMap', None)
thisCand = list(vals) + [detstat] thisCand = list(vals) + [detstat]
if getattr(self, 'transientWindowType', None): if getattr(self, 'transientWindowType', None):
if self.tCWFstatMapVersion == 'lal':
F_mn = FstatMap.F_mn.data
else:
F_mn = FstatMap.F_mn
if self.outputTransientFstatMap: if self.outputTransientFstatMap:
tCWfile = os.path.splitext(self.out_file)[0]+'_tCW_%.16f_%.16f_%.16f_%.16g_%.16g.dat' % (vals[2],vals[5],vals[6],vals[3],vals[4]) # freq alpha delta f1dot f2dot # per-Doppler filename convention:
fo = lal.FileOpen(tCWfile, 'w') # freq alpha delta f1dot f2dot
lalpulsar.write_transientFstatMap_to_fp ( fo, FstatMap, windowRange, None ) tCWfile = ( tCWfilebase
del fo # instead of lal.FileClose() which is not SWIG-exported + '%.16f_%.16f_%.16f_%.16g_%.16g.dat' %
Fmn = FstatMap.F_mn.data (vals[2],vals[5],vals[6],vals[3],vals[4]) )
maxidx = np.unravel_index(Fmn.argmax(), Fmn.shape) if self.tCWFstatMapVersion == 'lal':
fo = lal.FileOpen(tCWfile, 'w')
lalpulsar.write_transientFstatMap_to_fp (
fo, FstatMap, windowRange, None )
# instead of lal.FileClose(),
# which is not SWIG-exported:
del fo
else:
self.write_F_mn ( tCWfile, F_mn, windowRange)
maxidx = np.unravel_index(F_mn.argmax(), F_mn.shape)
thisCand += [windowRange.t0+maxidx[0]*windowRange.dt0, thisCand += [windowRange.t0+maxidx[0]*windowRange.dt0,
windowRange.tau+maxidx[1]*windowRange.dtau] windowRange.tau+maxidx[1]*windowRange.dtau]
data.append(thisCand) data.append(thisCand)
...@@ -453,6 +478,19 @@ class TransientGridSearch(GridSearch): ...@@ -453,6 +478,19 @@ class TransientGridSearch(GridSearch):
self.save_array_to_disk(data) self.save_array_to_disk(data)
self.data = data self.data = data
def write_F_mn (self, tCWfile, F_mn, windowRange ):
with open(tCWfile, 'w') as tfp:
tfp.write('# t0 [s] tau [s] 2F\n')
for m, F_m in enumerate(F_mn):
this_t0 = windowRange.t0 + m * windowRange.dt0
for n, this_F in enumerate(F_m):
this_tau = windowRange.tau + n * windowRange.dtau;
tfp.write(' %10d %10d %- 11.8g\n' % (this_t0, this_tau, 2.0*this_F))
def __del__(self):
if hasattr(self,'search'):
self.search.__del__()
class SliceGridSearch(GridSearch): class SliceGridSearch(GridSearch):
""" Slice gridded search using ComputeFstat """ """ Slice gridded search using ComputeFstat """
......
...@@ -82,6 +82,9 @@ class MCMCSearch(core.BaseSearchClass): ...@@ -82,6 +82,9 @@ class MCMCSearch(core.BaseSearchClass):
('none' instead of None explicitly calls the transient-window function, ('none' instead of None explicitly calls the transient-window function,
but with the full range, for debugging) but with the full range, for debugging)
Currently only supported for nsegs=1. Currently only supported for nsegs=1.
tCWFstatMapVersion: str
Choose between standard 'lal' implementation,
'pycuda' for gpu, and some others for devel/debug.
Attributes Attributes
---------- ----------
...@@ -115,7 +118,7 @@ class MCMCSearch(core.BaseSearchClass): ...@@ -115,7 +118,7 @@ class MCMCSearch(core.BaseSearchClass):
rhohatmax=1000, binary=False, BSGL=False, rhohatmax=1000, binary=False, BSGL=False,
SSBprec=None, minCoverFreq=None, maxCoverFreq=None, SSBprec=None, minCoverFreq=None, maxCoverFreq=None,
injectSources=None, assumeSqrtSX=None, injectSources=None, assumeSqrtSX=None,
transientWindowType=None): transientWindowType=None, tCWFstatMapVersion='lal'):
if os.path.isdir(outdir) is False: if os.path.isdir(outdir) is False:
os.mkdir(outdir) os.mkdir(outdir)
...@@ -161,7 +164,8 @@ class MCMCSearch(core.BaseSearchClass): ...@@ -161,7 +164,8 @@ class MCMCSearch(core.BaseSearchClass):
transientWindowType=self.transientWindowType, transientWindowType=self.transientWindowType,
minStartTime=self.minStartTime, maxStartTime=self.maxStartTime, minStartTime=self.minStartTime, maxStartTime=self.maxStartTime,
binary=self.binary, injectSources=self.injectSources, binary=self.binary, injectSources=self.injectSources,
assumeSqrtSX=self.assumeSqrtSX, SSBprec=self.SSBprec) assumeSqrtSX=self.assumeSqrtSX, SSBprec=self.SSBprec,
tCWFstatMapVersion=self.tCWFstatMapVersion)
if self.minStartTime is None: if self.minStartTime is None:
self.minStartTime = self.search.minStartTime self.minStartTime = self.search.minStartTime
if self.maxStartTime is None: if self.maxStartTime is None:
...@@ -2212,7 +2216,8 @@ class MCMCTransientSearch(MCMCSearch): ...@@ -2212,7 +2216,8 @@ class MCMCTransientSearch(MCMCSearch):
transientWindowType=self.transientWindowType, transientWindowType=self.transientWindowType,
minStartTime=self.minStartTime, maxStartTime=self.maxStartTime, minStartTime=self.minStartTime, maxStartTime=self.maxStartTime,
BSGL=self.BSGL, binary=self.binary, BSGL=self.BSGL, binary=self.binary,
injectSources=self.injectSources) injectSources=self.injectSources,
tCWFstatMapVersion=self.tCWFstatMapVersion)
if self.minStartTime is None: if self.minStartTime is None:
self.minStartTime = self.search.minStartTime self.minStartTime = self.search.minStartTime
if self.maxStartTime is None: if self.maxStartTime is None:
......
__global__ void cudaTransientFstatExpWindow ( float *input,
unsigned int numAtoms,
unsigned int TAtom,
unsigned int t0_data,
unsigned int win_t0,
unsigned int win_dt0,
unsigned int win_tau,
unsigned int win_dtau,
unsigned int Fmn_rows,
unsigned int Fmn_cols,
float *Fmn
)
{
/* match CUDA thread indexing and high-level (t0,tau) indexing */
unsigned int m = blockDim.x * blockIdx.x + threadIdx.x; // t0: row
unsigned int n = blockDim.y * blockIdx.y + threadIdx.y; // tau: column
/* unraveled 1D index for 2D output array */
unsigned int outidx = Fmn_cols * m + n;
/* hardcoded copy from lalpulsar */
unsigned int TRANSIENT_EXP_EFOLDING = 3;
if ( (m < Fmn_rows) && (n < Fmn_cols) ) {
/* compute Fstat-atom index i_t0 in [0, numAtoms) */
unsigned int TAtomHalf = TAtom/2; // integer division
unsigned int t0 = win_t0 + m * win_dt0;
/* integer round: floor(x+0.5) */
int i_tmp = ( t0 - t0_data + TAtomHalf ) / TAtom;
if ( i_tmp < 0 ) {
i_tmp = 0;
}
unsigned int i_t0 = (unsigned int)i_tmp;
if ( i_t0 >= numAtoms ) {
i_t0 = numAtoms - 1;
}
/* translate n into an atoms end-index
* for this search interval [t0, t0+Tcoh],
* giving the index range of atoms to sum over
*/
unsigned int tau = win_tau + n * win_dtau;
/* get end-time t1 of this transient-window search
* for given tau, what Tcoh should the exponential window cover?
* for speed reasons we want to truncate
* Tcoh = tau * TRANSIENT_EXP_EFOLDING
* with the e-folding factor chosen such that the window-value
* is practically negligible after that, where it will be set to 0
*/
// unsigned int t1 = lround( win_t0 + TRANSIENT_EXP_EFOLDING * win_tau);
unsigned int t1 = t0 + TRANSIENT_EXP_EFOLDING * tau;
/* compute window end-time Fstat-atom index i_t1 in [0, numAtoms)
* using integer round: floor(x+0.5)
*/
i_tmp = ( t1 - t0_data + TAtomHalf ) / TAtom - 1;
if ( i_tmp < 0 ) {
i_tmp = 0;
}
unsigned int i_t1 = (unsigned int)i_tmp;
if ( i_t1 >= numAtoms ) {
i_t1 = numAtoms - 1;
}
/* now we have two valid atoms-indices [i_t0, i_t1]
* spanning our Fstat-window to sum over
*/
float Ad = 0.0f;
float Bd = 0.0f;
float Cd = 0.0f;
float Fa_re = 0.0f;
float Fa_im = 0.0f;
float Fb_re = 0.0f;
float Fb_im = 0.0f;
unsigned short input_cols = 7; // must match input matrix!
/* sum up atoms */
for ( unsigned int i=i_t0; i<=i_t1; i++ ) {
unsigned int t_i = t0_data + i * TAtom;
float win_i = 0.0;
if ( t_i >= t0 && t_i <= t1 ) {
float x = 1.0 * ( t_i - t0 ) / tau;
win_i = exp ( -x );
}
float win2_i = win_i * win_i;
Ad += input[i*input_cols+0] * win2_i; // a2_alpha
Bd += input[i*input_cols+1] * win2_i; // b2_alpha
Cd += input[i*input_cols+2] * win2_i; // ab_alpha
Fa_re += input[i*input_cols+3] * win_i; // Fa_alpha_re
Fa_im += input[i*input_cols+4] * win_i; // Fa_alpha_im
Fb_re += input[i*input_cols+5] * win_i; // Fb_alpha_re
Fb_im += input[i*input_cols+6] * win_i; // Fb_alpha_im
}
/* get determinant */
float Dd = ( Ad * Bd - Cd * Cd );
float DdInv = 0.0f;
/* safety catch as in XLALWeightMultiAMCoeffs():
* make it so that in the end F=0 instead of -nan
*/
if ( Dd > 0.0 ) {
DdInv = 1.0 / Dd;
}
/* from XLALComputeFstatFromFaFb */
float F = DdInv * ( Bd * ( Fa_re*Fa_re + Fa_im*Fa_im )
+ Ad * ( Fb_re*Fb_re + Fb_im*Fb_im )
- 2.0 * Cd * ( Fa_re * Fb_re + Fa_im * Fb_im )
);
/* store result in Fstat-matrix
* at unraveled index of element {m,n}
*/
Fmn[outidx] = F;
} // ( (m < Fmn_rows) && (n < Fmn_cols) )
} // cudaTransientFstatExpWindow()
__global__ void cudaTransientFstatRectWindow ( float *input,
unsigned int numAtoms,
unsigned int TAtom,
unsigned int t0_data,
unsigned int win_t0,
unsigned int win_dt0,
unsigned int win_tau,
unsigned int win_dtau,
unsigned int N_tauRange,
float *Fmn
)
{
/* match CUDA thread indexing and high-level (t0,tau) indexing */
// assume 1D block, grid setup
unsigned int m = blockDim.x * blockIdx.x + threadIdx.x; // t0: row
unsigned short input_cols = 7; // must match input matrix!
/* compute Fstat-atom index i_t0 in [0, numAtoms) */
unsigned int TAtomHalf = TAtom/2; // integer division
unsigned int t0 = win_t0 + m * win_dt0;
/* integer round: floor(x+0.5) */
int i_tmp = ( t0 - t0_data + TAtomHalf ) / TAtom;
if ( i_tmp < 0 ) {
i_tmp = 0;
}
unsigned int i_t0 = (unsigned int)i_tmp;
if ( i_t0 >= numAtoms ) {
i_t0 = numAtoms - 1;
}
float Ad = 0.0f;
float Bd = 0.0f;
float Cd = 0.0f;
float Fa_re = 0.0f;
float Fa_im = 0.0f;
float Fb_re = 0.0f;
float Fb_im = 0.0f;
unsigned int i_t1_last = i_t0;
/* INNER loop over timescale-parameter tau
* NOT parallelized so that we can still use the i_t1_last trick
* (empirically seems to be faster than 2D CUDA version)
*/
for ( unsigned int n = 0; n < N_tauRange; n ++ ) {
if ( (m < N_tauRange) && (n < N_tauRange) ) {
/* translate n into an atoms end-index
* for this search interval [t0, t0+Tcoh],
* giving the index range of atoms to sum over
*/
unsigned int tau = win_tau + n * win_dtau;
/* get end-time t1 of this transient-window search */
unsigned int t1 = t0 + tau;
/* compute window end-time Fstat-atom index i_t1 in [0, numAtoms)
* using integer round: floor(x+0.5)
*/
i_tmp = ( t1 - t0_data + TAtomHalf ) / TAtom - 1;
if ( i_tmp < 0 ) {
i_tmp = 0;
}
unsigned int i_t1 = (unsigned int)i_tmp;
if ( i_t1 >= numAtoms ) {
i_t1 = numAtoms - 1;
}
/* now we have two valid atoms-indices [i_t0, i_t1]
* spanning our Fstat-window to sum over
*/
for ( unsigned int i = i_t1_last; i <= i_t1; i ++ ) {
/* sum up atoms,
* special optimiziation in the rectangular-window case:
* just add on to previous tau values,
* ie re-use the sum over [i_t0, i_t1_last]
from the pevious tau-loop iteration
*/
Ad += input[i*input_cols+0]; // a2_alpha
Bd += input[i*input_cols+1]; // b2_alpha
Cd += input[i*input_cols+2]; // ab_alpha
Fa_re += input[i*input_cols+3]; // Fa_alpha_re
Fa_im += input[i*input_cols+4]; // Fa_alpha_im
Fb_re += input[i*input_cols+5]; // Fb_alpha_re
Fb_im += input[i*input_cols+6]; // Fb_alpha_im
/* keep track of up to where we summed for the next iteration */
i_t1_last = i_t1 + 1;
}
/* get determinant */
float Dd = ( Ad * Bd - Cd * Cd );
float DdInv = 0.0f;
/* safety catch as in XLALWeightMultiAMCoeffs():
* make it so that in the end F=0 instead of -nan
*/
if ( Dd > 0.0 ) {
DdInv = 1.0 / Dd;
}
/* from XLALComputeFstatFromFaFb */
float F = DdInv * ( Bd * ( Fa_re*Fa_re + Fa_im*Fa_im )
+ Ad * ( Fb_re*Fb_re + Fb_im*Fb_im )
- 2.0 * Cd * ( Fa_re * Fb_re + Fa_im * Fb_im )
);
/* store result in Fstat-matrix
* at unraveled index of element {m,n}
*/
unsigned int outidx = m * N_tauRange + n;
Fmn[outidx] = F;
} // if ( (m < N_tauRange) && (n < N_tauRange) )
} // for ( unsigned int n = 0; n < N_tauRange; n ++ )
} // cudaTransientFstatRectWindow()
""" Additional helper functions dealing with transient-CW F(t0,tau) maps """
import numpy as np
import os
import sys