diff --git a/pyfstat/core.py b/pyfstat/core.py
index 26fcf4e9a15a15f0f7a4793d94ab7818cf21bf14..50705e73f48d8ceb6c0ff0db3314a49c3426b7d2 100755
--- a/pyfstat/core.py
+++ b/pyfstat/core.py
@@ -632,8 +632,8 @@ class ComputeFstat(BaseSearchClass):
             self.windowRange.dt0 = self.Tsft
             self.windowRange.dtau = self.Tsft
 
-            # special treatment of window_type = none ==> replace by
-            # rectangular window spanning all the data
+            # special treatment of window_type = none
+            # ==> replace by rectangular window spanning all the data
             if self.windowRange.type == lalpulsar.TRANSIENT_NONE:
                 self.windowRange.t0 = int(self.minStartTime)
                 self.windowRange.t0Band = 0
@@ -661,6 +661,7 @@ class ComputeFstat(BaseSearchClass):
                     if self.dtau:
                         self.windowRange.dtau = self.dtau
 
+            logging.info('Initialising transient FstatMap features...')
             self.tCWFstatMapFeatures, self.gpu_context = (
                 tcw.init_transient_fstat_map_features(
                     self.tCWFstatMapVersion == 'pycuda', self.cudaDeviceName))
@@ -707,15 +708,6 @@ class ComputeFstat(BaseSearchClass):
             # F-stat computation
             self.windowRange.tau = int(2*self.Tsft)
 
-        # logging.debug(
-        #     'Calling "%s" version of ComputeTransientFstatMap() with\
-        #     windowRange: (type=%d (%s), t0=%f, t0Band=%f, dt0=%f, tau=%f,\
-        #     tauBand=%f, dtau=%f)...' % (
-        #         self.tCWFstatMapVersion, self.windowRange.type,
-        #         self.transientWindowType, self.windowRange.t0,
-        #         self.windowRange.t0Band, self.windowRange.dt0,
-        #         self.windowRange.tau, self.windowRange.tauBand,
-        #         self.windowRange.dtau))
         self.FstatMap = tcw.call_compute_transient_fstat_map(
             self.tCWFstatMapVersion, self.tCWFstatMapFeatures,
             self.FstatResults.multiFatoms[0], self.windowRange)
@@ -724,13 +716,6 @@ class ComputeFstat(BaseSearchClass):
         else:
             F_mn = self.FstatMap.F_mn
 
-        # logging.debug('maxF:   {}'.format(self.FstatMap.maxF))
-        # logging.debug('t0_ML:  %ds=T0+%fd' % (
-        #     self.FstatMap.t0_ML, (self.FstatMap.t0_ML-tstart)/(3600.*24.)))
-        # logging.debug('tau_ML: %ds=%fd' % (
-        #     self.FstatMap.tau_ML, self.FstatMap.tau_ML/(3600.*24.)))
-        # logging.debug('F_mn:   {}'.format(F_mn))
-
         twoF = 2*np.max(F_mn)
         if self.BSGL is False:
             if np.isnan(twoF):
diff --git a/pyfstat/grid_based_searches.py b/pyfstat/grid_based_searches.py
index 1dc24f2d5ae71e6f90ec1229c1312e74e088b29c..8d830c291da203b1c05f8f8008f7cd96cb675c2e 100644
--- a/pyfstat/grid_based_searches.py
+++ b/pyfstat/grid_based_searches.py
@@ -435,6 +435,10 @@ class TransientGridSearch(GridSearch):
             self.inititate_search_object()
 
         data = []
+        if self.outputTransientFstatMap:
+            tCWfilebase = os.path.splitext(self.out_file)[0] + '_tCW_'
+            logging.info('Will save per-Doppler Fstatmap' \
+                         ' results to {}*.dat'.format(tCWfilebase))
         for vals in tqdm(self.input_data):
             detstat = self.search.get_det_stat(*vals)
             windowRange = getattr(self.search, 'windowRange', None)
@@ -446,11 +450,18 @@ class TransientGridSearch(GridSearch):
                 else:
                     F_mn = FstatMap.F_mn
                 if self.outputTransientFstatMap:
-                    tCWfile = os.path.splitext(self.out_file)[0]+'_tCW_%.16f_%.16f_%.16f_%.16g_%.16g.dat' % (vals[2],vals[5],vals[6],vals[3],vals[4]) # freq alpha delta f1dot f2dot
+                    # per-Doppler filename convention:
+                    # freq alpha delta f1dot f2dot
+                    tCWfile = ( tCWfilebase
+                                + '%.16f_%.16f_%.16f_%.16g_%.16g.dat' %
+                                (vals[2],vals[5],vals[6],vals[3],vals[4]) )
                     if self.tCWFstatMapVersion == 'lal':
                         fo = lal.FileOpen(tCWfile, 'w')
-                        lalpulsar.write_transientFstatMap_to_fp ( fo, FstatMap, windowRange, None )
-                        del fo # instead of lal.FileClose() which is not SWIG-exported
+                        lalpulsar.write_transientFstatMap_to_fp (
+                            fo, FstatMap, windowRange, None )
+                        # instead of lal.FileClose(),
+                        # which is not SWIG-exported:
+                        del fo
                     else:
                         self.write_F_mn ( tCWfile, F_mn, windowRange)
                 maxidx = np.unravel_index(F_mn.argmax(), F_mn.shape)
diff --git a/pyfstat/pyCUDAkernels/cudaTransientFstatExpWindow.cu b/pyfstat/pyCUDAkernels/cudaTransientFstatExpWindow.cu
index 28b9d8e288f142a8d4222adb56709ef561599ef4..85d9972c5d6cee5c6e8c4443e8c425b8cb335b8d 100644
--- a/pyfstat/pyCUDAkernels/cudaTransientFstatExpWindow.cu
+++ b/pyfstat/pyCUDAkernels/cudaTransientFstatExpWindow.cu
@@ -26,7 +26,8 @@ __global__ void cudaTransientFstatExpWindow ( float *input,
     /* compute Fstat-atom index i_t0 in [0, numAtoms) */
     unsigned int TAtomHalf = TAtom/2; // integer division
     unsigned int t0 = win_t0 + m * win_dt0;
-    int i_tmp = ( t0 - t0_data + TAtomHalf ) / TAtom; // integer round: floor(x+0.5)
+    /* integer round: floor(x+0.5) */
+    int i_tmp = ( t0 - t0_data + TAtomHalf ) / TAtom;
     if ( i_tmp < 0 ) {
         i_tmp = 0;
     }
@@ -35,22 +36,26 @@ __global__ void cudaTransientFstatExpWindow ( float *input,
         i_t0 = numAtoms - 1;
     }
 
-    /* translate n into an atoms end-index for this search interval [t0, t0+Tcoh],
+    /* translate n into an atoms end-index
+     * for this search interval [t0, t0+Tcoh],
      * giving the index range of atoms to sum over
      */
     unsigned int tau = win_tau + n * win_dtau;
 
     /* get end-time t1 of this transient-window search
      * for given tau, what Tcoh should the exponential window cover?
-     * for speed reasons we want to truncate Tcoh = tau * TRANSIENT_EXP_EFOLDING
+     * for speed reasons we want to truncate
+     * Tcoh = tau * TRANSIENT_EXP_EFOLDING
      * with the e-folding factor chosen such that the window-value
      * is practically negligible after that, where it will be set to 0
      */
 //     unsigned int t1 = lround( win_t0 + TRANSIENT_EXP_EFOLDING * win_tau);
     unsigned int t1 = t0 + TRANSIENT_EXP_EFOLDING * tau;
 
-    /* compute window end-time Fstat-atom index i_t1 in [0, numAtoms) */
-    i_tmp = ( t1 - t0_data + TAtomHalf ) / TAtom  - 1; // integer round: floor(x+0.5)
+      /* compute window end-time Fstat-atom index i_t1 in [0, numAtoms)
+       * using integer round: floor(x+0.5)
+       */
+    i_tmp = ( t1 - t0_data + TAtomHalf ) / TAtom  - 1;
     if ( i_tmp < 0 ) {
         i_tmp = 0;
     }
diff --git a/pyfstat/pyCUDAkernels/cudaTransientFstatRectWindow.cu b/pyfstat/pyCUDAkernels/cudaTransientFstatRectWindow.cu
index 276cc2e27cb78ae008e8d74e235675f7510a1fde..dd353bbcf7fb52d2ff88e8b59d114057754a5f8c 100644
--- a/pyfstat/pyCUDAkernels/cudaTransientFstatRectWindow.cu
+++ b/pyfstat/pyCUDAkernels/cudaTransientFstatRectWindow.cu
@@ -20,7 +20,8 @@ __global__ void cudaTransientFstatRectWindow ( float *input,
   /* compute Fstat-atom index i_t0 in [0, numAtoms) */
   unsigned int TAtomHalf = TAtom/2; // integer division
   unsigned int t0 = win_t0 + m * win_dt0;
-  int i_tmp = ( t0 - t0_data + TAtomHalf ) / TAtom; // integer round: floor(x+0.5)
+  /* integer round: floor(x+0.5) */
+  int i_tmp = ( t0 - t0_data + TAtomHalf ) / TAtom;
   if ( i_tmp < 0 ) {
     i_tmp = 0;
   }
@@ -46,7 +47,8 @@ __global__ void cudaTransientFstatRectWindow ( float *input,
 
     if ( (m < N_tauRange) && (n < N_tauRange) ) {
 
-      /* translate n into an atoms end-index for this search interval [t0, t0+Tcoh],
+      /* translate n into an atoms end-index
+       * for this search interval [t0, t0+Tcoh],
        * giving the index range of atoms to sum over
        */
       unsigned int tau = win_tau + n * win_dtau;
@@ -54,8 +56,10 @@ __global__ void cudaTransientFstatRectWindow ( float *input,
       /* get end-time t1 of this transient-window search */
       unsigned int t1 = t0 + tau;
 
-      /* compute window end-time Fstat-atom index i_t1 in [0, numAtoms) */
-      i_tmp = ( t1 - t0_data + TAtomHalf ) / TAtom  - 1; // integer round: floor(x+0.5)
+      /* compute window end-time Fstat-atom index i_t1 in [0, numAtoms)
+       * using integer round: floor(x+0.5)
+       */
+      i_tmp = ( t1 - t0_data + TAtomHalf ) / TAtom  - 1;
       if ( i_tmp < 0 ) {
         i_tmp = 0;
       }
@@ -72,7 +76,8 @@ __global__ void cudaTransientFstatRectWindow ( float *input,
         /* sum up atoms,
          * special optimiziation in the rectangular-window case:
          * just add on to previous tau values,
-         * ie re-use the sum over [i_t0, i_t1_last] from the pevious tau-loop iteration
+         * ie re-use the sum over [i_t0, i_t1_last]
+         from the pevious tau-loop iteration
          */
         Ad    += input[i*input_cols+0]; // a2_alpha
         Bd    += input[i*input_cols+1]; // b2_alpha
diff --git a/pyfstat/tcw_fstat_map_funcs.py b/pyfstat/tcw_fstat_map_funcs.py
index c95509a1b97e376f310e1f77dbbaaafb7f139974..f84d71da5d7dd83ff455b43cd54b2880986483f7 100644
--- a/pyfstat/tcw_fstat_map_funcs.py
+++ b/pyfstat/tcw_fstat_map_funcs.py
@@ -9,7 +9,7 @@ import logging
 import importlib as imp
 
 
-def optional_import ( modulename, shorthand=None ):
+def _optional_import ( modulename, shorthand=None ):
     '''
     Import a module/submodule only if it's available.
 
@@ -29,12 +29,16 @@ def optional_import ( modulename, shorthand=None ):
     if('pycuda' in sys.modules):
         try:
             globals()[shorthand] = imp.import_module(modulename)
-            logging.debug('Successfully imported module %s%s.' % (modulename, shorthandbit))
+            logging.debug('Successfully imported module %s%s.'
+                          % (modulename, shorthandbit))
             success = True
         except pycuda._driver.LogicError, e:
             if e.message == 'cuDeviceGet failed: invalid device ordinal':
                 devn = int(os.environ['CUDA_DEVICE'])
-                raise RuntimeError('Requested CUDA device number {} exceeds number of available devices! Please change through environment variable $CUDA_DEVICE.'.format(devn))
+                raise RuntimeError('Requested CUDA device number {} exceeds' \
+                                   ' number of available devices!' \
+                                   ' Please change through environment' \
+                                   ' variable $CUDA_DEVICE.'.format(devn))
             else:
                 raise pycuda._driver.LogicError(e.message)
         except ImportError, e:
@@ -46,7 +50,8 @@ def optional_import ( modulename, shorthand=None ):
     else:
         try:
             globals()[shorthand] = imp.import_module(modulename)
-            logging.debug('Successfully imported module %s%s.' % (modulename, shorthandbit))
+            logging.debug('Successfully imported module %s%s.'
+                          % (modulename, shorthandbit))
             success = True
         except ImportError, e:
             if e.message == 'No module named '+modulename:
@@ -72,7 +77,11 @@ class pyTransientFstatMap(object):
 
     def __init__(self, N_t0Range, N_tauRange):
         self.F_mn   = np.zeros((N_t0Range, N_tauRange), dtype=np.float32)
-        self.maxF   = float(-1.0) # Initializing to a negative value ensures that we always update at least once and hence return sane t0_d_ML, tau_d_ML even if there is only a single bin where F=0 happens.
+        # Initializing maxF to a negative value ensures
+        # that we always update at least once and hence return
+        # sane t0_d_ML, tau_d_ML
+        # even if there is only a single bin where F=0 happens.
+        self.maxF   = float(-1.0)
         self.t0_ML  = float(0.0)
         self.tau_ML = float(0.0)
 
@@ -100,26 +109,28 @@ def init_transient_fstat_map_features ( wantCuda=False, cudaDeviceName=None ):
 
     features = {}
 
-    have_lal           = optional_import('lal')
-    have_lalpulsar     = optional_import('lalpulsar')
+    have_lal           = _optional_import('lal')
+    have_lalpulsar     = _optional_import('lalpulsar')
     features['lal']    = have_lal and have_lalpulsar
 
     # import GPU features
-    have_pycuda          = optional_import('pycuda')
-    have_pycuda_drv      = optional_import('pycuda.driver', 'drv')
-    have_pycuda_gpuarray = optional_import('pycuda.gpuarray', 'gpuarray')
-    have_pycuda_tools    = optional_import('pycuda.tools', 'cudatools')
-    have_pycuda_compiler = optional_import('pycuda.compiler', 'cudacomp')
-    features['pycuda']   = have_pycuda_drv and have_pycuda_gpuarray and have_pycuda_tools and have_pycuda_compiler
+    have_pycuda          = _optional_import('pycuda')
+    have_pycuda_drv      = _optional_import('pycuda.driver', 'drv')
+    have_pycuda_gpuarray = _optional_import('pycuda.gpuarray', 'gpuarray')
+    have_pycuda_tools    = _optional_import('pycuda.tools', 'cudatools')
+    have_pycuda_compiler = _optional_import('pycuda.compiler', 'cudacomp')
+    features['pycuda']   = ( have_pycuda_drv and have_pycuda_gpuarray and
+                            have_pycuda_tools and have_pycuda_compiler )
 
     logging.debug('Got the following features for transient F-stat maps:')
     logging.debug(features)
 
     if wantCuda and features['pycuda']:
-        logging.debug('CUDA version: {}'.format(drv.get_version()))
+        logging.debug('CUDA version: '+'.'.join(map(str,drv.get_version())))
 
         drv.init()
-        logging.debug('Starting with default context, then checking all available devices...')
+        logging.debug('Starting with default pyCUDA context,' \
+                      ' then checking all available devices...')
         context0 = pycuda.tools.make_default_context()
 
         num_gpus = drv.Device.count()
@@ -131,7 +142,8 @@ def init_transient_fstat_map_features ( wantCuda=False, cudaDeviceName=None ):
             devn = drv.Device(n)
             devices.append(devn)
             devnames[n] = devn.name().replace(' ','-').replace('_','-')
-            logging.debug('device {}: model: {}, RAM: {}MB'.format(n,devnames[n],devn.total_memory()/(2.**20) ))
+            logging.debug('device {}: model: {}, RAM: {}MB'.format(
+                n, devnames[n], devn.total_memory()/(2.**20) ))
 
         if 'CUDA_DEVICE' in os.environ:
             devnum0 = int(os.environ['CUDA_DEVICE'])
@@ -141,14 +153,19 @@ def init_transient_fstat_map_features ( wantCuda=False, cudaDeviceName=None ):
         matchbit = ''
         if cudaDeviceName:
             # allow partial matches in device names
-            devmatches = [devidx for devidx, devname in enumerate(devnames) if cudaDeviceName in devname]
+            devmatches = [devidx for devidx, devname in enumerate(devnames)
+                          if cudaDeviceName in devname]
             if len(devmatches) == 0:
                 context0.detach()
-                raise RuntimeError('Requested CUDA device "{}" not found. Available devices: [{}]'.format(cudaDeviceName,','.join(devnames)))
+                raise RuntimeError('Requested CUDA device "{}" not found.' \
+                                   ' Available devices: [{}]'.format(
+                                      cudaDeviceName,','.join(devnames)))
             else:
                 devnum = devmatches[0]
                 if len(devmatches) > 1:
-                    logging.warning('Found {} CUDA devices matching name "{}". Choosing first one with index {}.'.format(len(devmatches),cudaDeviceName,devnum))
+                    logging.warning('Found {} CUDA devices matching name "{}".' \
+                                    ' Choosing first one with index {}.'.format(
+                                        len(devmatches),cudaDeviceName,devnum))
             os.environ['CUDA_DEVICE'] = str(devnum)
             matchbit =  '(matched to user request "{}")'.format(cudaDeviceName)
         elif 'CUDA_DEVICE' in os.environ:
@@ -156,7 +173,9 @@ def init_transient_fstat_map_features ( wantCuda=False, cudaDeviceName=None ):
         else:
             devnum = 0
         devn = devices[devnum]
-        logging.info('Choosing CUDA device {}, of {} devices present: {}{}...'.format(devnum,num_gpus,devn.name(),matchbit))
+        logging.info('Choosing CUDA device {},' \
+                     ' of {} devices present: {}{}...'.format(
+                         devnum, num_gpus, devn.name(), matchbit))
         if devnum == devnum0:
             gpu_context = context0
         else:
@@ -164,23 +183,28 @@ def init_transient_fstat_map_features ( wantCuda=False, cudaDeviceName=None ):
             gpu_context = pycuda.tools.make_default_context()
             gpu_context.push()
 
-        logging.debug('Available GPU memory: {}/{} MB free'.format(drv.mem_get_info()[0]/(2.**20),drv.mem_get_info()[1]/(2.**20)))
+        _print_GPU_memory_MB('Available')
     else:
         gpu_context = None
 
     return features, gpu_context
 
 
-def call_compute_transient_fstat_map ( version, features, multiFstatAtoms=None, windowRange=None ):
+def call_compute_transient_fstat_map ( version,
+                                       features,
+                                       multiFstatAtoms=None,
+                                       windowRange=None ):
     '''Choose which version of the ComputeTransientFstatMap function to call.'''
 
     if version in fstatmap_versions:
         if features[version]:
             FstatMap = fstatmap_versions[version](multiFstatAtoms, windowRange)
         else:
-            raise Exception('Required module(s) for transient F-stat map method "{}" not available!'.format(version))
+            raise Exception('Required module(s) for transient F-stat map' \
+                            ' method "{}" not available!'.format(version))
     else:
-        raise Exception('Transient F-stat map method "{}" not implemented!'.format(version))
+        raise Exception('Transient F-stat map method "{}"' \
+                        ' not implemented!'.format(version))
     return FstatMap
 
 
@@ -215,12 +239,19 @@ def reshape_FstatAtomsVector ( atomsVector ):
     return atomsDict
 
 
-def get_absolute_kernel_path ( kernel ):
+def _get_absolute_kernel_path ( kernel ):
     pyfstatdir = os.path.dirname(os.path.abspath(os.path.realpath(__file__)))
     kernelfile = kernel + '.cu'
     return os.path.join(pyfstatdir,'pyCUDAkernels',kernelfile)
 
 
+def _print_GPU_memory_MB ( key ):
+    mem_used_MB  = drv.mem_get_info()[0]/(2.**20)
+    mem_total_MB = drv.mem_get_info()[1]/(2.**20)
+    logging.debug('{} GPU memory: {:.4f} / {:.4f} MB free'.format(
+                      key, mem_used_MB, mem_total_MB))
+
+
 def pycuda_compute_transient_fstat_map ( multiFstatAtoms, windowRange ):
     '''
     GPU version of the function to compute transient-window "F-statistic map"
@@ -237,7 +268,9 @@ def pycuda_compute_transient_fstat_map ( multiFstatAtoms, windowRange ):
     '''
 
     if ( windowRange.type >= lalpulsar.TRANSIENT_LAST ):
-        raise ValueError ('Unknown window-type ({}) passed as input. Allowed are [0,{}].'.format(windowRange.type, lalpulsar.TRANSIENT_LAST-1))
+        raise ValueError ('Unknown window-type ({}) passed as input.' \
+                          ' Allowed are [0,{}].'.format(
+                              windowRange.type, lalpulsar.TRANSIENT_LAST-1))
 
     # internal dict for search/setup parameters
     tCWparams = {}
@@ -264,13 +297,16 @@ def pycuda_compute_transient_fstat_map ( multiFstatAtoms, windowRange ):
     # actual data spans [t0_data, t0_data + tCWparams['numAtoms'] * TAtom]
     # in steps of TAtom
     tCWparams['t0_data'] = int(atoms.data[0].timestamp)
-    tCWparams['t1_data'] = int(atoms.data[tCWparams['numAtoms']-1].timestamp + tCWparams['TAtom'])
+    tCWparams['t1_data'] = int(atoms.data[tCWparams['numAtoms']-1].timestamp
+                               + tCWparams['TAtom'])
 
-    logging.debug('t0_data={:d}, t1_data={:d}'.format(tCWparams['t0_data'],
-                                                      tCWparams['t1_data']))
-    logging.debug('numAtoms={:d}, TAtom={:d}, TAtomHalf={:d}'.format(tCWparams['numAtoms'],
-                                                                     tCWparams['TAtom'],
-                                                                     TAtomHalf))
+    logging.debug('Transient F-stat map:' \
+                  ' t0_data={:d}, t1_data={:d}'.format(
+                      tCWparams['t0_data'], tCWparams['t1_data']))
+    logging.debug('Transient F-stat map:' \
+                  ' numAtoms={:d}, TAtom={:d},' \
+                  ' TAtomHalf={:d}'.format(
+                      tCWparams['numAtoms'], tCWparams['TAtom'], TAtomHalf))
 
     # special treatment of window_type = none
     # ==> replace by rectangular window spanning all the data
@@ -283,16 +319,21 @@ def pycuda_compute_transient_fstat_map ( multiFstatAtoms, windowRange ):
         windowRange.tauBand = 0;
         windowRange.dtau    = tCWparams['TAtom'] # irrelevant
 
-    """ NOTE: indices {i,j} enumerate *actual* atoms and their timestamps t_i, while the
-    * indices {m,n} enumerate the full grid of values in [t0_min, t0_max]x[Tcoh_min, Tcoh_max] in
-    * steps of deltaT. This allows us to deal with gaps in the data in a transparent way.
+    """ NOTE: indices {i,j} enumerate *actual* atoms and their timestamps t_i,
+    * while the indices {m,n} enumerate the full grid of values
+    * in [t0_min, t0_max]x[Tcoh_min, Tcoh_max] in steps of deltaT.
+    * This allows us to deal with gaps in the data in a transparent way.
     *
-    * NOTE2: we operate on the 'binned' atoms returned from XLALmergeMultiFstatAtomsBinned(),
-    * which means we can safely assume all atoms to be lined up perfectly on a 'deltaT' binned grid.
+    * NOTE2: we operate on the 'binned' atoms returned
+    * from XLALmergeMultiFstatAtomsBinned(),
+    * which means we can safely assume all atoms to be lined up
+    * perfectly on a 'deltaT' binned grid.
     *
     * The mapping used will therefore be {i,j} -> {m,n}:
-    *   m = offs_i  / deltaT		= start-time offset from t0_min measured in deltaT
-    *   n = Tcoh_ij / deltaT		= duration Tcoh_ij measured in deltaT,
+    *   m = offs_i  / deltaT
+    *   start-time offset from t0_min measured in deltaT
+    *   n = Tcoh_ij / deltaT
+    *   duration Tcoh_ij measured in deltaT,
     *
     * where
     *   offs_i  = t_i - t0_min
@@ -301,24 +342,31 @@ def pycuda_compute_transient_fstat_map ( multiFstatAtoms, windowRange ):
     """
 
     # We allocate a matrix  {m x n} = t0Range * TcohRange elements
-    # covering the full timerange the transient window-range [t0,t0+t0Band]x[tau,tau+tauBand]
-    tCWparams['N_t0Range']  = int(np.floor( 1.0*windowRange.t0Band / windowRange.dt0 ) + 1)
-    tCWparams['N_tauRange'] = int(np.floor( 1.0*windowRange.tauBand / windowRange.dtau ) + 1)
-    FstatMap = pyTransientFstatMap ( tCWparams['N_t0Range'], tCWparams['N_tauRange'] )
-
-    logging.debug('N_t0Range={:d}, N_tauRange={:d}'.format(tCWparams['N_t0Range'],
-                                                           tCWparams['N_tauRange']))
+    # covering the full transient window-range [t0,t0+t0Band]x[tau,tau+tauBand]
+    tCWparams['N_t0Range']  = int(np.floor( 1.0*windowRange.t0Band /
+                                            windowRange.dt0 ) + 1)
+    tCWparams['N_tauRange'] = int(np.floor( 1.0*windowRange.tauBand /
+                                            windowRange.dtau ) + 1)
+    FstatMap = pyTransientFstatMap ( tCWparams['N_t0Range'],
+                                     tCWparams['N_tauRange'] )
+
+    logging.debug('Transient F-stat map:' \
+                  ' N_t0Range={:d}, N_tauRange={:d},' \
+                  ' total grid points: {:d}'.format(
+                      tCWparams['N_t0Range'], tCWparams['N_tauRange'],
+                      tCWparams['N_t0Range']*tCWparams['N_tauRange']))
 
     if ( windowRange.type == lalpulsar.TRANSIENT_RECTANGULAR ):
-        FstatMap.F_mn = pycuda_compute_transient_fstat_map_rect ( atomsInputMatrix,
-                                                                  windowRange,
-                                                                  tCWparams )
+        FstatMap.F_mn = pycuda_compute_transient_fstat_map_rect (
+                           atomsInputMatrix, windowRange, tCWparams )
     elif ( windowRange.type == lalpulsar.TRANSIENT_EXPONENTIAL ):
-        FstatMap.F_mn = pycuda_compute_transient_fstat_map_exp ( atomsInputMatrix,
-                                                                 windowRange,
-                                                                 tCWparams )
+        FstatMap.F_mn = pycuda_compute_transient_fstat_map_exp (
+                           atomsInputMatrix, windowRange, tCWparams )
     else:
-        raise ValueError('Invalid transient window type {} not in [{}, {}].'.format(windowRange.type, lalpulsar.TRANSIENT_NONE, lalpulsar.TRANSIENT_LAST-1))
+        raise ValueError('Invalid transient window type {}' \
+                         ' not in [{}, {}].'.format(
+                            windowRange.type, lalpulsar.TRANSIENT_NONE,
+                            lalpulsar.TRANSIENT_LAST-1))
 
     # out of loop: get max2F and ML estimates over the m x n matrix
     FstatMap.maxF = FstatMap.F_mn.max()
@@ -328,23 +376,32 @@ def pycuda_compute_transient_fstat_map ( multiFstatAtoms, windowRange ):
     FstatMap.t0_ML  = windowRange.t0  + maxidx[0] * windowRange.dt0
     FstatMap.tau_ML = windowRange.tau + maxidx[1] * windowRange.dtau
 
-    logging.debug('Done computing transient F-stat map. maxF={}, t0_ML={}, tau_ML={}'.format(FstatMap.maxF , FstatMap.t0_ML, FstatMap.tau_ML))
+    logging.debug('Done computing transient F-stat map.' \
+                  ' maxF={:.4f}, t0_ML={}, tau_ML={}'.format(
+                      FstatMap.maxF , FstatMap.t0_ML, FstatMap.tau_ML))
 
     return FstatMap
 
 
-def pycuda_compute_transient_fstat_map_rect ( atomsInputMatrix, windowRange, tCWparams ):
-    '''only GPU-parallizing outer loop, keeping partial sums with memory in kernel'''
+def pycuda_compute_transient_fstat_map_rect ( atomsInputMatrix,
+                                              windowRange,
+                                              tCWparams ):
+    '''
+    only GPU-parallizing outer loop,
+    keeping partial sums with memory in kernel
+    '''
 
     # gpu data setup and transfer
-    logging.debug('Initial GPU memory: {}/{} MB free'.format(drv.mem_get_info()[0]/(2.**20),drv.mem_get_info()[1]/(2.**20)))
+    _print_GPU_memory_MB('Initial')
     input_gpu = gpuarray.to_gpu ( atomsInputMatrix )
-    Fmn_gpu   = gpuarray.GPUArray ( (tCWparams['N_t0Range'],tCWparams['N_tauRange']), dtype=np.float32 )
-    logging.debug('GPU memory with input+output allocated: {}/{} MB free'.format(drv.mem_get_info()[0]/(2.**20), drv.mem_get_info()[1]/(2.**20)))
+    Fmn_gpu   = gpuarray.GPUArray ( (tCWparams['N_t0Range'],
+                                     tCWparams['N_tauRange']),
+                                    dtype=np.float32 )
+    _print_GPU_memory_MB('After input+output allocation:')
 
     # GPU kernel
     kernel = 'cudaTransientFstatRectWindow'
-    kernelfile = get_absolute_kernel_path ( kernel )
+    kernelfile = _get_absolute_kernel_path ( kernel )
     partial_Fstat_cuda_code = cudacomp.SourceModule(open(kernelfile,'r').read())
     partial_Fstat_cuda = partial_Fstat_cuda_code.get_function(kernel)
     partial_Fstat_cuda.prepare('PIIIIIIIIP')
@@ -354,32 +411,48 @@ def pycuda_compute_transient_fstat_map_rect ( atomsInputMatrix, windowRange, tCW
     blockCols = 1
     gridRows  = int(np.ceil(1.0*tCWparams['N_t0Range']/blockRows))
     gridCols  = 1
-    logging.debug('Looking to compute transient F-stat map over a N={}*{}={} (t0,tau) grid...'.format(tCWparams['N_t0Range'],tCWparams['N_tauRange'],tCWparams['N_t0Range']*tCWparams['N_tauRange']))
 
     # running the kernel
-    logging.debug('Calling kernel with a grid of {}*{}={} blocks of {}*{}={} threads each: {} total threads...'.format(gridRows, gridCols, gridRows*gridCols, blockRows, blockCols, blockRows*blockCols, gridRows*gridCols*blockRows*blockCols))
-    partial_Fstat_cuda.prepared_call ( (gridRows,gridCols), (blockRows,blockCols,1), input_gpu.gpudata, tCWparams['numAtoms'], tCWparams['TAtom'], tCWparams['t0_data'], windowRange.t0, windowRange.dt0, windowRange.tau, windowRange.dtau, tCWparams['N_tauRange'], Fmn_gpu.gpudata )
+    logging.debug('Calling pyCUDA kernel with a grid of {}*{}={} blocks' \
+                  ' of {}*{}={} threads each: {} total threads...'.format(
+                      gridRows, gridCols, gridRows*gridCols,
+                      blockRows, blockCols, blockRows*blockCols,
+                      gridRows*gridCols*blockRows*blockCols))
+    partial_Fstat_cuda.prepared_call ( (gridRows,gridCols),
+                                       (blockRows,blockCols,1),
+                                       input_gpu.gpudata,
+                                       tCWparams['numAtoms'],
+                                       tCWparams['TAtom'],
+                                       tCWparams['t0_data'],
+                                       windowRange.t0, windowRange.dt0,
+                                       windowRange.tau, windowRange.dtau,
+                                       tCWparams['N_tauRange'],
+                                       Fmn_gpu.gpudata )
 
     # return results to host
     F_mn = Fmn_gpu.get()
 
-    logging.debug('Final GPU memory: {}/{} MB free'.format(drv.mem_get_info()[0]/(2.**20),drv.mem_get_info()[1]/(2.**20)))
+    _print_GPU_memory_MB('Final')
 
     return F_mn
 
 
-def pycuda_compute_transient_fstat_map_exp ( atomsInputMatrix, windowRange, tCWparams ):
+def pycuda_compute_transient_fstat_map_exp ( atomsInputMatrix,
+                                             windowRange,
+                                             tCWparams ):
     '''exponential window, inner and outer loop GPU-parallelized'''
 
     # gpu data setup and transfer
-    logging.debug('Initial GPU memory: {}/{} MB free'.format(drv.mem_get_info()[0]/(2.**20),drv.mem_get_info()[1]/(2.**20)))
+    _print_GPU_memory_MB('Initial')
     input_gpu = gpuarray.to_gpu ( atomsInputMatrix )
-    Fmn_gpu   = gpuarray.GPUArray ( (tCWparams['N_t0Range'],tCWparams['N_tauRange']), dtype=np.float32 )
-    logging.debug('GPU memory with input+output allocated: {}/{} MB free'.format(drv.mem_get_info()[0]/(2.**20), drv.mem_get_info()[1]/(2.**20)))
+    Fmn_gpu   = gpuarray.GPUArray ( (tCWparams['N_t0Range'],
+                                     tCWparams['N_tauRange']),
+                                    dtype=np.float32 )
+    _print_GPU_memory_MB('After input+output allocation:')
 
     # GPU kernel
     kernel = 'cudaTransientFstatExpWindow'
-    kernelfile = get_absolute_kernel_path ( kernel )
+    kernelfile = _get_absolute_kernel_path ( kernel )
     partial_Fstat_cuda_code = cudacomp.SourceModule(open(kernelfile,'r').read())
     partial_Fstat_cuda = partial_Fstat_cuda_code.get_function(kernel)
     partial_Fstat_cuda.prepare('PIIIIIIIIIP')
@@ -389,15 +462,28 @@ def pycuda_compute_transient_fstat_map_exp ( atomsInputMatrix, windowRange, tCWp
     blockCols = min(32,tCWparams['N_tauRange'])
     gridRows  = int(np.ceil(1.0*tCWparams['N_t0Range']/blockRows))
     gridCols  = int(np.ceil(1.0*tCWparams['N_tauRange']/blockCols))
-    logging.debug('Looking to compute transient F-stat map over a N={}*{}={} (t0,tau) grid...'.format(tCWparams['N_t0Range'], tCWparams['N_tauRange'], tCWparams['N_t0Range']*tCWparams['N_tauRange']))
 
     # running the kernel
-    logging.debug('Calling kernel with a grid of {}*{}={} blocks of {}*{}={} threads each: {} total threads...'.format(gridRows, gridCols, gridRows*gridCols, blockRows, blockCols, blockRows*blockCols, gridRows*gridCols*blockRows*blockCols))
-    partial_Fstat_cuda.prepared_call ( (gridRows,gridCols), (blockRows,blockCols,1), input_gpu.gpudata, tCWparams['numAtoms'], tCWparams['TAtom'], tCWparams['t0_data'], windowRange.t0, windowRange.dt0, windowRange.tau, windowRange.dtau, tCWparams['N_t0Range'], tCWparams['N_tauRange'], Fmn_gpu.gpudata )
+    logging.debug('Calling kernel with a grid of {}*{}={} blocks' \
+                  ' of {}*{}={} threads each: {} total threads...'.format(
+                      gridRows, gridCols, gridRows*gridCols,
+                      blockRows, blockCols, blockRows*blockCols,
+                      gridRows*gridCols*blockRows*blockCols))
+    partial_Fstat_cuda.prepared_call ( (gridRows,gridCols),
+                                       (blockRows,blockCols,1),
+                                       input_gpu.gpudata,
+                                       tCWparams['numAtoms'],
+                                       tCWparams['TAtom'],
+                                       tCWparams['t0_data'],
+                                       windowRange.t0, windowRange.dt0,
+                                       windowRange.tau, windowRange.dtau,
+                                       tCWparams['N_t0Range'],
+                                       tCWparams['N_tauRange'],
+                                       Fmn_gpu.gpudata )
 
     # return results to host
     F_mn = Fmn_gpu.get()
 
-    logging.debug('Final GPU memory: {}/{} MB free'.format(drv.mem_get_info()[0]/(2.**20),drv.mem_get_info()[1]/(2.**20)))
+    _print_GPU_memory_MB('Final')
 
     return F_mn