diff --git a/pyfstat/core.py b/pyfstat/core.py
index abf18565e240dcad542e5446a82b0e51d902a0d8..6a624985e5c9e256ed76c098c9b984d6969e62a9 100755
--- a/pyfstat/core.py
+++ b/pyfstat/core.py
@@ -337,7 +337,7 @@ class ComputeFstat(BaseSearchClass):
                  detectors=None, minCoverFreq=None, maxCoverFreq=None,
                  injectSources=None, injectSqrtSX=None, assumeSqrtSX=None,
                  SSBprec=None,
-                 tCWFstatMapVersion='lal'):
+                 tCWFstatMapVersion='lal', cudaDeviceName=None):
         """
         Parameters
         ----------
@@ -388,6 +388,8 @@ class ComputeFstat(BaseSearchClass):
         tCWFstatMapVersion: str
             Choose between standard 'lal' implementation,
             'pycuda' for gpu, and some others for devel/debug.
+        cudaDeviceName: str
+            GPU name to be matched against drv.Device output.
 
         """
 
@@ -658,7 +660,7 @@ class ComputeFstat(BaseSearchClass):
                     if self.dtau:
                         self.windowRange.dtau = self.dtau
 
-            self.tCWFstatMapFeatures = tcw.init_transient_fstat_map_features()
+            self.tCWFstatMapFeatures, self.gpu_context = tcw.init_transient_fstat_map_features(self.cudaDeviceName)
 
     def get_fullycoherent_twoF(self, tstart, tend, F0, F1, F2, Alpha, Delta,
                                asini=None, period=None, ecc=None, tp=None,
@@ -939,6 +941,15 @@ class ComputeFstat(BaseSearchClass):
             raise RuntimeError('Cannot print atoms vector to file: no FstatResults.multiFatoms, or it is None!')
 
 
+    def __del__(self):
+        """
+        In pyCuda case without autoinit,
+        we need to make sure the context is removed at the end
+        """
+        if hasattr(self,'gpu_context') and self.gpu_context:
+            self.gpu_context.detach()
+
+
 class SemiCoherentSearch(ComputeFstat):
     """ A semi-coherent search """
 
diff --git a/pyfstat/grid_based_searches.py b/pyfstat/grid_based_searches.py
index 29d590f88d9d3e318a912779ac21f3dd0d040622..1dc24f2d5ae71e6f90ec1229c1312e74e088b29c 100644
--- a/pyfstat/grid_based_searches.py
+++ b/pyfstat/grid_based_searches.py
@@ -356,7 +356,7 @@ class TransientGridSearch(GridSearch):
                  dt0=None, dtau=None,
                  outputTransientFstatMap=False,
                  outputAtoms=False,
-                 tCWFstatMapVersion='lal'):
+                 tCWFstatMapVersion='lal', cudaDeviceName=None):
         """
         Parameters
         ----------
@@ -392,6 +392,8 @@ class TransientGridSearch(GridSearch):
         tCWFstatMapVersion: str
             Choose between standard 'lal' implementation,
             'pycuda' for gpu, and some others for devel/debug.
+        cudaDeviceName: str
+            GPU name to be matched against drv.Device output.
 
         For all other parameters, see `pyfstat.ComputeFStat` for details
         """
@@ -418,7 +420,8 @@ class TransientGridSearch(GridSearch):
             BSGL=self.BSGL, SSBprec=self.SSBprec,
             injectSources=self.injectSources,
             assumeSqrtSX=self.assumeSqrtSX,
-            tCWFstatMapVersion=self.tCWFstatMapVersion)
+            tCWFstatMapVersion=self.tCWFstatMapVersion,
+            cudaDeviceName=self.cudaDeviceName)
         self.search.get_det_stat = self.search.get_fullycoherent_twoF
 
     def run(self, return_data=False):
@@ -473,6 +476,10 @@ class TransientGridSearch(GridSearch):
                     this_tau = windowRange.tau + n * windowRange.dtau;
                     tfp.write('  %10d %10d %- 11.8g\n' % (this_t0, this_tau, 2.0*this_F))
 
+    def __del__(self):
+        if hasattr(self,'search'):
+            self.search.__del__()
+
 
 class SliceGridSearch(GridSearch):
     """ Slice gridded search using ComputeFstat """
diff --git a/pyfstat/tcw_fstat_map_funcs.py b/pyfstat/tcw_fstat_map_funcs.py
index b054d20ff01978d92023732ae77b72488f422769..d149a6bfc4c02db3a882189431eca108867a346f 100644
--- a/pyfstat/tcw_fstat_map_funcs.py
+++ b/pyfstat/tcw_fstat_map_funcs.py
@@ -89,7 +89,7 @@ fstatmap_versions = {
                     }
 
 
-def init_transient_fstat_map_features ( ):
+def init_transient_fstat_map_features ( cudaDeviceName ):
     '''
     Initialization of available modules (or "features") for F-stat maps.
 
@@ -106,12 +106,11 @@ def init_transient_fstat_map_features ( ):
 
     # import GPU features
     have_pycuda          = optional_import('pycuda')
-    have_pycuda_init     = optional_import('pycuda.autoinit', 'autoinit')
     have_pycuda_drv      = optional_import('pycuda.driver', 'drv')
     have_pycuda_gpuarray = optional_import('pycuda.gpuarray', 'gpuarray')
     have_pycuda_tools    = optional_import('pycuda.tools', 'cudatools')
     have_pycuda_compiler = optional_import('pycuda.compiler', 'cudacomp')
-    features['pycuda']   = have_pycuda_drv and have_pycuda_init and have_pycuda_gpuarray and have_pycuda_tools and have_pycuda_compiler
+    features['pycuda']   = have_pycuda_drv and have_pycuda_gpuarray and have_pycuda_tools and have_pycuda_compiler
 
     logging.debug('Got the following features for transient F-stat maps:')
     logging.debug(features)
@@ -119,25 +118,54 @@ def init_transient_fstat_map_features ( ):
     if features['pycuda']:
         logging.debug('CUDA version: {}'.format(drv.get_version()))
 
+        drv.init()
+        logging.debug('Starting with default context, then checking all available devices...')
+        context0 = pycuda.tools.make_default_context()
+
         num_gpus = drv.Device.count()
         logging.debug('Found {} CUDA device(s).'.format(num_gpus))
 
         devices = []
+        devnames = np.empty(num_gpus,dtype='S32')
         for n in range(num_gpus):
-            devices.append(drv.Device(n))
-
-        for n, devn in enumerate(devices):
-            logging.debug('device {} model: {}, RAM: {}MB'.format(n,devn.name(),devn.total_memory()/(2.**20) ))
+            devn = drv.Device(n)
+            devices.append(devn)
+            devnames[n] = devn.name().replace(' ','-').replace('_','-')
+            logging.debug('device {}: model: {}, RAM: {}MB'.format(n,devnames[n],devn.total_memory()/(2.**20) ))
 
         if 'CUDA_DEVICE' in os.environ:
+            devnum0 = int(os.environ['CUDA_DEVICE'])
+        else:
+            devnum0 = 0
+
+        if cudaDeviceName:
+            devmatches = np.where(devnames == cudaDeviceName)[0]
+            if len(devmatches) == 0:
+                context0.detach()
+                raise RuntimeError('Requested CUDA device "{}" not found. Available devices: [{}]'.format(cudaDeviceName,','.join(devnames)))
+            else:
+                devnum = devmatches[0]
+                if len(devmatches) > 1:
+                    logging.warning('Found {} CUDA devices matching name "{}". Choosing first one with index {}.'.format(len(devmatches),cudaDeviceName,devnum))
+            os.environ['CUDA_DEVICE'] = str(devnum)
+        elif 'CUDA_DEVICE' in os.environ:
             devnum = int(os.environ['CUDA_DEVICE'])
         else:
             devnum = 0
-        devn = drv.Device(devnum)
-        logging.info('Choosing CUDA device {}, of {} devices present: {}... (Can be changed through environment variable $CUDA_DEVICE.)'.format(devnum,num_gpus,devn.name()))
+        devn = devices[devnum]
+        logging.info('Choosing CUDA device {}, of {} devices present: {} (matched to user request "{}")...'.format(devnum,num_gpus,devn.name(),devnames[devnum]))
+        if devnum == devnum0:
+            gpu_context = context0
+        else:
+            context0.pop()
+            gpu_context = pycuda.tools.make_default_context()
+            gpu_context.push()
+
         logging.debug('Available GPU memory: {}/{} MB free'.format(drv.mem_get_info()[0]/(2.**20),drv.mem_get_info()[1]/(2.**20)))
+    else:
+        gpu_context = None
 
-    return features
+    return features, gpu_context
 
 
 def call_compute_transient_fstat_map ( version, features, multiFstatAtoms=None, windowRange=None ):