diff --git a/superpixel_classification/SuperpixelClassification/SuperpixelClassification.xml b/superpixel_classification/SuperpixelClassification/SuperpixelClassification.xml
index 38c7b77..bbb276a 100644
--- a/superpixel_classification/SuperpixelClassification/SuperpixelClassification.xml
+++ b/superpixel_classification/SuperpixelClassification/SuperpixelClassification.xml
@@ -100,6 +100,13 @@
       <label>Train model</label>
       <default>true</default>
     </boolean>
+    <boolean>
+      <name>useCuda</name>
+      <longflag>usecuda</longflag>
+      <description>Whether or not to use GPU/cuda (true) or cpu (false).</description>
+      <label>Use CUDA</label>
+      <default>true</default>
+    </boolean>
     <integer>
       <name>batchSize</name>
       <longflag>batchsize</longflag>
diff --git a/superpixel_classification/SuperpixelClassification/SuperpixelClassificationBase.py b/superpixel_classification/SuperpixelClassification/SuperpixelClassificationBase.py
index adc1148..6502496 100644
--- a/superpixel_classification/SuperpixelClassification/SuperpixelClassificationBase.py
+++ b/superpixel_classification/SuperpixelClassification/SuperpixelClassificationBase.py
@@ -505,7 +505,7 @@ def trainModelAddItem(self, gc, record, item, annotrec, elem, feature,
 
     def trainModel(self, gc, folderId, annotationName, features, modelFolderId,
                    batchSize, epochs, trainingSplit, randomInput, labelList,
-                   excludeLabelList, prog):
+                   excludeLabelList, use_cuda, prog):
         itemsAndAnnot = self.getItemsAndAnnotations(gc, folderId, annotationName)
         with tempfile.TemporaryDirectory(dir=os.getcwd()) as tempdir:
             trainingPath = os.path.join(tempdir, 'training.h5')
@@ -544,7 +544,7 @@ def trainModel(self, gc, folderId, annotationName, features, modelFolderId,
                 prog.progress(0)
                 history, modelPath = self.trainModelDetails(
                     record, annotationName, batchSize, epochs, itemsAndAnnot, prog, tempdir,
-                    trainingSplit)
+                    trainingSplit, use_cuda)
 
                 modTrainingPath = os.path.join(tempdir, '%s ModTraining Epoch %d.h5' % (
                     annotationName, self.getCurrentEpoch(itemsAndAnnot)))
@@ -568,7 +568,7 @@ def trainModel(self, gc, folderId, annotationName, features, modelFolderId,
 
     def predictLabelsForItem(self, gc, annotationName, annotationFolderId, tempdir, model, item,
                              annotrec, elem, feature, curEpoch, userId, labels, groups,
-                             makeHeatmaps, radius, magnification, certainty, batchSize, prog):
+                             makeHeatmaps, radius, magnification, certainty, batchSize, use_cuda, prog):
         import al_bench.factory
 
         print('Predicting %s' % (item['name']))
@@ -771,7 +771,7 @@ def makeHeatmapsForItem(self, gc, annotationName, userId, tempdir, radius, item,
 
     def predictLabels(self, gc, folderId, annotationName, features, modelFolderId,
                       annotationFolderId, saliencyMaps, radius, magnification,
-                      certainty, batchSize, prog):
+                      certainty, batchSize, use_cuda, prog):
         itemsAndAnnot = self.getItemsAndAnnotations(gc, folderId, annotationName)
         curEpoch = self.getCurrentEpoch(itemsAndAnnot)
         folder = gc.getFolder(folderId)
@@ -833,7 +833,7 @@ def predictLabels(self, gc, folderId, annotationName, features, modelFolderId,
                 self.predictLabelsForItem(
                     gc, annotationName, annotationFolderId, tempdir, model, item, annotrec, elem,
                     features.get(item['_id']), curEpoch, userId, labels, groups, saliencyMaps,
-                    radius, magnification, certainty, batchSize, prog)
+                    radius, magnification, certainty, batchSize, use_cuda, prog)
             prog.progress(1)
 
     def main(self, args):
@@ -864,5 +864,5 @@ def main(self, args):
 
             self.predictLabels(
                 gc, args.images, args.annotationName, features, args.modeldir, args.annotationDir,
-                args.heatmaps, args.radius, args.magnification, args.certainty, args.batchSize,
+                args.heatmaps, args.radius, args.magnification, args.certainty, args.batchSize, args.useCuda,
                 prog)
diff --git a/superpixel_classification/SuperpixelClassification/SuperpixelClassificationTensorflow.py b/superpixel_classification/SuperpixelClassification/SuperpixelClassificationTensorflow.py
index 0af02d8..27ab67e 100644
--- a/superpixel_classification/SuperpixelClassification/SuperpixelClassificationTensorflow.py
+++ b/superpixel_classification/SuperpixelClassification/SuperpixelClassificationTensorflow.py
@@ -35,33 +35,56 @@ class SuperpixelClassificationTensorflow(SuperpixelClassificationBase):
     def __init__(self):
         self.training_optimal_batchsize: Optional[int] = None
         self.prediction_optimal_batchsize: Optional[int] = None
+        self.use_cuda = False
 
     def trainModelDetails(self, record, annotationName, batchSize, epochs, itemsAndAnnot, prog,
-                          tempdir, trainingSplit):
-        # print(f'Tensorflow trainModelDetails(batchSize={batchSize}, ...)')
-        # make model
-        num_classes = len(record['labels'])
-        model = tf.keras.Sequential([
-            tf.keras.layers.Rescaling(1.0 / 255),
-            tf.keras.layers.Conv2D(16, 3, padding='same', activation='relu'),
-            tf.keras.layers.MaxPooling2D(),
-            tf.keras.layers.Conv2D(32, 3, padding='same', activation='relu'),
-            tf.keras.layers.MaxPooling2D(),
-            tf.keras.layers.Conv2D(64, 3, padding='same', activation='relu'),
-            tf.keras.layers.MaxPooling2D(),
-            tf.keras.layers.Flatten(),
-            # tf.keras.layers.Dropout(0.2),
-            tf.keras.layers.Dense(128, activation='relu'),
-            tf.keras.layers.Dense(num_classes)])
-        prog.progress(0.2)
-        model.compile(optimizer='adam',
-                      loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
-                      metrics=['accuracy'])
+                          tempdir, trainingSplit, use_cuda):
+        self.use_cuda = use_cuda
+
+        # Enable GPU memory growth globally to avoid precondition errors
+        gpus = tf.config.list_physical_devices('GPU')
+        if gpus and self.use_cuda:
+            try:
+                for gpu in gpus:
+                    tf.config.experimental.set_memory_growth(gpu, True)
+            except RuntimeError as e:
+                print(f"Could not set memory growth: {e}")
+        if not self.use_cuda:
+            tf.config.set_visible_devices([], 'GPU')
+        device = "gpu" if use_cuda else "cpu"
+        print(f"Using device: {device}")
+
+        # Dataset preparation (outside strategy scope)
+        ds_h5 = record['ds']
+        labelds_h5 = record['labelds']
+        # Fully load to memory and break h5py reference
+        ds_numpy = np.array(ds_h5[:])
+        labelds_numpy = np.array(labelds_h5[:])
+
+        strategy = tf.distribute.MirroredStrategy()
+        with strategy.scope():
+            num_classes = len(record['labels'])
+            model = tf.keras.Sequential([
+                tf.keras.layers.Rescaling(1.0 / 255),
+                tf.keras.layers.Conv2D(16, 3, padding='same', activation='relu'),
+                tf.keras.layers.MaxPooling2D(),
+                tf.keras.layers.Conv2D(32, 3, padding='same', activation='relu'),
+                tf.keras.layers.MaxPooling2D(),
+                tf.keras.layers.Conv2D(64, 3, padding='same', activation='relu'),
+                tf.keras.layers.MaxPooling2D(),
+                tf.keras.layers.Flatten(),
+                tf.keras.layers.Dense(128, activation='relu'),
+                tf.keras.layers.Dense(num_classes)])
+            prog.progress(0.2)
+            model.compile(optimizer='adam',
+                          loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+                          metrics=['accuracy'])
+
         prog.progress(0.7)
-        # generate split
-        full_ds = tf.data.Dataset.from_tensor_slices((record['ds'], record['labelds']))
-        full_ds = full_ds.shuffle(1000)  # add seed=123 ?
-        count = len(full_ds)
+        # generate split using numpy arrays
+        full_ds = tf.data.Dataset.from_tensor_slices((ds_numpy, labelds_numpy))
+        full_ds = full_ds.shuffle(1000)
+        count = len(ds_numpy)
         train_size = int(count * trainingSplit)
         if batchSize < 1:
             batchSize = self.findOptimalBatchSize(model, full_ds, training=True)
@@ -85,24 +108,53 @@ def trainModelDetails(self, record, annotationName, batchSize, epochs, itemsAndA
         self.saveModel(model, modelPath)
         return history, modelPath
 
+    def _get_device(self, use_cuda):
+        if tf.config.list_physical_devices('GPU') and use_cuda:
+            return '/GPU:0'
+        return '/CPU:0'
+
     def predictLabelsForItemDetails(
-        self, batchSize, ds: h5py._hl.dataset.Dataset, item, model, prog,
+            self, batchSize, ds: h5py._hl.dataset.Dataset, indices, item, model, use_cuda, prog,
     ):
-        # print(f'Tensorflow predictLabelsForItemDetails(batchSize={batchSize}, ...)')
         if batchSize < 1:
             batchSize = self.findOptimalBatchSize(
                 model, tf.data.Dataset.from_tensor_slices(ds), training=False,
             )
             print(f'Optimal batch size for prediction = {batchSize}')
-        predictions = model.predict(
-            ds,
-            batch_size=batchSize,
-            callbacks=[_LogTensorflowProgress(
-                prog, (ds.shape[0] + batchSize - 1) // batchSize, 0.05, 0.35, item)])
-        prog.item_progress(item, 0.4)
-        # softmax to scale to 0 to 1
-        catWeights = tf.nn.softmax(predictions)
-        return catWeights, predictions
+
+        device = self._get_device(use_cuda)
+        with tf.device(device):
+            # Create a dataset that pairs the data with their indices
+            dataset = tf.data.Dataset.from_tensor_slices((ds, indices))
+            dataset = dataset.batch(batchSize)
+        
+            # Initialize arrays to store results
+            all_predictions = []
+            all_cat_weights = []
+            all_indices = []
+        
+            # Iterate through batches manually to keep track of indices
+            for data, batch_indices in dataset:
+                batch_predictions = model.predict(
+                    data,
+                    batch_size=batchSize,
+                    verbose=0)  # Set verbose=0 to avoid multiple progress bars
+            
+                # Apply softmax to scale to 0 to 1
+                batch_cat_weights = tf.nn.softmax(batch_predictions)
+            
+                all_predictions.append(batch_predictions)
+                all_cat_weights.append(batch_cat_weights)
+                all_indices.append(batch_indices)
+            
+                prog.item_progress(item, 0.4)
+        
+            # Concatenate all results
+            predictions = tf.concat(all_predictions, axis=0)
+            catWeights = tf.concat(all_cat_weights, axis=0)
+            final_indices = tf.concat(all_indices, axis=0)
+        
+            return catWeights.numpy(), predictions.numpy(), final_indices.numpy().astype(np.int64)
 
     def findOptimalBatchSize(self, model, ds, training) -> int:
         if training and self.training_optimal_batchsize is not None:
diff --git a/superpixel_classification/SuperpixelClassification/SuperpixelClassificationTorch.py b/superpixel_classification/SuperpixelClassification/SuperpixelClassificationTorch.py
index e06d247..85acfb3 100644
--- a/superpixel_classification/SuperpixelClassification/SuperpixelClassificationTorch.py
+++ b/superpixel_classification/SuperpixelClassification/SuperpixelClassificationTorch.py
@@ -66,12 +66,10 @@ class _BayesianPatchTorchModel(bbald.consistent_mc_dropout.BayesianModule):
     # A Bayesian model that takes patches (2-dimensional shape) rather than vectors
     # (1-dimensional shape) as input.  It is useful when feature != 'vector' and
     # SuperpixelClassificationBase.certainty == 'batchbald'.
-    def __init__(self, num_classes: int) -> None:
+    def __init__(self, num_classes: int, device: torch.device) -> None:
         # Set `self.device` as early as possible so that other code does not lock out
         # what we want.
-        self.device: str = torch.device(
-            ('cuda' if torch.cuda.is_available() and torch.cuda.device_count() > 0 else 'cpu'),
-        )
+        self.device : torch.device = device
         # print(f'Initial model.device = {self.device}')
         super(_BayesianPatchTorchModel, self).__init__()
 
@@ -311,7 +309,10 @@ def trainModelDetails(
         prog: ProgressHelper,
         tempdir: str,
         trainingSplit: float,
+        cuda : bool,
     ):
+        device = torch.device("cuda" if cuda else "cpu")
+        print(f"Using device: {device}")
         # make model
         num_classes: int = len(record['labels'])
         model: torch.nn.Module
@@ -507,7 +508,7 @@ def fitModel(
         return history
 
     def predictLabelsForItemDetails(
-        self, batchSize: int, ds_h5, item, model: torch.nn.Module, prog: ProgressHelper,
+        self, batchSize: int, ds_h5, item, model: torch.nn.Module, use_cuda : bool, prog: ProgressHelper,
     ):
         # print(f'Torch predictLabelsForItemDetails(batchSize={batchSize}, ...)')
         num_superpixels: int = ds_h5.shape[0]
@@ -528,6 +529,9 @@ def predictLabelsForItemDetails(
             )
             if self.certainty == 'batchbald'
             else dict(num_superpixels=num_superpixels, num_classes=num_classes)
+        # also set on model.device, ideally
+        #device = torch.device("cuda" if use_cuda else "cpu")
+
         )
         for cb in callbacks:
             cb.on_predict_begin(logs=logs)