diff --git a/README.md b/README.md
index 7cb75563..e149f4fd 100644
--- a/README.md
+++ b/README.md
@@ -118,12 +118,16 @@ evaluate-detector --trained_model_path <path-to-ckpt-file>
 
 This command assumes the trained detector model (a `.ckpt` checkpoint file) is saved in an MLflow database structure. That is, the checkpoint is assumed to be under a `checkpoints` directory, which in turn should be under a `<mlflow-experiment-hash>/<mlflow-run-hash>` directory. This will be the case if the model has been trained using the `train-detector` command.
 
-The `evaluate-detector` command will print to screen the average precision and average recall of the detector on the test set. It will also log those metrics to the MLflow database, along with the hyperparameters of the evaluation job. To visualise the MLflow summary of the evaluation job, run:
+The `evaluate-detector` command will print to screen the average precision and average recall of the detector on the validation set by default. To evaluate the model on the test set instead, use the `--use_test_set` flag.
+
+The command will also log those performance metrics to the MLflow database, along with the hyperparameters of the evaluation job. To visualise the MLflow summary of the evaluation job, run:
 ```
 mlflow ui --backend-store-uri file:///<path-to-ml-runs>
 ```
 where `<path-to-ml-runs>` is the path to the directory where the MLflow output is.
 
+The evaluated samples can be inspected visually by exporting them using the `--save__frames` flag. In this case, the frames with the predicted and ground-truth bounding boxes are saved in a directory called `evaluation_output_<timestamp>` under the current working directory.
+
 To see the full list of possible arguments to the `evaluate-detector` command, run it with the `--help` flag.
 
 ### Run detector+tracking on a video
@@ -134,7 +138,7 @@ To track crabs in a new video, using a trained detector and a tracker, run the f
 detect-and-track-video --trained_model_path <path-to-ckpt-file> --video_path <path-to-input-video>
 ```
 
-This will produce a `tracking_output_<timestamp>` directory with the output from tracking.
+This will produce a `tracking_output_<timestamp>` directory with the output from tracking under the current working directory.
 
 The tracking output consists of:
 - a .csv file named `<video-name>_tracks.csv`, with the tracked bounding boxes data;
diff --git a/bash_scripts/run_evaluation_array.sh b/bash_scripts/run_evaluation_array.sh
new file mode 100644
index 00000000..aed3f2b8
--- /dev/null
+++ b/bash_scripts/run_evaluation_array.sh
@@ -0,0 +1,116 @@
+#!/bin/bash
+
+#SBATCH -p gpu # a100 # partition
+#SBATCH --gres=gpu:1 # gpu:a100_2g.10gb  # For any GPU: --gres=gpu:1. For a specific one: --gres=gpu:rtx5000
+#SBATCH -N 1   # number of nodes
+#SBATCH --ntasks-per-node 8 # 2 # max number of tasks per node
+#SBATCH --mem 32G # memory pool for all cores
+#SBATCH -t 3-00:00 # time (D-HH:MM)
+#SBATCH -o slurm_array.%A-%a.%N.out
+#SBATCH -e slurm_array.%A-%a.%N.err
+#SBATCH --mail-type=ALL
+#SBATCH --mail-user=s.minano@ucl.ac.uk
+#SBATCH --array=0-2%3
+
+
+# NOTE on SBATCH command for array jobs
+# with "SBATCH --array=0-n%m" ---> runs n separate jobs, but not more than m at a time.
+# the number of array jobs should match the number of input files
+
+# ---------------------
+# Source bashrc
+# ----------------------
+# Otherwise `which python` points to the miniconda module's Python
+source ~/.bashrc
+
+
+# memory
+# see https://pytorch.org/docs/stable/notes/cuda.html#environment-variables
+PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+
+# -----------------------------
+# Error settings for bash
+# -----------------------------
+# see https://wizardzines.com/comics/bash-errors/
+set -e  # do not continue after errors
+set -u  # throw error if variable is unset
+set -o pipefail  # make the pipe fail if any part of it fails
+
+# ---------------------
+# Define variables
+# ----------------------
+
+# List of models to evaluate
+MLFLOW_CKPTS_FOLDER=/ceph/zoo/users/sminano/ml-runs-all/ml-runs/317777717624044570/fe9a6c2f491a4496aade5034c75316cc/checkpoints
+LIST_CKPT_FILES=("$MLFLOW_CKPTS_FOLDER"/*.ckpt)
+
+# selected model
+CKPT_PATH=${LIST_CKPT_FILES[${SLURM_ARRAY_TASK_ID}]}
+
+# destination mlflow folder
+# EXPERIMENT_NAME="Sept2023" ----> get from training job
+MLFLOW_FOLDER=/ceph/zoo/users/sminano/ml-runs-all/ml-runs
+
+# version of the codebase
+GIT_BRANCH=main
+
+# --------------------
+# Check inputs
+# --------------------
+# Check len(list of input data) matches max SLURM_ARRAY_TASK_COUNT
+# if not, exit
+if [[ $SLURM_ARRAY_TASK_COUNT -ne ${#LIST_CKPT_FILES[@]} ]]; then
+    echo "The number of array tasks does not match the number of .ckpt files"
+    exit 1
+fi
+
+# -----------------------------
+# Create virtual environment
+# -----------------------------
+module load miniconda
+
+# Define a environment for each job in the
+# temporary directory of the compute node
+ENV_NAME=crabs-dev-$SPLIT_SEED-$SLURM_ARRAY_JOB_ID
+ENV_PREFIX=$TMPDIR/$ENV_NAME
+
+# create environment
+conda create \
+    --prefix $ENV_PREFIX \
+    -y \
+    python=3.10
+
+# activate environment
+conda activate $ENV_PREFIX
+
+# install crabs package in virtual env
+python -m pip install git+https://github.com/SainsburyWellcomeCentre/crabs-exploration.git@$GIT_BRANCH
+
+
+# log pip and python locations
+echo $ENV_PREFIX
+which python
+which pip
+
+# print the version of crabs package (last number is the commit hash)
+echo "Git branch: $GIT_BRANCH"
+conda list crabs
+echo "-----"
+
+# ------------------------------------
+# GPU specs
+# ------------------------------------
+echo "Memory used per GPU before training"
+echo $(nvidia-smi --query-gpu=name,memory.total,memory.free,memory.used --format=csv) #noheader
+echo "-----"
+
+
+# -------------------
+# Run training script
+# -------------------
+echo "Evaluating trained model at $CKPT_PATH: "
+evaluate-detector  \
+ --trained_model_path $CKPT_PATH \
+ --accelerator gpu \
+ --mlflow_folder $MLFLOW_FOLDER \
+echo "-----"
diff --git a/crabs/detector/evaluate_model.py b/crabs/detector/evaluate_model.py
index 37c3cf01..58f9a9cf 100644
--- a/crabs/detector/evaluate_model.py
+++ b/crabs/detector/evaluate_model.py
@@ -4,6 +4,7 @@
 import logging
 import os
 import sys
+from pathlib import Path
 
 import lightning
 import torch
@@ -20,9 +21,13 @@
     get_cli_arg_from_ckpt,
     get_config_from_ckpt,
     get_img_directories_from_ckpt,
+    get_mlflow_experiment_name_from_ckpt,
+    get_mlflow_parameters_from_ckpt,
 )
 from crabs.detector.utils.visualization import save_images_with_boxes
 
+logging.getLogger().setLevel(logging.INFO)
+
 
 class DetectorEvaluate:
     """Interface for evaluating an object detector.
@@ -39,10 +44,17 @@ def __init__(self, args: argparse.Namespace) -> None:
         # CLI inputs
         self.args = args
 
-        # trained model
+        # trained model data
         self.trained_model_path = args.trained_model_path
+        trained_model_params = get_mlflow_parameters_from_ckpt(
+            self.trained_model_path
+        )
+        self.trained_model_run_name = trained_model_params["run_name"]
+        self.trained_model_expt_name = trained_model_params[
+            "cli_args/experiment_name"
+        ]
 
-        # config: retreieve from ckpt if not passed as CLI argument
+        # config: retrieve from ckpt if not passed as CLI argument
         self.config_file = args.config_file
         self.config = get_config_from_ckpt(
             config_file=self.config_file,
@@ -61,28 +73,38 @@ def __init__(self, args: argparse.Namespace) -> None:
             cli_arg_str="seed_n",
             trained_model_path=self.trained_model_path,
         )
+        self.evaluation_split = "test" if self.args.use_test_set else "val"
 
         # Hardware
         self.accelerator = args.accelerator
 
-        # MLflow
-        self.experiment_name = args.experiment_name
+        # MLflow experiment name and run name
+        self.experiment_name = get_mlflow_experiment_name_from_ckpt(
+            args=self.args, trained_model_path=self.trained_model_path
+        )
+        self.run_name = set_mlflow_run_name()
         self.mlflow_folder = args.mlflow_folder
 
-        # Debugging
+        # Debugging settings
         self.fast_dev_run = args.fast_dev_run
         self.limit_test_batches = args.limit_test_batches
 
+        # Log dataset information to screen
         logging.info("Dataset")
         logging.info(f"Images directories: {self.images_dirs}")
         logging.info(f"Annotation files: {self.annotation_files}")
         logging.info(f"Seed: {self.seed_n}")
+        logging.info("---------------------------------")
+
+        # Log MLflow information to screen
+        logging.info("MLflow logs for current job")
+        logging.info(f"Experiment name: {self.experiment_name}")
+        logging.info(f"Run name: {self.run_name}")
+        logging.info(f"Folder: {Path(self.mlflow_folder).resolve()}")
+        logging.info("---------------------------------")
 
     def setup_trainer(self):
         """Set up trainer object with logging for testing."""
-        # Assign run name
-        self.run_name = set_mlflow_run_name()
-
         # Setup logger
         mlf_logger = setup_mlflow_logger(
             experiment_name=self.experiment_name,
@@ -91,6 +113,25 @@ def setup_trainer(self):
             cli_args=self.args,
         )
 
+        # Add trained model section to MLflow hyperparameters
+        mlf_logger.log_hyperparams(
+            {
+                "trained_model/experiment_name": self.trained_model_expt_name,
+                "trained_model/run_name": self.trained_model_run_name,
+                "trained_model/ckpt_file": Path(self.trained_model_path).name,
+            }
+        )
+
+        # Add dataset section to MLflow hyperparameters
+        mlf_logger.log_hyperparams(
+            {
+                "dataset/images_dir": self.images_dirs,
+                "dataset/annotation_files": self.annotation_files,
+                "dataset/seed": self.seed_n,
+                "dataset/evaluation_split": self.evaluation_split,
+            }
+        )
+
         # Return trainer linked to logger
         return lightning.Trainer(
             accelerator=self.accelerator,
@@ -107,6 +148,7 @@ def evaluate_model(self) -> None:
             list_annotation_files=self.annotation_files,
             split_seed=self.seed_n,
             config=self.config,
+            no_data_augmentation=True,
         )
 
         # Get trained model
@@ -114,19 +156,34 @@ def evaluate_model(self) -> None:
             self.trained_model_path, config=self.config
         )
 
-        # Run testing
+        # Evaluate model on either the validation or the test split
         trainer = self.setup_trainer()
-        trainer.test(
-            trained_model,
-            data_module,
-        )
+        if self.args.use_test_set:
+            trainer.test(
+                trained_model,
+                data_module,
+            )
+        else:
+            trainer.validate(
+                trained_model,
+                data_module,
+            )
 
-        # Save images if required
+        # Save images with bounding boxes if required
         if self.args.save_frames:
+            # get relevant dataloader
+            if self.args.use_test_set:
+                eval_dataloader = data_module.test_dataloader()
+            else:
+                eval_dataloader = data_module.val_dataloader()
+
             save_images_with_boxes(
-                test_dataloader=data_module.test_dataloader(),
+                dataloader=eval_dataloader,
                 trained_model=trained_model,
-                output_dir=self.args.frames_output_dir,
+                output_dir=str(
+                    Path(self.args.frames_output_dir)
+                    / f"evaluation_output_{self.evaluation_split}"
+                ),
                 score_threshold=self.args.frames_score_threshold,
             )
 
@@ -205,7 +262,14 @@ def evaluate_parse_args(args):
             "the trained model is used."
         ),
     )
-
+    parser.add_argument(
+        "--use_test_set",
+        action="store_true",
+        help=(
+            "Evaluate the model on the test split, rather than on the default "
+            "validation split."
+        ),
+    )
     parser.add_argument(
         "--accelerator",
         type=str,
@@ -220,35 +284,20 @@ def evaluate_parse_args(args):
     parser.add_argument(
         "--experiment_name",
         type=str,
-        default="Sept2023_evaluation",
         help=(
             "Name of the experiment in MLflow, under which the current run "
             "will be logged. "
-            "For example, the name of the dataset could be used, to group "
-            "runs using the same data. "
-            "Default: Sept2023_evaluation"
-        ),
-    )
-    parser.add_argument(
-        "--fast_dev_run",
-        action="store_true",
-        help="Debugging option to run training for one batch and one epoch",
-    )
-    parser.add_argument(
-        "--limit_test_batches",
-        type=float,
-        default=1.0,
-        help=(
-            "Debugging option to run training on a fraction of "
-            "the training set."
-            "Default: 1.0 (all the training set)"
+            "By default: <trained_model_mlflow_experiment_name>_evaluation."
         ),
     )
     parser.add_argument(
         "--mlflow_folder",
         type=str,
         default="./ml-runs",
-        help=("Path to MLflow directory. Default: ./ml-runs"),
+        help=(
+            "Path to MLflow directory where to log the evaluation data. "
+            "Default: ./ml-runs"
+        ),
     )
     parser.add_argument(
         "--save_frames",
@@ -269,12 +318,29 @@ def evaluate_parse_args(args):
         type=str,
         default="",
         help=(
-            "Output directory for the exported frames. "
+            "Output directory for the evaluated frames, with bounding boxes. "
+            "Predicted boxes are plotted in red, and ground-truth boxes in "
+            "green. "
             "By default, the frames are saved in a "
-            "`results_<timestamp> folder "
+            "`evaluation_output_<timestamp> folder "
             "under the current working directory."
         ),
     )
+    parser.add_argument(
+        "--fast_dev_run",
+        action="store_true",
+        help="Debugging option to run training for one batch and one epoch",
+    )
+    parser.add_argument(
+        "--limit_test_batches",
+        type=float,
+        default=1.0,
+        help=(
+            "Debugging option to run training on a fraction of "
+            "the training set."
+            "Default: 1.0 (all the training set)"
+        ),
+    )
     return parser.parse_args(args)
 
 
diff --git a/crabs/detector/utils/evaluate.py b/crabs/detector/utils/evaluate.py
index e2633291..3b0d25cc 100644
--- a/crabs/detector/utils/evaluate.py
+++ b/crabs/detector/utils/evaluate.py
@@ -2,7 +2,6 @@
 
 import argparse
 import ast
-import logging
 import sys
 from pathlib import Path
 
@@ -14,8 +13,6 @@
     prep_img_directories,
 )
 
-logging.basicConfig(level=logging.INFO)
-
 
 def compute_precision_recall(class_stats: dict) -> tuple[float, float, dict]:
     """Compute precision and recall.
@@ -143,6 +140,7 @@ def get_mlflow_parameters_from_ckpt(trained_model_path: str) -> dict:
     # get parameters of the run
     run = mlrun_client.get_run(ckpt_runID)
     params = run.data.params
+    params["run_name"] = run.info.run_name
 
     return params
 
@@ -192,7 +190,7 @@ def get_config_from_ckpt(config_file: str, trained_model_path: str) -> dict:
 def get_cli_arg_from_ckpt(
     args: argparse.Namespace, cli_arg_str: str, trained_model_path: str
 ):
-    """Get CLI argument from checkpoint if not in args."""
+    """Get CLI argument from checkpoint if not passed as CLI argument."""
     if getattr(args, cli_arg_str):
         cli_arg = getattr(args, cli_arg_str)
     else:
@@ -242,3 +240,20 @@ def get_annotation_files_from_ckpt(
         input_annotation_files, dataset_dirs
     )
     return annotation_files
+
+
+def get_mlflow_experiment_name_from_ckpt(
+    args: argparse.Namespace, trained_model_path: str
+) -> str:
+    """Define MLflow experiment name from the training job.
+
+    Only used if the experiment name is not passed via CLI.
+    """
+    if args.experiment_name:
+        experiment_name = args.experiment_name
+    else:
+        params = get_mlflow_parameters_from_ckpt(trained_model_path)
+        trained_model_expt_name = params["cli_args/experiment_name"]
+        experiment_name = trained_model_expt_name + "_evaluation"
+
+    return experiment_name
diff --git a/crabs/detector/utils/visualization.py b/crabs/detector/utils/visualization.py
index 784eb76a..fc10deab 100644
--- a/crabs/detector/utils/visualization.py
+++ b/crabs/detector/utils/visualization.py
@@ -154,7 +154,7 @@ def draw_detection(
 
 
 def save_images_with_boxes(
-    test_dataloader: torch.utils.data.DataLoader,
+    dataloader: torch.utils.data.DataLoader,
     trained_model: torch.nn.Module,
     output_dir: str,
     score_threshold: float,
@@ -163,12 +163,13 @@ def save_images_with_boxes(
 
     Parameters
     ----------
-    test_dataloader : DataLoader
-        DataLoader for the test dataset.
+    dataloader : DataLoader
+        DataLoader with the images to save.
     trained_model : torch.nn.Module
         The trained object detection model.
     output_dir : str
-        Directory to save the images with bounding boxes.
+        Path to directory to save the images with bounding boxes.
+        The directory name will be added a timestamp.
     score_threshold : float
         Threshold for object detection.
 
@@ -186,14 +187,14 @@ def save_images_with_boxes(
     trained_model.to(device)
     trained_model.eval()
 
-    if not output_dir:
-        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-        output_dir = f"results_{timestamp}"
+    # set output directory
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    output_dir = f"{output_dir}_{timestamp}"
     os.makedirs(output_dir, exist_ok=True)
 
     with torch.no_grad():
         imgs_id = 0
-        for imgs, annotations in test_dataloader:
+        for imgs, annotations in dataloader:
             imgs_id += 1  # noqa: SIM113
             imgs = list(img.to(device) for img in imgs)
 
diff --git a/tests/test_unit/test_visualization.py b/tests/test_unit/test_visualization.py
index bd6728fe..6bc62639 100644
--- a/tests/test_unit/test_visualization.py
+++ b/tests/test_unit/test_visualization.py
@@ -151,8 +151,8 @@ def test_draw_detection(annotations, detections):
 
 
 @pytest.mark.parametrize(
-    "output_dir_name, expected_dir_name",
-    [("output", r"^output$"), ("", r"^results_\d{8}_\d{6}$")],
+    "output_dir_name",
+    ["output", "evaluation_output"],
 )
 @pytest.mark.parametrize(
     "detections",
@@ -176,7 +176,10 @@ def test_draw_detection(annotations, detections):
 @patch("crabs.detector.utils.visualization.cv2.imwrite")
 @patch("crabs.detector.utils.visualization.os.makedirs")
 def test_save_images_with_boxes(
-    mock_makedirs, mock_imwrite, detections, output_dir_name, expected_dir_name
+    mock_makedirs,
+    mock_imwrite,
+    detections,
+    output_dir_name,
 ):
     trained_model = MagicMock()
     test_dataloader = MagicMock()
@@ -190,7 +193,9 @@ def test_save_images_with_boxes(
     )
 
     # extract and check first positional argument to (mocked) os.makedirs
+    output_dir_regexp = re.compile(rf"{output_dir_name}_\d{{8}}_\d{{6}}$")
     input_path_makedirs = mock_makedirs.call_args[0][0]
-    assert re.match(expected_dir_name, input_path_makedirs)
+    assert output_dir_regexp.match(input_path_makedirs)
 
+    # should be called as many times as batches in the dataloader
     assert mock_imwrite.call_count == len(test_dataloader)