SainsburyWellcomeCentre · nikk-nikaznan · Jun 17, 2024 · Jun 17, 2024 · Jun 18, 2024 · Jun 18, 2024
diff --git a/bash_scripts/run_tracking.sh b/bash_scripts/run_tracking.sh
@@ -0,0 +1,101 @@
+#!/bin/bash
+
+#SBATCH -p gpu # a100 # partition
+#SBATCH --gres=gpu:1
+#SBATCH -N 1   # number of nodes
+#SBATCH --ntasks-per-node 8 # 2 # max number of tasks per node
+#SBATCH --mem 64G # memory pool for all cores
+#SBATCH -t 3-00:00 # time (D-HH:MM)
+#SBATCH -o slurm.%A.%N.out
+#SBATCH -e slurm.%A.%N.err
+#SBATCH --mail-type=ALL
+#SBATCH [email protected]
+
+# ---------------------
+# Source bashrc
+# ----------------------
+# Otherwise `which python` points to the miniconda module's Python
+source ~/.bashrc
+
+# memory
+# see https://pytorch.org/docs/stable/notes/cuda.html#environment-variables
+PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+
+# -----------------------------
+# Error settings for bash
+# -----------------------------
+# see https://wizardzines.com/comics/bash-errors/
+set -e  # do not continue after errors
+set -u  # throw error if variable is unset
+set -o pipefail  # make the pipe fail if any part of it fails
+
+# ---------------------
+# Define variables
+# ----------------------
+
+# video and inference config
+VIDEO_PATH=/ceph/zoo/users/sminano/crabs_tracks_label/04.09.2023-04-Right_RE_test/04.09.2023-04-Right_RE_test.mp4
+CONFIG_FILE=/ceph/zoo/users/sminano/cluster_tracking_config.yaml
+
+# checkpoint
+TRAINED_MODEL_PATH=/ceph/zoo/users/sminano/ml-runs-all/ml_runs-nikkna-copy/243676951438603508/8dbe61069f17453a87c27b4f61f6e681/checkpoints/last.ckpt
+
+
+# output directory
+OUTPUT_DIR=/ceph/zoo/users/sminano/crabs_track_output
+
+# ground truth is available
+GT_PATH=/ceph/zoo/users/sminano/crabs_tracks_label/04.09.2023-04-Right_RE_test/04.09.2023-04-Right_RE_test_corrected_ST_csv.csv
+
+# version of the codebase
+GIT_BRANCH=main
+
+# -----------------------------
+# Create virtual environment
+# -----------------------------
+module load miniconda
+
+# Define a environment for each job in the
+# temporary directory of the compute node
+ENV_NAME=crabs-dev-$SLURM_JOB_ID
+ENV_PREFIX=$TMPDIR/$ENV_NAME
+
+# create environment
+conda create \
+   --prefix $ENV_PREFIX \
+   -y \
+   python=3.10
+
+# activate environment
+conda activate $ENV_PREFIX
+
+# install crabs package in virtual env
+python -m pip install git+https://github.com/SainsburyWellcomeCentre/crabs-exploration.git@$GIT_BRANCH
+
+
+# log pip and python locations
+echo $ENV_PREFIX
+which python
+which pip
+
+# print the version of crabs package (last number is the commit hash)
+echo "Git branch: $GIT_BRANCH"
+conda list crabs
+echo "-----"
+
+# ------------------------------------
+# GPU specs
+# ------------------------------------
+echo "Memory used per GPU before training"
+echo $(nvidia-smi --query-gpu=name,memory.total,memory.free,memory.used --format=csv) #noheader
+echo "-----"
+
+# -------------------
+# Run evaluation script
+# -------------------
+detect-and-track-video  \
+ --trained_model_path $TRAINED_MODEL_PATH \
+ --video_path $VIDEO_PATH \
+ --config_file $CONFIG_FILE \
+ --output_dir $OUTPUT_DIR \
+ --gt_path $GT_PATH
diff --git a/bash_scripts/run_tracking_all_escape_events.sh b/bash_scripts/run_tracking_all_escape_events.sh
@@ -0,0 +1,104 @@
+#!/bin/bash
+
+#SBATCH -p gpu # a100 # partition
+#SBATCH --gres=gpu:1
+#SBATCH -N 1   # number of nodes
+#SBATCH --ntasks-per-node 8 # 2 # max number of tasks per node
+#SBATCH --mem 64G # memory pool for all cores
+#SBATCH -t 3-00:00 # time (D-HH:MM)
+#SBATCH -o slurm.%A.%N.out
+#SBATCH -e slurm.%A.%N.err
+#SBATCH --mail-type=ALL
+#SBATCH [email protected]
+
+# ---------------------
+# Source bashrc
+# ----------------------
+# Otherwise `which python` points to the miniconda module's Python
+source ~/.bashrc
+
+# memory
+# see https://pytorch.org/docs/stable/notes/cuda.html#environment-variables
+PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+
+# -----------------------------
+# Error settings for bash
+# -----------------------------
+# see https://wizardzines.com/comics/bash-errors/
+set -e  # do not continue after errors
+set -u  # throw error if variable is unset
+set -o pipefail  # make the pipe fail if any part of it fails
+
+# ---------------------
+# Define variables
+# ----------------------
+
+# video and inference config
+VIDEO_DIR=/ceph/zoo/raw/CrabField/ramalhete_2023/Escapes
+PATTERN="*.mov"
+CONFIG_FILE=/ceph/zoo/users/sminano/cluster_tracking_config.yaml
+
+# checkpoint
+TRAINED_MODEL_PATH=/ceph/zoo/users/sminano/ml-runs-all/ml_runs-nikkna-copy/243676951438603508/8dbe61069f17453a87c27b4f61f6e681/checkpoints/last.ckpt
+
+# output directory
+OUTPUT_DIR=/ceph/zoo/users/sminano/crabs_track_output
+
+# version of the codebase
+GIT_BRANCH=main
+
+# Check if the target is not a directory
+if [ ! -d "$VIDEO_DIR" ]; then
+  exit 1
+fi
+
+# -----------------------------
+# Create virtual environment
+# -----------------------------
+module load miniconda
+
+# Define a environment for each job in the
+# temporary directory of the compute node
+ENV_NAME=crabs-dev-$SLURM_JOB_ID
+ENV_PREFIX=$TMPDIR/$ENV_NAME
+
+# create environment
+conda create \
+    --prefix $ENV_PREFIX \
+    -y \
+    python=3.10
+
+# activate environment
+conda activate $ENV_PREFIX
+
+# install crabs package in virtual env
+python -m pip install git+https://github.com/SainsburyWellcomeCentre/crabs-exploration.git@$GIT_BRANCH
+
+# log pip and python locations
+echo $ENV_PREFIX
+which python
+which pip
+
+# print the version of crabs package (last number is the commit hash)
+echo "Git branch: $GIT_BRANCH"
+conda list crabs
+echo "-----"
+
+# ------------------------------------
+# GPU specs
+# ------------------------------------
+echo "Memory used per GPU before training"
+echo $(nvidia-smi --query-gpu=name,memory.total,memory.free,memory.used --format=csv) #noheader
+echo "-----"
+
+# -------------------
+# Run evaluation script for each .mov file in VIDEO_DIR
+# -------------------
+for VIDEO_PATH in "$VIDEO_DIR"/*.mov; do
+  echo "Processing video: $VIDEO_PATH"
+  detect-and-track-video  \
+    --trained_model_path $TRAINED_MODEL_PATH \
+    --video_path $VIDEO_PATH \
+    --config_file $CONFIG_FILE \
+    --output_dir $OUTPUT_DIR
+done
diff --git a/crabs/tracker/track_video.py b/crabs/tracker/track_video.py
@@ -63,6 +63,11 @@ def setup(self):
         """
         Load tracking config, trained model and input video path.
         """
+        # Check for CUDA availability
+        if self.device == "cuda" and not torch.cuda.is_available():
+            print("CUDA is not available. Falling back to CPU.")
+            self.device = "cpu"
+
         with open(self.config_file, "r") as f:
             self.config = yaml.safe_load(f)
 

diff --git a/guides/TrackingModelHPC.md b/guides/TrackingModelHPC.md
@@ -0,0 +1,165 @@
+# Evaluate a trained detector model in the cluster
+
+1.  **Preparatory steps**
+
+    - If you are not connected to the SWC network: connect to the SWC VPN.
+
+1.  **Connect to the SWC HPC cluster**
+
+    ```
+    ssh <SWC-USERNAME>@ssh.swc.ucl.ac.uk
+    ssh hpc-gw1
+    ```
+
+    It may ask for your password twice. To set up SSH keys for the SWC cluster, see [this guide](https://howto.neuroinformatics.dev/programming/SSH-SWC-cluster.html#ssh-keys).
+
+1.  **Download the training script from the 🦀 repository**
+
+    To do so, run any of the following commands. They will download a bash script for tracking (`run_tracking.sh` or `run_tracking_all_escape_events.sh`) to the current working directory.
+
+    The download the version of these files in the `main` branch of the [🦀 repository](https://github.com/SainsburyWellcomeCentre/crabs-exploration), run one of the following commands.
+
+    - To run video tracking on a specific video: download the `run_tracking.sh` file
+
+      ```
+      curl https://raw.githubusercontent.com/SainsburyWellcomeCentre/crabs-exploration/main/bash_scripts/run_tracking.sh > run_tracking.sh
+      ```
+
+    - To run video tracking on all escape events (or on a directory): download the `run_tracking_all_escape_events.sh` file
+
+      ```
+      curl https://raw.githubusercontent.com/SainsburyWellcomeCentre/crabs-exploration/main/bash_scripts/run_tracking_all_escape_events.sh > run_tracking_all_escape_events.sh
+      ```
+
+    These bash scripts will launch a SLURM job that:
+
+    - gets the 🦀 package from git,
+    - installs it in the compute node,
+    - and runs a video tracking on a specific video.
+
+> [!TIP]
+> To retrieve a version of these files that is different from the files at the tip of `main`, edit the remote file path in the curl command:
+>
+> - For example, to download the version of the file at the tip of a branch called `<BRANCH-NAME>`, edit the path above to replace `main` with `<BRANCH-NAME>`:
+>   ```
+>   https://raw.githubusercontent.com/SainsburyWellcomeCentre/crabs-exploration/<BRANCH-NAME>/bash_scripts/run_tracking.sh
+>   ```
+> - To download the version of the file of a specific commit, replace `main` with `blob/<COMMIT-HASH>`:
+>   ```
+>   https://raw.githubusercontent.com/SainsburyWellcomeCentre/crabs-exploration/blob/<COMMIT-HASH>/bash_scripts/run_tracking.sh
+>   ```
+
+4.  **Edit the bash script!**
+
+    For run the tracker, we need to ensure the correct trained model is used. All the parameters used in any training is logged into `mlflow`.
+
+    We can see the perfomance of each training session by inspecting the `metrics` tab in `mlflow UI` where the `training loss`, `validation precision` and `validation recall` are plotted. The trained model (`checkpoint path`) are logged in `parameters` section under `overview` tab.
+
+    When launching a tacking job, we may want to edit in the bash script:
+
+    - The `TRAINED_MODEL_PATH`
+    - The `OUTPUT_DIR`
+    - The `VIDEO_PATH` (for `run_tracking.sh`) or `VIDEO_DIR` (for `run_tracking_all_escape_events.sh`)
+
+    Less frequently, one may need to edit:
+
+    - the `CONFIG_FILE`: usually we point to the same file we used to train the model at `/ceph/zoo/users/sminano/cluster_tracking_config.yaml` which we can edit.
+    - the `GIT_BRANCH`, if we want to use a specific version of the 🦀 package. Usually we will run the version of the 🦀 package in `main`.
+
+5.  **Other Inference options**
+
+    By default, the inference will save the tracking output into a CSV file. There are other options that we can enable in CLI arguments:
+
+    - `save_video` : This will save the tracking bounding boxes for every frame into a video output.
+    - `save_frames` : This will save the corresponding frames to the CSV output. This is needed if we want to correct the tracking labels.
+
+    Additionally, if we have ground truth for the video we used, we may want to add that to get the tracking evaluation:
+
+    - `GT_PATH`
+
+    We can add all these arguments in the bash script, for example:
+
+    ```
+    detect-and-track-video  \
+    --checkpoint_path $CKPT_PATH \
+    --video_path $VIDEO_PATH \
+    --config_file $CONFIG_FILE \
+    --gt_PATH $GT_PATH
+    --device $DEVICE
+    --save_video
+    --save_frames
+    ```
+
+6.  **Run the inference job using the SLURM scheduler**
+
+    To launch a job, use the `sbatch` command with the relevant training script:
+
+    ```
+    sbatch <path-to-inference-bash-script>
+    ```
+
+7.  **Check the status of the training job**
+
+    To do this, we can:
+
+    - Check the SLURM logs: these should be created automatically in the directory from which the `sbatch` command is run.
+    - Run supporting SLURM commands (see [below](#some-useful-slurm-commands)).
+    - Check the MLFlow logs. To do this, first create or activate an existing conda environment with `mlflow` installed, and then run the `mlflow` command from the login node.
+
+      - Create and activate a conda environment.
+        ```
+        module load miniconda
+        conda create -n mlflow-env python=3.10 mlflow -y
+        conda activate mlflow-env
+        ```
+      - Run `mlflow` to visualise the results logged to the `ml-runs` folder.
+
+        - If using the "scratch" folder:
+
+          ```
+          mlflow ui --backend-store-uri file:////ceph/zoo/users/sminano/ml-runs-all/ml-runs-scratch
+          ```
+
+        - If using the selected runs folder:
+
+          ```
+          mlflow ui --backend-store-uri file:////ceph/zoo/users/sminano/ml-runs-all/ml-runs
+          ```
+
+### Some useful SLURM commands
+
+To check the status of your jobs in the queue
+
+```
+squeue -u <username>
+```
+
+To show details of the latest jobs (including completed or cancelled jobs)
+
+```
+sacct -X -u <username>
+```
+
+To specify columns to display use `--format` (e.g., `Elapsed`)
+
+```
+sacct -X --format="JobID, JobName, Partition, Account, State, Elapsed" -u <username>
+```
+
+To check specific jobs by ID
+
+```
+sacct -X -j 3813494,3813184
+```
+
+To check the time limit of the jobs submitted by a user (for example, `sminano`)
+
+```
+squeue -u sminano --format="%i %P %j %u %T %l %C %S"
+```
+
+To cancel a job
+
+```
+scancel <jobID>
+```