diff --git a/bash_scripts/run_detect_and_track_array.sh b/bash_scripts/run_detect_and_track_array.sh new file mode 100644 index 00000000..d6ad33b0 --- /dev/null +++ b/bash_scripts/run_detect_and_track_array.sh @@ -0,0 +1,172 @@ +#!/bin/bash + +#SBATCH -p gpu # # partition +#SBATCH --gres=gpu:1 # For any GPU: --gres=gpu:1. For a specific one: --gres=gpu:rtx5000 +#SBATCH -N 1 # number of nodes +#SBATCH --ntasks-per-node 8 # 2 # max number of tasks per node +#SBATCH --mem 32G # memory pool for all cores +#SBATCH -t 3-00:00 # time (D-HH:MM) +#SBATCH -o slurm_array.%A-%a.%N.out +#SBATCH -e slurm_array.%A-%a.%N.err +#SBATCH --mail-type=ALL +#SBATCH --mail-user=s.minano@ucl.ac.uk +#SBATCH --array=0-233%25 + + +# NOTE on SBATCH command for array jobs +# with "SBATCH --array=0-n%m" ---> runs n separate jobs, but not more than m at a time. +# the number of array jobs should match the number of input files + + +# memory +# see https://pytorch.org/docs/stable/notes/cuda.html#environment-variables +PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True + +# ----------------------------- +# Error settings for bash +# ----------------------------- +# see https://wizardzines.com/comics/bash-errors/ +set -e # do not continue after errors +set -u # throw error if variable is unset +set -o pipefail # make the pipe fail if any part of it fails + +# --------------------- +# Define variables +# ---------------------- + +# Path to the trained model +CKPT_PATH="/ceph/zoo/users/sminano/ml-runs-all/ml-runs/317777717624044570/40b1688a76d94bd08175cb380d0a6e0e/checkpoints/last.ckpt" + +# Path to the tracking config file +TRACKING_CONFIG_FILE="/ceph/zoo/users/sminano/cluster_tracking_config.yaml" + +# List of videos to run inference on: define VIDEOS_DIR and VIDEO_FILENAME +# NOTE: if any of the paths have spaces, put the path in quotes, but stopping and re-starting at the wildcard. +# e.g.: "/ceph/zoo/users/sminano/ml-runs-all/ml-runs-scratch/763954951706829194/"*"/checkpoints" +# e.g.: "checkpoint-epoch="*".ckpt" +# List of videos to run inference on +VIDEOS_DIR="/ceph/zoo/users/sminano/escape_clips_all" +VIDEO_FILENAME=*.mov +mapfile -t LIST_VIDEOS < <(find $VIDEOS_DIR -type f -name $VIDEO_FILENAME) + + +# Set output directory name +# by default under current working directory +OUTPUT_DIR_NAME="tracking_output_slurm_$SLURM_ARRAY_JOB_ID" + +# Select optional output +SAVE_VIDEO=true +SAVE_FRAMES=false + + +# version of the codebase +GIT_BRANCH=main + +# -------------------- +# Check inputs +# -------------------- +# Check len(list of input data) matches max SLURM_ARRAY_TASK_COUNT +# if not, exit +if [[ $SLURM_ARRAY_TASK_COUNT -ne ${#LIST_VIDEOS[@]} ]]; then + echo "The number of array tasks does not match the number of input videos" + exit 1 +fi + +# ----------------------------- +# Create virtual environment +# ----------------------------- +module load miniconda + +# Define a environment for each job in the +# temporary directory of the compute node +ENV_NAME=crabs-dev-$SLURM_ARRAY_JOB_ID-$SLURM_ARRAY_TASK_ID +ENV_PREFIX=$TMPDIR/$ENV_NAME + +# create environment +conda create \ + --prefix $ENV_PREFIX \ + -y \ + python=3.10 + +# activate environment +source activate $ENV_PREFIX + +# install crabs package in virtual env +python -m pip install git+https://github.com/SainsburyWellcomeCentre/crabs-exploration.git@$GIT_BRANCH + +# log pip and python locations +echo $ENV_PREFIX +which python +which pip + +# print the version of crabs package (last number is the commit hash) +echo "Git branch: $GIT_BRANCH" +conda list crabs +echo "-----" + +# ------------------------------------ +# GPU specs +# ------------------------------------ +echo "Memory used per GPU before training" +echo $(nvidia-smi --query-gpu=name,memory.total,memory.free,memory.used --format=csv) #noheader +echo "-----" + + +# ------------------------- +# Run evaluation script +# ------------------------- +# video used in this job +INPUT_VIDEO=${LIST_VIDEOS[${SLURM_ARRAY_TASK_ID}]} + +echo "Running inference on $INPUT_VIDEO using trained model at $CKPT_PATH" + +# Set flags based on boolean variables +if [ "$SAVE_FRAMES" = "true" ]; then + SAVE_FRAMES_FLAG="--save_frames" +else + SAVE_FRAMES_FLAG="" +fi + +if [ "$SAVE_VIDEO" = "true" ]; then + SAVE_VIDEO_FLAG="--save_video" +else + SAVE_VIDEO_FLAG="" +fi + +# run detect-and-track command +# - to save all results from the array job in the same output directory +# we use --output_dir_no_timestamp +# - the output directory is created under SLURM_SUBMIT_DIR by default +detect-and-track-video \ + --trained_model_path $CKPT_PATH \ + --video_path $INPUT_VIDEO \ + --config_file $TRACKING_CONFIG_FILE \ + --output_dir $OUTPUT_DIR_NAME \ + --output_dir_no_timestamp \ + --accelerator gpu \ + $SAVE_FRAMES_FLAG \ + $SAVE_VIDEO_FLAG + + + +# copy tracking config to output directory +shopt -s extglob # Enable extended globbing + +# get tracking config filename without extension +INPUT_VIDEO_NO_EXT="${INPUT_VIDEO##*/}" +INPUT_VIDEO_NO_EXT="${INPUT_VIDEO_NO_EXT%.*}" + +cp "$TRACKING_CONFIG_FILE" "$SLURM_SUBMIT_DIR"/"$OUTPUT_DIR_NAME"/"$INPUT_VIDEO_NO_EXT"_config.yaml + + +echo "Copied $TRACKING_CONFIG_FILE to $OUTPUT_DIR_NAME" + + +# ----------------------------- +# Delete virtual environment +# ---------------------------- +conda deactivate +conda remove \ + --prefix $ENV_PREFIX \ + --all \ + -y diff --git a/bash_scripts/run_evaluation_array.sh b/bash_scripts/run_evaluation_array.sh index c4245d80..5237f1ba 100644 --- a/bash_scripts/run_evaluation_array.sh +++ b/bash_scripts/run_evaluation_array.sh @@ -17,12 +17,6 @@ # with "SBATCH --array=0-n%m" ---> runs n separate jobs, but not more than m at a time. # the number of array jobs should match the number of input files -# --------------------- -# Source bashrc -# ---------------------- -# Otherwise `which python` points to the miniconda module's Python -# source ~/.bashrc - # memory # see https://pytorch.org/docs/stable/notes/cuda.html#environment-variables @@ -144,3 +138,13 @@ evaluate-detector \ --mlflow_folder $MLFLOW_FOLDER \ $USE_TEST_SET_FLAG echo "-----" + + +# ----------------------------- +# Delete virtual environment +# ---------------------------- +conda deactivate +conda remove \ + --prefix $ENV_PREFIX \ + --all \ + -y diff --git a/bash_scripts/run_training_array.sh b/bash_scripts/run_training_array.sh index ec8d42f5..4ceed425 100644 --- a/bash_scripts/run_training_array.sh +++ b/bash_scripts/run_training_array.sh @@ -17,12 +17,6 @@ # with "SBATCH --array=0-n%m" ---> runs n separate jobs, but not more than m at a time. # the number of array jobs should match the number of input files -# --------------------- -# Source bashrc -# ---------------------- -# Otherwise `which python` points to the miniconda module's Python -# source ~/.bashrc - # memory # see https://pytorch.org/docs/stable/notes/cuda.html#environment-variables @@ -115,3 +109,12 @@ train-detector \ --experiment_name $EXPERIMENT_NAME \ --seed_n $SPLIT_SEED \ --mlflow_folder $MLFLOW_FOLDER \ + +# ----------------------------- +# Delete virtual environment +# ---------------------------- +conda deactivate +conda remove \ + --prefix $ENV_PREFIX \ + --all \ + -y diff --git a/bash_scripts/run_training_single.sh b/bash_scripts/run_training_single.sh index f3ef94e0..30a792f3 100644 --- a/bash_scripts/run_training_single.sh +++ b/bash_scripts/run_training_single.sh @@ -12,13 +12,6 @@ #SBATCH --mail-user=s.minano@ucl.ac.uk -# --------------------- -# Source bashrc -# ---------------------- -# Otherwise `which python` points to the miniconda module's Python -# source ~/.bashrc - - # memory # see https://pytorch.org/docs/stable/notes/cuda.html#environment-variables PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True @@ -101,3 +94,12 @@ train-detector \ --experiment_name $EXPERIMENT_NAME \ --seed_n $SPLIT_SEED \ --mlflow_folder $MLFLOW_FOLDER \ + +# ----------------------------- +# Delete virtual environment +# ---------------------------- +conda deactivate +conda remove \ + --prefix $ENV_PREFIX \ + --all \ + -y diff --git a/guides/DetectAndTrackHPC.md b/guides/DetectAndTrackHPC.md new file mode 100644 index 00000000..4276b4e9 --- /dev/null +++ b/guides/DetectAndTrackHPC.md @@ -0,0 +1,114 @@ +# Run detection and tracking over a set of videos in the cluster + +1. **Preparatory steps** + + - If you are not connected to the SWC network: connect to the SWC VPN. + +2. **Connect to the SWC HPC cluster** + + ``` + ssh @ssh.swc.ucl.ac.uk + ssh hpc-gw1 + ``` + + It may ask for your password twice. To set up SSH keys for the SWC cluster, see [this guide](https://howto.neuroinformatics.dev/programming/SSH-SWC-cluster.html#ssh-keys). + +3. **Download the detect+track script from the 🦀 repository** + + To do so, run the following command, which will download a bash script called `run_detect_and_track_array.sh` to the current working directory. + ``` + curl https://raw.githubusercontent.com/SainsburyWellcomeCentre/crabs-exploration/main/bash_scripts/run_detect_and_track_array.sh > run_detect_and_track_array.sh + ``` + + This bash script launches a SLURM array job that runs detection and tracking on an array of videos. The version of the bash script downloaded is the one at the tip of the `main` branch in the [🦀 repository](https://github.com/SainsburyWellcomeCentre/crabs-exploration). + + +> [!TIP] +> To retrieve a version of the file that is different from the file at the tip of `main`, edit the remote file path in the `curl` command: +> +> - For example, to download the version of the file at the tip of a branch called ``, edit the path above to replace `main` with ``: +> ``` +> https://raw.githubusercontent.com/SainsburyWellcomeCentre/crabs-exploration//bash_scripts/run_detect_and_track_array.sh +> ``` +> - To download the version of the file of a specific commit, replace `main` with `blob/`: +> ``` +> https://raw.githubusercontent.com/SainsburyWellcomeCentre/crabs-exploration/blob//bash_scripts/run_detect_and_track_array.sh +> ``` + +4. **Edit the bash script if required** + + Ideally, we won't make major edits to the bash scripts. If we find we do, then we may want to consider moving the relevant parameters to the config file, or making them a CLI argument. + + When launching an array job, we may want to edit the following variables in the detect+track bash script: + - The `CKPT_PATH` variable, which is the path to the trained detector model. + - The `VIDEOS_DIR` variable, which defines the path to the videos directory. + - The `VIDEO_FILENAME` variable, which allows us to define a wildcard expression to select a subset of videos in the directory. See the examples in the bash script comments for the syntax. + - Remember that the number of videos to run inference on needs to match the number of jobs in the array. To change the number of jobs in the array job, edit the line that starts with `#SBATCH --array=0-n%m` and set `n` to the total number of jobs minus 1. The variable `m` refers to the number of jobs that can be run at a time. + + + Less frequently, one may need to edit: + - the `TRACKING_CONFIG_FILE`, which is the path to the tracking config to use. Usually we point to the file at `/ceph/zoo/users/sminano/cluster_tracking_config.yaml`, which we can edit. + - the `OUTPUT_DIR_NAME`, the name of the output directory in which to save the results. By default it is created under the current working directory and named `tracking_output_slurm_` (with `SLURM_ARRAY_JOB_ID` being the job ID of the array job). + - the `SAVE_VIDEO` variable, which can be `true` or `false` depending on whether we want to to save the tracked videos or not. Usually set to `true`. + - the `SAVE_FRAMES` variable, which can be `true` or `false` depending on whether we want to to save the untracked full set of frames per video or not. Usually set to `false`. + - the `GIT_BRANCH`, if we want to use a specific version of the 🦀 package. Usually we will run the version of the 🦀 package in `main`. + + Currently, there is no option to pass a list of ground truth annotations that matches the set of videos analysed. + +> [!CAUTION] +> +> If we launch a job and then modify the config file _before_ the job has been able to read it, we may be using an undesired version of the config in our job! To avoid this, it is best to wait until you can verify that the job has the expected config parameters (and then edit the file to launch a new job if needed). + + +5. **Run the job using the SLURM scheduler** + + To launch a job, use the `sbatch` command with the relevant training script: + + ``` + sbatch + ``` + +6. **Check the status of the job** + + To do this, we can: + + - Check the SLURM logs: these should be created automatically in the directory from which the `sbatch` command is run. + - Run supporting SLURM commands (see [below](#some-useful-slurm-commands)). + +### Some useful SLURM commands + +To check the status of your jobs in the queue + +``` +squeue -u +``` + +To show details of the latest jobs (including completed or cancelled jobs) + +``` +sacct -X -u +``` + +To specify columns to display use `--format` (e.g., `Elapsed`) + +``` +sacct -X --format="JobID, JobName, Partition, Account, State, Elapsed" -u +``` + +To check specific jobs by ID + +``` +sacct -X -j 3813494,3813184 +``` + +To check the time limit of the jobs submitted by a user (for example, `sminano`) + +``` +squeue -u sminano --format="%i %P %j %u %T %l %C %S" +``` + +To cancel a job + +``` +scancel +``` diff --git a/guides/EvaluatingModelsHPC.md b/guides/EvaluatingModelsHPC.md index 3220ef28..955ca155 100644 --- a/guides/EvaluatingModelsHPC.md +++ b/guides/EvaluatingModelsHPC.md @@ -24,7 +24,7 @@ > [!TIP] -> To retrieve a version of these files that is different from the files at the tip of `main`, edit the remote file path in the curl command: +> To retrieve a version of the file that is different from the file at the tip of `main`, edit the remote file path in the `curl` command: > > - For example, to download the version of the file at the tip of a branch called ``, edit the path above to replace `main` with ``: > ``` @@ -42,7 +42,7 @@ When launching an array job, we may want to edit the following variables in the bash script: - The `MLFLOW_CKPTS_FOLDER` and the `CKPT_FILENAME` variables, define which trained models we would like to evaluate. See the examples in the bash script comments for the syntax. - - The number of trained models to evaluate needs to match the number of jobs in the array. To change the number of jobs in the array job, edit the line that start with `#SBATCH --array=0-n%m`. That command specifies to run `n` separate jobs, but not more than `m` at a time. + - The number of trained models to evaluate needs to match the number of jobs in the array. To change the number of jobs in the array job, edit the line that starts with `#SBATCH --array=0-n%m` and set `n` to the total number of jobs minus 1. The variable `m` refers to the number of jobs that can be run at a time. - The `MLFLOW_FOLDER`. By default, we point to the "scratch" folder at `/ceph/zoo/users/sminano/ml-runs-all/ml-runs-scratch` . This folder holds runs that we don't need to keep. For runs we would like to keep, we will instead point to the folder at `/ceph/zoo/users/sminano/ml-runs-all/ml-runs`. Less frequently, one may need to edit: diff --git a/guides/TrainingModelsHPC.md b/guides/TrainingModelsHPC.md index bfb248cb..990d7aff 100644 --- a/guides/TrainingModelsHPC.md +++ b/guides/TrainingModelsHPC.md @@ -65,7 +65,7 @@ Additionally for an array job, one may want to edit the number of jobs in the array (by default set to 3): - - this would mean editing the line that start with `#SBATCH --array=0-n%m` in the `run_training_array.sh` script. That command specifies to run `n` separate jobs, but not more than `m` at a time. + - this would mean editing the line that start with `#SBATCH --array=0-n%m` in the `run_training_array.sh` script. You will need to set `n` to the total number of jobs minus 1. The variable `m` refers to the number of jobs that can be run at a time. - if the number of jobs in the array is edited, the variable `LIST_SEEDS` needs to be modified accordingly, otherwise we will get an error when launching the job. 1. **Edit the config YAML file if required**