diff --git a/aeon/dj_pipeline/README.md b/aeon/dj_pipeline/README.md index 7d898168..130ee387 100644 --- a/aeon/dj_pipeline/README.md +++ b/aeon/dj_pipeline/README.md @@ -89,9 +89,9 @@ animals, cameras, food patches setup, etc. + These information are either entered by hand, or parsed and inserted from configuration yaml files. + For experiments these info can be inserted by running - + [create_experiment_01](create_experiments/create_experiment_01.py) + [create_socialexperiment_0](create_experiments/create_socialexperiment_0.py) + [create_experiment_02](create_experiments/create_experiment_02.py) + + [create_socialexperiment](create_experiments/create_socialexperiment.py) (just need to do this once) Tables in DataJoint are written with a `make()` function - @@ -99,10 +99,15 @@ instruction to generate and insert new records to itself, based on data from ups Triggering the auto ingestion and processing/computation routine is essentially calling the `.populate()` method for all relevant tables. -These routines are prepared in this [auto-processing script](populate/process.py). +These routines are prepared in this [auto-processing script](populate/worker.py). Essentially, turning on the auto-processing routine amounts to running the -following 2 commands (in different processing threads) +following 4 commands , either in sequence or in parallel (with different processing threads). +Data ingestion/populate with DataJoint is idempotent, so it is safe to run the same command multiple times. - aeon_ingest high + aeon_ingest pyrat_worker - aeon_ingest mid + aeon_ingest acquisition_worker + + aeon_ingest streams_worker + + aeon_ingest analysis_worker diff --git a/aeon/dj_pipeline/create_experiments/__init__.py b/aeon/dj_pipeline/create_experiments/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/aeon/dj_pipeline/docs/PIPELINE_LOCAL_DEPLOYMENT.md b/aeon/dj_pipeline/docs/PIPELINE_LOCAL_DEPLOYMENT.md new file mode 100644 index 00000000..ad108848 --- /dev/null +++ b/aeon/dj_pipeline/docs/PIPELINE_LOCAL_DEPLOYMENT.md @@ -0,0 +1,68 @@ +# Pipeline Deployment (On-Premises) + +This page describes the processes and required resources to deploy the Project Aeon data pipeline on-premises. + +## Prerequisites + +On the most basic level, in order to deploy and operate a DataJoint pipeline, you will need: + +1. A MySQL database server (version 8.0) with configured to be DataJoint compatible + - see [here](https://github.com/datajoint/mysql-docker/blob/master/config/my.cnf) for configuration of the MySQL server to be DataJoint compatible +2. If you want to use a preconfigured Docker container ([install Docker](https://docs.docker.com/engine/install/)), run the following command: + ```bash + docker run -d \ + --name db \ + -p 3306:3306 \ + -e MYSQL_ROOT_PASSWORD=simple \ + -v ./mysql/data:/var/lib/mysql \ + datajoint/mysql:8.0 \ + mysqld --default-authentication-plugin=mysql_native_password + ``` + + A new MySQL server will be launched in a Docker Container with the following credentials: + - host: `localhost` + - username: `root` + - password: `simple` + + To stop the container, run the following command: + + ```bash + docker stop db + ``` + +3. a GitHub repository with the [codebase](https://github.com/SainsburyWellcomeCentre/aeon_mecha) of the DataJoint pipeline + - this repository is the codebase, no additional modifications are needed to deploy this codebase locally +4. file storage + - the pipeline requires a location to access/store the data files (this can be a local directory or mounted network storage) +5. compute + - you need some form of a compute environment with the right software installed to run the pipeline (this could be a laptop, local work station or an HPC cluster) + +## Download the data + +The released data for Project Aeon can be downloaded from the data repository [here](https://zenodo.org/records/13881885) + + +## Pipeline Installation & Configuration + +### Installation Instructions + +In order to run the pipeline, follow the instruction to install this codebase in the [Local set-up](../../../README.md#local-set-up) section + +### Configuration Instructions + +DataJoint requires a configuration file named `dj_local_conf.json`. This file should be located in the root directory of the codebase. + +1. Generate the `dj_local_conf.json` file: + - Make a copy of the `sample_dj_local_conf.json` file with the exact name `dj_local_conf.json`. + - Update the file with your database credentials (username, password, and database host). + - Ensure the file is kept secure and not leaked. +2. In the `custom` section, specify the `database.prefix` - you can keep the default `aeon_`. +3. In the `custom` section, update the value of `ceph_aeon` (under `repository_config`) to the root directory of the downloaded data. +For example, if you download the data to `D:/data/project-aeon/aeon/data/raw/AEON3/...`, then update `ceph_aeon` to `D:/data/project-aeon/aeon/data`. + + +## Data Ingestion & Processing + +Now that the pipeline is installed and configured, you can start ingesting and processing the downloaded data. + +Follow the instructions in the [Data Ingestion & Processing](./notebooks/Data_Ingestion_and_Processing.ipynb) guide to learn how to ingest and process the downloaded data. diff --git a/aeon/dj_pipeline/docs/notebooks/Data_Ingestion_and_Processing.ipynb b/aeon/dj_pipeline/docs/notebooks/Data_Ingestion_and_Processing.ipynb new file mode 100644 index 00000000..cb1518e1 --- /dev/null +++ b/aeon/dj_pipeline/docs/notebooks/Data_Ingestion_and_Processing.ipynb @@ -0,0 +1,532 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "d4216edb-2492-4c63-b5ea-b7071b7b2b78", + "metadata": {}, + "source": [ + "# Data Ingestion & Processing\n", + "\n", + "This notebook provides the step-by-step instruction of ingesting the data from the source and processing it to make it ready for the query and further analysis.\n", + "\n", + "Prerequisites: complete pipeline installation and configuration. See instruction here: [Pipeline Deployment](../PIPELINE_LOCAL_DEPLOYMENT.md)\n", + "\n", + "There are 3 main steps in this notebook:\n", + "1. Create a new experiment\n", + "2. Manual insert of subject information for the experiment\n", + "3. Run automated ingestion & processing" + ] + }, + { + "cell_type": "markdown", + "id": "9dab1734-83ea-4306-83db-8f6f18d214be", + "metadata": {}, + "source": [ + "## Step 1 - Create a new experiment\n", + "\n", + "This step assumes that you have downloaded the data for this experiment and configured the path correctly (see prerequisites above)\n", + "\n", + "The released data is for experiment named: `social0.2-aeon3`\n", + "\n", + "The following command will insert a new entry for `social0.2-aeon3` experiment into `acquisition.Experiment` table as well as other relevant meta information\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f84f3d3b-a8a3-49a5-aed2-917be953ca7b", + "metadata": {}, + "outputs": [], + "source": [ + "import datajoint as dj\n", + "\n", + "from aeon.dj_pipeline import subject, acquisition\n", + "from aeon.dj_pipeline.create_experiments import create_socialexperiment" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9eb99df8-4058-4014-8885-d9a393019e2d", + "metadata": {}, + "outputs": [], + "source": [ + "experiment_name = \"social0.2-aeon3\"\n", + "\n", + "create_socialexperiment(experiment_name)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "819b0d10-1a0b-480f-9762-5a2ccdfa3152", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n", + "

experiment_name

\n", + " e.g exp0-aeon3\n", + "
\n", + "

experiment_start_time

\n", + " datetime of the start of this experiment\n", + "
\n", + "

experiment_description

\n", + " \n", + "
\n", + "

arena_name

\n", + " unique name of the arena (e.g. circular_2m)\n", + "
\n", + "

lab

\n", + " Abbreviated lab name\n", + "
\n", + "

location

\n", + " \n", + "
\n", + "

experiment_type

\n", + " \n", + "
social0.2-aeon32024-03-01 16:46:12Social0.2 experiment on AEON3 machinecircle-2mSWCAEON3social
\n", + " \n", + "

Total: 1

\n", + " " + ], + "text/plain": [ + "*experiment_na experiment_sta experiment_des arena_name lab location experiment_typ\n", + "+------------+ +------------+ +------------+ +------------+ +-----+ +----------+ +------------+\n", + "social0.2-aeon 2024-03-01 16: Social0.2 expe circle-2m SWC AEON3 social \n", + " (Total: 1)" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Check `Experiment` table\n", + "acquisition.Experiment()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "61787328-d3bd-4427-8be5-0f387d98f50d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n", + "

experiment_name

\n", + " e.g exp0-aeon3\n", + "
\n", + "

directory_type

\n", + " \n", + "
\n", + "

repository_name

\n", + " \n", + "
\n", + "

directory_path

\n", + " \n", + "
\n", + "

load_order

\n", + " order of priority to load the directory\n", + "
social0.2-aeon3processedceph_aeonaeon/data/processed/AEON3/social0.20
social0.2-aeon3rawceph_aeonaeon/data/raw/AEON3/social0.21
\n", + " \n", + "

Total: 2

\n", + " " + ], + "text/plain": [ + "*experiment_na *directory_typ repository_nam directory_path load_order \n", + "+------------+ +------------+ +------------+ +------------+ +------------+\n", + "social0.2-aeon processed ceph_aeon aeon/data/proc 0 \n", + "social0.2-aeon raw ceph_aeon aeon/data/raw/ 1 \n", + " (Total: 2)" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "acquisition.Experiment.Directory()" + ] + }, + { + "cell_type": "markdown", + "id": "61a29dfc-5f25-49d0-a376-ab70d4853d01", + "metadata": {}, + "source": [ + "## Step 2 - Insert Subjects\n", + "\n", + "The experiment \"social0.2-aeon3\" features two participating animals:\n", + "- BAA-1104045\n", + "- BAA-1104047\n", + "\n", + "Let's add them" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "fbac420f-d0b8-4226-a47c-8e409f9e361b", + "metadata": {}, + "outputs": [], + "source": [ + "subject_list = [\n", + " {'subject': 'BAA-1104045',\n", + " 'sex': 'U',\n", + " 'subject_birth_date': '2024-01-01',\n", + " 'subject_description': 'Subject for Social 0.2 experiment'},\n", + " {'subject': 'BAA-1104047',\n", + " 'sex': 'U',\n", + " 'subject_birth_date': '2024-01-01',\n", + " 'subject_description': 'Subject for Social 0.2 experiment'}\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "b1dbcb90-11eb-4233-8a2a-f1a0124c3c69", + "metadata": {}, + "outputs": [], + "source": [ + "subject.Subject.insert(subject_list, skip_duplicates=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "92589a7f-cf36-4de9-b36c-02dceafbf341", + "metadata": {}, + "outputs": [], + "source": [ + "subject_experiment_list = [\n", + " {'experiment_name': 'social0.2-aeon3', 'subject': 'BAA-1104045'},\n", + " {'experiment_name': 'social0.2-aeon3', 'subject': 'BAA-1104047'}\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "5a25cc2d-1ade-445c-be90-5b73c79699d3", + "metadata": {}, + "outputs": [], + "source": [ + "acquisition.Experiment.Subject.insert(subject_experiment_list, skip_duplicates=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "b9d37c3c-8f52-4f66-9253-c33fe1437d69", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " the subjects participating in this experiment\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + "\n", + "
\n", + "

experiment_name

\n", + " e.g exp0-aeon3\n", + "
\n", + "

subject

\n", + " \n", + "
social0.2-aeon3BAA-1104045
social0.2-aeon3BAA-1104047
\n", + " \n", + "

Total: 2

\n", + " " + ], + "text/plain": [ + "*experiment_na *subject \n", + "+------------+ +------------+\n", + "social0.2-aeon BAA-1104045 \n", + "social0.2-aeon BAA-1104047 \n", + " (Total: 2)" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Check Experiment.Subject table\n", + "acquisition.Experiment.Subject()" + ] + }, + { + "cell_type": "markdown", + "id": "9af39820-95b4-42b7-b648-32ae5f47d9a6", + "metadata": {}, + "source": [ + "## Step 3 - Data Ingestion & Processing\n", + "\n", + "Data ingestion and processing is fully automated in a few prepared routines below\n", + "\n", + "Data ingestion/populate with DataJoint is idempotent, so it is safe to run the same command multiple times." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ef7f913a-2fd7-4f6e-a278-59f70b953fcf", + "metadata": {}, + "outputs": [], + "source": [ + "from aeon.dj_pipeline.populate.worker import AutomatedExperimentIngestion, acquisition_worker, streams_worker, analysis_worker" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "39309e8a-634f-4ed0-9bc2-b3535df1a44d", + "metadata": {}, + "outputs": [], + "source": [ + "AutomatedExperimentIngestion.insert1({'experiment_name': 'social0.2-aeon3'}, skip_duplicates=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ee8f48df-c355-4900-b99e-7a25c34cc651", + "metadata": {}, + "outputs": [], + "source": [ + "acquisition_worker.run()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ff41d578-6a3c-45f5-89e7-11ec95c084fd", + "metadata": {}, + "outputs": [], + "source": [ + "streams_worker.run()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3aaed1a0-f98d-453a-a26a-131834a7d154", + "metadata": {}, + "outputs": [], + "source": [ + "analysis_worker.run()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.8" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/sample_dj_local_conf.json b/sample_dj_local_conf.json new file mode 100644 index 00000000..c663fcf5 --- /dev/null +++ b/sample_dj_local_conf.json @@ -0,0 +1,18 @@ +{ + "database.host": "localhost", + "database.user": "******", + "database.password": "******", + "database.port": 3306, + "connection.init_function": null, + "database.reconnect": true, + "enable_python_native_blobs": true, + "loglevel": "INFO", + "safemode": true, + "display.limit": 20, + "display.width": 40, + "display.show_tuple_count": true, + "custom": { + "database.prefix": "aeon_", + "repository_config": {"ceph_aeon": "/ceph/aeon"} + } +}