Skip to content

Commit

Permalink
Move DAG bundle config into config, not db
Browse files Browse the repository at this point in the history
This moves the DAG bundle config into the Airflow config, instead of
being in the db. This:

- makes it much easier to configure a fresh Airflow instance - no
  api/cli calls required
- avoids some security concerns by ensuring only deployment managers,
  with direct access to the instance, can configure these

The primary downside is this does mean you cannot reconfigure an
existing bundle in a running Airflow instance.
  • Loading branch information
jedcunningham committed Dec 13, 2024
1 parent 464e9ee commit 5b263b7
Show file tree
Hide file tree
Showing 14 changed files with 2,196 additions and 1,922 deletions.
13 changes: 13 additions & 0 deletions airflow/config_templates/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2654,3 +2654,16 @@ usage_data_collection:
example: ~
default: "True"
see_also: ":ref:`Usage data collection FAQ <usage-data-collection>`"
dag_bundles:
description: |
Configuration for the DAG bundles. This allows Airflow to load DAGs from different sources.
options:
dags_folder:
description: |
This is the default DAG bundle that loads DAGs from the traditional `[core] dags_folder`.
It can be disabled by setting it to an empty string.
version_added: ~
type: string
example: ~
default: |
{{"classpath": "airflow.dag_processing.bundles.dagfolder.DagsFolderDagBundle", "kwargs": {{}}}}
3 changes: 2 additions & 1 deletion airflow/dag_processing/bundles/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,10 @@ class BaseDagBundle(ABC):

supports_versioning: bool = False

def __init__(self, *, name: str, version: str | None = None) -> None:
def __init__(self, *, name: str, refresh_interval: int, version: str | None = None) -> None:
self.name = name
self.version = version
self.refresh_interval = refresh_interval

@property
def _dag_bundle_root_storage_path(self) -> Path:
Expand Down
12 changes: 10 additions & 2 deletions airflow/dag_processing/bundles/dagfolder.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,19 @@
from __future__ import annotations

from airflow import settings
from airflow.configuration import conf
from airflow.dag_processing.bundles.local import LocalDagBundle


class DagsFolderDagBundle(LocalDagBundle):
"""A bundle for the DAGs folder."""

def __init__(self, **kwargs):
super().__init__(local_folder=settings.DAGS_FOLDER, **kwargs)
def __init__(self, refresh_interval: int | None = None, **kwargs):
if refresh_interval is None:
refresh_interval = conf.getint("scheduler", "dag_dir_list_interval")

super().__init__(
local_folder=settings.DAGS_FOLDER,
refresh_interval=refresh_interval,
**kwargs,
)
94 changes: 94 additions & 0 deletions airflow/dag_processing/bundles/manager.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
from __future__ import annotations

from typing import TYPE_CHECKING

from airflow.configuration import conf
from airflow.exceptions import AirflowConfigException
from airflow.models.dagbundle import DagBundleModel
from airflow.utils.log.logging_mixin import LoggingMixin
from airflow.utils.module_loading import import_string
from airflow.utils.session import NEW_SESSION, provide_session

if TYPE_CHECKING:
from sqlalchemy.orm import Session

from airflow.dag_processing.bundles.base import BaseDagBundle


class DagBundlesManager(LoggingMixin):
"""Manager for Dag Bundles."""

@property
def bundle_configs(self) -> dict[str, dict]:
"""Get all DAG bundle configurations."""
configured_bundles = conf.getsection("dag_bundles")

if not configured_bundles:
return {}

# If dags_folder is empty string, we remove it. This allows the default dags_folder bundle to be disabled.
if not configured_bundles["dags_folder"]:
del configured_bundles["dags_folder"]

dict_bundles: dict[str, dict] = {}
for key in configured_bundles.keys():
config = conf.getjson("dag_bundles", key)
if not isinstance(config, dict):
raise AirflowConfigException(f"Bundle config for {key} is not a dict: {config}")
dict_bundles[key] = config

return dict_bundles

@provide_session
def sync_bundles_to_db(self, *, session: Session = NEW_SESSION) -> None:
known_bundles = {b.name: b for b in session.query(DagBundleModel).all()}

for name in self.bundle_configs.keys():
if bundle := known_bundles.get(name):
bundle.enabled = True
else:
session.add(DagBundleModel(name=name))
self.log.info("Added new DAG bundle %s to the database", name)

for name, bundle in known_bundles.items():
if name not in self.bundle_configs:
bundle.enabled = False
self.log.warning("DAG bundle %s is no longer found in config and has been disabled", name)

def get_all_dag_bundles(self) -> list[BaseDagBundle]:
"""
Get all DAG bundles.
:param session: A database session.
:return: list of DAG bundles.
"""
return [self.get_bundle(name, version=None) for name in self.bundle_configs.keys()]

def get_bundle(self, name: str, version: str | None = None) -> BaseDagBundle:
"""
Get a DAG bundle by name.
:param name: The name of the DAG bundle.
:param version: The version of the DAG bundle you need (optional). If not provided, `tracking_ref` will be used instead.
:return: The DAG bundle.
"""
# TODO: proper validation of the bundle configuration so we have better error messages
bundle_config = self.bundle_configs[name]
bundle_class = import_string(bundle_config["classpath"])
return bundle_class(name=name, version=version, **bundle_config["kwargs"])
4 changes: 4 additions & 0 deletions airflow/dag_processing/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -399,6 +399,10 @@ def start(self):
"Checking for new files in %s every %s seconds", self._dag_directory, self.dag_dir_list_interval
)

from airflow.dag_processing.bundles.manager import DagBundlesManager

DagBundlesManager().sync_bundles_to_db()

return self._run_parsing_loop()

def _scan_stale_dags(self):
Expand Down
25 changes: 10 additions & 15 deletions airflow/migrations/versions/0050_3_0_0_add_dagbundlemodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,8 @@

import sqlalchemy as sa
from alembic import op
from sqlalchemy_utils import UUIDType

from airflow.models.base import StringID
from airflow.utils.sqlalchemy import ExtendedJSON, UtcDateTime
from airflow.utils.sqlalchemy import UtcDateTime

revision = "e229247a6cb1"
down_revision = "eed27faa34e3"
Expand All @@ -43,27 +41,24 @@
def upgrade():
op.create_table(
"dag_bundle",
sa.Column("id", UUIDType(binary=False), nullable=False),
sa.Column("name", StringID(), nullable=False),
sa.Column("classpath", sa.String(length=1000), nullable=False),
sa.Column("kwargs", ExtendedJSON(), nullable=True),
sa.Column("refresh_interval", sa.Integer(), nullable=True),
sa.Column("name", sa.String(length=250), nullable=False),
sa.Column("enabled", sa.Boolean(), nullable=True),
sa.Column("latest_version", sa.String(length=200), nullable=True),
sa.Column("last_refreshed", UtcDateTime(timezone=True), nullable=True),
sa.PrimaryKeyConstraint("id", name=op.f("dag_bundle_pkey")),
sa.UniqueConstraint("name", name=op.f("dag_bundle_name_uq")),
sa.PrimaryKeyConstraint("name", name=op.f("dag_bundle_pkey")),
)
with op.batch_alter_table("dag", schema=None) as batch_op:
batch_op.add_column(sa.Column("bundle_id", UUIDType(binary=False), nullable=True))
batch_op.add_column(sa.Column("bundle_name", sa.String(length=250), nullable=True))
batch_op.add_column(sa.Column("latest_bundle_version", sa.String(length=200), nullable=True))
batch_op.create_foreign_key(batch_op.f("dag_bundle_id_fkey"), "dag_bundle", ["bundle_id"], ["id"])
batch_op.create_foreign_key(
batch_op.f("dag_bundle_name_fkey"), "dag_bundle", ["bundle_name"], ["name"]
)


def downgrade():
"""Unapply Add DagBundleModel."""
with op.batch_alter_table("dag", schema=None) as batch_op:
batch_op.drop_constraint(batch_op.f("dag_bundle_id_fkey"), type_="foreignkey")
batch_op.drop_constraint(batch_op.f("dag_bundle_name_fkey"), type_="foreignkey")
batch_op.drop_column("latest_bundle_version")
batch_op.drop_column("bundle_id")
batch_op.drop_column("bundle_name")

op.drop_table("dag_bundle")
3 changes: 1 addition & 2 deletions airflow/models/dag.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,6 @@
from sqlalchemy.ext.hybrid import hybrid_property
from sqlalchemy.orm import backref, relationship
from sqlalchemy.sql import Select, expression
from sqlalchemy_utils import UUIDType

from airflow import settings, utils
from airflow.configuration import conf as airflow_conf, secrets_backend_list
Expand Down Expand Up @@ -2028,7 +2027,7 @@ class DagModel(Base):
fileloc = Column(String(2000))
# The base directory used by Dag Processor that parsed this dag.
processor_subdir = Column(String(2000), nullable=True)
bundle_id = Column(UUIDType(binary=False), ForeignKey("dag_bundle.id"), nullable=True)
bundle_name = Column(StringID(), ForeignKey("dag_bundle.name"), nullable=True)
# The version of the bundle the last time the DAG was parsed
latest_bundle_version = Column(String(200), nullable=True)
# String representing the owners
Expand Down
50 changes: 6 additions & 44 deletions airflow/models/dagbundle.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,58 +16,20 @@
# under the License.
from __future__ import annotations

from typing import TYPE_CHECKING

import uuid6
from sqlalchemy import Column, Integer, String
from sqlalchemy_utils import UUIDType
from sqlalchemy import Boolean, Column, String

from airflow.models.base import Base, StringID
from airflow.utils.module_loading import import_string
from airflow.utils.session import NEW_SESSION, provide_session
from airflow.utils.sqlalchemy import ExtendedJSON, UtcDateTime

if TYPE_CHECKING:
from sqlalchemy.orm import Session

from airflow.dag_processing.bundles.base import BaseDagBundle
from airflow.utils.sqlalchemy import UtcDateTime


class DagBundleModel(Base):
"""A table for DAG Bundle config."""
"""A table for DAG Bundle information."""

__tablename__ = "dag_bundle"
id = Column(UUIDType(binary=False), primary_key=True, default=uuid6.uuid7)
name = Column(StringID(), nullable=False, unique=True)
classpath = Column(String(1000), nullable=False)
kwargs = Column(ExtendedJSON, nullable=True)
refresh_interval = Column(Integer, nullable=True)
name = Column(StringID(), primary_key=True)
enabled = Column(Boolean, default=True)
latest_version = Column(String(200), nullable=True)
last_refreshed = Column(UtcDateTime, nullable=True)

def __init__(self, *, name, classpath, kwargs, refresh_interval):
def __init__(self, *, name: str):
self.name = name
self.classpath = classpath
self.kwargs = kwargs
self.refresh_interval = refresh_interval

@classmethod
@provide_session
def get_all_dag_bundles(
cls, *, session: Session = NEW_SESSION
) -> list[tuple[DagBundleModel, BaseDagBundle]]:
"""
Get all DAG bundles.
:param session: A database session.
:return: list of DAG bundles.
"""
bundle_configs = session.query(cls).all()

bundles = []
for bundle_config in bundle_configs:
bundle_class = import_string(bundle_config.classpath)
bundle = bundle_class(name=bundle_config.name, **bundle_config.kwargs)
bundles.append((bundle_config, bundle))

return bundles
2 changes: 1 addition & 1 deletion docs/apache-airflow/img/airflow_erd.sha256
Original file line number Diff line number Diff line change
@@ -1 +1 @@
8f2fd91375c546b297490e701dc3853d7ba53c7cd1422ed7f7e57b9ac86f6eca
e26bbb2fb5251cc828463aa98e43bd2abbf1273e31f204687a897a1d3cf7db58
Loading

0 comments on commit 5b263b7

Please sign in to comment.