Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Move DAG bundle config into config, not db #44924

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 31 additions & 0 deletions airflow/config_templates/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2654,3 +2654,34 @@ usage_data_collection:
example: ~
default: "True"
see_also: ":ref:`Usage data collection FAQ <usage-data-collection>`"
dag_bundles:
description: |
Configuration for the DAG bundles. This allows Airflow to load DAGs from different sources.
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

describe that the section is important, and airflow will consume any new option you add.

add examples on how to define them

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@dstandish updated


Airflow will consume all options added to this section. Below you will see only the default,
``dags_folder``. The option name is the bundle name and the value is a json object with the following
keys:

* classpath: The classpath of the bundle class
* kwargs: The keyword arguments to pass to the bundle class
* refresh_interval: The interval in seconds to refresh the bundle from its source.

For example, to add a new bundle named ``hello`` to my Airflow instance, add the following to your
airflow.cfg (this is just an example, the classpath and kwargs are not real):

.. code-block:: ini

[dag_bundles]
hello: {classpath: "airflow.some.classpath", kwargs: {"hello": "world"}, refresh_interval: 60}
options:
dags_folder:
description: |
This is the default DAG bundle that loads DAGs from the traditional ``[core] dags_folder``.
By default, ``refresh_interval`` it set to ``[scheduler] dag_dir_list_interval``, but that can be
jedcunningham marked this conversation as resolved.
Show resolved Hide resolved
overridden here if desired.
Parsing DAGs from the DAG folder can be disabled by setting this option to an empty string.
version_added: ~
type: string
example: ~
default: '{{"classpath": "airflow.dag_processing.bundles.dagfolder.DagsFolderDagBundle",
"kwargs": {{}}}}'
3 changes: 2 additions & 1 deletion airflow/dag_processing/bundles/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,10 @@ class BaseDagBundle(ABC):

supports_versioning: bool = False

def __init__(self, *, name: str, version: str | None = None) -> None:
def __init__(self, *, name: str, refresh_interval: int, version: str | None = None) -> None:
self.name = name
self.version = version
self.refresh_interval = refresh_interval

@property
def _dag_bundle_root_storage_path(self) -> Path:
Expand Down
12 changes: 10 additions & 2 deletions airflow/dag_processing/bundles/dagfolder.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,19 @@
from __future__ import annotations

from airflow import settings
from airflow.configuration import conf
from airflow.dag_processing.bundles.local import LocalDagBundle


class DagsFolderDagBundle(LocalDagBundle):
"""A bundle for the DAGs folder."""

def __init__(self, **kwargs):
super().__init__(local_folder=settings.DAGS_FOLDER, **kwargs)
def __init__(self, refresh_interval: int | None = None, **kwargs):
if refresh_interval is None:
refresh_interval = conf.getint("scheduler", "dag_dir_list_interval")

super().__init__(
local_folder=settings.DAGS_FOLDER,
refresh_interval=refresh_interval,
**kwargs,
)
96 changes: 96 additions & 0 deletions airflow/dag_processing/bundles/manager.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
from __future__ import annotations

from typing import TYPE_CHECKING

from airflow.configuration import conf
from airflow.exceptions import AirflowConfigException
from airflow.models.dagbundle import DagBundleModel
from airflow.utils.log.logging_mixin import LoggingMixin
from airflow.utils.module_loading import import_string
from airflow.utils.session import NEW_SESSION, provide_session

if TYPE_CHECKING:
from sqlalchemy.orm import Session

from airflow.dag_processing.bundles.base import BaseDagBundle


class DagBundlesManager(LoggingMixin):
"""Manager for DAG bundles."""

@property
def bundle_configs(self) -> dict[str, dict]:
"""Get all DAG bundle configurations."""
configured_bundles = conf.getsection("dag_bundles")

if not configured_bundles:
return {}

# If dags_folder is empty string, we remove it. This allows the default dags_folder bundle to be disabled.
if not configured_bundles["dags_folder"]:
del configured_bundles["dags_folder"]

dict_bundles: dict[str, dict] = {}
for key in configured_bundles.keys():
config = conf.getjson("dag_bundles", key)
jedcunningham marked this conversation as resolved.
Show resolved Hide resolved
if not isinstance(config, dict):
raise AirflowConfigException(f"Bundle config for {key} is not a dict: {config}")
dict_bundles[key] = config

return dict_bundles

@provide_session
def sync_bundles_to_db(self, *, session: Session = NEW_SESSION) -> None:
known_bundles = {b.name: b for b in session.query(DagBundleModel).all()}

for name in self.bundle_configs.keys():
if bundle := known_bundles.get(name):
bundle.enabled = True
else:
session.add(DagBundleModel(name=name))
self.log.info("Added new DAG bundle %s to the database", name)

for name, bundle in known_bundles.items():
if name not in self.bundle_configs:
bundle.enabled = False
self.log.warning("DAG bundle %s is no longer found in config and has been disabled", name)

def get_all_dag_bundles(self) -> list[BaseDagBundle]:
"""
Get all DAG bundles.

:param session: A database session.

:return: list of DAG bundles.
"""
return [self.get_bundle(name, version=None) for name in self.bundle_configs.keys()]

def get_bundle(self, name: str, version: str | None = None) -> BaseDagBundle:
"""
Get a DAG bundle by name.

:param name: The name of the DAG bundle.
:param version: The version of the DAG bundle you need (optional). If not provided, ``tracking_ref`` will be used instead.

:return: The DAG bundle.
"""
# TODO: proper validation of the bundle configuration so we have better error messages
bundle_config = self.bundle_configs[name]
bundle_class = import_string(bundle_config["classpath"])
return bundle_class(name=name, version=version, **bundle_config["kwargs"])
4 changes: 4 additions & 0 deletions airflow/dag_processing/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -399,6 +399,10 @@ def start(self):
"Checking for new files in %s every %s seconds", self._dag_directory, self.dag_dir_list_interval
)

from airflow.dag_processing.bundles.manager import DagBundlesManager

DagBundlesManager().sync_bundles_to_db()

return self._run_parsing_loop()

def _scan_stale_dags(self):
Expand Down
25 changes: 10 additions & 15 deletions airflow/migrations/versions/0050_3_0_0_add_dagbundlemodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,8 @@

import sqlalchemy as sa
from alembic import op
from sqlalchemy_utils import UUIDType

from airflow.models.base import StringID
from airflow.utils.sqlalchemy import ExtendedJSON, UtcDateTime
from airflow.utils.sqlalchemy import UtcDateTime

revision = "e229247a6cb1"
down_revision = "eed27faa34e3"
Expand All @@ -43,27 +41,24 @@
def upgrade():
op.create_table(
"dag_bundle",
sa.Column("id", UUIDType(binary=False), nullable=False),
sa.Column("name", StringID(), nullable=False),
sa.Column("classpath", sa.String(length=1000), nullable=False),
sa.Column("kwargs", ExtendedJSON(), nullable=True),
sa.Column("refresh_interval", sa.Integer(), nullable=True),
sa.Column("name", sa.String(length=250), nullable=False),
sa.Column("enabled", sa.Boolean(), nullable=True),
sa.Column("latest_version", sa.String(length=200), nullable=True),
sa.Column("last_refreshed", UtcDateTime(timezone=True), nullable=True),
sa.PrimaryKeyConstraint("id", name=op.f("dag_bundle_pkey")),
sa.UniqueConstraint("name", name=op.f("dag_bundle_name_uq")),
sa.PrimaryKeyConstraint("name", name=op.f("dag_bundle_pkey")),
)
with op.batch_alter_table("dag", schema=None) as batch_op:
batch_op.add_column(sa.Column("bundle_id", UUIDType(binary=False), nullable=True))
batch_op.add_column(sa.Column("bundle_name", sa.String(length=250), nullable=True))
batch_op.add_column(sa.Column("latest_bundle_version", sa.String(length=200), nullable=True))
batch_op.create_foreign_key(batch_op.f("dag_bundle_id_fkey"), "dag_bundle", ["bundle_id"], ["id"])
batch_op.create_foreign_key(
batch_op.f("dag_bundle_name_fkey"), "dag_bundle", ["bundle_name"], ["name"]
)


def downgrade():
"""Unapply Add DagBundleModel."""
with op.batch_alter_table("dag", schema=None) as batch_op:
batch_op.drop_constraint(batch_op.f("dag_bundle_id_fkey"), type_="foreignkey")
batch_op.drop_constraint(batch_op.f("dag_bundle_name_fkey"), type_="foreignkey")
batch_op.drop_column("latest_bundle_version")
batch_op.drop_column("bundle_id")
batch_op.drop_column("bundle_name")

op.drop_table("dag_bundle")
3 changes: 1 addition & 2 deletions airflow/models/dag.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,6 @@
from sqlalchemy.ext.hybrid import hybrid_property
from sqlalchemy.orm import backref, relationship
from sqlalchemy.sql import Select, expression
from sqlalchemy_utils import UUIDType

from airflow import settings, utils
from airflow.configuration import conf as airflow_conf, secrets_backend_list
Expand Down Expand Up @@ -2028,7 +2027,7 @@ class DagModel(Base):
fileloc = Column(String(2000))
# The base directory used by Dag Processor that parsed this dag.
processor_subdir = Column(String(2000), nullable=True)
bundle_id = Column(UUIDType(binary=False), ForeignKey("dag_bundle.id"), nullable=True)
bundle_name = Column(StringID(), ForeignKey("dag_bundle.name"), nullable=True)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think to preserve history, we should use an association table so that when a dag object is assigned a new bundle object, the history will be preserved. Example: If a dag 'A' is in dag-bundle 'DA', and 'DA' is no longer configured or the name was changed, which triggers a new dag-bundle object, say 'DB', which now has dag 'A' in it. The DAG bundle_name will update to the new dagbundle object 'DB', causing us to lose the previous bundle name. With an association table, we can have an is_active in the table that tells whether the bundle has been removed. However, there will be more complex queries.

Another thing I thought of is using a history table like in TIH, but DAG changes more often.

# The version of the bundle the last time the DAG was parsed
latest_bundle_version = Column(String(200), nullable=True)
# String representing the owners
Expand Down
58 changes: 14 additions & 44 deletions airflow/models/dagbundle.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,58 +16,28 @@
# under the License.
from __future__ import annotations

from typing import TYPE_CHECKING

import uuid6
from sqlalchemy import Column, Integer, String
from sqlalchemy_utils import UUIDType
from sqlalchemy import Boolean, Column, String

from airflow.models.base import Base, StringID
from airflow.utils.module_loading import import_string
from airflow.utils.session import NEW_SESSION, provide_session
from airflow.utils.sqlalchemy import ExtendedJSON, UtcDateTime

if TYPE_CHECKING:
from sqlalchemy.orm import Session

from airflow.dag_processing.bundles.base import BaseDagBundle
from airflow.utils.sqlalchemy import UtcDateTime


class DagBundleModel(Base):
"""A table for DAG Bundle config."""
"""
A table for storing DAG bundle metadata.

We track the following information about each bundle, as it can be useful for
informational purposes and for debugging:
- enabled: Is the bundle currently found in configuration?
- latest_version: The latest version Airflow has seen for the bundle.
- last_refreshed: When the bundle was last refreshed.
"""

__tablename__ = "dag_bundle"
id = Column(UUIDType(binary=False), primary_key=True, default=uuid6.uuid7)
name = Column(StringID(), nullable=False, unique=True)
classpath = Column(String(1000), nullable=False)
kwargs = Column(ExtendedJSON, nullable=True)
refresh_interval = Column(Integer, nullable=True)
name = Column(StringID(), primary_key=True)
enabled = Column(Boolean, default=True)
latest_version = Column(String(200), nullable=True)
last_refreshed = Column(UtcDateTime, nullable=True)

def __init__(self, *, name, classpath, kwargs, refresh_interval):
def __init__(self, *, name: str):
self.name = name
self.classpath = classpath
self.kwargs = kwargs
self.refresh_interval = refresh_interval

@classmethod
@provide_session
def get_all_dag_bundles(
cls, *, session: Session = NEW_SESSION
) -> list[tuple[DagBundleModel, BaseDagBundle]]:
"""
Get all DAG bundles.

:param session: A database session.
:return: list of DAG bundles.
"""
bundle_configs = session.query(cls).all()

bundles = []
for bundle_config in bundle_configs:
bundle_class = import_string(bundle_config.classpath)
bundle = bundle_class(name=bundle_config.name, **bundle_config.kwargs)
bundles.append((bundle_config, bundle))

return bundles
2 changes: 1 addition & 1 deletion docs/apache-airflow/img/airflow_erd.sha256
Original file line number Diff line number Diff line change
@@ -1 +1 @@
ccb8ef5583b2a6b3ee3ab4212139c112b92953675655010a6775fffb4945b206
dc1ed8fb08456efddbcfcb0a1665b90091b5157432f11654fc4d0744baa90cdb
Loading
Loading