diff --git a/api-python-sql.yaml b/api-python-sql.yaml index a974853..c764ed2 100644 --- a/api-python-sql.yaml +++ b/api-python-sql.yaml @@ -32,7 +32,7 @@ tasks: FROM read_csv_auto('{{ workingDir }}/in.csv', header=True) GROUP BY brand ORDER BY avg_price DESC; - store: true + fetchType: STORE extend: title: Extract data from a REST API, process it in Python with Polars in a diff --git a/business-automation.yaml b/business-automation.yaml index 9aa578f..1aa5b57 100644 --- a/business-automation.yaml +++ b/business-automation.yaml @@ -39,7 +39,7 @@ tasks: - id: query type: io.kestra.plugin.jdbc.sqlite.Query url: jdbc:sqlite:kestra.db - store: true + fetchType: STORE sql: | SELECT * FROM features ORDER BY release_version; diff --git a/data-engineering-pipeline.yaml b/data-engineering-pipeline.yaml index ca76b76..977e8c8 100644 --- a/data-engineering-pipeline.yaml +++ b/data-engineering-pipeline.yaml @@ -53,7 +53,7 @@ tasks: FROM read_json_auto('{{ workingDir }}/products.json') GROUP BY brand ORDER BY avg_price DESC; - store: true + fetchType: STORE extend: title: Getting started with Kestra — a Data Engineering Pipeline example diff --git a/dremio-sql-python.yaml b/dremio-sql-python.yaml index ebae8ff..d37f990 100644 --- a/dremio-sql-python.yaml +++ b/dremio-sql-python.yaml @@ -11,9 +11,8 @@ tasks: url: jdbc:dremio:direct=sql.dremio.cloud:443;ssl=true;PROJECT_ID={{vars.project_id}};schema=postgres.public username: $token password: "{{ secret('DREMIO_TOKEN') }}" - sql: SELECT first_name, last_name, hire_date, salary FROM - postgres.public.employees LIMIT 100; - store: true + sql: SELECT first_name, last_name, hire_date, salary FROM postgres.public.employees LIMIT 100; + fetchType: STORE - id: python type: io.kestra.plugin.scripts.python.Script diff --git a/http-check.yaml b/http-check.yaml index 1138f23..c1a8f47 100644 --- a/http-check.yaml +++ b/http-check.yaml @@ -1,13 +1,16 @@ id: http-check namespace: company.team + inputs: - id: uri type: URI defaults: https://kestra.io + tasks: - id: api type: io.kestra.plugin.core.http.Request uri: "{{ inputs.uri }}" + - id: check_status type: io.kestra.plugin.core.flow.If condition: "{{ outputs.api.code != 200 }}" @@ -15,6 +18,7 @@ tasks: - id: unhealthy type: io.kestra.plugin.core.log.Log message: Server unhealthy!!! Response {{ outputs.api.body }} + - id: send_slack_alert type: io.kestra.plugin.notifications.slack.SlackIncomingWebhook url: "{{ secret('SLACK_WEBHOOK') }}" @@ -27,10 +31,12 @@ tasks: - id: healthy type: io.kestra.plugin.core.log.Log message: Everything is fine! + triggers: - id: daily type: io.kestra.plugin.core.trigger.Schedule cron: 0 9 * * * + extend: title: Monitor availability of an HTTP endpoint and send a Slack alert if a service is unhealthy diff --git a/hubspot-to-bigquery.yaml b/hubspot-to-bigquery.yaml index 00e0022..1476b04 100644 --- a/hubspot-to-bigquery.yaml +++ b/hubspot-to-bigquery.yaml @@ -1,5 +1,6 @@ id: hubspot-to-bigquery namespace: company.team + tasks: - id: sync type: io.kestra.plugin.cloudquery.Sync @@ -32,10 +33,12 @@ tasks: - "*" spec: max_requests_per_second: 5 + triggers: - id: schedule type: io.kestra.plugin.core.trigger.Schedule cron: 0 6 * * * + extend: title: Sync Hubspot CRM data to BigQuery on a schedule description: >- @@ -43,21 +46,18 @@ extend: `sync` task from the CloudQuery plugin uses the `hubspot` source and the `bigquery` destination. - Note how we use the `sa.json` credentials file to authenticate with GCP and the `HUBSPOT_APP_TOKEN` environment variable to authenticate with Hubspot CRM. - To avoid rate limiting issues, you can set the `max_requests_per_second` parameter in the `hubspot` source configuration. In this example, we set it to 5 requests per second. + To avoid rate limiting issues, you can set the `max_requests_per_second` parameter in the `hubspot` source configuration. In this example, we set it to 5 requests per second. The `schedule` trigger runs the flow every day at 6:00 AM. - Additionally, you can [generate an API key](https://docs.cloudquery.io/docs/deployment/generate-api-key) to use premium plugins. You can add the API key as an environment variable: - ```yaml - id: hn_to_duckdb type: io.kestra.plugin.cloudquery.Sync diff --git a/infrastructure-automation.yaml b/infrastructure-automation.yaml index ed6dc60..ab79866 100644 --- a/infrastructure-automation.yaml +++ b/infrastructure-automation.yaml @@ -1,10 +1,12 @@ id: infrastructure-automation namespace: tutorial description: Infrastructure Automation + inputs: - id: docker_image type: STRING defaults: kestra/myimage:latest + tasks: - id: build_image type: io.kestra.plugin.docker.Build @@ -18,6 +20,7 @@ tasks: registry: https://index.docker.io/v1/ username: "{{ secret('DOCKERHUB_USERNAME') }}" password: "{{ secret('DOCKERHUB_PASSWORD') }}" + - id: run_container type: io.kestra.plugin.docker.Run pullPolicy: NEVER @@ -26,6 +29,7 @@ tasks: - pip - show - kestra + - id: run_terraform type: io.kestra.plugin.terraform.cli.TerraformCLI beforeCommands: @@ -48,24 +52,19 @@ tasks: } } - provider "http" {} - provider "local" {} - variable "pokemon_names" { type = list(string) default = ["pikachu", "psyduck", "charmander", "bulbasaur"] } - data "http" "pokemon" { count = length(var.pokemon_names) url = "https://pokeapi.co/api/v2/pokemon/${var.pokemon_names[count.index]}" } - locals { pokemon_details = [for i in range(length(var.pokemon_names)) : { name = jsondecode(data.http.pokemon[i].response_body)["name"] @@ -75,19 +74,19 @@ tasks: file_content = join("\n\n", [for detail in local.pokemon_details : "Name: ${detail.name}\nTypes: ${detail.types}"]) } - resource "local_file" "pokemon_details_file" { filename = "${path.module}/pokemon.txt" content = local.file_content } - output "file_path" { value = local_file.pokemon_details_file.filename } + - id: log_pokemon type: io.kestra.plugin.core.log.Log message: "{{ read(outputs.run_terraform.outputFiles['pokemon.txt']) }}" + extend: title: Getting started with Kestra — an Infrastructure Automation workflow example description: >- @@ -98,12 +97,8 @@ extend: The flow has four tasks: 1. The first task builds a Docker image. - 2. The second task runs a container using the image. - - 3. The third task uses Terraform to create a file with details about - Pokémon. - + 3. The third task uses Terraform to create a file with details about Pokémon. 4. The fourth task logs the details about Pokémon. tags: - Getting Started diff --git a/ingest-to-datalake-event-driven.yaml b/ingest-to-datalake-event-driven.yaml index 6da51ae..de7e8db 100644 --- a/ingest-to-datalake-event-driven.yaml +++ b/ingest-to-datalake-event-driven.yaml @@ -14,6 +14,7 @@ tasks: - id: clone_repository type: io.kestra.plugin.git.Clone url: https://github.com/kestra-io/scripts + - id: etl type: io.kestra.plugin.scripts.python.Commands warningOnStdErr: false @@ -27,6 +28,7 @@ tasks: commands: - python etl/aws_iceberg_fruit.py {{ vars.destination_prefix }}/{{ trigger.objects | jq('.[].key') | first }} + - id: merge_query type: io.kestra.plugin.aws.athena.Query accessKeyId: "{{ secret('AWS_ACCESS_KEY_ID') }}" @@ -43,6 +45,7 @@ tasks: WHEN NOT MATCHED THEN INSERT (id, fruit, berry, update_timestamp) VALUES(r.id, r.fruit, r.berry, current_timestamp); + - id: optimize type: io.kestra.plugin.aws.athena.Query accessKeyId: "{{ secret('AWS_ACCESS_KEY_ID') }}" @@ -68,13 +71,13 @@ triggers: region: "{{ secret('AWS_DEFAULT_REGION') }}" accessKeyId: "{{ secret('AWS_ACCESS_KEY_ID') }}" secretKeyId: "{{ secret('AWS_SECRET_ACCESS_KEY') }}" + extend: title: Event-driven data ingestion to AWS S3 data lake managed by Apache Iceberg, AWS Glue and Amazon Athena description: >- This workflow ingests data to an S3 data lake using a Python script. - This script is stored in a public GitHub repository so you can directly use this workflow as long as you adjust your AWS credentials, S3 bucket name and the Amazon Athena table name. The script takes the detected S3 object key diff --git a/ingest-to-datalake-git.yaml b/ingest-to-datalake-git.yaml index 12c3425..0906ffe 100644 --- a/ingest-to-datalake-git.yaml +++ b/ingest-to-datalake-git.yaml @@ -1,9 +1,11 @@ id: ingest-to-datalake-git namespace: company.team + variables: bucket: kestraio prefix: inbox database: default + tasks: - id: list_objects type: io.kestra.plugin.aws.s3.List @@ -12,6 +14,7 @@ tasks: secretKeyId: "{{ secret('AWS_SECRET_ACCESS_KEY') }}" region: "{{ secret('AWS_DEFAULT_REGION') }}" bucket: "{{ vars.bucket }}" + - id: check type: io.kestra.plugin.core.flow.If condition: "{{ outputs.list_objects.objects }}" @@ -23,6 +26,7 @@ tasks: type: io.kestra.plugin.git.Clone url: https://github.com/kestra-io/scripts branch: main + - id: ingest_to_datalake type: io.kestra.plugin.scripts.python.Commands warningOnStdErr: false @@ -35,6 +39,7 @@ tasks: containerImage: ghcr.io/kestra-io/aws:latest commands: - python etl/aws_iceberg_fruit.py + - id: merge_query type: io.kestra.plugin.aws.athena.Query accessKeyId: "{{ secret('AWS_ACCESS_KEY_ID') }}" @@ -51,6 +56,7 @@ tasks: WHEN NOT MATCHED THEN INSERT (id, fruit, berry, update_timestamp) VALUES(r.id, r.fruit, r.berry, current_timestamp); + - id: optimize type: io.kestra.plugin.aws.athena.Query accessKeyId: "{{ secret('AWS_ACCESS_KEY_ID') }}" @@ -60,6 +66,7 @@ tasks: outputLocation: s3://{{ vars.bucket }}/query_results/ query: | OPTIMIZE fruits REWRITE DATA USING BIN_PACK; + - id: move_to_archive type: io.kestra.plugin.aws.cli.AwsCLI accessKeyId: "{{ secret('AWS_ACCESS_KEY_ID') }}" @@ -68,11 +75,13 @@ tasks: commands: - aws s3 mv s3://{{ vars.bucket }}/{{ vars.prefix }}/ s3://{{ vars.bucket }}/archive/{{ vars.prefix }}/ --recursive + triggers: - id: hourly_schedule type: io.kestra.plugin.core.trigger.Schedule disabled: true cron: "@hourly" + extend: title: Ingest data to AWS S3 with Git, Python, Apache Iceberg, AWS Glue and Amazon Athena diff --git a/ingest-to-datalake-inline-python.yaml b/ingest-to-datalake-inline-python.yaml index 089921a..4b363a2 100644 --- a/ingest-to-datalake-inline-python.yaml +++ b/ingest-to-datalake-inline-python.yaml @@ -1,9 +1,11 @@ id: ingest-to-datalake-inline-python namespace: company.team + variables: bucket: kestraio prefix: inbox database: default + tasks: - id: list_objects type: io.kestra.plugin.aws.s3.List @@ -12,6 +14,7 @@ tasks: secretKeyId: "{{ secret('AWS_SECRET_ACCESS_KEY') }}" region: "{{ secret('AWS_DEFAULT_REGION') }}" bucket: "{{ vars.bucket }}" + - id: check type: io.kestra.plugin.core.flow.If condition: "{{outputs.list_objects.objects}}" @@ -28,51 +31,33 @@ tasks: containerImage: ghcr.io/kestra-io/aws:latest script: > import awswrangler as wr - from kestra import Kestra - # Iceberg table - BUCKET_NAME = "{{ vars.bucket }}" - DATABASE = "{{ vars.database }}" - TABLE = "raw_fruits" - # Iceberg table's location - S3_PATH = f"s3://{BUCKET_NAME}/{TABLE}" - S3_PATH_TMP = f"{S3_PATH}_tmp" - # File to ingest - PREFIX = "{{ vars.prefix }}" - INGEST_S3_KEY_PATH = f"s3://{BUCKET_NAME}/{PREFIX}/" - df = wr.s3.read_csv(INGEST_S3_KEY_PATH) - nr_rows = df.id.nunique() - print(f"Ingesting {nr_rows} rows") Kestra.counter("nr_rows", nr_rows, {"table": TABLE}) - df = df[~df["fruit"].isin(["Blueberry", "Banana"])] - df = df.drop_duplicates(subset=["fruit"], ignore_index=True, keep="first") - wr.catalog.delete_table_if_exists(database=DATABASE, table=TABLE) - wr.athena.to_iceberg( df=df, database=DATABASE, @@ -84,6 +69,7 @@ tasks: ) print(f"New data successfully ingested into {S3_PATH}") + - id: merge_query type: io.kestra.plugin.aws.athena.Query accessKeyId: "{{ secret('AWS_ACCESS_KEY_ID') }}" @@ -100,6 +86,7 @@ tasks: WHEN NOT MATCHED THEN INSERT (id, fruit, berry, update_timestamp) VALUES(r.id, r.fruit, r.berry, current_timestamp); + - id: optimize type: io.kestra.plugin.aws.athena.Query accessKeyId: "{{ secret('AWS_ACCESS_KEY_ID') }}" @@ -109,19 +96,21 @@ tasks: outputLocation: s3://{{ vars.bucket }}/query_results/ query: | OPTIMIZE fruits REWRITE DATA USING BIN_PACK; + - id: move_to_archive type: io.kestra.plugin.aws.cli.AwsCLI accessKeyId: "{{ secret('AWS_ACCESS_KEY_ID') }}" secretKeyId: "{{ secret('AWS_SECRET_ACCESS_KEY') }}" region: "{{ secret('AWS_DEFAULT_REGION') }}" commands: - - aws s3 mv s3://{{ vars.bucket }}/{{ vars.prefix }}/ s3://{{ - vars.bucket }}/archive/{{ vars.prefix }}/ --recursive + - aws s3 mv s3://{{ vars.bucket }}/{{ vars.prefix }}/ s3://{{ vars.bucket }}/archive/{{ vars.prefix }}/ --recursive + triggers: - id: hourly_schedule type: io.kestra.plugin.core.trigger.Schedule cron: "@hourly" disabled: true + extend: title: Ingest data to AWS S3 with Python, Apache Iceberg, AWS Glue and Athena description: >- diff --git a/input-file-upload-gcs.yaml b/input-file-upload-gcs.yaml index 74be228..d7b37d6 100644 --- a/input-file-upload-gcs.yaml +++ b/input-file-upload-gcs.yaml @@ -1,21 +1,23 @@ id: input-file-upload-gcs namespace: company.team + inputs: - id: file type: FILE - id: rename type: STRING + tasks: - id: upload type: io.kestra.plugin.gcp.gcs.Upload from: "{{ inputs.file }}" to: gs://kestra-demo/{{ inputs.rename }} + extend: title: Read a file from inputs and upload it to GCS description: >- This blueprint shows how to read a file from inputs and upload it to GCS. - > Note: Authentication to GCP can be done by setting the `GOOGLE_APPLICATION_CREDENTIALS` variable in environment (via a service account for example). diff --git a/input-file.yaml b/input-file.yaml index 7345879..1da9b3c 100644 --- a/input-file.yaml +++ b/input-file.yaml @@ -1,8 +1,10 @@ id: input-file namespace: company.team + inputs: - id: text_file type: FILE + tasks: - id: read_file type: io.kestra.plugin.scripts.shell.Commands @@ -10,6 +12,7 @@ tasks: type: io.kestra.plugin.core.runner.Process commands: - cat "{{ inputs.text_file }}" + extend: title: Read a file from inputs - a parametrized workflow with files input parameters description: This example shows how to read a file from flow inputs. diff --git a/json-from-api-to-mongodb.yaml b/json-from-api-to-mongodb.yaml index ad39bb8..cdbf4e4 100644 --- a/json-from-api-to-mongodb.yaml +++ b/json-from-api-to-mongodb.yaml @@ -1,5 +1,6 @@ id: json-from-api-to-mongodb namespace: company.team + tasks: - id: generate_json type: io.kestra.plugin.scripts.python.Script @@ -20,6 +21,7 @@ tasks: json.dump(data, output_file) Kestra.outputs({'data': data, 'status': response.status_code}) + - id: load_to_mongodb type: io.kestra.plugin.mongodb.Load connection: @@ -27,6 +29,7 @@ tasks: database: local collection: github from: "{{ outputs.generate_json.outputFiles['output.json'] }}" + extend: title: Scrape API in a Python task running in a Docker container and load the JSON document to a MongoDB collection diff --git a/kafka-realtime-trigger.yaml b/kafka-realtime-trigger.yaml index e18a674..6f7cb7f 100644 --- a/kafka-realtime-trigger.yaml +++ b/kafka-realtime-trigger.yaml @@ -1,5 +1,6 @@ id: kafka-realtime-trigger namespace: company.team + tasks: - id: insert_into_mongodb type: io.kestra.plugin.mongodb.InsertOne @@ -14,6 +15,7 @@ tasks: "category": "{{ trigger.value | jq('.product_category') | first }}", "brand": "{{ trigger.value | jq('.brand') | first }}" } + triggers: - id: realtime_trigger type: io.kestra.plugin.kafka.RealtimeTrigger @@ -23,76 +25,57 @@ triggers: serdeProperties: valueDeserializer: JSON groupId: kestraConsumer + extend: title: Use Kafka Realtime Trigger to push events into MongoDB description: > This flow will: - 1. Get - [triggered](https://kestra.io/plugins/plugin-kafka/triggers/io.kestra.plugin.kafka.realtimetrigger) + 1. Get [triggered](https://kestra.io/plugins/plugin-kafka/triggers/io.kestra.plugin.kafka.realtimetrigger) every time the event lands in Kafka 2. The flow will push the data onto a collection in MongoDB using the [InsertOne task](https://kestra.io/plugins/plugin-mongodb/tasks/io.kestra.plugin.mongodb.insertone) - To setup Apache Kafka locally, follow the instructions mentioned in the [official documentation](https://kafka.apache.org/quickstart). Once Apache Kafka is installed, you can create the `products` topic, and start producing data into the topic using the following commands: ``` - # Create topic - $ bin/kafka-topics.sh --create --topic products --bootstrap-server localhost:9092 - # Produce data into Kafka topic - $ bin/kafka-console-producer.sh --topic products --bootstrap-server localhost:9092 > {"product_id": 1, "product_name": "streamline turn-key systems", "product_category": "Electronics", "brand": "gomez"} - ``` - To setup MongoDB server locally, you can use the following docker command: - ``` - docker run -d --name my-mongo \ - - -e MONGO_INITDB_ROOT_USERNAME=mongoadmin \ - - -e MONGO_INITDB_ROOT_PASSWORD=secret \ - - -p 27017:27017 mongo - + -e MONGO_INITDB_ROOT_USERNAME=mongoadmin \ + -e MONGO_INITDB_ROOT_PASSWORD=secret \ + -p 27017:27017 mongo ``` - - You can use [MongoDB - Compass](https://www.mongodb.com/products/tools/compass) as the UI client to + You can use [MongoDB Compass](https://www.mongodb.com/products/tools/compass) as the UI client to work with MongoDB. - We are using the product JSON records generated from [products.csv](https://huggingface.co/datasets/kestra/datasets/raw/main/csv/products.csv) - in this blueprint. A sample event that can be produced into Kafka topic - `products` can be: - + in this blueprint. A sample event that can be produced into Kafka topic `products` can be: ``` - {"product_id": 1, "product_name": "streamline turn-key systems", "product_category": "Electronics", "brand": "gomez"} - ``` + tags: - Realtime Trigger - Queue diff --git a/kubernetes-script-runner.yaml b/kubernetes-script-runner.yaml index aac782f..0748fcf 100644 --- a/kubernetes-script-runner.yaml +++ b/kubernetes-script-runner.yaml @@ -1,5 +1,6 @@ id: kubernetes-script-runner namespace: company.team + tasks: - id: send_data type: io.kestra.plugin.scripts.python.Script @@ -34,7 +35,6 @@ tasks: platform = platform.platform() os_arch = f"{sys.platform}/{platform.machine()}" - def print_environment_info(): print(f"Host's network name: {host}") print(f"Python version: {py_version}") @@ -53,9 +53,9 @@ tasks: with open(filename, "w") as json_file: json.dump(env_info, json_file, indent=4) - if __name__ == '__main__': print_environment_info() + extend: title: Run a Python script in a Kubernetes pod description: >- diff --git a/limit-memory.yaml b/limit-memory.yaml index 5c1df88..f3aef5a 100644 --- a/limit-memory.yaml +++ b/limit-memory.yaml @@ -1,5 +1,6 @@ id: limit-memory namespace: company.team + tasks: - id: docker_memory type: io.kestra.plugin.scripts.python.Script @@ -11,6 +12,7 @@ tasks: script: | import time time.sleep(2) + extend: title: Limit Docker container memory to 500MB for a Python script description: The example below will use no more than 500MB of memory for the diff --git a/listen-debezium.yaml b/listen-debezium.yaml index 95da421..c47843e 100644 --- a/listen-debezium.yaml +++ b/listen-debezium.yaml @@ -1,5 +1,6 @@ id: listen-debezium namespace: company.team + tasks: - id: slack_notificaiton type: io.kestra.plugin.notifications.slack.SlackIncomingWebhook @@ -9,9 +10,11 @@ tasks: "channel": "U052JMPLBM3", "text": "{{ trigger.size }} new rows have been added to the database" } + - id: json type: io.kestra.plugin.serdes.json.IonToJson from: "{{ trigger.uris['postgres.order'] }}" + - id: python type: io.kestra.plugin.scripts.python.Script script: | @@ -21,6 +24,7 @@ tasks: data = json.load(fopen) print(data) + triggers: - id: listen_debezium type: io.kestra.plugin.debezium.postgres.Trigger @@ -33,6 +37,7 @@ triggers: snapshotMode: INITIAL format: INLINE interval: PT30S + extend: title: Use Debezium to trigger a flow whenever new entries hit a Postgres database, then send notification to Slack and process data in Python @@ -44,16 +49,11 @@ extend: notification through Slack with the number of rows ingested and then execute a Python script that read the corresponding data in json. - - This blueprint can be reproduced with the following `docker-compose.yml` setup - ``` - services: - db: image: debezium/postgres:latest restart: always @@ -67,11 +67,11 @@ extend: restart: always ports: - 8082:8080 - ``` + ``` - You can access localhost:8082 to create and edit databases or tables via the adminer interface. The database is accessible on `5433` port. + You can access localhost:8082 to create and edit databases or tables via the adminer interface. The database is accessible on `5433` port. - Note that depending of your database installation, you might need to change the `pluginName` property of the debezium plugin. Other options can be seen in corresponding documentation. + Note that depending of your database installation, you might need to change the `pluginName` property of the debezium plugin. Other options can be seen in corresponding documentation. tags: - Postgres - Trigger diff --git a/load-multiple-csv-files-into-excel.yaml b/load-multiple-csv-files-into-excel.yaml index e5c677a..2cb33ed 100644 --- a/load-multiple-csv-files-into-excel.yaml +++ b/load-multiple-csv-files-into-excel.yaml @@ -1,23 +1,29 @@ id: load-multiple-csv-files-into-excel namespace: company.team + tasks: - id: dataset1 type: io.kestra.plugin.core.http.Download uri: https://huggingface.co/datasets/kestra/datasets/raw/main/csv/products.csv + - id: dataset2 type: io.kestra.plugin.core.http.Download uri: https://huggingface.co/datasets/kestra/datasets/raw/main/csv/fruit.csv + - id: convert1 type: io.kestra.plugin.serdes.csv.CsvToIon from: "{{ outputs.dataset1.uri }}" + - id: convert2 type: io.kestra.plugin.serdes.csv.CsvToIon from: "{{ outputs.dataset2.uri }}" + - id: write_to_excel type: io.kestra.plugin.serdes.excel.IonToExcel from: Sheet_1: "{{ outputs.convert1.uri }}" Sheet_2: "{{ outputs.convert2.uri }}" + extend: title: Load multiple CSV files from an HTTP API into Excel Sheets description: > @@ -26,20 +32,14 @@ extend: `IonToExcel` plugins to download the CSV files, convert them to Ion format, and write them to an Excel file. - The flow is composed of the following tasks: 1. Download the first CSV file from an HTTP API. - 2. Download the second CSV file from an HTTP API. - 3. Convert the first CSV file to Ion format. - 4. Convert the second CSV file to Ion format. - 5. Write the Ion data to an Excel file with two sheets. - The flow can be used to download multiple CSV files from an HTTP API and load them into separate sheets in an Excel file. It's useful if you need to get data from external sources and store them in Excel format. diff --git a/load-pokemon.yaml b/load-pokemon.yaml index 6f68076..c101c6e 100644 --- a/load-pokemon.yaml +++ b/load-pokemon.yaml @@ -1,14 +1,17 @@ id: load-pokemon namespace: company.team + inputs: - id: pokemon type: STRING defaults: psyduck + tasks: - id: fetch_pokemon type: io.kestra.plugin.core.http.Request uri: https://pokeapi.co/api/v2/pokemon/{{ inputs.pokemon }} method: GET + - id: load type: io.kestra.plugin.mongodb.InsertOne connection: @@ -16,6 +19,7 @@ tasks: database: local collection: pokemon document: "{{ outputs.fetch_pokemon.body }}" + extend: title: Extract JSON data from an API and load it as a document to MongoDB description: >- diff --git a/load-to-cloud-storage.yaml b/load-to-cloud-storage.yaml index b588344..f92a3be 100644 --- a/load-to-cloud-storage.yaml +++ b/load-to-cloud-storage.yaml @@ -1,20 +1,22 @@ id: load-to-cloud-storage namespace: company.team + tasks: - id: data type: io.kestra.plugin.core.http.Download uri: https://huggingface.co/datasets/kestra/datasets/raw/main/csv/orders.csv + - id: cloud_storage type: io.kestra.plugin.gcp.gcs.Upload from: "{{ outputs.data.uri }}" to: gs://kestra-demo/data.csv + extend: title: Download data and upload to Google Cloud Storage description: >- This blueprint shows how to download a CSV file via http Download, and upload it to GCS. - > Note: Authentication to GCP can be done by setting the `GOOGLE_APPLICATION_CREDENTIALS` variable in environment (via a service account for example). diff --git a/log-flow.yaml b/log-flow.yaml index 7a6c590..d5eb136 100644 --- a/log-flow.yaml +++ b/log-flow.yaml @@ -1,9 +1,11 @@ id: log-flow namespace: company.team + tasks: - id: log type: io.kestra.plugin.core.log.Log message: Hello world! + extend: title: Log content in the console description: A simple example to show how to display message in the console. diff --git a/loguru.yaml b/loguru.yaml index d8d98ed..ae5a8cf 100644 --- a/loguru.yaml +++ b/loguru.yaml @@ -1,9 +1,11 @@ id: loguru namespace: company.team + inputs: - id: nr_logs type: INT defaults: 100 + tasks: - id: reproducer type: io.kestra.plugin.scripts.python.Script @@ -13,22 +15,14 @@ tasks: containerImage: ghcr.io/kestra-io/pydata:latest script: > from loguru import logger - from faker import Faker - import time - import sys - logger.remove() - logger.add(sys.stdout, level="INFO") - logger.add(sys.stderr, level="WARNING") - - def generate_logs(fake, num_logs): logger.debug("This message will not show up as the log level is set to INFO") logger.warning("Starting to generate log messages") @@ -38,10 +32,10 @@ tasks: time.sleep(0.01) logger.warning("Finished generating log messages") - if __name__ == "__main__": faker_ = Faker() generate_logs(faker_, int("{{ inputs.nr_logs }}")) + extend: title: Logging configuration in a Python script using Loguru description: >- @@ -50,7 +44,6 @@ extend: by default, 100 random log messages, but this number of logs can be changed at runtime using the input parameter `nr_logs`. - - The `warningOnStdErr` property is set to `false` to prevent the `Script` task from failing when the `logger.warning` method is used. @@ -61,7 +54,6 @@ extend: - The `script` property contains the Python code that will be executed by the `Script` task. - The log level is set to `INFO` in the `Script` task. Therefore, the `logger.debug` message will NOT show up in the logs. The `logger.warning` messages will be translated to WARN-level logs in Kestra. The `logger.info` diff --git a/manage-aiven-resources-from-cli.yaml b/manage-aiven-resources-from-cli.yaml index 04d6ba8..38990a5 100644 --- a/manage-aiven-resources-from-cli.yaml +++ b/manage-aiven-resources-from-cli.yaml @@ -1,5 +1,6 @@ id: manage-aiven-resources-from-cli namespace: company.team + tasks: - id: cli type: io.kestra.plugin.scripts.python.Commands @@ -17,10 +18,12 @@ tasks: --power-on env: AVN_AUTH_TOKEN: "{{ secret('AVN_AUTH_TOKEN') }}" + triggers: - id: every_morning type: io.kestra.plugin.core.trigger.Schedule cron: 0 9 * * * + extend: title: Manage Aiven resources from the CLI — start and stop services or databases on schedule @@ -32,25 +35,20 @@ extend: API token. It's recommended to use Secrets to store sensitive data such as API tokens. - Once you've configured the Aiven secret, you can reproduce this flow without any changes. - The first command is great to test the setup — the command will just list your Aiven projects. However, there is a lot more you can do with the Aiven CLI. Check out the [Aiven CLI guide](https://aiven.io/developer/aiven-cmdline) for more information. - For example, you can use it to start and stop specific services in your Aiven projects using scheduled flows in Kestra. This is useful if you want to save money by stopping your services when you don't need them, e.g. at night or during the weekend. - - You can also use the CLI to create and delete services or databases on - demand. + You can also use the CLI to create and delete services or databases on demand. tags: - CLI - Python diff --git a/metrics-from-shell-commands.yaml b/metrics-from-shell-commands.yaml index d9330de..7ebb60c 100644 --- a/metrics-from-shell-commands.yaml +++ b/metrics-from-shell-commands.yaml @@ -1,16 +1,17 @@ id: metrics-from-shell-commands namespace: company.team + tasks: - id: process type: io.kestra.plugin.scripts.shell.Commands commands: - echo '::{"metrics":[{"name":"count","type":"counter","value":1}]}::' + extend: title: Expose custom metrics from a Shell script description: > This blueprint shows how to expose metrics within a Shell script. - Metrics are intended to track custom numeric (metric `type: counter`) or duration (metric `type: timer`) attributes that you may want to visualize across task runs and flow executions. diff --git a/microservices-and-apis.yaml b/microservices-and-apis.yaml index 31c4e84..ee6d781 100644 --- a/microservices-and-apis.yaml +++ b/microservices-and-apis.yaml @@ -1,6 +1,7 @@ id: microservices-and-apis namespace: tutorial description: Microservices and APIs + inputs: - id: server_uri type: URI @@ -8,6 +9,7 @@ inputs: - id: slack_webhook_uri type: URI defaults: https://reqres.in/api/slack + tasks: - id: http_status_check type: io.kestra.plugin.core.flow.AllowFailure @@ -15,6 +17,7 @@ tasks: - id: http_request type: io.kestra.plugin.core.http.Request uri: "{{ inputs.server_uri }}" + - id: check_status type: io.kestra.plugin.core.flow.If condition: "{{ outputs.http_request.code != 200 }}" @@ -22,6 +25,7 @@ tasks: - id: unhealthy type: io.kestra.plugin.core.log.Log message: Server is unhealthy! Response {{ outputs.http_request.body }} + - id: send_slack_alert type: io.kestra.plugin.notifications.slack.SlackIncomingWebhook url: "{{ inputs.slack_webhook_uri }}" @@ -34,6 +38,7 @@ tasks: - id: healthy type: io.kestra.plugin.core.log.Log message: Everything is fine! + errors: - id: server_unreachable type: io.kestra.plugin.notifications.slack.SlackIncomingWebhook @@ -43,25 +48,23 @@ tasks: "channel": "#alerts", "text": "The server {{ inputs.server_uri }} is unreachable!" } + triggers: - id: daily type: io.kestra.plugin.core.trigger.Schedule disabled: true cron: 0 9 * * * + extend: title: Getting started with Kestra — a Microservices and APIs workflow example description: >- This flow is a simple example of a microservices and APIs use case. It checks the health of a server and sends a Slack alert if the server is down. - The flow has two tasks: - 1. The first task checks the health of a server. - 2. The second task sends a Slack alert if the server is down. - The flow also has a trigger that runs the flow daily at 9:00 AM to check the server's health regularly. tags: diff --git a/monthly-sales-report.yaml b/monthly-sales-report.yaml index a7a9c01..ad01a44 100644 --- a/monthly-sales-report.yaml +++ b/monthly-sales-report.yaml @@ -1,7 +1,9 @@ id: monthly-sales-report namespace: company.team + variables: bucket: kestraio + tasks: - id: raw_data_to_s3 type: io.kestra.plugin.scripts.python.Script @@ -14,16 +16,11 @@ tasks: AWS_DEFAULT_REGION: "{{ secret('AWS_DEFAULT_REGION') }}" script: > import requests - import boto3 - from kestra import Kestra - BUCKET = "{{ vars.bucket }}" - - def extract_and_upload(file): url = f"https://huggingface.co/datasets/kestra/datasets/blob/main/{file}" @@ -37,6 +34,7 @@ tasks: filename = f"monthly_orders/2023_{str(month).zfill(2)}.csv" extract_and_upload(filename) Kestra.outputs({f"{filename}": f"s3://{BUCKET}/{filename}"}) + - id: query type: io.kestra.plugin.jdbc.duckdb.Query sql: | @@ -49,11 +47,13 @@ tasks: FROM read_csv_auto('s3://kestraio/monthly_orders/*.csv', FILENAME = 1) GROUP BY 1 ORDER BY 2 desc; - store: true + fetchType: STORE timeout: PT30S + - id: csv type: io.kestra.plugin.serdes.csv.IonToCsv from: "{{ outputs.query.uri }}" + - id: email type: io.kestra.plugin.notifications.mail.MailSend subject: The monthly sales report is ready @@ -71,10 +71,12 @@ tasks: Please find attached the current sales report.

Best regards,
Data Team + triggers: - id: monthly type: io.kestra.plugin.core.trigger.Schedule cron: 0 9 1 * * + extend: title: Upload data to S3 in Python using boto3, transform it in a SQL query with DuckDB and send a CSV report via email every first day of the month @@ -82,19 +84,14 @@ extend: Replace the S3 bucket `kestraio` with your bucket name to reproduce the example. - This flow assumes: - - an in-process DuckDB - - AWS credentials with S3 access permissions stored using Kestra Secret. - If you use [MotherDuck](https://motherduck.com/) and [MotherDuck's managed S3 secrets](https://motherduck.com/docs/authenticating-to-s3), you can replace the `query` task with the following simpler configuration: - ```yaml - id: query type: io.kestra.plugin.jdbc.duckdb.Query @@ -103,7 +100,7 @@ extend: FROM read_csv_auto('s3://{{vars.bucket}}/monthly_orders/*.csv', FILENAME = 1) GROUP BY 1 ORDER BY 2 desc; - store: true + fetchType: STORE timeout: PT30S url: "jdbc:duckdb:md:my_db?motherduck_token={{ secret('MOTHERDUCK_TOKEN') }}" ``` diff --git a/motherduck.yaml b/motherduck.yaml index dad0429..e73354d 100644 --- a/motherduck.yaml +++ b/motherduck.yaml @@ -8,7 +8,7 @@ tasks: FROM sample_data.hn.hacker_news GROUP BY by ORDER BY nr_comments DESC; - store: true + fetchType: STORE - id: csv type: io.kestra.plugin.serdes.csv.IonToCsv from: "{{ outputs.query.uri }}" diff --git a/new-shell.yaml b/new-shell.yaml index 7cbf05b..6811ebb 100644 --- a/new-shell.yaml +++ b/new-shell.yaml @@ -1,8 +1,10 @@ id: new-shell namespace: company.team + variables: project_id: myProjectId region: eu-west-2 + tasks: - id: shell type: io.kestra.plugin.scripts.shell.Commands @@ -13,25 +15,20 @@ tasks: serviceAccount: "{{ secret('GOOGLE_SA') }}" commands: - echo "Hello World" + extend: title: Run a Shell script on Google Cloud with Cloud Run description: >- This flow runs a simple Shell command in a Cloud Run container. - The `containerImage` property is required because Cloud Run executes tasks as containers. You can use any image from a public or private registry. - Your service account needs to have the following IAM roles attached to use the service: - - Cloud Run Developer - - Logs Viewer - - Storage Admin (to upload files to GCS and download files from GCS) - - Owner/Editor of the Compute Engine default service account (to be able to provision compute resources for the Cloud Run container) tags: diff --git a/node-custom-package.yaml b/node-custom-package.yaml index 8dbe8bb..b5bcd11 100644 --- a/node-custom-package.yaml +++ b/node-custom-package.yaml @@ -1,5 +1,6 @@ id: node-custom-package namespace: company.team + tasks: - id: script type: io.kestra.plugin.scripts.node.Script @@ -18,6 +19,7 @@ tasks: script: | import colors from 'colors'; console.log(colors.red("Hello")); + extend: title: Install custom Node packages from package.json before running a Node.js script diff --git a/notify-about-github-stars-via-slack.yaml b/notify-about-github-stars-via-slack.yaml index 290b04e..7f83c7e 100644 --- a/notify-about-github-stars-via-slack.yaml +++ b/notify-about-github-stars-via-slack.yaml @@ -1,9 +1,11 @@ id: notify-about-github-stars-via-slack namespace: company.team + inputs: - id: repo type: STRING defaults: kestra-io/kestra + tasks: - id: api_query type: io.kestra.plugin.core.http.Request @@ -11,17 +13,17 @@ tasks: headers: User-Agent: kestra uri: https://api.github.com/repos/{{inputs.repo}} + - id: get_stars type: io.kestra.plugin.core.log.Log - message: ✨✨✨ Total GitHub stars {{json(outputs.api_query.body).stargazers_count - }} ✨✨✨ + message: ✨✨✨ Total GitHub stars {{ json(outputs.api_query.body).stargazers_count }} ✨✨✨ + extend: title: Extract field from JSON object in API call and pass to subsequent task description: >+ A common use-case may be to retrieve a specific field from a JSON payload in an API request and use that further downstream. - In this simple example we will query the number of stars for a given GitHub repo and then output it as a message. diff --git a/on-demand-cluster-job.yaml b/on-demand-cluster-job.yaml index 1cd4bc7..f036d6c 100644 --- a/on-demand-cluster-job.yaml +++ b/on-demand-cluster-job.yaml @@ -1,5 +1,6 @@ id: on-demand-cluster-job namespace: company.team + tasks: - id: create_cluster type: io.kestra.plugin.databricks.cluster.CreateCluster @@ -10,6 +11,7 @@ tasks: nodeTypeId: n2-highmem-4 numWorkers: 1 sparkVersion: 13.0.x-scala2.12 + - id: allow_failure type: io.kestra.plugin.core.flow.AllowFailure tasks: @@ -25,12 +27,14 @@ tasks: pythonFile: /Shared/hello.py sparkPythonTaskSource: WORKSPACE waitForCompletion: PT5M + - id: delete_cluster type: io.kestra.plugin.databricks.cluster.DeleteCluster authentication: token: "{{ secret('DATABRICKS_TOKEN') }}" host: "{{ secret('DATABRICKS_HOST') }}" clusterId: "{{ outputs.create_cluster.clusterId }}" + extend: title: Run a task on an on-demand Databricks cluster description: >- @@ -44,7 +48,6 @@ extend: minutes (as declared on the `waitForCompletion` property) for the task to complete. - Even if the job fails, the `AllowFailure` tasks ensures that Databricks cluster will be deleted in the end. tags: diff --git a/on-failure-alert.yaml b/on-failure-alert.yaml index 44054a7..6f91011 100644 --- a/on-failure-alert.yaml +++ b/on-failure-alert.yaml @@ -1,5 +1,6 @@ id: on-failure-alert namespace: company.team + tasks: - id: fail type: io.kestra.plugin.scripts.shell.Commands @@ -7,6 +8,7 @@ tasks: type: io.kestra.plugin.core.runner.Process commands: - exit 1 + errors: - id: slack type: io.kestra.plugin.notifications.slack.SlackIncomingWebhook @@ -16,6 +18,7 @@ errors: "channel": "#alerts", "text": "Failure alert for flow {{ flow.namespace }}.{{ flow.id }} with ID {{ execution.id }}" } + extend: title: "Error handling: send Slack alert on failure" description: This flow will fail and the `errors` section declares tasks that diff --git a/openai-dall-e-create-image.yaml b/openai-dall-e-create-image.yaml index 16944f0..29fbf88 100644 --- a/openai-dall-e-create-image.yaml +++ b/openai-dall-e-create-image.yaml @@ -1,5 +1,6 @@ id: openai-dall-e-create-image namespace: company.team + tasks: - id: puppy type: io.kestra.plugin.openai.CreateImage @@ -7,6 +8,7 @@ tasks: n: 1 download: true prompt: the cutest little happy smiling puppy + extend: title: Create an image using OpenAI's DALL-E description: >- @@ -14,10 +16,8 @@ extend: you set the download attribute to `true`, the image will be available for download from the Outputs tab on the Executions page. - Example result: - ![dog](https://storage.googleapis.com/strapi--kestra-prd/dog_bf751be6a4/dog_bf751be6a4.png) tags: - AI diff --git a/openai.yaml b/openai.yaml index 30fc11f..9fddb76 100644 --- a/openai.yaml +++ b/openai.yaml @@ -1,21 +1,23 @@ id: openai namespace: company.team + tasks: - id: prompt type: io.kestra.plugin.openai.ChatCompletion apiKey: "{{ secret('OPENAI_API_KEY') }}" model: gpt-4 prompt: Explain in one sentence why data engineers build data pipelines + - id: use_output type: io.kestra.plugin.core.log.Log message: "{{ outputs.prompt.choices | jq('.[].message.content') | first }}" + extend: title: Send a prompt to OpenAI's ChatCompletion API description: >- This flow will send a prompt to OpenAI. You can select the desired model and additional configuration such as temperature. - The next task shows how you can retrieve the message content from the API response. tags: diff --git a/opsgenie-notify-on-failure.yaml b/opsgenie-notify-on-failure.yaml index b6a5ee3..6bded87 100644 --- a/opsgenie-notify-on-failure.yaml +++ b/opsgenie-notify-on-failure.yaml @@ -1,5 +1,6 @@ id: opsgenie-notify-on-failure namespace: company.team + tasks: - id: send_notification type: io.kestra.plugin.notifications.opsgenie.OpsgenieExecution @@ -23,6 +24,7 @@ tasks: - Execution authorizationToken: sampleAuthorizationToken executionId: "{{ trigger.executionId }}" + triggers: - id: on_failure type: io.kestra.plugin.core.trigger.Flow @@ -34,40 +36,33 @@ triggers: - type: io.kestra.plugin.core.condition.ExecutionNamespaceCondition namespace: company comparison: PREFIX + extend: title: Send a notification via Opsgenie when a workflow fails description: >- This system flow will send a notification via Opsgenie anytime a workflow in a `company` namespace (or any nested child namespace) fails. - Using this pattern, you can send notifications for Kestra workflow execution failures alongside other notifications. - You can customize that system flow by modifying the task, adding more tasks to the flow or adjusting the trigger conditions. Read more about that pattern in the [Administrator Guide](https://kestra.io/docs/administrator-guide/monitoring). - Let's create a flow in the namespace with prefix `company` that will always fail. - ```yaml - id: failure_flow - namespace: company.team - tasks: - id: always_fails type: io.kestra.plugin.core.execution.Fail ``` - Whenever you run the `failure_flow`, it will trigger an execution of the `opsgenie_notify_on_failure` flow. As a result, a notification will be sent using Opsgenie so that prompt action can be taken. diff --git a/outputs-from-shell-commands.yaml b/outputs-from-shell-commands.yaml index 9e54cda..616c59b 100644 --- a/outputs-from-shell-commands.yaml +++ b/outputs-from-shell-commands.yaml @@ -1,19 +1,21 @@ id: outputs-from-shell-commands namespace: company.team + tasks: - id: process type: io.kestra.plugin.scripts.shell.Commands commands: - echo '::{"outputs":{"test":"value","int":2,"bool":true,"float":3.65}}::' + - id: return type: io.kestra.plugin.core.debug.Return format: "{{ outputs.process.vars.test }}" + extend: title: Create custom outputs from a Shell script description: >- This blueprint shows how to expose custom outputs from a shell script. - The `::{"outputs":{"test":"value"}}::` allow to expose your data in task output. Those outputs are accessible through the `{{ outputs..vars. }}` command in other tasks. diff --git a/papermill-notebook.yaml b/papermill-notebook.yaml index d950bae..b731887 100644 --- a/papermill-notebook.yaml +++ b/papermill-notebook.yaml @@ -1,5 +1,6 @@ id: papermill-notebook namespace: company.team + tasks: - id: python type: io.kestra.plugin.scripts.python.Commands @@ -12,13 +13,13 @@ tasks: - papermill src/example.ipynb.py output.ipynb -k python3.12.0 outputFiles: - output.ipynb + extend: title: Run a Papermill notebook description: >- This blueprint shows how to execute a Jupyter Notebook within a Kestra flow using the Papermill library. - Here we use Namespace Files where we created the `src/example.ipynb.py` notebook. We expose the outputs of the notebook execution into the `output.ipynb` file. diff --git a/parallel-files.yaml b/parallel-files.yaml index e011c01..1e0a727 100644 --- a/parallel-files.yaml +++ b/parallel-files.yaml @@ -1,5 +1,6 @@ id: parallel-files namespace: company.team + tasks: - id: bash type: io.kestra.plugin.scripts.shell.Commands @@ -13,6 +14,7 @@ tasks: - echo "Hello from 2" >> out/output2.txt - echo "Hello from 3" >> out/output3.txt - echo "Hello from 4" >> out/output4.txt + - id: each type: io.kestra.plugin.core.flow.EachParallel value: "{{ outputs.bash.outputFiles | jq('.[]') }}" @@ -20,18 +22,19 @@ tasks: - id: path type: io.kestra.plugin.core.debug.Return format: "{{ taskrun.value }}" + - id: contents type: io.kestra.plugin.scripts.shell.Commands taskRunner: type: io.kestra.plugin.core.runner.Process commands: - cat "{{ taskrun.value }}" + extend: title: Process files in parallel description: > This example demonstrates how to process files in parallel. - In the `bash` task, we generate multiple files, and store them in the internal storage. @@ -39,7 +42,6 @@ extend: tasks `path` and `contents` run for each of the 4 output files, resulting in 8 parallel task runs. - Instead of the `bash` script, you may have a Python/R/Node.js script that generates such files. tags: diff --git a/parallel-python.yaml b/parallel-python.yaml index 3b7e4d0..e31104e 100644 --- a/parallel-python.yaml +++ b/parallel-python.yaml @@ -1,5 +1,6 @@ id: parallel-python namespace: company.team + tasks: - id: parallel type: io.kestra.plugin.core.flow.EachParallel @@ -23,6 +24,7 @@ tasks: containerImage: ghcr.io/kestra-io/pydata:latest commands: - python parametrized.py --num {{ taskrun.value }} + extend: title: Add a parametrized Python script as a Namespace File and run it in parallel in Docker containers @@ -31,30 +33,18 @@ extend: in parallel with different parameter `values` using a Python script added as a Namespace File. - Here is the content of the `parametrized.py` script: - ```python - import argparse - parser = argparse.ArgumentParser() - - parser.add_argument("--num", type=int, default=42, help="Enter an integer") - - args = parser.parse_args() - result = args.num * 2 - print(result) - ``` - You can add that file directly from the embedded Visual Studio Code Editor in the Kestra UI. tags: diff --git a/parallel-sequences.yaml b/parallel-sequences.yaml index 0de1e6d..579b316 100644 --- a/parallel-sequences.yaml +++ b/parallel-sequences.yaml @@ -1,5 +1,6 @@ id: parallel-sequences namespace: company.team + tasks: - id: parallel type: io.kestra.plugin.core.flow.Parallel @@ -10,24 +11,27 @@ tasks: - id: task1 type: io.kestra.plugin.core.debug.Return format: "{{ task.id }}" + - id: task2 type: io.kestra.plugin.core.debug.Return format: "{{ task.id }}" + - id: sequence2 type: io.kestra.plugin.core.flow.Sequential tasks: - id: task3 type: io.kestra.plugin.core.debug.Return format: "{{ task.id }}" + - id: task4 type: io.kestra.plugin.core.debug.Return format: "{{ task.id }}" + extend: title: Run two sequences in parallel description: >- This blueprint shows how to run two independent task sequences in parallel. - The two sequences, sequence1 and sequence2, start in parallel. The tasks in these sequences however run one after the other serially, i.e. task2 starts after task1 finishes, and task4 starts after task3 finishes. diff --git a/parallel-tasks.yaml b/parallel-tasks.yaml index c73a221..89d87aa 100644 --- a/parallel-tasks.yaml +++ b/parallel-tasks.yaml @@ -1,5 +1,6 @@ id: parallel-tasks namespace: company.team + tasks: - id: parallel type: io.kestra.plugin.core.flow.Parallel @@ -7,9 +8,11 @@ tasks: - id: task1 type: io.kestra.plugin.core.debug.Return format: "{{ task.id }}" + - id: task2 type: io.kestra.plugin.core.debug.Return format: "{{ task.id }}" + extend: title: Run two tasks in parallel description: This blueprints show how to run two tasks in parallel. diff --git a/parallelSubflows.yaml b/parallelSubflows.yaml index 772ac28..dedd298 100644 --- a/parallelSubflows.yaml +++ b/parallelSubflows.yaml @@ -1,5 +1,6 @@ id: parallelSubflows namespace: company.team + tasks: - id: parallel type: io.kestra.plugin.core.flow.Parallel @@ -8,76 +9,68 @@ tasks: type: io.kestra.plugin.core.flow.Subflow flowId: flow1 namespace: company.team + - id: flow2 type: io.kestra.plugin.core.flow.Subflow flowId: flow2 namespace: company.team + - id: flow3 type: io.kestra.plugin.core.flow.Subflow flowId: flow3 namespace: company.team + pluginDefaults: - type: io.kestra.plugin.core.flow.Flow values: namespace: company.team wait: true transmitFailed: true + extend: title: Run multiple subflows in parallel and wait for their completion - use taskDefaults to avoid boilerplate code description: >- Add the child flows first: - First flow: ```yaml - id: flow1 - namespace: company.team tasks: - - - id: get - type: io.kestra.plugin.core.debug.Return - format: hi from {{ flow.id }} + - id: get + type: io.kestra.plugin.core.debug.Return + format: hi from {{ flow.id }} ``` - Second flow: ```yaml - id: flow2 - namespace: company.team tasks: - - - id: get - type: io.kestra.plugin.core.debug.Return - format: hi from {{ flow.id }} + - id: get + type: io.kestra.plugin.core.debug.Return + format: hi from {{ flow.id }} ``` - Third flow: ```yaml - id: flow3 - namespace: company.team tasks: - - - id: get - type: io.kestra.plugin.core.debug.Return - format: hi from {{ flow.id }} + - id: get + type: io.kestra.plugin.core.debug.Return + format: hi from {{ flow.id }} ``` - - Then run the parent flow `parallelSubflows` to trigger multiple subflows in - parallel. + + Then run the parent flow `parallelSubflows` to trigger multiple subflows in parallel. + tags: - Parallel ee: false diff --git a/parametrized-flow-with-multiple-schedules.yaml b/parametrized-flow-with-multiple-schedules.yaml index 3fd228b..a17cc84 100644 --- a/parametrized-flow-with-multiple-schedules.yaml +++ b/parametrized-flow-with-multiple-schedules.yaml @@ -1,14 +1,17 @@ id: parametrized-flow-with-multiple-schedules namespace: company.team + inputs: - id: user type: STRING defaults: Data Engineer required: false + tasks: - id: hello type: io.kestra.plugin.core.log.Log message: Hello {{ inputs.user }} from Kestra! + triggers: - id: quarter_hourly type: io.kestra.plugin.core.trigger.Schedule @@ -16,6 +19,7 @@ triggers: cron: "*/15 * * * *" inputs: name: user + - id: every_minute type: io.kestra.plugin.core.trigger.Schedule disabled: true @@ -23,6 +27,7 @@ triggers: inputs: name: user value: custom value + extend: title: Parametrized workflow with multiple schedules description: > @@ -30,12 +35,9 @@ extend: console. The flow has two scheduled attached to it: - - one that runs every 15 minutes with the default input parameter value - - another one that runs every 1 minute with a custom input parameter value - Note that both schedules are currently disabled. To start scheduling the flow, set the `disabled` property to `false` or diff --git a/parquet-duckdb-to-excel.yaml b/parquet-duckdb-to-excel.yaml index 479e985..76c1b4f 100644 --- a/parquet-duckdb-to-excel.yaml +++ b/parquet-duckdb-to-excel.yaml @@ -1,27 +1,24 @@ id: parquet-duckdb-to-excel namespace: company.team + tasks: - id: parquet_duckdb type: io.kestra.plugin.jdbc.duckdb.Query sql: > INSTALL parquet; - LOAD parquet; - INSTALL httpfs; - LOAD httpfs; - SELECT * - FROM read_parquet('https://huggingface.co/datasets/kestra/datasets/resolve/main/jaffle-large/raw_items.parquet?download=true') - LIMIT 1000000; - store: true + fetchType: STORE + - id: duckdb_to_excel type: io.kestra.plugin.serdes.excel.IonToExcel from: "{{ outputs.parquet_duckdb.uri }}" + extend: title: Extract and transform a Parquet file using DuckDB and export it in Excel format diff --git a/parse-image-metadata-using-apache-tika.yaml b/parse-image-metadata-using-apache-tika.yaml index 9a66175..658a61a 100644 --- a/parse-image-metadata-using-apache-tika.yaml +++ b/parse-image-metadata-using-apache-tika.yaml @@ -1,9 +1,11 @@ id: parse-image-metadata-using-apache-tika namespace: company.team + tasks: - id: get_image type: io.kestra.plugin.core.http.Download uri: https://kestra.io/blogs/2023-05-31-beginner-guide-kestra.jpg + - id: tika type: io.kestra.plugin.tika.Parse from: "{{ outputs.get_image.uri }}" @@ -11,6 +13,7 @@ tasks: contentType: TEXT ocrOptions: strategy: OCR_AND_TEXT_EXTRACTION + extend: title: Extract image metadata using Apache Tika description: This flow extracts metadata from an image using Apache Tika. diff --git a/parse-pdf.yaml b/parse-pdf.yaml index bd25f72..a6c10ae 100644 --- a/parse-pdf.yaml +++ b/parse-pdf.yaml @@ -1,17 +1,21 @@ id: parse-pdf namespace: company.team + tasks: - id: download_pdf type: io.kestra.plugin.core.http.Download uri: https://huggingface.co/datasets/kestra/datasets/resolve/main/pdf/app_store.pdf + - id: parse_text type: io.kestra.plugin.tika.Parse from: "{{ outputs.download_pdf.uri }}" contentType: TEXT store: false + - id: log_extracted_text type: io.kestra.plugin.core.log.Log message: "{{ outputs.parse_text.result.content }}" + extend: title: Download a PDF file and extract text from it using Apache Tika description: |- diff --git a/parse-twitter-json-payload.yaml b/parse-twitter-json-payload.yaml index f612c56..97667e8 100644 --- a/parse-twitter-json-payload.yaml +++ b/parse-twitter-json-payload.yaml @@ -1,5 +1,6 @@ id: parse-twitter-json-payload namespace: company.team + inputs: - id: json type: JSON @@ -13,27 +14,32 @@ inputs: "previous_token": "77qp8" } } + tasks: - id: jq_filter type: io.kestra.plugin.core.log.Log message: | {{ inputs.json | jq('.meta | has("next_token")') | first }} + - id: contains type: io.kestra.plugin.core.debug.Return format: | - {{inputs.json["meta"] contains "next_token"}} + {{ inputs.json["meta"] contains "next_token" }} + - id: contains_if_else_operator type: io.kestra.plugin.core.debug.Return format: | {% if inputs.json["meta"] contains "next_token" %} true {% else %} false {% endif %} + - id: is_not_null_operator type: io.kestra.plugin.core.debug.Return format: | {% if inputs.json["meta"]["next_token"] is not null %} true {% endif %} + extend: title: Check if a given key exists in a JSON REST API payload description: >- @@ -42,7 +48,6 @@ extend: accomplishing that — a JQuery filter, a `contains` operator, and an `is not null` operator. - Check the [Expressions](https://kestra.io/docs/concepts/expression) documentation for more examples. tags: diff --git a/pass-data-between-subflows.yaml b/pass-data-between-subflows.yaml index 4f752f1..d3ef0b6 100644 --- a/pass-data-between-subflows.yaml +++ b/pass-data-between-subflows.yaml @@ -1,31 +1,30 @@ id: pass-data-between-subflows namespace: company.team + tasks: - id: call_child_flow type: io.kestra.plugin.core.flow.Subflow namespace: company.team flowId: child_flow wait: true + - id: log type: io.kestra.plugin.core.log.Log message: "{{ outputs.call_child_flow.outputs.data_from_child_flow }}" + extend: title: Pass data between subflows — use outputs from the child flow in a parent flow description: >- First, create a child flow: - ```yaml - id: child_flow - namespace: company.team tasks: - id: return_data type: io.kestra.plugin.core.debug.Return format: this is a secret message returned from {{ flow.id }} - outputs: - id: data_from_child_flow @@ -33,7 +32,6 @@ extend: value: "{{ outputs.return_data.value }}" ``` - Then, you can run this parent flow that will retrieve data from the subflow and store it under a specified key. In this example, the subflow uses the key `data_from_child_flow`. diff --git a/pass-data-between-tasks.yaml b/pass-data-between-tasks.yaml index eacf78a..9bdf157 100644 --- a/pass-data-between-tasks.yaml +++ b/pass-data-between-tasks.yaml @@ -1,9 +1,11 @@ id: pass-data-between-tasks namespace: company.team + tasks: - id: pass_output type: io.kestra.plugin.core.debug.Return format: hello + - id: py_outputs type: io.kestra.plugin.scripts.python.Script taskRunner: @@ -20,17 +22,20 @@ tasks: with open('myoutput.json', 'w') as f: json.dump(my_kv_pair, f) + - id: take_inputs type: io.kestra.plugin.core.log.Log message: > data from previous tasks: {{ outputs.pass_output.value }} and {{ outputs.py_outputs.vars.mykey }} + - id: check_output_file type: io.kestra.plugin.scripts.shell.Commands taskRunner: type: io.kestra.plugin.core.runner.Process commands: - cat {{ outputs.py_outputs.outputFiles['myoutput.json'] }} + extend: title: Pass data between Python script tasks and Shell tasks using Outputs description: >+ @@ -39,19 +44,16 @@ extend: The first two tasks return some outputs and the next 2 tasks read those values for further processing. - Check the "Outputs" section in each task documentation to see what outputs it returns, and check the "Outputs" tab on the Execution page to validate what outputs are generated by this flow. - In case of the `Return` task, it returns a value under the `value` key. - - All script tasks, including Python, return a map of outputs under the `vars` key. To access outputs in the downstream tasks, use the format `{{ outputs.task_name.vars.key_name }}`. Additionally, script tasks can return files as shown with the `myoutput.json` file. - tags: - Python - Outputs diff --git a/pip-packages-docker.yaml b/pip-packages-docker.yaml index eef4570..336bcb1 100644 --- a/pip-packages-docker.yaml +++ b/pip-packages-docker.yaml @@ -1,5 +1,6 @@ id: pip-packages-docker namespace: company.team + tasks: - id: run_python type: io.kestra.plugin.scripts.python.Script @@ -13,6 +14,7 @@ tasks: response = requests.get("https://api.github.com") data = response.json() print(data) + extend: title: Docker container installing pip packages before starting a Python Script task description: > @@ -21,7 +23,6 @@ extend: box and you can add several `beforeCommands` to install custom Pip packages, and prepare the environment for the task. - Adding `warningOnStdErr: false` ensures that warnings raised during pip package installation don't set the task to a `WARNING` state. However, by default, any warning raised during the setup process (i.e. when executing diff --git a/postgres-s3-python-git.yaml b/postgres-s3-python-git.yaml index 26c7194..1915092 100644 --- a/postgres-s3-python-git.yaml +++ b/postgres-s3-python-git.yaml @@ -1,5 +1,6 @@ id: postgres-s3-python-git namespace: company.team + tasks: - id: wdir type: io.kestra.plugin.core.flow.WorkingDirectory @@ -8,6 +9,7 @@ tasks: type: io.kestra.plugin.git.Clone url: https://github.com/kestra-io/scripts branch: main + - id: get_users type: io.kestra.plugin.scripts.python.Commands taskRunner: @@ -16,6 +18,7 @@ tasks: warningOnStdErr: false commands: - python etl/get_users_from_api.py + - id: save_users_pg type: io.kestra.plugin.scripts.python.Commands beforeCommands: @@ -28,6 +31,7 @@ tasks: DB_PASSWORD: "{{ secret('DB_PASSWORD') }}" DB_HOST: host.docker.internal DB_PORT: "5432" + extend: title: Extract data from an API and load it to Postgres using Python, Git and Docker (passing custom environment variables to the container) @@ -40,7 +44,6 @@ extend: 2. The second Python script reads that extracted raw data file and loads it to Postgres using Python and Pandas. - **The benefits of this approach:** - your **orchestration logic** (YAML) is decoupled from your **business diff --git a/postgres-s3-python-script.yaml b/postgres-s3-python-script.yaml index b8f9394..2522e97 100644 --- a/postgres-s3-python-script.yaml +++ b/postgres-s3-python-script.yaml @@ -1,5 +1,6 @@ id: postgres-s3-python-script namespace: company.team + tasks: - id: api_to_postgres type: io.kestra.plugin.scripts.python.Script @@ -27,6 +28,7 @@ tasks: df_users.to_sql("users", engine, if_exists="append", index=False) df_users.to_json("users.json") + - id: s3_upload type: io.kestra.plugin.aws.s3.Upload from: "{{ outputs.api_to_postgres.outputFiles['users.json'] }}" @@ -35,6 +37,7 @@ tasks: region: eu-central-1 accessKeyId: "{{ secret('AWS_ACCESS_KEY_ID') }}" secretKeyId: "{{ secret('AWS_SECRET_ACCESS_KEY') }}" + extend: title: Extract data from an API using Python, then load it to Postgres and S3 description: >- @@ -45,7 +48,6 @@ extend: 2. Loads that extracted data to Postgres and a local JSON file. The local file is then uploaded to S3 in the following task. - The Python task runs in a Docker container. Before starting the script, Kestra will install custom package dependencies, as defined by the `beforeCommands` property. diff --git a/postgres-to-bigquery.yaml b/postgres-to-bigquery.yaml index 9162526..b2a762c 100644 --- a/postgres-to-bigquery.yaml +++ b/postgres-to-bigquery.yaml @@ -1,5 +1,6 @@ id: postgres-to-bigquery namespace: company.team + tasks: - id: extract type: io.kestra.plugin.singer.taps.PipelinewisePostgres @@ -13,6 +14,7 @@ tasks: streamsConfigurations: - replicationMethod: FULL_TABLE selected: true + - id: load type: io.kestra.plugin.singer.targets.AdswerveBigQuery addMetadataColumns: true @@ -23,6 +25,7 @@ tasks: location: US projectId: yourProjectName serviceAccount: "{{ secret('GCP_CREDS') }}" + extend: title: Load data from Postgres to BigQuery using Singer description: >- diff --git a/postgres-to-pandas-dataframes.yaml b/postgres-to-pandas-dataframes.yaml index 105edf0..41b13b0 100644 --- a/postgres-to-pandas-dataframes.yaml +++ b/postgres-to-pandas-dataframes.yaml @@ -1,7 +1,9 @@ id: postgres-to-pandas-dataframes namespace: company.team + variables: db_host: host.docker.internal + tasks: - id: get_tables type: io.kestra.plugin.core.flow.Parallel @@ -10,9 +12,11 @@ tasks: - id: products type: io.kestra.plugin.jdbc.postgresql.CopyOut sql: SELECT * FROM products + - id: orders type: io.kestra.plugin.jdbc.postgresql.CopyOut sql: SELECT * FROM orders + - id: pandas type: io.kestra.plugin.scripts.python.Script warningOnStdErr: false @@ -38,6 +42,7 @@ tasks: ) top.to_json("bestsellers_pandas.json", orient="records") + pluginDefaults: - type: io.kestra.plugin.jdbc.postgresql.CopyOut values: @@ -47,10 +52,12 @@ pluginDefaults: format: CSV header: true delimiter: "," + triggers: - id: every_morning type: io.kestra.plugin.core.trigger.Schedule cron: 0 9 * * * + extend: title: Extract multiple tables from Postgres using SQL queries and process those as Pandas dataframes on schedule @@ -59,7 +66,6 @@ extend: limit of how many tasks will run at the same time is defined using the `concurrent` property. - The flow extracts data from a Postgres database. That data is then passed to a Python task using `inputFiles`. The Python task reads the input files, and performs operations on the data using Pandas. diff --git a/process-s3-file-if-changed.yaml b/process-s3-file-if-changed.yaml index 8ce8c79..45c07ce 100644 --- a/process-s3-file-if-changed.yaml +++ b/process-s3-file-if-changed.yaml @@ -1,8 +1,10 @@ id: process-s3-file-if-changed namespace: company.team + variables: bucket: kestraio object: hello.txt + tasks: - id: process_file_if_changed type: io.kestra.plugin.scripts.python.Commands @@ -18,25 +20,22 @@ tasks: AWS_ACCESS_KEY_ID: "{{ secret('AWS_ACCESS_KEY_ID') }}" AWS_SECRET_ACCESS_KEY: "{{ secret('AWS_SECRET_ACCESS_KEY') }}" AWS_DEFAULT_REGION: "{{ secret('AWS_DEFAULT_REGION') }}" + triggers: - id: schedule type: io.kestra.plugin.core.trigger.Schedule cron: "*/5 * * * *" + extend: title: Process a file from S3 only if it changed since the last execution description: >- Add the following Python script named `s3_modified.py` in the Editor: - ```python - import boto3 - from datetime import datetime - import argparse - def parse_date(date_str): if date_str.endswith('Z'): return datetime.fromisoformat(date_str.replace('Z', '+00:00')) @@ -68,7 +67,6 @@ extend: main() ``` - Make sure to add Secrets for your AWS credentials and adjust the variables to point to your S3 bucket and object. tags: diff --git a/process-script-runner.yaml b/process-script-runner.yaml index 1621b87..7ed405f 100644 --- a/process-script-runner.yaml +++ b/process-script-runner.yaml @@ -1,5 +1,6 @@ id: process-script-runner namespace: company.team + tasks: - id: shell type: io.kestra.plugin.scripts.shell.Commands @@ -7,13 +8,13 @@ tasks: type: io.kestra.plugin.core.runner.Process commands: - echo "Hello World!" + extend: title: Run a Shell script as a subprocess on the Kestra host description: >- Here is an example of a Shell script configured with the Process task runner which runs a Shell command as a child process within the Kestra host. - The Process task runner doesn’t have any additional configuration beyond the `type` property. tags: diff --git a/produce-kafka-message.yaml b/produce-kafka-message.yaml index 01c4378..d09a819 100644 --- a/produce-kafka-message.yaml +++ b/produce-kafka-message.yaml @@ -1,9 +1,11 @@ id: produce-kafka-message namespace: company.team + tasks: - id: api type: io.kestra.plugin.core.http.Request uri: https://dummyjson.com/products + - id: produce type: io.kestra.plugin.kafka.Produce from: @@ -17,6 +19,7 @@ tasks: topic: mytopic properties: bootstrap.servers: my.kafka.k8s.com:9094 + extend: title: Extract data from a REST API and send it to a Kafka topic using the Kafka producer task @@ -26,12 +29,10 @@ extend: running, and that you created a topic named `mytopic`. Make sure to replace the `bootstrap.servers` value with your Kafka cluster URL. - The `from` argument expects a map or a list of maps with key-value pairs. The allowed keys are: `key`, `value`, `partition`, `timestamp`, and `headers`. - In this example, we're using the `outputs.api.body` value, which is a JSON-formatted response body from the `api` task. This is why the `valueSerializer` argument is set to `JSON`. diff --git a/produce-to-rabbitmq.yaml b/produce-to-rabbitmq.yaml index 4f06bd4..20895b7 100644 --- a/produce-to-rabbitmq.yaml +++ b/produce-to-rabbitmq.yaml @@ -1,8 +1,10 @@ id: produce-to-rabbitmq namespace: company.team + inputs: - id: order type: STRING + tasks: - id: publish_to_rabbitmq type: io.kestra.plugin.amqp.Publish @@ -10,29 +12,24 @@ tasks: exchange: test-queue from: - data: "{{ read(inputs.order) }}" + extend: title: Read a CSV file and load each row into RabbitMQ description: >- This blueprint has two flows: `read_orders` and `produce_to_rabbitmq`. - 1. `read_orders` reads the CSV file from a URL, converts it into ION, and generates an execution of `produce_to_rabbitmq` flow for each row of the ION file. 2. `produce_to_rabbitmq` publishes the record into RabbitMQ. - Here is the code of the parent flow `read_orders`: - ```yaml - id: read_orders - namespace: company.team - tasks: - id: csv type: io.kestra.plugin.fs.http.Download @@ -55,21 +52,15 @@ extend: order: "{{ taskrun.items }}" ``` - Execute the `read_orders` flow. This flow execution will trigger the `produce_to_rabbitmq` flow for each record. - You can run RabbitMQ locally using Docker with the following command: ```bash - - docker run -it --rm --name rabbitmq -p 5672:5672 -p 15672:15672 - rabbitmq:latest - + docker run -it --rm --name rabbitmq -p 5672:5672 -p 15672:15672 rabbitmq:latest ``` - You can open the RabbitMQ UI locally on `http://localhost:15672/` and login using `guest`/`guest`. tags: [] diff --git a/pubsub-realtime-trigger.yaml b/pubsub-realtime-trigger.yaml index 7388ed2..8586d98 100644 --- a/pubsub-realtime-trigger.yaml +++ b/pubsub-realtime-trigger.yaml @@ -1,5 +1,6 @@ id: pubsub-realtime-trigger namespace: company.team + tasks: - id: insert_into_firestore type: io.kestra.plugin.gcp.firestore.Set @@ -13,6 +14,7 @@ tasks: price: "{{ trigger.data | jq('.price') | first }}" quantity: "{{ trigger.data | jq('.quantity') | first }}" total: "{{ trigger.data | jq('.total') | first }}" + triggers: - id: realtime_trigger type: io.kestra.plugin.gcp.pubsub.RealtimeTrigger @@ -20,37 +22,30 @@ triggers: topic: orders subscription: kestra-subscription serdeType: JSON + extend: title: Use GCP Pub/Sub Realtime Trigger to push events into Firestore description: >- This flow will: - - 1. Get - [triggered](https://kestra.io/plugins/plugin-gcp/triggers/io.kestra.plugin.gcp.pubsub.realtimetrigger) + 1. Get [triggered](https://kestra.io/plugins/plugin-gcp/triggers/io.kestra.plugin.gcp.pubsub.realtimetrigger) every time the event lands in the Pub/Sub topic 2. The flow will push the data into Firestore table - For this, create a Pub/Sub topic named `orders`. We will be producing JSON messages into the Pub/Sub topic generated from the [orders.csv](https://huggingface.co/datasets/kestra/datasets/raw/main/csv/orders.csv). One sample produced message can be: - ``` - {"order_id": "1", "customer_name": "Kelly Olsen", "customer_email": "jenniferschneider@example.com", "product_id": "20", "price": "166.89", "quantity": "1", "total": "166.89"} - ``` - Create `orders` table in Firestore. - When you produce the message onto Pub/Sub topic, the flow will get triggered, and you can see that a corresponding new record gets into the Firestore table. diff --git a/pulsar-realtime-trigger.yaml b/pulsar-realtime-trigger.yaml index 54ad242..40cb79f 100644 --- a/pulsar-realtime-trigger.yaml +++ b/pulsar-realtime-trigger.yaml @@ -1,5 +1,6 @@ id: pulsar-realtime-trigger namespace: company.team + tasks: - id: create_mysql_table type: io.kestra.plugin.jdbc.mysql.Query @@ -10,62 +11,57 @@ tasks: created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, PRIMARY KEY(log_id) ) + - id: insert_into_logs_table type: io.kestra.plugin.jdbc.mysql.Query sql: insert into logs(message) values("{{ trigger.value }}") + triggers: - id: realtime_trigger type: io.kestra.plugin.pulsar.RealtimeTrigger topic: apache/pulsar/logs uri: pulsar://localhost:26650 subscriptionName: kestra_trigger_sub + pluginDefaults: - type: io.kestra.plugin.jdbc.mysql.Query values: url: jdbc:mysql://localhost:3306/kestra username: mysql_user password: mysql_passwd + extend: title: Use Pulsar Realtime Trigger to push events into MySQL description: > This flow will: - - 1. Get - [triggered](https://kestra.io/plugins/plugin-pulsar/triggers/io.kestra.plugin.pulsar.realtimetrigger) + 1. Get [triggered](https://kestra.io/plugins/plugin-pulsar/triggers/io.kestra.plugin.pulsar.realtimetrigger) every time the event lands in Apache Pulsar topic 2. The flow will push the data into a table in MySQL database - To setup Apache Pulsar locally, you can install the [standalone cluster](https://pulsar.apache.org/docs/next/getting-started-standalone/) or - [docker - cluster](https://pulsar.apache.org/docs/next/getting-started-docker/) for + [docker cluster](https://pulsar.apache.org/docs/next/getting-started-docker/) for Apache Pulsar. You can run the following commands to create the topic, and produce data to the topic: - 1) Setup a tenant `bin/pulsar-admin tenants create apache` - 2) Create a namespace `bin/pulsar-admin namespaces create apache/pulsar` - 3) Create a topic `bin/pulsar-admin topics create-partitioned-topic apache/pulsar/logs -p 4` - 4) Produce data to topic `bin/pulsar-client produce apache/pulsar/logs -m '--Hello World--' -n 1` - To setup MySQL server locally, follow the official installation steps using [docker](https://hub.docker.com/r/mysql/mysql-server/). tags: diff --git a/purge.yaml b/purge.yaml index 3ba6eb6..f0ad810 100644 --- a/purge.yaml +++ b/purge.yaml @@ -1,5 +1,6 @@ id: purge namespace: system + tasks: - id: purge_executions type: io.kestra.plugin.core.execution.PurgeExecutions @@ -7,14 +8,17 @@ tasks: purgeLog: false states: - SUCCESS + - id: purge_logs type: io.kestra.plugin.core.log.PurgeLogs endDate: "{{ now() | dateAdd(-1, 'MONTHS') }}" + triggers: - id: daily type: io.kestra.plugin.core.trigger.Schedule disabled: true cron: 0 9 * * * + extend: title: Purge execution data including logs, metrics and outputs on a schedule description: > @@ -26,22 +30,18 @@ extend: empty to purge all execution data, regardless of the execution status, or adjust it to your needs. - Given that logs often consistute the largest chunk of data that needs to be purged, we use a dedicated task to purge logs (so that you can run it independently or rerun only this step in case something fails). Keep in mind though that, by default, the `PurgeExecutions` task would also automatically purge the logs. - It is recommended to run this flow daily to keep your Kestra instance clean and save storage space. - **Before using this flow, make sure to set the `disabled` property to false (or remove that line entirely).** - Note that this flow will not purge the flow definitions or the namespace files — your code will be safe. Only the execution-related data will be purged. diff --git a/push-to-git.yaml b/push-to-git.yaml index 223d3e6..0eaaa0b 100644 --- a/push-to-git.yaml +++ b/push-to-git.yaml @@ -32,12 +32,12 @@ triggers: - id: every_full_hour type: io.kestra.plugin.core.trigger.Schedule cron: "*/15 * * * *" + extend: title: Push code to Git at regular intervals description: >- This flow will push code to Git every 15 minutes. - We will be using PushFlows and PushNamespaceFiles task to push flows and namespace files respectively. tags: diff --git a/python-aws-ecr.yaml b/python-aws-ecr.yaml index 2713a40..1dca636 100644 --- a/python-aws-ecr.yaml +++ b/python-aws-ecr.yaml @@ -1,11 +1,13 @@ id: python-aws-ecr namespace: company.team + tasks: - id: ecr type: io.kestra.plugin.aws.ecr.GetAuthToken accessKeyId: "{{ secret('AWS_ACCESS_KEY_ID') }}" secretKeyId: "{{ secret('AWS_SECRET_ACCESS_KEY') }}" region: eu-central-1 + - id: py type: io.kestra.plugin.scripts.python.Commands taskRunner: @@ -16,6 +18,7 @@ tasks: containerImage: 123456789.dkr.ecr.eu-central-1.amazonaws.com/data-infastructure:latest commands: - python --version + extend: title: Pull a container image from Amazon ECR registry and run a Python script description: >- @@ -23,7 +26,6 @@ extend: Amazon ECR. Then, it will pull the specified image and will run a Python script (or whichever command you wish) in a Docker container. - This flow assumes AWS credentials stored as secrets `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY`. tags: diff --git a/python-csv-each-parallel.yaml b/python-csv-each-parallel.yaml index ea26edf..24243e7 100644 --- a/python-csv-each-parallel.yaml +++ b/python-csv-each-parallel.yaml @@ -1,5 +1,6 @@ id: python-csv-each-parallel namespace: company.team + tasks: - id: csv type: io.kestra.plugin.core.flow.EachParallel @@ -18,6 +19,7 @@ tasks: import pandas as pd df = pd.read_csv("{{ taskrun.value }}") df.info() + extend: title: Transform data from CSV files with Pandas in Python containers (in parallel) description: This flow reads a list of CSV files and processes each file in diff --git a/python-docker-artifact-registry-gcp.yaml b/python-docker-artifact-registry-gcp.yaml index 10a783c..5c3b827 100644 --- a/python-docker-artifact-registry-gcp.yaml +++ b/python-docker-artifact-registry-gcp.yaml @@ -1,5 +1,6 @@ id: python-docker-artifact-registry-gcp namespace: company.team + tasks: - id: wdir type: io.kestra.plugin.core.flow.WorkingDirectory @@ -7,10 +8,12 @@ tasks: - id: download_csv type: io.kestra.plugin.core.http.Download uri: https://huggingface.co/datasets/kestra/datasets/raw/main/csv/orders.csv + - id: fetch_auth_token type: io.kestra.plugin.gcp.auth.OauthAccessToken projectId: YOUR_GCP_PROJECT_NAME serviceAccount: "{{ secret('GCP_CREDS') }}" + - id: analyze_sales type: io.kestra.plugin.scripts.python.Script inputFiles: @@ -39,6 +42,7 @@ tasks: } } containerImage: yourGcpRegion-docker.pkg.dev/YOUR_GCP_PROJECT_NAME/REPO_NAME/python:latest + extend: title: Run Python script in a Docker container based on Google Artifact Registry container image @@ -50,23 +54,12 @@ extend: The Docker image is stored in Google Artifact Registry. - To push an image to Google Artifact Registry, you need to: - - - Create a Google Cloud Platform service account with the `Artifact Registry - Writer` role. - + - Create a Google Cloud Platform service account with the `Artifact Registry Writer` role. - Create a JSON key for the service account. - - Create a secret with the contents of the JSON key. - - - Build a Docker image: `docker build -t - yourGcpRegion-docker.pkg.dev/YOUR_GCP_PROJECT_NAME/REPO_NAME/python:latest - .` - - - Push the image to Google Artifact Registry: `docker push - yourGcpRegion-docker.pkg.dev/YOUR_GCP_PROJECT_NAME/REPO_NAME/python:latest` - + - Build a Docker image: `docker build -t yourGcpRegion-docker.pkg.dev/YOUR_GCP_PROJECT_NAME/REPO_NAME/python:latest .` + - Push the image to Google Artifact Registry: `docker push yourGcpRegion-docker.pkg.dev/YOUR_GCP_PROJECT_NAME/REPO_NAME/python:latest` Note that the `OauthAccessToken` task is necessary to securely fetch a short-lived [access diff --git a/python-generate-logs.yaml b/python-generate-logs.yaml index 8d109dc..8371d66 100644 --- a/python-generate-logs.yaml +++ b/python-generate-logs.yaml @@ -27,6 +27,7 @@ tasks: time.sleep(0.5) logger.critical("CRITICAL means a severe failure.") + extend: title: Run a Python script and capture logs description: >- diff --git a/python-generate-output-file.yaml b/python-generate-output-file.yaml index 8622eea..6bef2eb 100644 --- a/python-generate-output-file.yaml +++ b/python-generate-output-file.yaml @@ -14,6 +14,7 @@ tasks: - id: log_file_contents type: io.kestra.plugin.core.log.Log message: "{{ read(outputs.generate_output_file.outputFiles['my_file.txt']) }}" + extend: title: Generate an output file using Python script description: >- diff --git a/python-generate-outputs-simple.yaml b/python-generate-outputs-simple.yaml index cab083c..c9cfe7a 100644 --- a/python-generate-outputs-simple.yaml +++ b/python-generate-outputs-simple.yaml @@ -17,6 +17,7 @@ tasks: message: - "Total Marks: {{ outputs.generate_output.vars.total_marks }}" - "Average Marks: {{ outputs.generate_output.vars.average_marks }}" + extend: title: Run a simple Python script to generate outputs and log them description: >- @@ -25,7 +26,6 @@ extend: The flow has two tasks: 1. Generate outputs using Python script - 2. Log the outputs generated in the prior task tags: - Python diff --git a/python-generate-outputs.yaml b/python-generate-outputs.yaml index 701eee6..b5412c2 100644 --- a/python-generate-outputs.yaml +++ b/python-generate-outputs.yaml @@ -14,23 +14,16 @@ tasks: - "*.csv" script: > import csv - import random - import time - from faker import Faker - from kestra import Kestra - start_time = time.time() fake = Faker() - # list of columns for the CSV file - columns = [ "order_id", "customer_name", @@ -42,14 +35,10 @@ tasks: ] filename = "{{ vars.file }}" - tags = {'file': filename} - # Generate 100 random orders - orders = [] - for i in range(100): order_id = i + 1 customer_name = fake.name() @@ -63,37 +52,27 @@ tasks: ) # Write the orders to a CSV file - with open(filename, "w", newline="") as file: writer = csv.writer(file) writer.writerow(columns) writer.writerows(orders) # Calculate and print the sum and average of the "total" column - total_sum = sum(order[6] for order in orders) - average_order = round(total_sum / len(orders), 2) - print(f"Total sum: {total_sum}") - print(f"Average Order value: {average_order}") - Kestra.outputs({"total_sum": total_sum, "average_order": average_order}) - Kestra.counter('total_sum', total_sum, tags) - Kestra.counter('average_order', average_order, tags) - end_time = time.time() - processing_time = end_time - start_time - Kestra.timer('processing_time', processing_time, tags) print(f"The script execution took: {processing_time} seconds") + extend: title: Run a Python script and generate outputs, metrics and files specified with a variable description: >- @@ -101,13 +80,11 @@ extend: the sum and average of the "total" column. It then reports the results as outputs and metrics. - The CSV file generated by a Python task is set as `outputFiles`, allowing you to download the file from the UI's Execution page. It is helpful to share the results of your workflow with business stakeholders who can download the file from the UI and use it in their processes. - To avoid hardcoding values, the filename `orders.csv` is specified as a variable. tags: diff --git a/python-partitions-metrics.yaml b/python-partitions-metrics.yaml index d15d5a4..e874ccb 100644 --- a/python-partitions-metrics.yaml +++ b/python-partitions-metrics.yaml @@ -1,6 +1,7 @@ id: python-partitions-metrics namespace: company.team description: Process partitions in parallel + tasks: - id: get_partitions type: io.kestra.plugin.scripts.python.Script @@ -11,6 +12,7 @@ tasks: from kestra import Kestra partitions = [f"file_{nr}.parquet" for nr in range(1, 10)] Kestra.outputs({'partitions': partitions}) + - id: process_partitions type: io.kestra.plugin.core.flow.EachParallel value: "{{ outputs.get_partitions.vars.partitions }}" @@ -22,22 +24,14 @@ tasks: containerImage: ghcr.io/kestra-io/pydata:latest script: > import random - import time - from kestra import Kestra - filename = '{{ taskrun.value }}' - print(f"Reading and processing partition {filename}") - nr_rows = random.randint(1, 1000) - processing_time = random.randint(1, 20) - time.sleep(processing_time) - Kestra.counter('nr_rows', nr_rows, {'partition': filename}) Kestra.timer('processing_time', processing_time, {'partition': diff --git a/python-subflow-component.yaml b/python-subflow-component.yaml index 4ee82e7..891824e 100644 --- a/python-subflow-component.yaml +++ b/python-subflow-component.yaml @@ -1,10 +1,12 @@ id: python-subflow-component namespace: company.team + inputs: - id: arg1 type: INT - id: arg2 type: INT + tasks: - id: python type: io.kestra.plugin.scripts.python.Commands @@ -14,10 +16,8 @@ tasks: inputFiles: main.py: > import argparse - from kestra import Kestra - def multiply_arguments(arg1, arg2): return arg1 * arg2 @@ -32,13 +32,13 @@ tasks: Kestra.outputs({'result': result}) commands: - python main.py --arg1 {{ inputs.arg1 }} --arg2 {{ inputs.arg2 }} + extend: title: Create a Python subflow, acting like an abstracted component description: >- This flow shows how you can create a templated flow (subflow) to run a custom script. - This flow can be used in another flow, acting like a separated component. We can imagine having a complex flow, with many tasks but abstracted with inputs and outputs, so users can only deal with a simple interface. @@ -46,12 +46,8 @@ extend: Here is an example of calling this flow, giving inputs and retrieve the desired outputs. - - ``` - id: call_python_component - namespace: company.team tasks: diff --git a/query-clickhouse.yaml b/query-clickhouse.yaml index 1d69f73..6c30255 100644 --- a/query-clickhouse.yaml +++ b/query-clickhouse.yaml @@ -1,9 +1,11 @@ id: query-clickhouse namespace: company.team + tasks: - id: create_database type: io.kestra.plugin.jdbc.clickhouse.Query sql: CREATE DATABASE IF NOT EXISTS helloworld + - id: create_table type: io.kestra.plugin.jdbc.clickhouse.Query sql: | @@ -16,6 +18,7 @@ tasks: ) ENGINE = MergeTree() PRIMARY KEY (user_id, timestamp) + - id: insert_data type: io.kestra.plugin.jdbc.clickhouse.Query sql: > @@ -25,15 +28,18 @@ tasks: (102, 'Insert a lot of rows per batch', yesterday(), 1.41421 ), (102, 'Sort your data based on your commonly-used queries', today(), 2.718 ), (101, 'Granules are the smallest chunks of data read', now() + 5, 3.14159 ) + - id: query_and_store_as_json type: io.kestra.plugin.jdbc.clickhouse.Query sql: SELECT user_id, message FROM helloworld.my_first_table - store: true + fetchType: STORE + pluginDefaults: - type: io.kestra.plugin.jdbc.clickhouse.Query values: url: jdbc:clickhouse://host.docker.internal:8123/ username: default + extend: title: Ingest data to and query data from ClickHouse description: > @@ -41,15 +47,11 @@ extend: already exist. It will then insert some data into the table and finally query the table to show the data. - To test this flow, you can start ClickHouse in a Docker container: - ``` - docker run -d -p 8123:8123 -p 9000:9000 --name myclickhouse --ulimit nofile=262144:262144 clickhouse/clickhouse-server - ``` tags: - Ingest diff --git a/r-script.yaml b/r-script.yaml index 23737f1..f340ca3 100644 --- a/r-script.yaml +++ b/r-script.yaml @@ -1,5 +1,6 @@ id: r-script namespace: company.team + tasks: - id: r_script type: io.kestra.plugin.scripts.r.Script @@ -31,6 +32,7 @@ tasks: print(df) write_parquet(df, "women.parquet") write_csv_arrow(df, "women.csv") + extend: title: Run R script in a Docker container and output downloadable artifacts description: >- @@ -38,7 +40,6 @@ extend: using the `dplyr` package. Finally, it stores the result as both CSV and Parquet files, which both can be downloaded from the Execution Outputs tab. - The R script is executed in a Docker container, providing isolated environment for the task and avoiding any dependency conflicts. All dependencies for the task are baked into a publicly available Docker image, @@ -46,9 +47,7 @@ extend: image with your own, or install custom dependencies at runtime using the `beforeCommands` property, for example: - ``` - beforeCommands: - Rscript -e "install.packages(c('httr', 'RSQLite'))" > /dev/null 2>&1 ``` diff --git a/react-to-sqs-trigger.yaml b/react-to-sqs-trigger.yaml index c2676d6..3383e55 100644 --- a/react-to-sqs-trigger.yaml +++ b/react-to-sqs-trigger.yaml @@ -1,5 +1,6 @@ id: react-to-sqs-trigger namespace: company.team + tasks: - id: print_message type: io.kestra.plugin.scripts.shell.Commands @@ -7,6 +8,7 @@ tasks: type: io.kestra.plugin.core.runner.Process commands: - cat "{{ trigger.uri }}" + triggers: - id: sqs type: io.kestra.plugin.aws.sqs.Trigger @@ -15,18 +17,17 @@ triggers: region: "{{ secret('AWS_DEFAULT_REGION') }}" queueUrl: https://sqs.eu-central-1.amazonaws.com/123456789/kestra maxRecords: 1 + extend: title: React to an SQS trigger description: >- This flow reacts to an SQS trigger. Any time there is a new message in the queue, the flow is triggered. - The queue URL points to an already existing queue. The `{{ trigger.uri }}` points to a file in Kestra's internal storage containing the content of the SQS message. You can read the contents of that file in any task. - This flow assumes AWS credentials stored as secrets `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY` and `AWS_DEFAULT_REGION`. tags: diff --git a/redis-key-value-store.yaml b/redis-key-value-store.yaml index 2c5f9c0..edbe029 100644 --- a/redis-key-value-store.yaml +++ b/redis-key-value-store.yaml @@ -1,5 +1,6 @@ id: redis-key-value-store namespace: company.team + inputs: - id: key type: STRING @@ -22,6 +23,7 @@ inputs: "isPremium": true, "interests": ["programming", "reading", "traveling"] } + tasks: - id: set type: io.kestra.plugin.redis.string.Set @@ -29,25 +31,23 @@ tasks: serdeType: JSON key: "{{ inputs.key }}" value: "{{ inputs.value }}" + - id: get type: io.kestra.plugin.redis.string.Get url: redis://host.docker.internal:6379/0 serdeType: JSON key: "{{ inputs.key }}" + extend: title: Store and retrieve JSON data using Redis description: >- This flow will set a key-value pair in Redis and then retrieve it. The key-value pair will be set using inputs which can be provided at runtime. - To test this flow, you can start Redis in a Docker container: - ``` - docker run --name myredis -p 6379:6379 -d redis - ``` tags: - Ingest diff --git a/redis-list-realtime-trigger.yaml b/redis-list-realtime-trigger.yaml index 93ae8f3..a41a60c 100644 --- a/redis-list-realtime-trigger.yaml +++ b/redis-list-realtime-trigger.yaml @@ -1,5 +1,6 @@ id: redis-list-realtime-trigger namespace: company.team + tasks: - id: insert_into_cassandra type: io.kestra.plugin.cassandra.Query @@ -9,59 +10,45 @@ tasks: port: 9042 localDatacenter: datacenter1 cql: > - INSERT INTO kestra.products (product_id, product_name, product_category, - brand) + INSERT INTO kestra.products (product_id, product_name, product_category, brand) + + VALUES ({{ trigger.value | jq(".product_id") | first }}, '{{ trigger.value | jq(".product_name") | first }}', - VALUES ({{ trigger.value | jq(".product_id") | first }}, '{{ trigger.value - | jq(".product_name") | first }}', + '{{ trigger.value | jq(".product_category") | first }}', '{{ trigger.value | jq(".brand") | first }}') - '{{ trigger.value | jq(".product_category") | first }}', '{{ trigger.value - | jq(".brand") | first }}') triggers: - id: realtime_trigger type: io.kestra.plugin.redis.list.RealtimeTrigger url: redis://localhost:6379/0 key: products + extend: title: Use Redis List Realtime Trigger to push events into Cassandra description: >- This flow will: - - 1. Get - [triggered](https://kestra.io/plugins/plugin-redis/triggers/io.kestra.plugin.redis.list.realtimetrigger) + 1. Get [triggered](https://kestra.io/plugins/plugin-redis/triggers/io.kestra.plugin.redis.list.realtimetrigger) every time you push data onto Redis List 2. The flow will push the data into a table in Cassandra - To setup Cassandra server locally, use the following docker command: ``` - docker run --name my-cassandra -p 9042:9042 -d cassandra - ``` - - You can use the cqlsh in the Cassandra docker container, and run the - following commands: + You can use the cqlsh in the Cassandra docker container, and run the following commands: ``` - # Create the keyspace - > create keyspace if not exists kestra with replication = {'class' : 'SimpleStrategy', 'replication_factor' : 1}; - # Use the keyspace - > use kestra; - # Create the table - > CREATE TABLE kestra.products ( product_id int, product_name text, @@ -70,35 +57,26 @@ extend: PRIMARY KEY (product_id)); ``` - To setup Redis locally, use the following Docker command: ``` - docker run --name my-redis -p 6379:6379 -d redis - ``` - You can use the redis-cli in the Redis docker container, and push data onto Redis using: ``` - > LPUSH products '{"product_id": 1, "product_name": "streamline turn-key systems", "product_category": "Electronics", "brand": "gomez"}' - ``` - We will be using the JSON records generated from the data in [products.csv](https://huggingface.co/datasets/kestra/datasets/raw/main/csv/products.csv). - Whenever you push the data onto Redis List, the flow will be triggered immediately, and will insert the data from the trigger into Cassandra table. - Whenever tags: - Realtime Trigger - Trigger diff --git a/redis-list.yaml b/redis-list.yaml index 2ed12a7..061a576 100644 --- a/redis-list.yaml +++ b/redis-list.yaml @@ -1,13 +1,16 @@ id: redis-list namespace: company.team + variables: key: favorite_plugins + tasks: - id: clear_list type: io.kestra.plugin.redis.list.ListPop url: redis://host.docker.internal:6379/0 key: "{{ vars.key }}" maxRecords: 1 + - id: publish_list type: io.kestra.plugin.redis.list.ListPush url: redis://host.docker.internal:6379/0 @@ -17,6 +20,7 @@ tasks: - duckdb - gcp - aws + extend: title: Add a list of strings to Redis description: >- @@ -25,14 +29,10 @@ extend: entries. To prevent this, the `ListPop` task is used to empty the list before `ListPush` adds the new values. - To test this flow, you can start Redis in a Docker container: - ``` - docker run --name myredis -p 6379:6379 -d redis - ``` tags: - Ingest diff --git a/redis-set-parallel.yaml b/redis-set-parallel.yaml index 98de04f..e69a0e3 100644 --- a/redis-set-parallel.yaml +++ b/redis-set-parallel.yaml @@ -1,5 +1,6 @@ id: redis-set-parallel namespace: company.team + inputs: - id: values type: JSON @@ -10,6 +11,7 @@ inputs: {"aws": ["s3", "sqs", "sns", "athena"]}, {"gcp": ["big-query", "gcs", "cloudrun"]} ] + tasks: - id: parallel type: io.kestra.plugin.core.flow.EachParallel @@ -22,20 +24,17 @@ tasks: key: "{{ json(taskrun.value) | keys | first }}" value: | {{ taskrun.value | jq('.[]') | first }} + extend: title: Add multiple Redis keys in parallel from JSON input description: >- This flow adds multiple keys in parallel to a Redis data store based on JSON input provided by the user at runtime. - To test this flow, you can start Redis in a Docker container: - ``` - docker run --name myredis -p 6379:6379 -d redis - ``` tags: - Ingest diff --git a/regex-input.yaml b/regex-input.yaml index d40835e..5b8f19d 100644 --- a/regex-input.yaml +++ b/regex-input.yaml @@ -1,5 +1,6 @@ id: regex-input namespace: company.team + inputs: - id: age type: INT @@ -7,55 +8,46 @@ inputs: required: false min: 18 max: 64 + - id: user type: STRING defaults: student required: false validator: ^student(\d+)?$ + tasks: - id: validator type: io.kestra.plugin.core.log.Log message: User {{ inputs.user }}, age {{ inputs.age }} + extend: title: Parametrized flow with custom validators to ensure correct integer value range and Regex-based string pattern validation description: > This flow uses several input validators. - The `age` property must be within a valid range between `min` and `max` integer value. - The Regex expression `^student(\d+)?$` is used to validate that the input argument `user` is of type STRING and that it follows a given pattern: - `^`: Asserts the start of the string. - - `student`: Matches the word "student". - - `\d`: Matches any digit (0-9). - - `+`: Asserts that there is one or more of the preceding token (i.e., one or more digits). - - `()?`: The parentheses group the digits together, and the question mark makes the entire group optional. - - `$`: Asserts the end of the string. This ensures that the string doesn't contain any characters after the optional digits. - With this pattern: - - "student" would be a match. - - "student123" would be a match. - - "studentabc" would not be a match because "abc" isn't a sequence of digits. - Try running this flow with various inputs or adjust the Regex pattern to see how the input validation works. tags: diff --git a/request-resources.yaml b/request-resources.yaml index 0d85dea..573f108 100644 --- a/request-resources.yaml +++ b/request-resources.yaml @@ -1,5 +1,6 @@ id: request-resources namespace: company.team + inputs: - id: resource_type displayName: Resource Type @@ -10,6 +11,7 @@ inputs: - SaaS application - Development tool - Cloud VM + - id: access_permissions displayName: Access Permissions type: SELECT @@ -19,6 +21,7 @@ inputs: inputs: - resource_type condition: "{{ inputs.resource_type equals 'Access permissions' }}" + - id: saas_applications displayName: SaaS Application type: MULTISELECT @@ -28,6 +31,7 @@ inputs: inputs: - resource_type condition: "{{ inputs.resource_type equals 'SaaS application' }}" + - id: development_tools displayName: Development Tool type: SELECT @@ -37,6 +41,7 @@ inputs: inputs: - resource_type condition: "{{ inputs.resource_type equals 'Development tool' }}" + - id: cloud_provider displayName: Cloud Provider type: SELECT @@ -48,6 +53,7 @@ inputs: inputs: - resource_type condition: "{{ inputs.resource_type equals 'Cloud VM' }}" + - id: cloud_vms displayName: Cloud VM type: SELECT @@ -58,6 +64,7 @@ inputs: - resource_type - cloud_provider condition: "{{ inputs.resource_type equals 'Cloud VM' }}" + - id: region displayName: Cloud Region type: SELECT @@ -68,6 +75,7 @@ inputs: - cloud_provider - cloud_vms condition: "{{ inputs.resource_type equals 'Cloud VM' }}" + variables: slack_message: > Validate resource request. @@ -75,6 +83,7 @@ variables: To approve the request, click on the Resume button here http://localhost:28080/ui/executions/{{flow.namespace}}/{{flow.id}}/{{execution.id}}. + tasks: - id: send_approval_request type: io.kestra.plugin.notifications.slack.SlackIncomingWebhook @@ -84,6 +93,7 @@ tasks: "channel": "#devops", "text": {{ render(vars.slack_message) | json }} } + - id: wait_for_approval type: io.kestra.plugin.core.flow.Pause onResume: @@ -95,132 +105,89 @@ tasks: description: Extra comments about the provisioned resources type: STRING defaults: All requested resources are approved + - id: approve type: io.kestra.plugin.core.http.Request uri: https://reqres.in/api/resources method: POST contentType: application/json body: "{{ inputs }}" + - id: log type: io.kestra.plugin.core.log.Log message: | Status of the request {{ outputs.wait_for_approval.onResume.comment }}. Process finished with {{ outputs.approve.body }}. + extend: title: Use conditional inputs to request compute resources and wait for approval description: "This flow shows how to use conditional inputs to build dynamic approval workflows. The workflow takes user input and sends those in a Slack message for approval — the execution is paused until manually resumed. - Using the `dependsOn` input property, you can set up a chain of dependencies, where one input depends on other inputs or conditions. In this example, the `access_permissions`, `saas_applications`, `development_tools`, and `cloud_vms` inputs are conditionally displayed based on the chosen `resource_type` input value. - Before running this flow, make sure to add the required KV pairs e.g. by using the following flow: - ```yaml - id: add_kv_pairs - namespace: company.team - tasks: - \ - id: access_permissions - \ type: io.kestra.plugin.core.kv.Set - \ key: \"{{ task.id }}\" - \ kvType: JSON - \ value: | - \ [\"Admin\", \"Developer\", \"Editor\", \"Launcher\", \"Viewer\"] - - + \ \ - id: saas_applications - \ type: io.kestra.plugin.core.kv.Set - \ key: \"{{ task.id }}\" - \ kvType: JSON - \ value: | - \ [\"Slack\", \"Notion\", \"HubSpot\", \"GitHub\", \"Jira\"] - - + \ \ - id: development_tools - \ type: io.kestra.plugin.core.kv.Set - \ key: \"{{ task.id }}\" - \ kvType: JSON - \ value: | - \ [\"Cursor\", \"IntelliJ IDEA\", \"PyCharm Professional\", \"Datagrip\"] - - + \ \ - id: cloud_vms - \ type: io.kestra.plugin.core.kv.Set - \ key: \"{{ task.id }}\" - \ kvType: JSON - \ value: | - \ { - \ \"AWS\": [\"t2.micro\", \"t2.small\", \"t2.medium\", \"t2.large\"], - \ \"GCP\": [\"f1-micro\", \"g1-small\", \"n1-standard-1\", \"n1-standard-2\"], - \ \"Azure\": [\"Standard_B1s\", \"Standard_B1ms\", \"Standard_B2s\", \"Standard_B2ms\"] - \ } - - + \ \ - id: cloud_regions - \ type: io.kestra.plugin.core.kv.Set - \ key: \"{{ task.id }}\" - \ kvType: JSON - \ value: | - \ { - \ \"AWS\": [\"us-east-1\", \"us-west-1\", \"us-west-2\", \"eu-west-1\"], - \ \"GCP\": [\"us-central1\", \"us-east1\", \"us-west1\", \"europe-west1\"], - \ \"Azure\": [\"eastus\", \"westus\", \"centralus\", \"northcentralus\"] - \ } - ``` - \ " tags: - Inputs diff --git a/retries.yaml b/retries.yaml index a35552e..f95c35e 100644 --- a/retries.yaml +++ b/retries.yaml @@ -1,5 +1,6 @@ id: retries namespace: company.team + tasks: - id: fail_4_times type: io.kestra.plugin.scripts.shell.Commands @@ -13,10 +14,12 @@ tasks: maxAttempt: 5 maxDuration: PT1M warningOnRetry: false + errors: - id: will_never_happen type: io.kestra.plugin.core.debug.Return format: This will never be executed as retries will fix the issue + extend: title: Retry a failing task up to 4 times (allowing up to 5 attempts with up to 4 retries) diff --git a/run-airflow-dag-from-kestra.yaml b/run-airflow-dag-from-kestra.yaml index 1c8d54b..fb77736 100644 --- a/run-airflow-dag-from-kestra.yaml +++ b/run-airflow-dag-from-kestra.yaml @@ -1,5 +1,6 @@ id: run-airflow-dag-from-kestra namespace: company.team + tasks: - id: run_dag type: io.kestra.plugin.airflow.dags.TriggerDagRun @@ -17,6 +18,7 @@ tasks: flow: "{{ flow.id }}" task: "{{ task.id }}" execution: "{{ execution.id }}" + extend: title: Trigger an Apache Airflow DAG run from Kestra and wait for its completion description: >- @@ -24,11 +26,9 @@ extend: waits for its completion. Under the hood, the plugin uses the Airflow REST API to trigger the DAG run and check its status. - The `conf` field in the request body can be used to pass extra metadata about the execution that triggered the Airflow DAG run. - The flow can be useful for users migrating to Kestra from Airflow or orchestrating workflows across both platforms. tags: diff --git a/run-tasks-on-databricks.yaml b/run-tasks-on-databricks.yaml index 5251c81..b0e7feb 100644 --- a/run-tasks-on-databricks.yaml +++ b/run-tasks-on-databricks.yaml @@ -1,5 +1,6 @@ id: run-tasks-on-databricks namespace: company.team + tasks: - id: submit_run type: io.kestra.plugin.databricks.job.SubmitRun @@ -13,9 +14,11 @@ tasks: pythonFile: /Shared/hello.py sparkPythonTaskSource: WORKSPACE waitForCompletion: PT5M + - id: log_status type: io.kestra.plugin.core.log.Log message: The job finished, all done! + extend: title: Execute a Spark or Python script on an existing Databricks cluster and wait for its completion diff --git a/s3-map-over-objects.yaml b/s3-map-over-objects.yaml index 0bd482d..5530e66 100644 --- a/s3-map-over-objects.yaml +++ b/s3-map-over-objects.yaml @@ -1,9 +1,11 @@ id: s3-map-over-objects namespace: company.team + inputs: - id: bucket type: STRING defaults: declarative-data-orchestration + tasks: - id: list_objects type: io.kestra.plugin.aws.s3.List @@ -12,24 +14,25 @@ tasks: accessKeyId: "{{ secret('AWS_ACCESS_KEY_ID') }}" secretKeyId: "{{ secret('AWS_SECRET_ACCESS_KEY') }}" region: "{{ secret('AWS_DEFAULT_REGION') }}" + - id: print_objects type: io.kestra.plugin.core.log.Log - message: found objects {{ outputs.list_objects.objects }} + message: "Found objects {{ outputs.list_objects.objects }}" + - id: map_over_s3_objects type: io.kestra.plugin.core.flow.EachParallel value: "{{ outputs.list_objects.objects }}" tasks: - id: filename type: io.kestra.plugin.core.log.Log - message: filename {{ json(taskrun.value).key }} with size {{ - json(taskrun.value).size }} + message: "Filename {{ json(taskrun.value).key }} with size {{ json(taskrun.value).size }}" + extend: title: List objects in an S3 bucket and process them in parallel description: > This flow lists objects with a specific prefix in an S3 bucket and then processes each object in parallel. - This flow assumes AWS credentials stored as secrets `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY` and `AWS_DEFAULT_REGION`. tags: diff --git a/s3-parallel-uploads.yaml b/s3-parallel-uploads.yaml index cea6e09..bc63c06 100644 --- a/s3-parallel-uploads.yaml +++ b/s3-parallel-uploads.yaml @@ -1,17 +1,21 @@ id: s3-parallel-uploads namespace: company.team + inputs: - id: bucket type: STRING defaults: declarative-data-orchestration + tasks: - id: get_zip_file type: io.kestra.plugin.core.http.Download uri: https://wri-dataportal-prod.s3.amazonaws.com/manual/global_power_plant_database_v_1_3.zip + - id: unzip type: io.kestra.plugin.compress.ArchiveDecompress algorithm: ZIP from: "{{ outputs.get_zip_file.uri }}" + - id: parallel_upload_to_s3 type: io.kestra.plugin.core.flow.Parallel tasks: @@ -19,16 +23,17 @@ tasks: type: io.kestra.plugin.aws.s3.Upload from: "{{ outputs.unzip.files['global_power_plant_database.csv'] }}" key: powerplant/global_power_plant_database.csv + - id: pdf type: io.kestra.plugin.aws.s3.Upload - from: "{{ - outputs.unzip.files['Estimating_Power_Plant_Generation_in_the_Global_P\ - ower_Plant_Database.pdf'] }}" + from: "{{ outputs.unzip.files['Estimating_Power_Plant_Generation_in_the_Global_Power_Plant_Database.pdf'] }}" key: powerplant/Estimating_Power_Plant_Generation_in_the_Global_Power_Plant_Database.pdf + - id: txt type: io.kestra.plugin.aws.s3.Upload from: "{{ outputs.unzip.files['RELEASE_NOTES.txt'] }}" key: powerplant/RELEASE_NOTES.txt + pluginDefaults: - type: io.kestra.plugin.aws.s3.Upload values: @@ -36,19 +41,17 @@ pluginDefaults: secretKeyId: "{{ secret('AWS_SECRET_ACCESS_KEY') }}" region: "{{ secret('AWS_DEFAULT_REGION') }}" bucket: "{{ inputs.bucket }}" + extend: title: Download a zip file, unzip it and upload all files in parallel to AWS S3 - using pluginDefaults to avoid boilerplate code description: >- - This flow downloads a zip file, unzips it, and uploads all files to S3 in - parallel. - + This flow downloads a zip file, unzips it, and uploads all files to S3 in parallel. This flow assumes AWS credentials stored as secrets `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY` and `AWS_DEFAULT_REGION` configured using `pluginDefaults` to avoid boilerplate configuration. - The flow does not create the S3 bucket, and assumes that the bucket name provided in the inputs already exists in the `AWS_DEFAULT_REGION`. tags: diff --git a/s3-trigger-duckdb.yaml b/s3-trigger-duckdb.yaml index 35bc54a..799cc62 100644 --- a/s3-trigger-duckdb.yaml +++ b/s3-trigger-duckdb.yaml @@ -1,35 +1,32 @@ id: s3-trigger-duckdb namespace: company.team + variables: bucket: kestraio source_prefix: monthly_orders destination_prefix: stage_orders + tasks: - id: query type: io.kestra.plugin.jdbc.duckdb.Query description: Validate new file for anomalies sql: > INSTALL httpfs; - LOAD httpfs; - SET s3_region='{{ secret('AWS_DEFAULT_REGION') }}'; - SET s3_access_key_id='{{ secret('AWS_ACCESS_KEY_ID') }}'; - SET s3_secret_access_key='{{ secret('AWS_SECRET_ACCESS_KEY') }}'; - SELECT * - FROM read_csv_auto('s3://{{ vars.bucket }}/{{ vars.destination_prefix }}/{{ trigger.objects | jq('.[].key') | first }}') - WHERE price * quantity != total; - store: true + fetchType: STORE + - id: csv type: io.kestra.plugin.serdes.csv.IonToCsv description: Create CSV file from query results from: "{{ outputs.query.uri }}" + - id: if_anomalies_detected type: io.kestra.plugin.core.flow.If condition: "{{ outputs.query.size }}" @@ -58,6 +55,7 @@ tasks: Best regards,
Data Team + triggers: - id: poll_for_new_s3_files type: io.kestra.plugin.aws.s3.Trigger @@ -72,6 +70,7 @@ triggers: region: "{{ secret('AWS_DEFAULT_REGION') }}" accessKeyId: "{{ secret('AWS_ACCESS_KEY_ID') }}" secretKeyId: "{{ secret('AWS_SECRET_ACCESS_KEY') }}" + extend: title: Anomaly detection using DuckDB SQL query and S3 file event trigger, sending a CSV file attachment via email if anomalies are detected @@ -79,24 +78,20 @@ extend: This flow will be triggered any time a new file arrives in a given S3 `bucket` and `source_prefix` folder. - The flow will check for anomalies in the data from that file using a DuckDB query, and will move the file to the same S3 bucket below the `destination_prefix` folder. - If anomalies are detected, the flow will send an email to the recipients specified on the `to` property, and will send anomalous rows as a CSV file attachment in the same email. - If you use [MotherDuck](https://motherduck.com/), use Kestra Secret to store the [MotherDuck service token](https://motherduck.com/docs/authenticating-to-motherduck). Then, modify the `query` task as follows to point the task to your MotherDuck database: - ```yaml - id: query type: io.kestra.plugin.jdbc.duckdb.Query @@ -105,10 +100,9 @@ extend: SELECT * FROM read_csv_auto('s3://{{ vars.bucket }}/{{ vars.destination_prefix }}/{{ trigger.objects | jq('.[].key') | first }}') WHERE price * quantity != total; - store: true + fetchType: STORE url: "jdbc:duckdb:md:my_db?motherduck_token={{ secret('MOTHERDUCK_TOKEN') }}" ``` - tags: - S3 - Trigger diff --git a/s3-trigger-python.yaml b/s3-trigger-python.yaml index a099e99..8369a32 100644 --- a/s3-trigger-python.yaml +++ b/s3-trigger-python.yaml @@ -33,6 +33,7 @@ triggers: moveTo: key: archive/ maxKeys: 1 + extend: title: "Detect New Files in S3 and process them in Python" description: >- @@ -44,11 +45,9 @@ extend: The Python code will read the file as an `inputFile` called `input.csv` and processing it to generate a new file called `data.csv`. - It's recommended to set the `accessKeyId` and `secretKeyId` properties as secrets. - This flow assumes AWS credentials stored as secrets `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY`. tags: diff --git a/s3-trigger.yaml b/s3-trigger.yaml index 279a751..465056f 100644 --- a/s3-trigger.yaml +++ b/s3-trigger.yaml @@ -1,5 +1,6 @@ id: s3-trigger namespace: company.team + tasks: - id: each type: io.kestra.plugin.core.flow.EachParallel @@ -8,6 +9,7 @@ tasks: type: io.kestra.plugin.core.debug.Return format: "{{ taskrun.value }}" value: "{{ trigger.objects | jq('.[].uri') }}" + triggers: - id: wait_for_s3_object type: io.kestra.plugin.aws.s3.Trigger @@ -21,6 +23,7 @@ triggers: region: "{{ secret('AWS_DEFAULT_REGION') }}" accessKeyId: "{{ secret('AWS_ACCESS_KEY_ID') }}" secretKeyId: "{{ secret('AWS_SECRET_ACCESS_KEY') }}" + extend: title: "AWS S3 Event Trigger " description: >- @@ -29,13 +32,9 @@ extend: internal storage and move the S3 objects to an `archive` folder (i.e. S3 object prefix with the name `archive`). - The `EachParallel` task will iterate over the objects and print their URIs. - - It's recommended to set the `accessKeyId` and `secretKeyId` properties as - secrets. - + It's recommended to set the `accessKeyId` and `secretKeyId` properties as secrets. This flow assumes AWS credentials stored as secrets `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY` and `AWS_DEFAULT_REGION`. diff --git a/scan-dynamodb-table.yaml b/scan-dynamodb-table.yaml index 7bceaeb..9480736 100644 --- a/scan-dynamodb-table.yaml +++ b/scan-dynamodb-table.yaml @@ -1,5 +1,6 @@ id: scan-dynamodb-table namespace: company.team + tasks: - id: extract_data type: io.kestra.plugin.aws.dynamodb.Scan @@ -8,25 +9,23 @@ tasks: region: "{{ secret('AWS_DEFAULT_REGION') }}" accessKeyId: "{{ secret('AWS_ACCESS_KEY_ID') }}" secretKeyId: "{{ secret('AWS_SECRET_ACCESS_KEY') }}" + - id: process_data type: io.kestra.plugin.scripts.shell.Commands taskRunner: type: io.kestra.plugin.core.runner.Process commands: - echo {{ outputs.extract_data.rows }} + extend: title: Extract and process data from DynamoDB description: >- This flow scans a DynamoDB table and outputs the extracted data as a JSON string. The subsequent task processes that data. - The `tableName` property must point to an already existing DynamoDB table. - - It's recommended to set the `accessKeyId` and `secretKeyId` properties as - secrets. - + It's recommended to set the `accessKeyId` and `secretKeyId` properties as secrets. This flow assumes AWS credentials stored as secrets `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY` and `AWS_DEFAULT_REGION`. diff --git a/send-email-with-attachment.yaml b/send-email-with-attachment.yaml index 47d6e49..0e24980 100644 --- a/send-email-with-attachment.yaml +++ b/send-email-with-attachment.yaml @@ -1,9 +1,11 @@ id: send-email-with-attachment namespace: company.team + tasks: - id: dataset1 type: io.kestra.plugin.core.http.Download uri: https://huggingface.co/datasets/kestra/datasets/raw/main/csv/products.csv + - id: send_email type: io.kestra.plugin.notifications.mail.MailSend from: onboardin@resend.dev @@ -17,6 +19,7 @@ tasks: - name: data.csv uri: "{{ outputs.dataset1.uri }}" htmlTextContent: Please find attached your dataset as a CSV filie + extend: title: Send an email and corresponding attachments with Resend description: >- diff --git a/send-sms.yaml b/send-sms.yaml index e39acd6..e8b53d7 100644 --- a/send-sms.yaml +++ b/send-sms.yaml @@ -1,9 +1,11 @@ id: send-sms namespace: company.team + inputs: - id: sms_text type: STRING defaults: Hello from Kestra and AWS SNS! + tasks: - id: send_sms type: io.kestra.plugin.aws.sns.Publish @@ -14,6 +16,7 @@ tasks: from: data: | {{ inputs.sms_text }} + extend: title: Send an SMS message using AWS SNS based on a runtime-specific input description: >- @@ -21,7 +24,6 @@ extend: number must be registered when creating an AWS SNS topic. You can override the SMS message text at runtime by leveraging the input argument `sms_text`. - This flow assumes AWS credentials stored as secrets `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY` and `AWS_DEFAULT_REGION`. tags: diff --git a/sengrid-notify-on-failure.yaml b/sengrid-notify-on-failure.yaml index caaff8c..4e3f90e 100644 --- a/sengrid-notify-on-failure.yaml +++ b/sengrid-notify-on-failure.yaml @@ -1,5 +1,6 @@ id: sengrid-notify-on-failure namespace: company.team + tasks: - id: send_email_notification type: io.kestra.plugin.notifications.sendgrid.SendGridMailExecution @@ -10,6 +11,7 @@ tasks: taskrun.startDate }}" sendgridApiKey: "{{ secret('SENDGRID_API_KEY') }}" executionId: "{{ trigger.executionId }}" + triggers: - id: on_failure type: io.kestra.plugin.core.trigger.Flow @@ -21,40 +23,31 @@ triggers: - type: io.kestra.plugin.core.condition.ExecutionNamespaceCondition namespace: company comparison: PREFIX + extend: title: Send an SendGrid email notification when a workflow fails description: >- This system flow will send an SendGrid email notification anytime a workflow in a `company` namespace (or any nested child namespace) fails. - Using this pattern, you can send email notifications for Kestra workflow execution failures alongside other notifications. - You can customize that system flow by modifying the task, adding more tasks to the flow or adjusting the trigger conditions. Read more about that - pattern in the [Administrator - Guide](https://kestra.io/docs/administrator-guide/monitoring). - - - Let's create a flow in the namespace with prefix `company` that will always - fail. + pattern in the [Administrator Guide](https://kestra.io/docs/administrator-guide/monitoring). + Let's create a flow in the namespace with prefix `company` that will always fail. ```yaml - id: failure_flow - namespace: company.team - tasks: - id: always_fails type: io.kestra.plugin.core.execution.Fail ``` - Whenever you run the `failure_flow`, it will trigger an execution of the `sengrid_notify_on_failure` flow. As a result, an email notification will be created using SendGrid so that prompt action can be taken. diff --git a/sensitive-data.yaml b/sensitive-data.yaml index 41db322..4932fca 100644 --- a/sensitive-data.yaml +++ b/sensitive-data.yaml @@ -1,9 +1,11 @@ id: sensitive-data namespace: company.team + tasks: - id: extract type: io.kestra.plugin.core.http.Download uri: https://huggingface.co/datasets/kestra/datasets/raw/main/csv/orders.csv + - id: transform type: io.kestra.plugin.jdbc.duckdb.Query inputFiles: @@ -24,6 +26,7 @@ tasks: DELIMITER ','); outputFiles: - "*.csv" + - id: load type: io.kestra.plugin.gcp.bigquery.Load from: "{{ outputs.transform.outputFiles.csv }}" @@ -34,12 +37,12 @@ tasks: autodetect: true csvOptions: fieldDelimiter: "," + extend: title: Extract data, mask sensitive columns using DuckDB and load it to BigQuery description: >- This flow has three tasks: `extract`, `transform` and `load`. - 1. The `extract` task here is a simple HTTP Download task, but you can replace it with any task or custom script that extracts data. @@ -49,13 +52,11 @@ extend: 3. The `load` task loads that extracted and transformed data to BigQuery. - If you use [MotherDuck](https://motherduck.com/), use Kestra Secret to store the [MotherDuck service token](https://motherduck.com/docs/authenticating-to-motherduck). Then, add the `url` property to point the task to your MotherDuck database. - ```yaml - id: transform type: io.kestra.plugin.jdbc.duckdb.Query diff --git a/sentry-alert.yaml b/sentry-alert.yaml index 4761d47..4c762d4 100644 --- a/sentry-alert.yaml +++ b/sentry-alert.yaml @@ -1,8 +1,10 @@ id: sentry-alert namespace: company.team + tasks: - id: fail type: io.kestra.plugin.core.execution.Fail + errors: - id: alert_on_failure type: io.kestra.plugin.notifications.sentry.SentryAlert @@ -21,31 +23,28 @@ errors: "Link": "http://localhost:8080/ui/executions/{{flow.namespace}}/{{flow.id}}/{{execution.id}}" } } + extend: title: Send an alert to Sentry when a flow fails description: >- This flow shows how to send an alert to Sentry when a flow fails. - The only required input is a DSN string value, which you can find when you go to your Sentry project settings and go to the section "Client Keys (DSN)". You can find more detailed description of how to find your DSN in the [following Sentry documentation](https://docs.sentry.io/product/sentry-basics/concepts/dsn-explainer/#where-to-find-your-dsn). - You can customize the alert `payload`, which is a JSON object, or you can skip it and use the default payload created by kestra. For more information about the payload, check the [Sentry Event Payloads documentation](https://develop.sentry.dev/sdk/event-payloads/). - The `event_id` is an optional payload attribute that you can use to override the default event ID. If you don't specify it (recommended), kestra will generate a random UUID. You can use this attribute to group events together, but note that this must be a UUID type. For more information, check the - [Sentry - documentation](https://docs.sentry.io/product/issues/grouping-and-fingerprints/). + [Sentry documentation](https://docs.sentry.io/product/issues/grouping-and-fingerprints/). tags: - Notifications ee: false diff --git a/shell-scripts.yaml b/shell-scripts.yaml index b0ad121..c6ce2b1 100644 --- a/shell-scripts.yaml +++ b/shell-scripts.yaml @@ -1,5 +1,6 @@ id: shell-scripts namespace: company.team + tasks: - id: working_directory type: io.kestra.plugin.core.flow.WorkingDirectory @@ -15,12 +16,14 @@ tasks: do echo "$i,$RANDOM,$RANDOM" >> file.csv done + - id: inspect_file type: io.kestra.plugin.scripts.shell.Commands taskRunner: type: io.kestra.plugin.core.runner.Process commands: - cat file.csv + - id: filter_file type: io.kestra.plugin.scripts.shell.Commands description: select only the first five rows of the second column @@ -28,6 +31,7 @@ tasks: type: io.kestra.plugin.core.runner.Process commands: - cut -d ',' -f 2 file.csv | head -n 6 + extend: title: Run Shell Scripts and Shell commands in a working directory using a Process Task Runner diff --git a/slack-failure-alert.yaml b/slack-failure-alert.yaml index 5f18e57..179b559 100644 --- a/slack-failure-alert.yaml +++ b/slack-failure-alert.yaml @@ -1,11 +1,13 @@ id: slack-failure-alert namespace: company.monitoring + tasks: - id: send type: io.kestra.plugin.notifications.slack.SlackExecution url: "{{ secret('SLACK_WEBHOOK') }}" channel: "#general" executionId: "{{ trigger.executionId }}" + triggers: - id: listen type: io.kestra.plugin.core.trigger.Flow @@ -17,6 +19,7 @@ triggers: - type: io.kestra.plugin.core.condition.ExecutionNamespaceCondition namespace: company.analytics prefix: true + extend: title: Failure notifications to Slack to monitor the health of production workflows description: >- @@ -24,7 +27,6 @@ extend: namespace finishes with errors or warnings. Thanks to the `executionId` variable, the alert includes a link to the failed flow's execution page. - Given that this flow runs on a Flow trigger, there is no need for boilerplate code to define alert logic in each flow separately. Instead, the Flow trigger allows you to define that logic only once. The trigger will @@ -32,7 +34,6 @@ extend: namespace, including all child namespaces, and will automatically send Slack messages on failure. - This flow assumes that you stored the Slack webhook URL as a secret. tags: - Notifications diff --git a/slack-incoming-webhook.yaml b/slack-incoming-webhook.yaml index 59ee115..27b9814 100644 --- a/slack-incoming-webhook.yaml +++ b/slack-incoming-webhook.yaml @@ -1,5 +1,6 @@ id: slack-incoming-webhook namespace: company.team + tasks: - id: slack type: io.kestra.plugin.notifications.slack.SlackIncomingWebhook @@ -9,10 +10,10 @@ tasks: "channel": "#alerts", "text": "Flow {{ flow.namespace }}.{{ flow.id }} started with execution {{ execution.id }}" } + extend: title: Send a Slack message via incoming webhook - description: Send messages through [Slack Incoming - Webhook](https://api.slack.com/messaging/webhooks). + description: Send messages through [Slack Incoming Webhook](https://api.slack.com/messaging/webhooks). tags: - Notifications - Software Engineering diff --git a/snowflake-query.yaml b/snowflake-query.yaml index b4cf9f1..dc5b929 100644 --- a/snowflake-query.yaml +++ b/snowflake-query.yaml @@ -1,23 +1,24 @@ id: snowflake-query namespace: company.team + tasks: - id: query type: io.kestra.plugin.jdbc.snowflake.Query url: jdbc:snowflake://accountID.snowflakecomputing.com?warehouse=COMPUTE_WH username: yourSnowflakeUser password: "{{ secret('SNOWFLAKE_PASSWORD') }}" - fetchOne: true + fetchType: FETCH_ONE sql: | SELECT * FROM SNOWFLAKE_SAMPLE_DATA.TPCH_SF1.CUSTOMER + extend: title: Query data in Snowflake description: >- - This flow runs a query within a Snowflake data warehouse. The `fetchOne` - property will retrieve only the first row, while using the `fetch` property - will retrieve all rows. Setting `store: true` will provide the results as a + This flow runs a query within a Snowflake data warehouse. The `fetchType` + property value `FETCH_ONE` will retrieve only the first row, while using the `FETCH` value + will retrieve all rows. Setting `fetchType: STORE` will provide the results as a downloadable file. - The flow assumes the password is stored as a secret. tags: - Snowflake diff --git a/snowflake.yaml b/snowflake.yaml index 783122c..d531aa4 100644 --- a/snowflake.yaml +++ b/snowflake.yaml @@ -1,9 +1,11 @@ id: snowflake namespace: company.team + tasks: - id: create_database type: io.kestra.plugin.jdbc.snowflake.Query sql: CREATE OR REPLACE DATABASE kestra; + - id: create_table type: io.kestra.plugin.jdbc.snowflake.Query sql: | @@ -15,9 +17,11 @@ tasks: city STRING , start_date DATE ); + - id: extract type: io.kestra.plugin.core.http.Download uri: https://huggingface.co/datasets/kestra/datasets/raw/main/employees/employees00.csv + - id: load_to_internal_stage type: io.kestra.plugin.jdbc.snowflake.Upload from: "{{ outputs.extract.uri }}" @@ -25,46 +29,43 @@ tasks: prefix: raw stageName: "@kestra.public.%employees" compress: true + - id: load_from_stage_to_table type: io.kestra.plugin.jdbc.snowflake.Query sql: > COPY INTO KESTRA.PUBLIC.EMPLOYEES - FROM @kestra.public.%employees - - FILE_FORMAT = (type = csv field_optionally_enclosed_by='"' skip_header = - 1) - + FILE_FORMAT = (type = csv field_optionally_enclosed_by='"' skip_header = 1) PATTERN = '.*employees0[0-9].csv.gz' - ON_ERROR = 'skip_file'; + - id: analyze type: io.kestra.plugin.jdbc.snowflake.Query description: Growth of new hires per month sql: > - SELECT year(START_DATE) as year, monthname(START_DATE) as month, count(*) - as nr_employees - + SELECT year(START_DATE) as year, monthname(START_DATE) as month, count(*) as nr_employees FROM kestra.public.EMPLOYEES - GROUP BY year(START_DATE), monthname(START_DATE) - ORDER BY nr_employees desc; - store: true + fetchType: STORE + - id: csv_report type: io.kestra.plugin.serdes.csv.IonToCsv from: "{{ outputs.analyze.uri }}" + pluginDefaults: - type: io.kestra.plugin.jdbc.snowflake.Query values: url: jdbc:snowflake://accountID.snowflakecomputing.com?warehouse=COMPUTE_WH username: yourSnowflakeUser password: "{{ secret('SNOWFLAKE_PASSWORD') }}" + - type: io.kestra.plugin.jdbc.snowflake.Upload values: url: jdbc:snowflake://accountID.snowflakecomputing.com?warehouse=COMPUTE_WH username: yourSnowflakeUser password: "{{ secret('SNOWFLAKE_PASSWORD') }}" + extend: title: "Snowflake ETL: load files to internal stage, copy from stage to a table and run analytical SQL queries " @@ -73,10 +74,8 @@ extend: and a table. It then extracts data from an external source, and loads that data as a CSV file into Snowflake's internal stage. - The CSV file uploaded to stage is then loaded into the table. - Finally, we do some analytics by aggregating (imaginary) new hires at Kestra over time. The final result is fetched into the Kestra's internal storage and converted to a CSV file that you can download from the Outputs tab on diff --git a/sqlmesh.yaml b/sqlmesh.yaml index a2ee1a8..6beeb30 100644 --- a/sqlmesh.yaml +++ b/sqlmesh.yaml @@ -1,6 +1,7 @@ id: sqlmesh namespace: company.team description: Clone SQLMesh project and run the project, and query with DuckDB + tasks: - id: working_dir type: io.kestra.plugin.core.flow.WorkingDirectory @@ -9,6 +10,7 @@ tasks: type: io.kestra.plugin.git.Clone url: https://github.com/TobikoData/sqlmesh-examples.git branch: main + - id: sqlmesh type: io.kestra.plugin.sqlmesh.cli.SQLMeshCLI beforeCommands: @@ -17,16 +19,16 @@ tasks: - sqlmesh plan --auto-apply outputFiles: - 001_sushi/1_simple/db/sushi-example.db + - id: query type: io.kestra.plugin.jdbc.duckdb.Query inputFiles: - data.db: "{{ - outputs.sqlmesh.outputFiles['001_sushi/1_simple/db/sushi-example.db'] - }}" + data.db: "{{ outputs.sqlmesh.outputFiles['001_sushi/1_simple/db/sushi-example.db'] }}" sql: | ATTACH '{{ workingDir }}/data.db'; SELECT * FROM sushisimple.top_waiters; - store: true + fetchType: STORE + extend: title: Orchestrate SQLMesh and DuckDB engine description: This blueprint shows how you can pull a SQLMesh project from a Git diff --git a/sqs-publish-message.yaml b/sqs-publish-message.yaml index a4b7f00..790c8aa 100644 --- a/sqs-publish-message.yaml +++ b/sqs-publish-message.yaml @@ -1,9 +1,11 @@ id: sqs-publish-message namespace: company.team + inputs: - id: message type: STRING defaults: Hi from Kestra! + tasks: - id: publish_message type: io.kestra.plugin.aws.sqs.Publish @@ -13,13 +15,13 @@ tasks: queueUrl: https://sqs.eu-central-1.amazonaws.com/123456789/kestra from: data: "{{ inputs.message }}" + extend: title: Publish a message to an SQS queue description: >- This flow publishes a message to an SQS queue. The queue URL points to an already existing queue. - This flow assumes AWS credentials stored as secrets `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY` and `AWS_DEFAULT_REGION`. tags: diff --git a/sqs-realtime-trigger.yaml b/sqs-realtime-trigger.yaml index 7e63ca4..373d7e8 100644 --- a/sqs-realtime-trigger.yaml +++ b/sqs-realtime-trigger.yaml @@ -1,5 +1,6 @@ id: sqs-realtime-trigger namespace: company.team + tasks: - id: insert_into_dynamoDB type: io.kestra.plugin.aws.dynamodb.PutItem @@ -15,6 +16,7 @@ tasks: price: "{{ trigger.data | jq('.price') | first }}" quantity: "{{ trigger.data | jq('.quantity') | first }}" total: "{{ trigger.data | jq('.total') | first }}" + triggers: - id: realtime_trigger type: io.kestra.plugin.aws.sqs.RealtimeTrigger @@ -23,41 +25,33 @@ triggers: region: eu-central-1 queueUrl: https://sqs.eu-central-1.amazonaws.com/000000000000/orders serdeType: JSON + extend: title: Use AWS SQS Realtime Trigger to push events into DynamoDB description: >- This flow will: - - 1. Get - [triggered](https://kestra.io/plugins/plugin-aws/triggers/io.kestra.plugin.aws.sqs.realtimetrigger) + 1. Get [triggered](https://kestra.io/plugins/plugin-aws/triggers/io.kestra.plugin.aws.sqs.realtimetrigger) every time the event lands in the AWS SQS queue 2. The flow will push the data into a table in DynamoDB - For this, create a SQS queue named `orders`. We will be producing JSON messages into the queue generated from the [orders.csv](https://huggingface.co/datasets/kestra/datasets/raw/main/csv/orders.csv). One sample produced message can be: - ``` - {"order_id": "1", "customer_name": "Kelly Olsen", "customer_email": "jenniferschneider@example.com", "product_id": "20", "price": "166.89", "quantity": "1", "total": "166.89"} - ``` - Create `orders` table in the DynamoDB. - We get the AWS access key and secret key from the secrets `AWS_ACCESS_KEY_ID` and `AWS_SECRET_KEY_ID`. - When you produce the message onto SQS queue, the flow will get triggered, and you can see that a corresponding new records gets into the DynamoDB table. diff --git a/start-job-on-existing-cluster.yaml b/start-job-on-existing-cluster.yaml index 515efa9..53e8594 100644 --- a/start-job-on-existing-cluster.yaml +++ b/start-job-on-existing-cluster.yaml @@ -1,5 +1,6 @@ id: start-job-on-existing-cluster namespace: company.team + tasks: - id: create_job type: io.kestra.plugin.databricks.job.CreateJob @@ -13,9 +14,11 @@ tasks: pythonFile: /Shared/hello.py sparkPythonTaskSource: WORKSPACE waitForCompletion: PT5H + - id: log_status type: io.kestra.plugin.core.log.Log message: The job finished, all done! + extend: title: Create a Spark job on a Databricks cluster and wait for its completion description: This flow will start a job on an existing Databricks cluster. The diff --git a/subflow-for-each-value.yaml b/subflow-for-each-value.yaml index 447f4ba..020ca32 100644 --- a/subflow-for-each-value.yaml +++ b/subflow-for-each-value.yaml @@ -1,5 +1,6 @@ id: subflow-for-each-value namespace: company.team + tasks: - id: parallel type: io.kestra.plugin.core.flow.ForEach @@ -12,20 +13,17 @@ tasks: namespace: company.team inputs: my_input: "{{ taskrun.value }}" + extend: title: Run a subflow for each value in parallel and wait for their completion — recommended pattern to iterate over hundreds or thousands of list items description: >- First, create the following flow that we'll use as a parametrized subflow: - ```yaml - id: my_subflow - namespace: company.team - inputs: - id: my_input type: STRING @@ -37,17 +35,14 @@ extend: format: hi from {{ flow.id }} using input {{ inputs.my_input }} ``` - Then, create the flow `subflow_for_each_value`. - This flow will trigger multiple executions of the flow `my_subflow`. Each execution will be triggered in parallel using input from the list of values. In this example, you should see three executions of the subflow, one with the input user1, another with the input user2 and yet another execution with the input user3. - This pattern is particularly useful if the list of values you iterate over is large. As explained in the [Flow best practices documentation](https://kestra.io/docs/developer-guide/best-practice), it's diff --git a/surreal-db-slack.yaml b/surreal-db-slack.yaml index bab7f86..dd0586b 100644 --- a/surreal-db-slack.yaml +++ b/surreal-db-slack.yaml @@ -1,5 +1,6 @@ id: surreal-db-slack namespace: company.team + tasks: - id: company type: io.kestra.plugin.surrealdb.Query @@ -7,9 +8,11 @@ tasks: CREATE company SET name = 'Kestra', created_at = time::now() + - id: delete_anna type: io.kestra.plugin.surrealdb.Query query: DELETE author:anna; + - id: add_author_tbl type: io.kestra.plugin.surrealdb.Query disabled: true @@ -19,29 +22,30 @@ tasks: name.last = 'Geller', name.full = string::join(' ', name.first, name.last), admin = true + - id: fix_admin_permission type: io.kestra.plugin.surrealdb.Query query: UPDATE author:anna SET admin = false WHERE name.last = 'Geller'; + - id: create_article_tbl type: io.kestra.plugin.surrealdb.Query query: > CREATE article SET - created_at = time::now(), - author = author:anna, - title = 'Kestra 0.12 simplifies building modular, event-driven and containerized workflows', - company = (SELECT VALUE id FROM company WHERE name = 'Kestra' LIMIT 1)[0] + - id: query type: io.kestra.plugin.surrealdb.Query query: SELECT title FROM article; fetchType: FETCH_ONE + - id: log_query_results type: io.kestra.plugin.core.log.Log message: "{{ outputs.query.row }}" + - id: slack type: io.kestra.plugin.notifications.slack.SlackIncomingWebhook url: "{{ secret('SLACK_WEBHOOK') }}" @@ -50,6 +54,7 @@ tasks: "channel": "#general", "text": "{{ outputs.query.row.title }}" } + pluginDefaults: - type: io.kestra.plugin.surrealdb.Query values: @@ -58,6 +63,7 @@ pluginDefaults: namespace: test username: root password: root + extend: title: "CRUD operations in SurrealQL: run multiple SurrealDB queries and send the query results via Slack" @@ -66,7 +72,6 @@ extend: as well as insert, update and delete data. The flow parses the final query result and sends it in a Slack message. - This flow assumes that the Slack Incoming Webhook URL is stored as a secret named `SLACK_WEBHOOK`. tags: [] diff --git a/surreal-db.yaml b/surreal-db.yaml index 9a48880..1f2db89 100644 --- a/surreal-db.yaml +++ b/surreal-db.yaml @@ -1,5 +1,6 @@ id: surreal-db namespace: company.team + tasks: - id: article type: io.kestra.plugin.surrealdb.Query @@ -11,6 +12,7 @@ tasks: text = 'Donec eleifend, nunc vitae commodo accumsan, mauris est fringilla.', account = (SELECT VALUE id FROM account WHERE name = 'ACME Inc' LIMIT 1)[0] ; + - id: account type: io.kestra.plugin.surrealdb.Query query: | @@ -18,14 +20,17 @@ tasks: name = 'ACME Inc', created_at = time::now() ; + - id: query type: io.kestra.plugin.surrealdb.Query query: SELECT * FROM article, account; fetchType: STORE + - id: query_condition type: io.kestra.plugin.surrealdb.Query query: SELECT * FROM article WHERE author.age < 30 FETCH author, account; fetchType: STORE + pluginDefaults: - type: io.kestra.plugin.surrealdb.Query values: @@ -34,6 +39,7 @@ pluginDefaults: namespace: test username: root password: root + extend: title: Query SurrealDB and store the result as a downloadable artifact description: >- @@ -41,7 +47,6 @@ extend: storage. The result of the query can be used by other tasks in the same flow using the syntax `{{ outputs.query.uri }}`. - To install and run SurrealDB, follow the instructions in the [SurrealDB documentation](https://surrealdb.com/docs/introduction/start). tags: [] diff --git a/switch.yaml b/switch.yaml index 64f25b6..02bd6de 100644 --- a/switch.yaml +++ b/switch.yaml @@ -1,8 +1,10 @@ id: switch namespace: company.team + inputs: - id: string type: STRING + tasks: - id: switch type: io.kestra.plugin.core.flow.Switch @@ -20,6 +22,7 @@ tasks: - id: default type: io.kestra.plugin.core.debug.Return format: This is the default case + extend: title: Switch tasks depending on a specific value description: The `switch` task will drive the flow depending of the input value. diff --git a/sync-from-git.yaml b/sync-from-git.yaml index 68bc6e6..5efc13b 100644 --- a/sync-from-git.yaml +++ b/sync-from-git.yaml @@ -29,12 +29,12 @@ triggers: - id: every_full_hour type: io.kestra.plugin.core.trigger.Schedule cron: "*/15 * * * *" + extend: title: Sync code from Git at regular intervals description: >- This flow will sync code from Git every 15 minutes. - We will be using SyncFlows and SyncNamespaceFiles task to sync flows and namespace files respectively. tags: diff --git a/task-outputs.yaml b/task-outputs.yaml index 5b49267..5e65311 100644 --- a/task-outputs.yaml +++ b/task-outputs.yaml @@ -1,12 +1,15 @@ id: task-outputs namespace: company.team + tasks: - id: task1 type: io.kestra.plugin.core.debug.Return format: Hello + - id: task2 type: io.kestra.plugin.core.log.Log message: "{{ outputs.task1.value }} World!" + extend: title: Task outputs description: This blueprint shows how to use outputs from one task to another. diff --git a/taxi-trip-data.yaml b/taxi-trip-data.yaml index 2b9e475..d43fc8b 100644 --- a/taxi-trip-data.yaml +++ b/taxi-trip-data.yaml @@ -1,11 +1,13 @@ id: taxi-trip-data namespace: company.team + tasks: - id: log type: io.kestra.plugin.core.log.Log message: running backfill for file https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_{{ trigger.date ?? execution.startDate | date("yyyy-MM") }}.parquet + triggers: - id: schedule type: io.kestra.plugin.core.trigger.Schedule @@ -13,13 +15,13 @@ triggers: timezone: US/Eastern backfill: start: 2023-01-01T00:00:00Z + extend: title: Backfill a flow for the Yellow Taxi Trip dataset description: >- To backfill a flow, add a `start` date from which Kestra should backfill the past runs before running the regular schedule. - The expression `{{ trigger.date ?? execution.startDate | date("yyyy-MM") }}` will always give you the execution date regardless of whether you triggered the flow ad-hoc or whether it ran based on a schedule. The date function diff --git a/telegram-notify-on-failure.yaml b/telegram-notify-on-failure.yaml index ad8d625..0bcfe28 100644 --- a/telegram-notify-on-failure.yaml +++ b/telegram-notify-on-failure.yaml @@ -1,13 +1,14 @@ id: telegram-notify-on-failure namespace: company.team + tasks: - id: send_notification type: io.kestra.plugin.notifications.telegram.TelegramExecution token: "{{ secret('TELEGRAM_TOKEN') }}" channel: "2072728690" - payload: "Kestra Workflow Failure: {{ trigger.executionId }} has failed on {{ - taskrun.startDate }}" + payload: "Kestra Workflow Failure: {{ trigger.executionId }} has failed on {{ taskrun.startDate }}" executionId: "{{ trigger.executionId }}" + triggers: - id: on_failure type: io.kestra.plugin.core.trigger.Flow @@ -19,40 +20,31 @@ triggers: - type: io.kestra.plugin.core.condition.ExecutionNamespaceCondition namespace: company comparison: PREFIX + extend: title: Send a Telegram notification when a workflow fails description: >- This system flow will send a Telegram notification anytime a workflow in a `company` namespace (or any nested child namespace) fails. - Using this pattern, you can send Telegram notifications for Kestra workflow execution failures alongside other notifications. - You can customize that system flow by modifying the task, adding more tasks to the flow or adjusting the trigger conditions. Read more about that - pattern in the [Administrator - Guide](https://kestra.io/docs/administrator-guide/monitoring). - - - Let's create a flow in the namespace with prefix `company` that will always - fail. + pattern in the [Administrator Guide](https://kestra.io/docs/administrator-guide/monitoring). + Let's create a flow in the namespace with prefix `company` that will always fail. ```yaml - id: failure_flow - namespace: company.team - tasks: - id: always_fails type: io.kestra.plugin.core.execution.Fail ``` - Whenever you run the `failure_flow`, it will trigger an execution of the `telegram_notify_on_failure` flow. As a result, a Telegram notification will be sent so that prompt action can be taken. diff --git a/trigger-subflow.yaml b/trigger-subflow.yaml index f030346..b5925f4 100644 --- a/trigger-subflow.yaml +++ b/trigger-subflow.yaml @@ -1,9 +1,11 @@ id: trigger-subflow namespace: company.team + tasks: - id: task_a type: io.kestra.plugin.core.debug.Return format: "{{ task.id }} - flow_a" + - id: flow_b type: io.kestra.plugin.core.flow.Subflow description: This task triggers the flow `subflow` with corresponding inputs. @@ -11,6 +13,7 @@ tasks: flowId: subflow inputs: data: "{{ outputs.task_a.value }}" + extend: title: Trigger a sub Flow description: >- @@ -18,16 +21,12 @@ extend: great pattern to reuse common flows. Thanks to inputs you can parameter your flows and reuse them easily. - You can create this subflow first: - ``` - + ```yaml id: subflow - namespace: company.team - inputs: - id: data type: STRING @@ -38,7 +37,6 @@ extend: format: "{{ task.id }} - subflow - {{ inputs.data }}" ``` - The flow will trigger this subflow in the task `flow_b`. tags: - Trigger diff --git a/trino-query.yaml b/trino-query.yaml index 7380a1f..915500e 100644 --- a/trino-query.yaml +++ b/trino-query.yaml @@ -1,5 +1,6 @@ id: trino-query namespace: company.team + tasks: - id: analyze_orders type: io.kestra.plugin.jdbc.trino.Query @@ -10,24 +11,22 @@ tasks: from tpch.tiny.orders group by orderpriority order by orderpriority - store: true + fetchType: STORE + - id: csv_report type: io.kestra.plugin.serdes.csv.IonToCsv from: "{{ outputs.analyze_orders.uri }}" + extend: title: Generate a CSV file report from a SQL query using Trino description: >- This flow queries data using Trino SQL and generates a downloadable CSV report. - To test this integration, you can start Trino in a Docker container: - ```bash - docker run -d -p 8090:8080 --name trino trinodb/trino - ``` tags: - SQL diff --git a/twilio-notify-on-failure.yaml b/twilio-notify-on-failure.yaml index 110d3c5..54aff15 100644 --- a/twilio-notify-on-failure.yaml +++ b/twilio-notify-on-failure.yaml @@ -1,5 +1,6 @@ id: twilio-notify-on-failure namespace: company.team + tasks: - id: send_twilio_notification type: io.kestra.plugin.notifications.twilio.TwilioExecution @@ -10,6 +11,7 @@ tasks: authToken: "{{ secret('TWILIO_AUTH_TOKEN') }}" body: "Kestra Workflow Failure: {{ trigger.executionId }} has failed on {{ taskrun.startDate }}" + triggers: - id: on_failure type: io.kestra.plugin.core.trigger.Flow @@ -21,40 +23,31 @@ triggers: - type: io.kestra.plugin.core.condition.ExecutionNamespaceCondition namespace: company comparison: PREFIX + extend: title: Send Twilio notification when a workflow fails description: >- This system flow will send a notification anytime a workflow in a `company` namespace (or any nested child namespace) fails. - Using this pattern, you can get notification for Kestra workflow execution failures alongside other notifications. - You can customize that system flow by modifying the task, adding more tasks to the flow or adjusting the trigger conditions. Read more about that - pattern in the [Administrator - Guide](https://kestra.io/docs/administrator-guide/monitoring). - - - Let's create a flow in the namespace with prefix `company` that will always - fail. + pattern in the [Administrator Guide](https://kestra.io/docs/administrator-guide/monitoring). + Let's create a flow in the namespace with prefix `company` that will always fail. ```yaml - id: failure_flow - namespace: company.team - tasks: - id: always_fails type: io.kestra.plugin.core.execution.Fail ``` - Whenever you run the `failure_flow`, it will trigger an execution of the `twilio_notify_on_failure` flow. As a result, a notification will be sent using Twilio so that prompt action can be taken. diff --git a/unreliable-flow.yaml b/unreliable-flow.yaml index c99c65a..654f3b5 100644 --- a/unreliable-flow.yaml +++ b/unreliable-flow.yaml @@ -1,10 +1,12 @@ id: unreliable-flow namespace: company.team + tasks: - id: fail type: io.kestra.plugin.scripts.shell.Commands commands: - exit 1 + errors: - id: alert_on_failure type: io.kestra.plugin.notifications.zenduty.ZendutyAlert @@ -28,6 +30,7 @@ errors: } ] } + extend: title: Send an alert to Zenduty when a flow fails description: >- diff --git a/upload-file-to-s3.yaml b/upload-file-to-s3.yaml index b53f5e4..ec04ec5 100644 --- a/upload-file-to-s3.yaml +++ b/upload-file-to-s3.yaml @@ -1,16 +1,20 @@ id: upload-file-to-s3 namespace: company.team + inputs: - id: bucket type: STRING defaults: declarative-data-orchestration + - id: file_url type: STRING defaults: https://wri-dataportal-prod.s3.amazonaws.com/manual/global_power_plant_database_v_1_3.zip + tasks: - id: download_file type: io.kestra.plugin.core.http.Download uri: "{{ inputs.file_url }}" + - id: upload_to_s3 type: io.kestra.plugin.aws.s3.Upload from: "{{ outputs.download_file.uri }}" @@ -19,12 +23,12 @@ tasks: region: "{{ secret('AWS_DEFAULT_REGION') }}" accessKeyId: "{{ secret('AWS_ACCESS_KEY_ID') }}" secretKeyId: "{{ secret('AWS_SECRET_ACCESS_KEY') }}" + extend: title: Download a file and upload it to S3 description: > This flow downloads a single file and uploads it to an S3 bucket. - This flow assumes AWS credentials stored as secrets `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY` and `AWS_DEFAULT_REGION`. tags: diff --git a/upload-google-drive.yaml b/upload-google-drive.yaml index 0eb107a..dded72a 100644 --- a/upload-google-drive.yaml +++ b/upload-google-drive.yaml @@ -1,9 +1,11 @@ id: upload-google-drive namespace: company.team + tasks: - id: download type: io.kestra.plugin.core.http.Download uri: https://huggingface.co/datasets/kestra/datasets/raw/main/csv/orders.csv + - id: upload type: io.kestra.plugin.googleworkspace.drive.Upload from: "{{ outputs.download.uri }}" @@ -12,18 +14,17 @@ tasks: name: Orders contentType: text/csv mimeType: application/vnd.google-apps.spreadsheet + extend: title: Upload file to Google Drive description: >- In this blueprint, we upload a file to Google Drive. - > Note: The `parents` property here refers to an existing Google Drive directory. To be able to use Google Drive you will have to enable the API in your Google Cloud Platform project and share the folder with your service account email address. - Make sure to add the `GOOGLE_APPLICATION_CREDENTIALS` environment variable with a value of the path to the JSON keyfile (note this must be a path to a file, not the contents of the Service Account). diff --git a/upload-parquet-to-databricks.yaml b/upload-parquet-to-databricks.yaml index 6f2d960..46749f7 100644 --- a/upload-parquet-to-databricks.yaml +++ b/upload-parquet-to-databricks.yaml @@ -1,8 +1,10 @@ id: upload-parquet-to-databricks namespace: company.team + inputs: - id: my_file type: FILE + tasks: - id: upload_file type: io.kestra.plugin.databricks.dbfs.Upload @@ -11,6 +13,7 @@ tasks: host: "{{ secret('DATABRICKS_HOST') }}" from: "{{ inputs.my_file }}" to: /Shared/myFile.parquet + extend: title: Upload a Parquet file to Databricks description: This flow will upload a local Parquet file to Databricks File System (DBFS). diff --git a/upload-to-s3.yaml b/upload-to-s3.yaml index 8c5935c..720c941 100644 --- a/upload-to-s3.yaml +++ b/upload-to-s3.yaml @@ -1,27 +1,32 @@ id: upload-to-s3 namespace: company.team + inputs: - id: bucket type: STRING defaults: declarative-data-orchestration + tasks: - id: get_zip_file type: io.kestra.plugin.core.http.Download uri: https://wri-dataportal-prod.s3.amazonaws.com/manual/global_power_plant_database_v_1_3.zip + - id: unzip type: io.kestra.plugin.compress.ArchiveDecompress algorithm: ZIP - from: "{{outputs.get_zip_file.uri}}" + from: "{{ outputs.get_zip_file.uri }}" + - id: csv_upload type: io.kestra.plugin.aws.s3.Upload from: "{{ outputs.unzip.files['global_power_plant_database.csv'] }}" bucket: "{{ inputs.bucket }}" - key: powerplant/{{ trigger.date ?? execution.startDate | - date('yyyy_MM_dd__HH_mm_ss') }}.csv + key: powerplant/{{ trigger.date ?? execution.startDate | date('yyyy_MM_dd__HH_mm_ss') }}.csv + triggers: - id: hourly type: io.kestra.plugin.core.trigger.Schedule cron: "@hourly" + extend: title: Extract a CSV file via HTTP API and upload it to S3 by using scheduled date as a filename diff --git a/wdir-pandas-python-outputs.yaml b/wdir-pandas-python-outputs.yaml index ee95fba..c0b02fb 100644 --- a/wdir-pandas-python-outputs.yaml +++ b/wdir-pandas-python-outputs.yaml @@ -1,5 +1,6 @@ id: wdir-pandas-python-outputs namespace: company.team + tasks: - id: etl type: io.kestra.plugin.core.flow.WorkingDirectory @@ -20,6 +21,7 @@ tasks: df = pd.DataFrame(data) print(df.head()) df.to_csv("raw_data.csv", index=False) + - id: transform_and_load_csv type: io.kestra.plugin.scripts.python.Script warningOnStdErr: false @@ -34,6 +36,7 @@ tasks: df['Column4'] = df['Column2'] + df['Column3'] print(df.head()) df.to_csv("final.csv", index=False) + extend: title: Pandas ETL - passing data between Python script tasks running in separate containers @@ -41,12 +44,10 @@ extend: This flow demonstrates how to use the `WorkingDirectory` task to persist data between multiple Python script tasks running in separate containers. - The first task stores the data as a CSV file called "raw_data.csv". The second task loads the CSV file, transforms it and outputs the final CSV file to Kestra's internal storage. You can download that file from the Outputs - tab on the Execution's page. - + tab on the Execution's page. Kestra's internal storage allows you to use the output in other tasks in the flow, even if those tasks are processed in different containers. The final diff --git a/weaviate-csv.yaml b/weaviate-csv.yaml index ad72fd2..a557d78 100644 --- a/weaviate-csv.yaml +++ b/weaviate-csv.yaml @@ -1,21 +1,26 @@ id: weaviate-csv namespace: company.team + variables: host: https://demo-ito81rf6.weaviate.network secret: YOUR_WEAVIATE_API_KEY + tasks: - id: csv type: io.kestra.plugin.core.http.Download uri: https://huggingface.co/datasets/kestra/datasets/raw/main/csv/trivia_questions.csv + - id: csv_to_ion type: io.kestra.plugin.serdes.csv.CsvToIon from: "{{ outputs.csv.uri }}" + - id: batch_load type: io.kestra.plugin.weaviate.BatchCreate url: "{{ vars.host }}" apiKey: "{{ vars.secret }}" className: QuestionsCsv objects: "{{ outputs.csv_to_ion.uri }}" + - id: query type: io.kestra.plugin.weaviate.Query url: "{{ vars.host }}" @@ -30,6 +35,7 @@ tasks: } } } + extend: title: Extract data from a CSV file, load it in batch to Weaviate and query it with GraphQL @@ -37,14 +43,12 @@ extend: This flow shows how to extract data from a CSV file using the HTTP API, load it to a Weaviate cluster and query it with GraphQL. - This flow assumes that you have a [Weaviate cluster](https://console.weaviate.cloud/) running, and that you created an API key. Make sure to replace the `url` and `apiKey` values in the tasks with your Weaviate credentials. It's recommended to use Secrets to store your API key. - Once you've configured the Weaviate secret, you can reproduce this flow without any changes. It will load the data from the Kaggle Jeopardy dataset to Weaviate, and then query it with GraphQL. diff --git a/weaviate-load-and-query.yaml b/weaviate-load-and-query.yaml index 9c61fb2..b56289c 100644 --- a/weaviate-load-and-query.yaml +++ b/weaviate-load-and-query.yaml @@ -1,19 +1,23 @@ id: weaviate-load-and-query namespace: company.team + tasks: - id: json type: io.kestra.plugin.core.http.Download uri: https://raw.githubusercontent.com/weaviate-tutorials/quickstart/main/data/jeopardy_tiny.json + - id: json_to_ion type: io.kestra.plugin.serdes.json.JsonToIon from: "{{ outputs.json.uri }}" newLine: false + - id: batch_load type: io.kestra.plugin.weaviate.BatchCreate url: https://demo-oczq9ryw.weaviate.network apiKey: "{{ secret('WEAVIATE_API_KEY') }}" className: Questions objects: "{{ outputs.json_to_ion.uri }}" + - id: batch_load_map type: io.kestra.plugin.weaviate.BatchCreate url: demo-oczq9ryw.weaviate.network @@ -32,6 +36,7 @@ tasks: - company: initech user: Bob Slydell city: Austin + - id: query_users type: io.kestra.plugin.weaviate.Query url: demo-oczq9ryw.weaviate.network @@ -46,6 +51,7 @@ tasks: } } } + - id: generative_search type: io.kestra.plugin.weaviate.Query disabled: true @@ -63,26 +69,24 @@ tasks: } } } + extend: title: Extract data from a REST API, load it to Weaviate and query it with GraphQL description: > This flow shows how to extract data from an HTTP API, load it to a Weaviate cluster and query it with GraphQL. - This flow assumes that you have a [Weaviate cluster](https://console.weaviate.cloud/) running, and that you created an API key. Make sure to replace the `url` and `apiKey` values in the tasks with your Weaviate credentials. It's recommended to use Secrets to store your API key. - Once you've configured the Weaviate secret, you can reproduce this flow without any changes. It will load the data from the [Jeopardy dataset](https://www.kaggle.com/tunguz/200000-jeopardy-questions) to Weaviate, and then query it with GraphQL. - You can ingest data to Weaviate from a Kestra flow using one of the following options: @@ -93,7 +97,6 @@ extend: is recommended when you want to load data from a previous task in the same flow, e.g. after extracting it from a database or a file. - The last task performing a [Generative Search](https://weaviate.io/developers/weaviate/starter-guides/generative#what-is-generative-search) is currently disabled, as it requires an OpenAI API key and following the diff --git a/whatsapp-notify-on-failure.yaml b/whatsapp-notify-on-failure.yaml index a1f9690..af6bd43 100644 --- a/whatsapp-notify-on-failure.yaml +++ b/whatsapp-notify-on-failure.yaml @@ -1,5 +1,6 @@ id: whatsapp-notify-on-failure namespace: company.team + tasks: - id: send_notification type: io.kestra.plugin.notifications.whatsapp.WhatsAppExecution @@ -12,6 +13,7 @@ tasks: payload: "Kestra Workflow Failure: {{ trigger.executionId }} has failed on {{ taskrun.startDate }}" executionId: "{{ trigger.executionId }}" + triggers: - id: on_failure type: io.kestra.plugin.core.trigger.Flow @@ -23,40 +25,31 @@ triggers: - type: io.kestra.plugin.core.condition.ExecutionNamespaceCondition namespace: company comparison: PREFIX + extend: title: Send a WhatsApp notification when a workflow fails description: >- This system flow will send a WhatsApp notification anytime a workflow in a `company` namespace (or any nested child namespace) fails. - Using this pattern, you can send WhatsApp notifications for Kestra workflow execution failures alongside other notifications. - You can customize that system flow by modifying the task, adding more tasks to the flow or adjusting the trigger conditions. Read more about that - pattern in the [Administrator - Guide](https://kestra.io/docs/administrator-guide/monitoring). - - - Let's create a flow in the namespace with prefix `company` that will always - fail. + pattern in the [Administrator Guide](https://kestra.io/docs/administrator-guide/monitoring). + Let's create a flow in the namespace with prefix `company` that will always fail. ```yaml - id: failure_flow - namespace: company.team - tasks: - id: always_fails type: io.kestra.plugin.core.execution.Fail ``` - Whenever you run the `failure_flow`, it will trigger an execution of the `whatsapp_notify_on_failure` flow. As a result, a WhatsApp notification will be sent so that prompt action can be taken. diff --git a/wikipedia-top10-python-pandas.yaml b/wikipedia-top10-python-pandas.yaml index 8abb23b..a2de1aa 100644 --- a/wikipedia-top10-python-pandas.yaml +++ b/wikipedia-top10-python-pandas.yaml @@ -1,24 +1,24 @@ id: wikipedia-top10-python-pandas namespace: company.team description: Analyze top 10 Wikipedia pages + tasks: - id: query type: io.kestra.plugin.gcp.bigquery.Query sql: > SELECT DATETIME(datehour) as date, title, views FROM `bigquery-public-data.wikipedia.pageviews_2024` - WHERE DATE(datehour) = current_date() and wiki = 'en' - ORDER BY datehour desc, views desc - LIMIT 10 store: true projectId: test-project serviceAccount: "{{ secret('GCP_SERVICE_ACCOUNT_JSON') }}" + - id: write_csv type: io.kestra.plugin.serdes.csv.IonToCsv from: "{{ outputs.query.uri }}" + - id: pandas type: io.kestra.plugin.scripts.python.Script warningOnStdErr: false @@ -35,39 +35,30 @@ tasks: df.head(10) views = df['views'].max() Kestra.outputs({'views': int(views)}) + extend: title: Use BigQuery and Python script running in Docker to analyze Wikipedia page views description: >- This flow will do the following: - 1. Use `bigquery.Query` task to query the top 10 wikipedia pages for the current day - 2. Use `IonToCsv` to store the results in a CSV file. - 3. Use `python.Script` task to read the CSV file and use pandas to find the maximum number of views. - 4. Use Kestra `outputs` to track the maximum number of views over time. - The Python script will run in a Docker container based on the public image `ghcr.io/kestra-io/pydata:latest`. - The BigQuery task exposes (by default) a variety of **metrics** such as: - total.bytes.billed - - total.partitions.processed - - number of rows processed - - query duration - You can view those metrics on the Execution page in the Metrics tab. tags: - Python diff --git a/write-mongo.yaml b/write-mongo.yaml index 6c10802..d070262 100644 --- a/write-mongo.yaml +++ b/write-mongo.yaml @@ -1,5 +1,6 @@ id: write-mongo namespace: company.team + tasks: - id: write type: io.kestra.plugin.mongodb.InsertOne @@ -12,6 +13,7 @@ tasks: $oid: 60930c39a982931c20ef6cd6 name: John Doe city: Paris + extend: title: Write data in MongoDB description: This blueprint shows how to insert a document in MongoDB database. diff --git a/zenduty-failure-alert.yaml b/zenduty-failure-alert.yaml index 3c56735..b435182 100644 --- a/zenduty-failure-alert.yaml +++ b/zenduty-failure-alert.yaml @@ -1,13 +1,14 @@ id: zenduty-failure-alert namespace: system + tasks: - id: send_alert type: io.kestra.plugin.notifications.zenduty.ZendutyExecution url: https://www.zenduty.com/api/events/{{ secret('ZENDUTY_INTEGRATION_KEY') }}/ executionId: "{{ trigger.executionId }}" - message: Kestra workflow execution {{ trigger.executionId }} of a flow {{ - trigger.flowId }} in the namespace {{ trigger.namespace }} changed status - to {{ trigger.state }} + message: Kestra workflow execution {{ trigger.executionId }} of a flow {{ trigger.flowId }} + in the namespace {{ trigger.namespace }} changed status to {{ trigger.state }} + triggers: - id: failed_prod_workflows type: io.kestra.plugin.core.trigger.Flow @@ -19,6 +20,7 @@ triggers: - type: io.kestra.plugin.core.condition.ExecutionNamespaceCondition namespace: company prefix: true + extend: title: Send an alert to Zenduty when any flow fails in the company namespace description: >- @@ -28,24 +30,17 @@ extend: API integration and generate the key. The API integration will send an API call that follows the format: - - ```bash - curl -X POST https://www.zenduty.com/api/events/[integration-key]/ -H 'Content-Type: application/json' -d '{"alert_type":"critical", "message":"Some message", "summary":"some summary", "entity_id":"some_entity_id"}' - ``` - - The `message` and `summary` parameters are required. The `alert_type` parameter is the severity of the issue, including `info`, `warning`, `error`, or `critical`. - This kestra task abstracts away raw API calls and only requires the integration key, which you can store as a Secret. The default value is `error`. Visit the Zenduty [Events API diff --git a/zip-to-parquet.yaml b/zip-to-parquet.yaml index a9736d0..66a061a 100644 --- a/zip-to-parquet.yaml +++ b/zip-to-parquet.yaml @@ -1,16 +1,19 @@ id: zip-to-parquet namespace: company.team + variables: file_id: "{{ execution.startDate | dateAdd(-3, 'MONTHS') | date('yyyyMM') }}" + tasks: - id: get_zipfile type: io.kestra.plugin.core.http.Download - uri: https://divvy-tripdata.s3.amazonaws.com/{{ render(vars.file_id) - }}-divvy-tripdata.zip + uri: https://divvy-tripdata.s3.amazonaws.com/{{ render(vars.file_id) }}-divvy-tripdata.zip + - id: unzip type: io.kestra.plugin.compress.ArchiveDecompress algorithm: ZIP from: "{{ outputs.get_zipfile.uri }}" + - id: parquet_output type: io.kestra.plugin.scripts.python.Script warningOnStdErr: false @@ -31,6 +34,7 @@ tasks: df.to_parquet(f"{file_id}.parquet") outputFiles: - "*.parquet" + extend: title: Extract a zip file, decompress it, and convert CSV to parquet format in Python