diff --git a/acceptance/bundle/templates/lakeflow-pipelines/python/input.json b/acceptance/bundle/templates/lakeflow-pipelines/python/input.json new file mode 100644 index 0000000000..e3f799e1c7 --- /dev/null +++ b/acceptance/bundle/templates/lakeflow-pipelines/python/input.json @@ -0,0 +1,6 @@ +{ + "project_name": "my_lakeflow_pipelines", + "default_catalog": "main", + "personal_schemas": "yes", + "language": "python" +} diff --git a/acceptance/bundle/templates/lakeflow-pipelines/python/output.txt b/acceptance/bundle/templates/lakeflow-pipelines/python/output.txt new file mode 100644 index 0000000000..954a2a8409 --- /dev/null +++ b/acceptance/bundle/templates/lakeflow-pipelines/python/output.txt @@ -0,0 +1,29 @@ + +>>> [CLI] bundle init lakeflow-pipelines --config-file ./input.json --output-dir output + +Welcome to the template for Lakeflow Declarative Pipelines! + + +Your new project has been created in the 'my_lakeflow_pipelines' directory! + +Refer to the README.md file for "getting started" instructions! + +>>> [CLI] bundle validate -t dev +Name: my_lakeflow_pipelines +Target: dev +Workspace: + Host: [DATABRICKS_URL] + User: [USERNAME] + Path: /Workspace/Users/[USERNAME]/.bundle/my_lakeflow_pipelines/dev + +Validation OK! + +>>> [CLI] bundle validate -t prod +Name: my_lakeflow_pipelines +Target: prod +Workspace: + Host: [DATABRICKS_URL] + User: [USERNAME] + Path: /Workspace/Users/[USERNAME]/.bundle/my_lakeflow_pipelines/prod + +Validation OK! diff --git a/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/.vscode/__builtins__.pyi b/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/.vscode/__builtins__.pyi new file mode 100644 index 0000000000..0edd5181bc --- /dev/null +++ b/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/.vscode/__builtins__.pyi @@ -0,0 +1,3 @@ +# Typings for Pylance in Visual Studio Code +# see https://github.com/microsoft/pyright/blob/main/docs/builtins.md +from databricks.sdk.runtime import * diff --git a/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/.vscode/extensions.json b/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/.vscode/extensions.json new file mode 100644 index 0000000000..5d15eba363 --- /dev/null +++ b/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/.vscode/extensions.json @@ -0,0 +1,7 @@ +{ + "recommendations": [ + "databricks.databricks", + "ms-python.vscode-pylance", + "redhat.vscode-yaml" + ] +} diff --git a/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/.vscode/settings.json b/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/.vscode/settings.json new file mode 100644 index 0000000000..3e76d20bd8 --- /dev/null +++ b/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/.vscode/settings.json @@ -0,0 +1,21 @@ +{ + "python.analysis.stubPath": ".vscode", + "databricks.python.envFile": "${workspaceFolder}/.env", + "jupyter.interactiveWindow.cellMarker.codeRegex": "^# COMMAND ----------|^# Databricks notebook source|^(#\\s*%%|#\\s*\\|#\\s*In\\[\\d*?\\]|#\\s*In\\[ \\])", + "jupyter.interactiveWindow.cellMarker.default": "# COMMAND ----------", + "python.testing.pytestArgs": [ + "." + ], + "python.testing.unittestEnabled": false, + "python.testing.pytestEnabled": true, + "python.analysis.extraPaths": ["resources/my_lakeflow_pipelines_pipeline"], + "files.exclude": { + "**/*.egg-info": true, + "**/__pycache__": true, + ".pytest_cache": true, + }, + "[python]": { + "editor.defaultFormatter": "ms-python.black-formatter", + "editor.formatOnSave": true, + }, +} diff --git a/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/README.md b/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/README.md new file mode 100644 index 0000000000..49d493b854 --- /dev/null +++ b/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/README.md @@ -0,0 +1,41 @@ +# my_lakeflow_pipelines + +The 'my_lakeflow_pipelines' project was generated by using the Lakeflow Pipelines template. + +## Setup + +1. Install the Databricks CLI from https://docs.databricks.com/dev-tools/cli/databricks-cli.html + +2. Authenticate to your Databricks workspace, if you have not done so already: + ``` + $ databricks auth login + ``` + +3. Optionally, install developer tools such as the Databricks extension for Visual Studio Code from + https://docs.databricks.com/dev-tools/vscode-ext.html. Or the PyCharm plugin from + https://www.databricks.com/blog/announcing-pycharm-integration-databricks. + + +## Deploying resources + +1. To deploy a development copy of this project, type: + ``` + $ databricks bundle deploy --target dev + ``` + (Note that "dev" is the default target, so the `--target` parameter + is optional here.) + +2. Similarly, to deploy a production copy, type: + ``` + $ databricks bundle deploy --target prod + ``` + +3. Use the "summary" comand to review everything that was deployed: + ``` + $ databricks bundle summary + ``` + +4. To run a job or pipeline, use the "run" command: + ``` + $ databricks bundle run + ``` diff --git a/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/databricks.yml b/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/databricks.yml new file mode 100644 index 0000000000..ded4a8470d --- /dev/null +++ b/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/databricks.yml @@ -0,0 +1,47 @@ +# This is a Databricks asset bundle definition for my_lakeflow_pipelines. +# See https://docs.databricks.com/dev-tools/bundles/index.html for documentation. +bundle: + name: my_lakeflow_pipelines + uuid: [UUID] + +include: + - resources/*.yml + - resources/*/*.yml + +# Variable declarations. These variables are assigned in the dev/prod targets below. +variables: + catalog: + description: The catalog to use + schema: + description: The schema to use + notifications: + description: The email addresses to use for failure notifications + +targets: + dev: + # The default target uses 'mode: development' to create a development copy. + # - Deployed resources get prefixed with '[dev my_user_name]' + # - Any job schedules and triggers are paused by default. + # See also https://docs.databricks.com/dev-tools/bundles/deployment-modes.html. + mode: development + default: true + workspace: + host: [DATABRICKS_URL] + variables: + catalog: main + schema: ${workspace.current_user.short_name} + notifications: [] + + prod: + mode: production + workspace: + host: [DATABRICKS_URL] + # We explicitly deploy to /Workspace/Users/[USERNAME] to make sure we only have a single copy. + root_path: /Workspace/Users/[USERNAME]/.bundle/${bundle.name}/${bundle.target} + permissions: + - user_name: [USERNAME] + level: CAN_MANAGE + variables: + catalog: main + schema: default + notifications: [[USERNAME]] diff --git a/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/out.gitignore b/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/out.gitignore new file mode 100644 index 0000000000..f6a3b5ff93 --- /dev/null +++ b/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/out.gitignore @@ -0,0 +1,8 @@ +.databricks/ +build/ +dist/ +__pycache__/ +*.egg-info +.venv/ +**/explorations/** +**/!explorations/README.md diff --git a/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/my_lakeflow_pipelines_pipeline/README.md b/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/my_lakeflow_pipelines_pipeline/README.md new file mode 100644 index 0000000000..6caf95d48a --- /dev/null +++ b/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/my_lakeflow_pipelines_pipeline/README.md @@ -0,0 +1,22 @@ +# my_lakeflow_pipelines_pipeline + +This folder defines all source code for the my_lakeflow_pipelines_pipeline pipeline: + +- `explorations`: Ad-hoc notebooks used to explore the data processed by this pipeline. +- `transformations`: All dataset definitions and transformations. +- `utilities` (optional): Utility functions and Python modules used in this pipeline. +- `data_sources` (optional): View definitions describing the source data for this pipeline. + +## Getting Started + +To get started, go to the `transformations` folder -- most of the relevant source code lives there: + +* By convention, every dataset under `transformations` is in a separate file. +* Take a look at the sample under "sample_trips_my_lakeflow_pipelines.py" to get familiar with the syntax. + Read more about the syntax at https://docs.databricks.com/dlt/python-ref.html. +* Use `Run file` to run and preview a single transformation. +* Use `Run pipeline` to run _all_ transformations in the entire pipeline. +* Use `+ Add` in the file browser to add a new data set definition. +* Use `Schedule` to run the pipeline on a schedule! + +For more tutorials and reference material, see https://docs.databricks.com/dlt. diff --git a/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/my_lakeflow_pipelines_pipeline/explorations/sample_exploration.ipynb b/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/my_lakeflow_pipelines_pipeline/explorations/sample_exploration.ipynb new file mode 100644 index 0000000000..0187c0c95f --- /dev/null +++ b/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/my_lakeflow_pipelines_pipeline/explorations/sample_exploration.ipynb @@ -0,0 +1,63 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "[UUID]", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "### Example Exploratory Notebook\n", + "\n", + "Use this notebook to explore the data generated by the pipeline in your preferred programming language.\n", + "\n", + "**Note**: This notebook is not executed as part of the pipeline." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "[UUID]", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [], + "source": [ + "# !!! Before performing any data analysis, make sure to run the pipeline to materialize the sample datasets. The tables referenced in this notebook depend on that step.\n", + "\n", + "display(spark.sql(\"SELECT * FROM main.[USERNAME].my_lakeflow_pipelines\"))" + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "computePreferences": null, + "dashboards": [], + "environmentMetadata": null, + "inputWidgetPreferences": null, + "language": "python", + "notebookMetadata": { + "pythonIndentUnit": 2 + }, + "notebookName": "sample_exploration", + "widgets": {} + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/my_lakeflow_pipelines_pipeline/my_lakeflow_pipelines.job.yml b/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/my_lakeflow_pipelines_pipeline/my_lakeflow_pipelines.job.yml new file mode 100644 index 0000000000..f07a973780 --- /dev/null +++ b/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/my_lakeflow_pipelines_pipeline/my_lakeflow_pipelines.job.yml @@ -0,0 +1,19 @@ +# The job that triggers my_lakeflow_pipelines_pipeline. +resources: + jobs: + my_lakeflow_pipelines_job: + name: my_lakeflow_pipelines_job + + trigger: + # Run this job every day, exactly one day from the last run; see https://docs.databricks.com/api/workspace/jobs/create#trigger + periodic: + interval: 1 + unit: DAYS + + email_notifications: + on_failure: ${var.notifications} + + tasks: + - task_key: refresh_pipeline + pipeline_task: + pipeline_id: ${resources.pipelines.my_lakeflow_pipelines_pipeline.id} diff --git a/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/my_lakeflow_pipelines_pipeline/my_lakeflow_pipelines.pipeline.yml b/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/my_lakeflow_pipelines_pipeline/my_lakeflow_pipelines.pipeline.yml new file mode 100644 index 0000000000..499ddad0ca --- /dev/null +++ b/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/my_lakeflow_pipelines_pipeline/my_lakeflow_pipelines.pipeline.yml @@ -0,0 +1,12 @@ +resources: + pipelines: + my_lakeflow_pipelines_pipeline: + name: my_lakeflow_pipelines_pipeline + serverless: true + channel: "PREVIEW" + catalog: ${var.catalog} + schema: ${var.schema} + root_path: "." + libraries: + - glob: + include: transformations/** diff --git a/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/my_lakeflow_pipelines_pipeline/transformations/sample_trips_my_lakeflow_pipelines.py b/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/my_lakeflow_pipelines_pipeline/transformations/sample_trips_my_lakeflow_pipelines.py new file mode 100644 index 0000000000..804ac6ea25 --- /dev/null +++ b/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/my_lakeflow_pipelines_pipeline/transformations/sample_trips_my_lakeflow_pipelines.py @@ -0,0 +1,16 @@ +import dlt +from pyspark.sql.functions import col +from utilities import utils + + +# This file defines a sample transformation. +# Edit the sample below or add new transformations +# using "+ Add" in the file browser. + + +@dlt.table +def sample_trips_my_lakeflow_pipelines(): + return ( + spark.read.table("samples.nyctaxi.trips") + .withColumn("trip_distance_km", utils.distance_km(col("trip_distance"))) + ) diff --git a/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/my_lakeflow_pipelines_pipeline/transformations/sample_zones_my_lakeflow_pipelines.py b/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/my_lakeflow_pipelines_pipeline/transformations/sample_zones_my_lakeflow_pipelines.py new file mode 100644 index 0000000000..4c2798d1b1 --- /dev/null +++ b/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/my_lakeflow_pipelines_pipeline/transformations/sample_zones_my_lakeflow_pipelines.py @@ -0,0 +1,19 @@ +import dlt +from pyspark.sql.functions import col, sum + + +# This file defines a sample transformation. +# Edit the sample below or add new transformations +# using "+ Add" in the file browser. + + +@dlt.table +def sample_zones_my_lakeflow_pipelines(): + # Read from the "sample_trips" table, then sum all the fares + return ( + spark.read.table("sample_trips_my_lakeflow_pipelines") + .groupBy(col("pickup_zip")) + .agg( + sum("fare_amount").alias("total_fare") + ) + ) diff --git a/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/my_lakeflow_pipelines_pipeline/utilities/utils.py b/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/my_lakeflow_pipelines_pipeline/utilities/utils.py new file mode 100644 index 0000000000..ff039898f0 --- /dev/null +++ b/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/my_lakeflow_pipelines_pipeline/utilities/utils.py @@ -0,0 +1,8 @@ +from pyspark.sql.functions import udf +from pyspark.sql.types import FloatType + + +@udf(returnType=FloatType()) +def distance_km(distance_miles): + """Convert distance from miles to kilometers (1 mile = 1.60934 km).""" + return distance_miles * 1.60934 diff --git a/acceptance/bundle/templates/lakeflow-pipelines/python/script b/acceptance/bundle/templates/lakeflow-pipelines/python/script new file mode 100755 index 0000000000..90735aebef --- /dev/null +++ b/acceptance/bundle/templates/lakeflow-pipelines/python/script @@ -0,0 +1,9 @@ +trace $CLI bundle init lakeflow-pipelines --config-file ./input.json --output-dir output + +cd output/my_lakeflow_pipelines +trace $CLI bundle validate -t dev +trace $CLI bundle validate -t prod + +# Do not affect this repository's git behaviour #2318 +mv .gitignore out.gitignore +rm .databricks/.gitignore diff --git a/acceptance/bundle/templates/lakeflow-pipelines/sql/input.json b/acceptance/bundle/templates/lakeflow-pipelines/sql/input.json new file mode 100644 index 0000000000..36c5cb6995 --- /dev/null +++ b/acceptance/bundle/templates/lakeflow-pipelines/sql/input.json @@ -0,0 +1,6 @@ +{ + "project_name": "my_lakeflow_pipelines", + "default_catalog": "main", + "personal_schemas": "yes", + "language": "sql" +} diff --git a/acceptance/bundle/templates/lakeflow-pipelines/sql/output.txt b/acceptance/bundle/templates/lakeflow-pipelines/sql/output.txt new file mode 100644 index 0000000000..954a2a8409 --- /dev/null +++ b/acceptance/bundle/templates/lakeflow-pipelines/sql/output.txt @@ -0,0 +1,29 @@ + +>>> [CLI] bundle init lakeflow-pipelines --config-file ./input.json --output-dir output + +Welcome to the template for Lakeflow Declarative Pipelines! + + +Your new project has been created in the 'my_lakeflow_pipelines' directory! + +Refer to the README.md file for "getting started" instructions! + +>>> [CLI] bundle validate -t dev +Name: my_lakeflow_pipelines +Target: dev +Workspace: + Host: [DATABRICKS_URL] + User: [USERNAME] + Path: /Workspace/Users/[USERNAME]/.bundle/my_lakeflow_pipelines/dev + +Validation OK! + +>>> [CLI] bundle validate -t prod +Name: my_lakeflow_pipelines +Target: prod +Workspace: + Host: [DATABRICKS_URL] + User: [USERNAME] + Path: /Workspace/Users/[USERNAME]/.bundle/my_lakeflow_pipelines/prod + +Validation OK! diff --git a/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/.vscode/__builtins__.pyi b/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/.vscode/__builtins__.pyi new file mode 100644 index 0000000000..0edd5181bc --- /dev/null +++ b/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/.vscode/__builtins__.pyi @@ -0,0 +1,3 @@ +# Typings for Pylance in Visual Studio Code +# see https://github.com/microsoft/pyright/blob/main/docs/builtins.md +from databricks.sdk.runtime import * diff --git a/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/.vscode/extensions.json b/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/.vscode/extensions.json new file mode 100644 index 0000000000..5d15eba363 --- /dev/null +++ b/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/.vscode/extensions.json @@ -0,0 +1,7 @@ +{ + "recommendations": [ + "databricks.databricks", + "ms-python.vscode-pylance", + "redhat.vscode-yaml" + ] +} diff --git a/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/.vscode/settings.json b/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/.vscode/settings.json new file mode 100644 index 0000000000..3e76d20bd8 --- /dev/null +++ b/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/.vscode/settings.json @@ -0,0 +1,21 @@ +{ + "python.analysis.stubPath": ".vscode", + "databricks.python.envFile": "${workspaceFolder}/.env", + "jupyter.interactiveWindow.cellMarker.codeRegex": "^# COMMAND ----------|^# Databricks notebook source|^(#\\s*%%|#\\s*\\|#\\s*In\\[\\d*?\\]|#\\s*In\\[ \\])", + "jupyter.interactiveWindow.cellMarker.default": "# COMMAND ----------", + "python.testing.pytestArgs": [ + "." + ], + "python.testing.unittestEnabled": false, + "python.testing.pytestEnabled": true, + "python.analysis.extraPaths": ["resources/my_lakeflow_pipelines_pipeline"], + "files.exclude": { + "**/*.egg-info": true, + "**/__pycache__": true, + ".pytest_cache": true, + }, + "[python]": { + "editor.defaultFormatter": "ms-python.black-formatter", + "editor.formatOnSave": true, + }, +} diff --git a/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/README.md b/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/README.md new file mode 100644 index 0000000000..49d493b854 --- /dev/null +++ b/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/README.md @@ -0,0 +1,41 @@ +# my_lakeflow_pipelines + +The 'my_lakeflow_pipelines' project was generated by using the Lakeflow Pipelines template. + +## Setup + +1. Install the Databricks CLI from https://docs.databricks.com/dev-tools/cli/databricks-cli.html + +2. Authenticate to your Databricks workspace, if you have not done so already: + ``` + $ databricks auth login + ``` + +3. Optionally, install developer tools such as the Databricks extension for Visual Studio Code from + https://docs.databricks.com/dev-tools/vscode-ext.html. Or the PyCharm plugin from + https://www.databricks.com/blog/announcing-pycharm-integration-databricks. + + +## Deploying resources + +1. To deploy a development copy of this project, type: + ``` + $ databricks bundle deploy --target dev + ``` + (Note that "dev" is the default target, so the `--target` parameter + is optional here.) + +2. Similarly, to deploy a production copy, type: + ``` + $ databricks bundle deploy --target prod + ``` + +3. Use the "summary" comand to review everything that was deployed: + ``` + $ databricks bundle summary + ``` + +4. To run a job or pipeline, use the "run" command: + ``` + $ databricks bundle run + ``` diff --git a/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/databricks.yml b/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/databricks.yml new file mode 100644 index 0000000000..ded4a8470d --- /dev/null +++ b/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/databricks.yml @@ -0,0 +1,47 @@ +# This is a Databricks asset bundle definition for my_lakeflow_pipelines. +# See https://docs.databricks.com/dev-tools/bundles/index.html for documentation. +bundle: + name: my_lakeflow_pipelines + uuid: [UUID] + +include: + - resources/*.yml + - resources/*/*.yml + +# Variable declarations. These variables are assigned in the dev/prod targets below. +variables: + catalog: + description: The catalog to use + schema: + description: The schema to use + notifications: + description: The email addresses to use for failure notifications + +targets: + dev: + # The default target uses 'mode: development' to create a development copy. + # - Deployed resources get prefixed with '[dev my_user_name]' + # - Any job schedules and triggers are paused by default. + # See also https://docs.databricks.com/dev-tools/bundles/deployment-modes.html. + mode: development + default: true + workspace: + host: [DATABRICKS_URL] + variables: + catalog: main + schema: ${workspace.current_user.short_name} + notifications: [] + + prod: + mode: production + workspace: + host: [DATABRICKS_URL] + # We explicitly deploy to /Workspace/Users/[USERNAME] to make sure we only have a single copy. + root_path: /Workspace/Users/[USERNAME]/.bundle/${bundle.name}/${bundle.target} + permissions: + - user_name: [USERNAME] + level: CAN_MANAGE + variables: + catalog: main + schema: default + notifications: [[USERNAME]] diff --git a/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/out.gitignore b/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/out.gitignore new file mode 100644 index 0000000000..f6a3b5ff93 --- /dev/null +++ b/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/out.gitignore @@ -0,0 +1,8 @@ +.databricks/ +build/ +dist/ +__pycache__/ +*.egg-info +.venv/ +**/explorations/** +**/!explorations/README.md diff --git a/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/resources/my_lakeflow_pipelines_pipeline/README.md b/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/resources/my_lakeflow_pipelines_pipeline/README.md new file mode 100644 index 0000000000..d77802d23e --- /dev/null +++ b/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/resources/my_lakeflow_pipelines_pipeline/README.md @@ -0,0 +1,21 @@ +# my_lakeflow_pipelines_pipeline + +This folder defines all source code for the 'my_lakeflow_pipelines_pipeline' pipeline: + +- `explorations`: Ad-hoc notebooks used to explore the data processed by this pipeline. +- `transformations`: All dataset definitions and transformations. +- `data_sources` (optional): View definitions describing the source data for this pipeline. + +## Getting Started + +To get started, go to the `transformations` folder -- most of the relevant source code lives there: + +* By convention, every dataset under `transformations` is in a separate file. +* Take a look at the sample under "sample_trips_my_lakeflow_pipelines.sql" to get familiar with the syntax. + Read more about the syntax at https://docs.databricks.com/dlt/sql-ref.html. +* Use `Run file` to run and preview a single transformation. +* Use `Run pipeline` to run _all_ transformations in the entire pipeline. +* Use `+ Add` in the file browser to add a new data set definition. +* Use `Schedule` to run the pipeline on a schedule! + +For more tutorials and reference material, see https://docs.databricks.com/dlt. diff --git a/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/resources/my_lakeflow_pipelines_pipeline/explorations/sample_exploration.ipynb b/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/resources/my_lakeflow_pipelines_pipeline/explorations/sample_exploration.ipynb new file mode 100644 index 0000000000..a3db8fdf08 --- /dev/null +++ b/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/resources/my_lakeflow_pipelines_pipeline/explorations/sample_exploration.ipynb @@ -0,0 +1,64 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "[UUID]", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "### Example Exploratory Notebook\n", + "\n", + "Use this notebook to explore the data generated by the pipeline in your preferred programming language.\n", + "\n", + "**Note**: This notebook is not executed as part of the pipeline." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "[UUID]", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [], + "source": [ + "-- !!! Before performing any data analysis, make sure to run the pipeline to materialize the sample datasets. The tables referenced in this notebook depend on that step.\n", + "\n", + "USE CATALOG `main`;\n", + "USE SCHEMA `[USERNAME]`;\n", + "\n", + "SELECT * from my_lakeflow_pipelines;" + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "computePreferences": null, + "dashboards": [], + "environmentMetadata": null, + "inputWidgetPreferences": null, + "language": "sql", + "notebookMetadata": {}, + "notebookName": "sample_exploration", + "widgets": {} + }, + "language_info": { + "name": "sql" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/resources/my_lakeflow_pipelines_pipeline/my_lakeflow_pipelines.job.yml b/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/resources/my_lakeflow_pipelines_pipeline/my_lakeflow_pipelines.job.yml new file mode 100644 index 0000000000..f07a973780 --- /dev/null +++ b/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/resources/my_lakeflow_pipelines_pipeline/my_lakeflow_pipelines.job.yml @@ -0,0 +1,19 @@ +# The job that triggers my_lakeflow_pipelines_pipeline. +resources: + jobs: + my_lakeflow_pipelines_job: + name: my_lakeflow_pipelines_job + + trigger: + # Run this job every day, exactly one day from the last run; see https://docs.databricks.com/api/workspace/jobs/create#trigger + periodic: + interval: 1 + unit: DAYS + + email_notifications: + on_failure: ${var.notifications} + + tasks: + - task_key: refresh_pipeline + pipeline_task: + pipeline_id: ${resources.pipelines.my_lakeflow_pipelines_pipeline.id} diff --git a/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/resources/my_lakeflow_pipelines_pipeline/my_lakeflow_pipelines.pipeline.yml b/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/resources/my_lakeflow_pipelines_pipeline/my_lakeflow_pipelines.pipeline.yml new file mode 100644 index 0000000000..499ddad0ca --- /dev/null +++ b/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/resources/my_lakeflow_pipelines_pipeline/my_lakeflow_pipelines.pipeline.yml @@ -0,0 +1,12 @@ +resources: + pipelines: + my_lakeflow_pipelines_pipeline: + name: my_lakeflow_pipelines_pipeline + serverless: true + channel: "PREVIEW" + catalog: ${var.catalog} + schema: ${var.schema} + root_path: "." + libraries: + - glob: + include: transformations/** diff --git a/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/resources/my_lakeflow_pipelines_pipeline/transformations/sample_trips_my_lakeflow_pipelines.sql b/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/resources/my_lakeflow_pipelines_pipeline/transformations/sample_trips_my_lakeflow_pipelines.sql new file mode 100644 index 0000000000..116bb5184b --- /dev/null +++ b/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/resources/my_lakeflow_pipelines_pipeline/transformations/sample_trips_my_lakeflow_pipelines.sql @@ -0,0 +1,9 @@ +-- This file defines a sample transformation. +-- Edit the sample below or add new transformations +-- using "+ Add" in the file browser. + +CREATE MATERIALIZED VIEW sample_trips_my_lakeflow_pipelines AS +SELECT + pickup_zip, + fare_amount +FROM samples.nyctaxi.trips diff --git a/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/resources/my_lakeflow_pipelines_pipeline/transformations/sample_zones_my_lakeflow_pipelines.sql b/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/resources/my_lakeflow_pipelines_pipeline/transformations/sample_zones_my_lakeflow_pipelines.sql new file mode 100644 index 0000000000..79cfe04ae5 --- /dev/null +++ b/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/resources/my_lakeflow_pipelines_pipeline/transformations/sample_zones_my_lakeflow_pipelines.sql @@ -0,0 +1,10 @@ +-- This file defines a sample transformation. +-- Edit the sample below or add new transformations +-- using "+ Add" in the file browser. + +CREATE MATERIALIZED VIEW sample_zones_my_lakeflow_pipelines AS +SELECT + pickup_zip, + SUM(fare_amount) AS total_fare +FROM sample_trips_my_lakeflow_pipelines +GROUP BY pickup_zip diff --git a/acceptance/bundle/templates/lakeflow-pipelines/sql/script b/acceptance/bundle/templates/lakeflow-pipelines/sql/script new file mode 100755 index 0000000000..90735aebef --- /dev/null +++ b/acceptance/bundle/templates/lakeflow-pipelines/sql/script @@ -0,0 +1,9 @@ +trace $CLI bundle init lakeflow-pipelines --config-file ./input.json --output-dir output + +cd output/my_lakeflow_pipelines +trace $CLI bundle validate -t dev +trace $CLI bundle validate -t prod + +# Do not affect this repository's git behaviour #2318 +mv .gitignore out.gitignore +rm .databricks/.gitignore diff --git a/libs/template/template.go b/libs/template/template.go index 423770a54e..43569050e2 100644 --- a/libs/template/template.go +++ b/libs/template/template.go @@ -26,6 +26,7 @@ type TemplateName string const ( DefaultPython TemplateName = "default-python" DefaultSql TemplateName = "default-sql" + LakeflowPipelines TemplateName = "lakeflow-pipelines" DbtSql TemplateName = "dbt-sql" MlopsStacks TemplateName = "mlops-stacks" DefaultPydabs TemplateName = "default-pydabs" @@ -46,6 +47,13 @@ var databricksTemplates = []Template{ Reader: &builtinReader{name: string(DefaultSql)}, Writer: &writerWithFullTelemetry{defaultWriter: defaultWriter{name: DefaultSql}}, }, + { + name: LakeflowPipelines, + hidden: true, + description: "The default template for Lakeflow Declarative Pipelines", + Reader: &builtinReader{name: string(LakeflowPipelines)}, + Writer: &writerWithFullTelemetry{defaultWriter: defaultWriter{name: LakeflowPipelines}}, + }, { name: DbtSql, description: "The dbt SQL template (databricks.com/blog/delivering-cost-effective-data-real-time-dbt-and-databricks)", diff --git a/libs/template/templates/experimental-jobs-as-code/template/__preamble.tmpl b/libs/template/templates/experimental-jobs-as-code/template/__preamble.tmpl index 708a299385..bd284b0252 100644 --- a/libs/template/templates/experimental-jobs-as-code/template/__preamble.tmpl +++ b/libs/template/templates/experimental-jobs-as-code/template/__preamble.tmpl @@ -1,6 +1,6 @@ # Preamble -This file only template directives; it is skipped for the actual output. +This file only contains template directives; it is skipped for the actual output. {{skip "__preamble"}} diff --git a/libs/template/templates/lakeflow-pipelines/README.md b/libs/template/templates/lakeflow-pipelines/README.md new file mode 100644 index 0000000000..008a23aecd --- /dev/null +++ b/libs/template/templates/lakeflow-pipelines/README.md @@ -0,0 +1,3 @@ +# Lakeflow Pipelines + +Default template for Lakeflow Declarative Pipelines diff --git a/libs/template/templates/lakeflow-pipelines/databricks_template_schema.json b/libs/template/templates/lakeflow-pipelines/databricks_template_schema.json new file mode 100644 index 0000000000..53841d36f4 --- /dev/null +++ b/libs/template/templates/lakeflow-pipelines/databricks_template_schema.json @@ -0,0 +1,57 @@ +{ + "welcome_message": "\nWelcome to the template for Lakeflow Declarative Pipelines!", + "properties": { + "project_name": { + "type": "string", + "default": "my_lakeflow_project", + "description": "Please provide the following details to tailor the template to your preferences.\n\nUnique name for this project\nproject_name", + "order": 1, + "pattern": "^[a-z0-9_]+$", + "pattern_match_failure_message": "Name must consist of lower case letters, numbers, and underscores." + }, + "default_catalog": { + "type": "string", + "default": "{{default_catalog}}", + "pattern": "^\\w*$", + "pattern_match_failure_message": "Invalid catalog name.", + "description": "\nInitial catalog.\ndefault_catalog", + "order": 3 + }, + "personal_schemas": { + "type": "string", + "description": "\nUse a personal schema for each user working on this project? (e.g., 'catalog.{{short_name}}')\npersonal_schemas", + "default": "yes", + "enum": [ + "yes", + "no" + ], + "order": 4 + }, + "shared_schema": { + "skip_prompt_if": { + "properties": { + "personal_schemas": { + "const": "yes" + } + } + }, + "type": "string", + "default": "default", + "pattern": "^\\w+$", + "pattern_match_failure_message": "Invalid schema name.", + "description": "\nInitial schema during development:\ndefault_schema", + "order": 5 + }, + "language": { + "type": "string", + "default": "python", + "description": "\nInitial language for this project:\nlanguage", + "enum": [ + "python", + "sql" + ], + "order": 6 + } + }, + "success_message": "\n\nYour new project has been created in the '{{.project_name}}' directory!\n\nRefer to the README.md file for \"getting started\" instructions!" +} diff --git a/libs/template/templates/lakeflow-pipelines/library/variables.tmpl b/libs/template/templates/lakeflow-pipelines/library/variables.tmpl new file mode 100644 index 0000000000..9c5c36b449 --- /dev/null +++ b/libs/template/templates/lakeflow-pipelines/library/variables.tmpl @@ -0,0 +1,33 @@ +{{- define `pipeline_name` -}} + {{ .project_name }}_pipeline +{{- end }} + +{{- define `job_name` -}} + {{ .project_name }}_job +{{- end }} + +{{- define `static_dev_schema` -}} + {{- if (regexp "^yes").MatchString .personal_schemas -}} + {{ short_name }} + {{- else -}} + {{ .shared_schema }} + {{- end}} +{{- end }} + + +{{- define `dev_schema` -}} + {{- if (regexp "^yes").MatchString .personal_schemas -}} + ${workspace.current_user.short_name} + {{- else -}} + {{ .shared_schema }} + {{- end}} +{{- end }} + + +{{- define `prod_schema` -}} + {{- if (regexp "^yes").MatchString .personal_schemas -}} + default + {{- else -}} + {{ .shared_schema }} + {{- end}} +{{- end }} diff --git a/libs/template/templates/lakeflow-pipelines/template/__preamble.tmpl b/libs/template/templates/lakeflow-pipelines/template/__preamble.tmpl new file mode 100644 index 0000000000..c6c0c2321f --- /dev/null +++ b/libs/template/templates/lakeflow-pipelines/template/__preamble.tmpl @@ -0,0 +1,16 @@ +# Preamble + +This file only contains template directives; it is skipped for the actual output. + +{{skip "__preamble"}} + +{{$isSQL := eq .language "sql"}} + +{{if $isSQL}} + {{skip "{{.project_name}}/resources/{{.project_name}}_pipeline/utilities/utils.py"}} + {{skip "{{.project_name}}/resources/{{.project_name}}_pipeline/transformations/sample_zones_{{.project_name}}.py"}} + {{skip "{{.project_name}}/resources/{{.project_name}}_pipeline/transformations/sample_trips_{{.project_name}}.py"}} +{{else}} + {{skip "{{.project_name}}/resources/{{.project_name}}_pipeline/transformations/sample_zones_{{.project_name}}.sql"}} + {{skip "{{.project_name}}/resources/{{.project_name}}_pipeline/transformations/sample_trips_{{.project_name}}.sql"}} +{{end}} diff --git a/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/.gitignore.tmpl b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/.gitignore.tmpl new file mode 100644 index 0000000000..f6a3b5ff93 --- /dev/null +++ b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/.gitignore.tmpl @@ -0,0 +1,8 @@ +.databricks/ +build/ +dist/ +__pycache__/ +*.egg-info +.venv/ +**/explorations/** +**/!explorations/README.md diff --git a/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/.vscode/__builtins__.pyi b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/.vscode/__builtins__.pyi new file mode 100644 index 0000000000..0edd5181bc --- /dev/null +++ b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/.vscode/__builtins__.pyi @@ -0,0 +1,3 @@ +# Typings for Pylance in Visual Studio Code +# see https://github.com/microsoft/pyright/blob/main/docs/builtins.md +from databricks.sdk.runtime import * diff --git a/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/.vscode/extensions.json b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/.vscode/extensions.json new file mode 100644 index 0000000000..5d15eba363 --- /dev/null +++ b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/.vscode/extensions.json @@ -0,0 +1,7 @@ +{ + "recommendations": [ + "databricks.databricks", + "ms-python.vscode-pylance", + "redhat.vscode-yaml" + ] +} diff --git a/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/.vscode/settings.json.tmpl b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/.vscode/settings.json.tmpl new file mode 100644 index 0000000000..6a87715ae2 --- /dev/null +++ b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/.vscode/settings.json.tmpl @@ -0,0 +1,22 @@ +{ + "python.analysis.stubPath": ".vscode", + "databricks.python.envFile": "${workspaceFolder}/.env", + "jupyter.interactiveWindow.cellMarker.codeRegex": "^# COMMAND ----------|^# Databricks notebook source|^(#\\s*%%|#\\s*\\|#\\s*In\\[\\d*?\\]|#\\s*In\\[ \\])", + "jupyter.interactiveWindow.cellMarker.default": "# COMMAND ----------", + "python.testing.pytestArgs": [ + "." + ], + "python.testing.unittestEnabled": false, + "python.testing.pytestEnabled": true, + {{- /* Unfortunately extraPaths doesn't support globs!! See: https://github.com/microsoft/pylance-release/issues/973 */}} + "python.analysis.extraPaths": ["resources/{{.project_name}}_pipeline"], + "files.exclude": { + "**/*.egg-info": true, + "**/__pycache__": true, + ".pytest_cache": true, + }, + "[python]": { + "editor.defaultFormatter": "ms-python.black-formatter", + "editor.formatOnSave": true, + }, +} diff --git a/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/README.md.tmpl b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/README.md.tmpl new file mode 100644 index 0000000000..837213a189 --- /dev/null +++ b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/README.md.tmpl @@ -0,0 +1,41 @@ +# {{.project_name}} + +The '{{.project_name}}' project was generated by using the Lakeflow Pipelines template. + +## Setup + +1. Install the Databricks CLI from https://docs.databricks.com/dev-tools/cli/databricks-cli.html + +2. Authenticate to your Databricks workspace, if you have not done so already: + ``` + $ databricks auth login + ``` + +3. Optionally, install developer tools such as the Databricks extension for Visual Studio Code from + https://docs.databricks.com/dev-tools/vscode-ext.html. Or the PyCharm plugin from + https://www.databricks.com/blog/announcing-pycharm-integration-databricks. + + +## Deploying resources + +1. To deploy a development copy of this project, type: + ``` + $ databricks bundle deploy --target dev + ``` + (Note that "dev" is the default target, so the `--target` parameter + is optional here.) + +2. Similarly, to deploy a production copy, type: + ``` + $ databricks bundle deploy --target prod + ``` + +3. Use the "summary" comand to review everything that was deployed: + ``` + $ databricks bundle summary + ``` + +4. To run a job or pipeline, use the "run" command: + ``` + $ databricks bundle run + ``` diff --git a/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/databricks.yml.tmpl b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/databricks.yml.tmpl new file mode 100644 index 0000000000..1108b20128 --- /dev/null +++ b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/databricks.yml.tmpl @@ -0,0 +1,47 @@ +# This is a Databricks asset bundle definition for {{.project_name}}. +# See https://docs.databricks.com/dev-tools/bundles/index.html for documentation. +bundle: + name: {{.project_name}} + uuid: {{bundle_uuid}} + +include: + - resources/*.yml + - resources/*/*.yml + +# Variable declarations. These variables are assigned in the dev/prod targets below. +variables: + catalog: + description: The catalog to use + schema: + description: The schema to use + notifications: + description: The email addresses to use for failure notifications + +targets: + dev: + # The default target uses 'mode: development' to create a development copy. + # - Deployed resources get prefixed with '[dev my_user_name]' + # - Any job schedules and triggers are paused by default. + # See also https://docs.databricks.com/dev-tools/bundles/deployment-modes.html. + mode: development + default: true + workspace: + host: {{workspace_host}} + variables: + catalog: {{.default_catalog}} + schema: {{template `dev_schema` .}} + notifications: [] + + prod: + mode: production + workspace: + host: {{workspace_host}} + # We explicitly deploy to /Workspace/Users/{{user_name}} to make sure we only have a single copy. + root_path: /Workspace/Users/{{user_name}}/.bundle/${bundle.name}/${bundle.target} + permissions: + - {{if is_service_principal}}service_principal{{else}}user{{end}}_name: {{user_name}} + level: CAN_MANAGE + variables: + catalog: {{.default_catalog}} + schema: {{template `prod_schema` .}} + notifications: [{{user_name}}] diff --git a/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name}}_pipeline/README.md.tmpl b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name}}_pipeline/README.md.tmpl new file mode 100644 index 0000000000..b085a301a6 --- /dev/null +++ b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name}}_pipeline/README.md.tmpl @@ -0,0 +1,48 @@ +{{- if (eq .language "python") -}} + +# {{template `pipeline_name` .}} + +This folder defines all source code for the {{template `pipeline_name` .}} pipeline: + +- `explorations`: Ad-hoc notebooks used to explore the data processed by this pipeline. +- `transformations`: All dataset definitions and transformations. +- `utilities` (optional): Utility functions and Python modules used in this pipeline. +- `data_sources` (optional): View definitions describing the source data for this pipeline. + +## Getting Started + +To get started, go to the `transformations` folder -- most of the relevant source code lives there: + +* By convention, every dataset under `transformations` is in a separate file. +* Take a look at the sample under "sample_trips_{{ .project_name }}.py" to get familiar with the syntax. + Read more about the syntax at https://docs.databricks.com/dlt/python-ref.html. +* Use `Run file` to run and preview a single transformation. +* Use `Run pipeline` to run _all_ transformations in the entire pipeline. +* Use `+ Add` in the file browser to add a new data set definition. +* Use `Schedule` to run the pipeline on a schedule! + +For more tutorials and reference material, see https://docs.databricks.com/dlt. +{{ else -}} + +# {{template `pipeline_name` .}} + +This folder defines all source code for the '{{template `pipeline_name` .}}' pipeline: + +- `explorations`: Ad-hoc notebooks used to explore the data processed by this pipeline. +- `transformations`: All dataset definitions and transformations. +- `data_sources` (optional): View definitions describing the source data for this pipeline. + +## Getting Started + +To get started, go to the `transformations` folder -- most of the relevant source code lives there: + +* By convention, every dataset under `transformations` is in a separate file. +* Take a look at the sample under "sample_trips_{{ .project_name }}.sql" to get familiar with the syntax. + Read more about the syntax at https://docs.databricks.com/dlt/sql-ref.html. +* Use `Run file` to run and preview a single transformation. +* Use `Run pipeline` to run _all_ transformations in the entire pipeline. +* Use `+ Add` in the file browser to add a new data set definition. +* Use `Schedule` to run the pipeline on a schedule! + +For more tutorials and reference material, see https://docs.databricks.com/dlt. +{{ end -}} diff --git a/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name}}_pipeline/explorations/sample_exploration.ipynb.tmpl b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name}}_pipeline/explorations/sample_exploration.ipynb.tmpl new file mode 100644 index 0000000000..967e663fae --- /dev/null +++ b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name}}_pipeline/explorations/sample_exploration.ipynb.tmpl @@ -0,0 +1,130 @@ +{{- if (eq .language "python") -}} +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "19a992e9-55e0-49e4-abc7-8c92c420dd5b", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "### Example Exploratory Notebook\n", + "\n", + "Use this notebook to explore the data generated by the pipeline in your preferred programming language.\n", + "\n", + "**Note**: This notebook is not executed as part of the pipeline." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "1b0a82fa-3c6a-4f29-bb43-ded1c4fd77c6", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [], + "source": [ + "# !!! Before performing any data analysis, make sure to run the pipeline to materialize the sample datasets. The tables referenced in this notebook depend on that step.\n", + "\n", + "display(spark.sql(\"SELECT * FROM {{ .default_catalog}}.{{template `static_dev_schema` .}}.{{ .project_name }}\"))" + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "computePreferences": null, + "dashboards": [], + "environmentMetadata": null, + "inputWidgetPreferences": null, + "language": "python", + "notebookMetadata": { + "pythonIndentUnit": 2 + }, + "notebookName": "sample_exploration", + "widgets": {} + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} +{{ else -}} +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "3bd3cbb1-1518-4d0a-a8d1-f08da3f8840b", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "### Example Exploratory Notebook\n", + "\n", + "Use this notebook to explore the data generated by the pipeline in your preferred programming language.\n", + "\n", + "**Note**: This notebook is not executed as part of the pipeline." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "d30a8e05-bf7a-47e1-982e-b37e64cd6d43", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [], + "source": [ + "-- !!! Before performing any data analysis, make sure to run the pipeline to materialize the sample datasets. The tables referenced in this notebook depend on that step.\n", + "\n", + "USE CATALOG `{{.default_catalog}}`;\n", + "USE SCHEMA `{{template `static_dev_schema` .}}`;\n", + "\n", + "SELECT * from {{ .project_name }};" + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "computePreferences": null, + "dashboards": [], + "environmentMetadata": null, + "inputWidgetPreferences": null, + "language": "sql", + "notebookMetadata": {}, + "notebookName": "sample_exploration", + "widgets": {} + }, + "language_info": { + "name": "sql" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} +{{ end -}} diff --git a/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name}}_pipeline/transformations/sample_trips_{{.project_name}}.py.tmpl b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name}}_pipeline/transformations/sample_trips_{{.project_name}}.py.tmpl new file mode 100644 index 0000000000..a191f88b9f --- /dev/null +++ b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name}}_pipeline/transformations/sample_trips_{{.project_name}}.py.tmpl @@ -0,0 +1,16 @@ +import dlt +from pyspark.sql.functions import col +from utilities import utils + + +# This file defines a sample transformation. +# Edit the sample below or add new transformations +# using "+ Add" in the file browser. + + +@dlt.table +def sample_trips_{{ .project_name }}(): + return ( + spark.read.table("samples.nyctaxi.trips") + .withColumn("trip_distance_km", utils.distance_km(col("trip_distance"))) + ) diff --git a/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name}}_pipeline/transformations/sample_trips_{{.project_name}}.sql.tmpl b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name}}_pipeline/transformations/sample_trips_{{.project_name}}.sql.tmpl new file mode 100644 index 0000000000..b95a95da4d --- /dev/null +++ b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name}}_pipeline/transformations/sample_trips_{{.project_name}}.sql.tmpl @@ -0,0 +1,9 @@ +-- This file defines a sample transformation. +-- Edit the sample below or add new transformations +-- using "+ Add" in the file browser. + +CREATE MATERIALIZED VIEW sample_trips_{{ .project_name }} AS +SELECT + pickup_zip, + fare_amount +FROM samples.nyctaxi.trips diff --git a/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name}}_pipeline/transformations/sample_zones_{{.project_name}}.py.tmpl b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name}}_pipeline/transformations/sample_zones_{{.project_name}}.py.tmpl new file mode 100644 index 0000000000..64e40036d0 --- /dev/null +++ b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name}}_pipeline/transformations/sample_zones_{{.project_name}}.py.tmpl @@ -0,0 +1,19 @@ +import dlt +from pyspark.sql.functions import col, sum + + +# This file defines a sample transformation. +# Edit the sample below or add new transformations +# using "+ Add" in the file browser. + + +@dlt.table +def sample_zones_{{ .project_name }}(): + # Read from the "sample_trips" table, then sum all the fares + return ( + spark.read.table("sample_trips_{{ .project_name }}") + .groupBy(col("pickup_zip")) + .agg( + sum("fare_amount").alias("total_fare") + ) + ) diff --git a/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name}}_pipeline/transformations/sample_zones_{{.project_name}}.sql.tmpl b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name}}_pipeline/transformations/sample_zones_{{.project_name}}.sql.tmpl new file mode 100644 index 0000000000..ab84f4066a --- /dev/null +++ b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name}}_pipeline/transformations/sample_zones_{{.project_name}}.sql.tmpl @@ -0,0 +1,10 @@ +-- This file defines a sample transformation. +-- Edit the sample below or add new transformations +-- using "+ Add" in the file browser. + +CREATE MATERIALIZED VIEW sample_zones_{{ .project_name }} AS +SELECT + pickup_zip, + SUM(fare_amount) AS total_fare +FROM sample_trips_{{ .project_name }} +GROUP BY pickup_zip diff --git a/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name}}_pipeline/utilities/utils.py b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name}}_pipeline/utilities/utils.py new file mode 100644 index 0000000000..ff039898f0 --- /dev/null +++ b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name}}_pipeline/utilities/utils.py @@ -0,0 +1,8 @@ +from pyspark.sql.functions import udf +from pyspark.sql.types import FloatType + + +@udf(returnType=FloatType()) +def distance_km(distance_miles): + """Convert distance from miles to kilometers (1 mile = 1.60934 km).""" + return distance_miles * 1.60934 diff --git a/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name}}_pipeline/{{.project_name}}.job.yml.tmpl b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name}}_pipeline/{{.project_name}}.job.yml.tmpl new file mode 100644 index 0000000000..1e7a7ca780 --- /dev/null +++ b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name}}_pipeline/{{.project_name}}.job.yml.tmpl @@ -0,0 +1,19 @@ +# The job that triggers {{template `pipeline_name` .}}. +resources: + jobs: + {{template `job_name` .}}: + name: {{template `job_name` .}} + + trigger: + # Run this job every day, exactly one day from the last run; see https://docs.databricks.com/api/workspace/jobs/create#trigger + periodic: + interval: 1 + unit: DAYS + + email_notifications: + on_failure: ${var.notifications} + + tasks: + - task_key: refresh_pipeline + pipeline_task: + pipeline_id: ${resources.pipelines.{{template `pipeline_name` .}}.id} diff --git a/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name}}_pipeline/{{.project_name}}.pipeline.yml.tmpl b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name}}_pipeline/{{.project_name}}.pipeline.yml.tmpl new file mode 100644 index 0000000000..23df081f00 --- /dev/null +++ b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name}}_pipeline/{{.project_name}}.pipeline.yml.tmpl @@ -0,0 +1,12 @@ +resources: + pipelines: + {{template `pipeline_name` .}}: + name: {{template `pipeline_name` .}} + serverless: true + channel: "PREVIEW" + catalog: ${var.catalog} + schema: ${var.schema} + root_path: "." + libraries: + - glob: + include: transformations/** diff --git a/ruff.toml b/ruff.toml index b439cc6c00..5838db95ee 100644 --- a/ruff.toml +++ b/ruff.toml @@ -1,4 +1,7 @@ line-length = 150 -# tagging.py is synced from universe in the `openapi/tagging` directory and follows different format rules. -exclude = ["tagging.py"] + +exclude = [ + "tagging.py", # tagging.py is synced from universe in the `openapi/tagging` directory and follows different format rules. + "acceptance/bundle/templates/lakeflow-pipelines/**/*.py" # files are manually formatted +]