diff --git a/NEXT_CHANGELOG.md b/NEXT_CHANGELOG.md index 257624d3a0..85f57c32cb 100644 --- a/NEXT_CHANGELOG.md +++ b/NEXT_CHANGELOG.md @@ -10,5 +10,6 @@ ### Dependency updates ### Bundles +* Update default-python template to make DB Connect work out of the box for unit tests, using uv to install dependencies ([#3254](https://github.com/databricks/cli/pull/3254)) ### API Changes diff --git a/acceptance/bundle/help/bundle-init/output.txt b/acceptance/bundle/help/bundle-init/output.txt index b66253e08a..608c9c8cbb 100644 --- a/acceptance/bundle/help/bundle-init/output.txt +++ b/acceptance/bundle/help/bundle-init/output.txt @@ -3,7 +3,7 @@ Initialize using a bundle template to get started quickly. TEMPLATE_PATH optionally specifies which template to use. It can be one of the following: -- default-python: The default Python template for Notebooks and Lakeflow +- default-python: The default Python template, using Python files or notebooks with Lakeflow - default-sql: The default SQL template for .sql files that run with Databricks SQL - dbt-sql: The dbt SQL template (databricks.com/blog/delivering-cost-effective-data-real-time-dbt-and-databricks) - mlops-stacks: The Databricks MLOps Stacks template (github.com/databricks/mlops-stacks) diff --git a/acceptance/bundle/templates-machinery/helper_upper_lower/output.txt b/acceptance/bundle/templates-machinery/helper_upper_lower/output.txt index c395eeecfb..aa4ba5a1a8 100644 --- a/acceptance/bundle/templates-machinery/helper_upper_lower/output.txt +++ b/acceptance/bundle/templates-machinery/helper_upper_lower/output.txt @@ -1,7 +1,7 @@ >>> [CLI] bundle init . -✨ Successfully initialized template +Error: failed to compute file content for hello.txt.tmpl. error in {{ .s | lower }} +{{ .s | upper }} +: template: :1: function "lower" not defined ->>> cat hello.txt -abcd0123😀 -ABCD0123😀 +Exit code: 1 diff --git a/acceptance/bundle/templates/default-python/classic/out.compare-vs-serverless.diff b/acceptance/bundle/templates/default-python/classic/out.compare-vs-serverless.diff deleted file mode 100644 index 25307e2031..0000000000 --- a/acceptance/bundle/templates/default-python/classic/out.compare-vs-serverless.diff +++ /dev/null @@ -1,68 +0,0 @@ ---- [TESTROOT]/bundle/templates/default-python/classic/../serverless/output/my_default_python/databricks.yml -+++ output/my_default_python/databricks.yml -@@ -25,4 +25,11 @@ - host: [DATABRICKS_URL] - -+ presets: -+ # Set dynamic_version: true on all artifacts of type "whl". -+ # This makes "bundle deploy" add a timestamp to wheel's version before uploading, -+ # new wheel takes over the previous installation even if actual wheel version is unchanged. -+ # See https://docs.databricks.com/aws/en/dev-tools/bundles/settings -+ artifacts_dynamic_version: true -+ - prod: - mode: production ---- [TESTROOT]/bundle/templates/default-python/classic/../serverless/output/my_default_python/resources/my_default_python.job.yml -+++ output/my_default_python/resources/my_default_python.job.yml -@@ -17,4 +17,5 @@ - tasks: - - task_key: notebook_task -+ job_cluster_key: job_cluster - notebook_task: - notebook_path: ../src/notebook.ipynb -@@ -29,17 +30,21 @@ - depends_on: - - task_key: refresh_pipeline -- environment_key: default -+ job_cluster_key: job_cluster - python_wheel_task: - package_name: my_default_python - entry_point: main -+ libraries: -+ # By default we just include the .whl file generated for the my_default_python package. -+ # See https://docs.databricks.com/dev-tools/bundles/library-dependencies.html -+ # for more information on how to add other libraries. -+ - whl: ../dist/*.whl - -- # A list of task execution environment specifications that can be referenced by tasks of this job. -- environments: -- - environment_key: default -- -- # Full documentation of this spec can be found at: -- # https://docs.databricks.com/api/workspace/jobs/create#environments-spec -- spec: -- client: "2" -- dependencies: -- - ../dist/*.whl -+ job_clusters: -+ - job_cluster_key: job_cluster -+ new_cluster: -+ spark_version: 15.4.x-scala2.12 -+ node_type_id: [NODE_TYPE_ID] -+ data_security_mode: SINGLE_USER -+ autoscale: -+ min_workers: 1 -+ max_workers: 4 ---- [TESTROOT]/bundle/templates/default-python/classic/../serverless/output/my_default_python/resources/my_default_python.pipeline.yml -+++ output/my_default_python/resources/my_default_python.pipeline.yml -@@ -4,8 +4,7 @@ - my_default_python_pipeline: - name: my_default_python_pipeline -- ## Catalog is required for serverless compute -- catalog: main -+ ## Specify the 'catalog' field to configure this pipeline to make use of Unity Catalog: -+ # catalog: catalog_name - schema: my_default_python_${bundle.target} -- serverless: true - libraries: - - notebook: diff --git a/acceptance/bundle/templates/default-python/classic/output.txt b/acceptance/bundle/templates/default-python/classic/output.txt index 930e756de7..15d2468038 100644 --- a/acceptance/bundle/templates/default-python/classic/output.txt +++ b/acceptance/bundle/templates/default-python/classic/output.txt @@ -1,8 +1,12 @@ >>> [CLI] bundle init default-python --config-file ./input.json --output-dir output - Welcome to the default Python template for Databricks Asset Bundles! -Workspace to use (auto-detected, edit in 'my_default_python/databricks.yml'): [DATABRICKS_URL] + +Please answer the below to tailor your project to your preferences. +You can always change your mind and change your configuration in the databricks.yml file later. + +Note that [DATABRICKS_URL] is used for initialization +(see https://docs.databricks.com/dev-tools/cli/profiles.html for how to change your profile). ✨ Your new project has been created in the 'my_default_python' directory! @@ -10,6 +14,8 @@ Please refer to the README.md file for "getting started" instructions. See also the documentation at https://docs.databricks.com/dev-tools/bundles/index.html. >>> [CLI] bundle validate -t dev +Error: path [TEST_TMP_DIR]/output/dist/*.whl is not contained in sync root path + Name: my_default_python Target: dev Workspace: @@ -17,14 +23,6 @@ Workspace: User: [USERNAME] Path: /Workspace/Users/[USERNAME]/.bundle/my_default_python/dev -Validation OK! - ->>> [CLI] bundle validate -t prod -Name: my_default_python -Target: prod -Workspace: - Host: [DATABRICKS_URL] - User: [USERNAME] - Path: /Workspace/Users/[USERNAME]/.bundle/my_default_python/prod +Found 1 error -Validation OK! +Exit code: 1 diff --git a/acceptance/bundle/templates/default-python/classic/output/my_default_python/out.gitignore b/acceptance/bundle/templates/default-python/classic/output/my_default_python/.gitignore similarity index 100% rename from acceptance/bundle/templates/default-python/classic/output/my_default_python/out.gitignore rename to acceptance/bundle/templates/default-python/classic/output/my_default_python/.gitignore diff --git a/acceptance/bundle/templates/default-python/classic/output/my_default_python/.vscode/extensions.json b/acceptance/bundle/templates/default-python/classic/output/my_default_python/.vscode/extensions.json index 5d15eba363..1f39c33087 100644 --- a/acceptance/bundle/templates/default-python/classic/output/my_default_python/.vscode/extensions.json +++ b/acceptance/bundle/templates/default-python/classic/output/my_default_python/.vscode/extensions.json @@ -1,7 +1,6 @@ { "recommendations": [ "databricks.databricks", - "ms-python.vscode-pylance", "redhat.vscode-yaml" ] } diff --git a/acceptance/bundle/templates/default-python/classic/output/my_default_python/.vscode/settings.json b/acceptance/bundle/templates/default-python/classic/output/my_default_python/.vscode/settings.json index 8ee87c30d4..d8468d7b60 100644 --- a/acceptance/bundle/templates/default-python/classic/output/my_default_python/.vscode/settings.json +++ b/acceptance/bundle/templates/default-python/classic/output/my_default_python/.vscode/settings.json @@ -1,16 +1,39 @@ { - "python.analysis.stubPath": ".vscode", "jupyter.interactiveWindow.cellMarker.codeRegex": "^# COMMAND ----------|^# Databricks notebook source|^(#\\s*%%|#\\s*\\|#\\s*In\\[\\d*?\\]|#\\s*In\\[ \\])", "jupyter.interactiveWindow.cellMarker.default": "# COMMAND ----------", "python.testing.pytestArgs": [ "." ], - "python.testing.unittestEnabled": false, - "python.testing.pytestEnabled": true, - "python.analysis.extraPaths": ["src"], "files.exclude": { "**/*.egg-info": true, "**/__pycache__": true, ".pytest_cache": true, + "dist": true, + }, + "files.associations": { + "**/.gitkeep": "markdown" + } + + // Pylance settings (VS Code) + // Set typeCheckingMode to "basic" to enable type checking! + "python.analysis.typeCheckingMode": "off", + "python.analysis.extraPaths": ["src", "lib", "resources"], + "python.analysis.diagnosticMode": "workspace", + "python.analysis.stubPath": ".vscode", + + // Pyright settings (Cursor) + // Set typeCheckingMode to "basic" to enable type checking! + "cursorpyright.analysis.typeCheckingMode": "off", + "cursorpyright.analysis.extraPaths": ["src", "lib", "resources"], + "cursorpyright.analysis.diagnosticMode": "workspace", + "cursorpyright.analysis.stubPath": ".vscode", + + // General Python settings + "python.defaultInterpreterPath": "./.venv/bin/python", + "python.testing.unittestEnabled": false, + "python.testing.pytestEnabled": true, + "[python]": { + "editor.defaultFormatter": "ms-python.python", + "editor.formatOnSave": true, }, } diff --git a/acceptance/bundle/templates/default-python/classic/output/my_default_python/README.md b/acceptance/bundle/templates/default-python/classic/output/my_default_python/README.md index f3b5d153b2..b373a1708d 100644 --- a/acceptance/bundle/templates/default-python/classic/output/my_default_python/README.md +++ b/acceptance/bundle/templates/default-python/classic/output/my_default_python/README.md @@ -1,19 +1,46 @@ # my_default_python The 'my_default_python' project was generated by using the default-python template. +For documentation on the Databricks Asset Bundles format use for this project, +and for CI/CD configuration, see https://docs.databricks.com/aws/en/dev-tools/bundles. + +* `src/`: Python source code for this project. +* `src/shared`: Shared source code across all jobs/pipelines/etc. +* `src/default_python_etl`: Python source code for the default_python_etl pipeline. +* `resources/`: Resource configurations (jobs, pipelines, etc.) +* `tests/`: Unit tests. +* `fixtures/`: Fixtures for data sets (primarily used for testing). ## Getting started -0. Install UV: https://docs.astral.sh/uv/getting-started/installation/ +Choose how you want to work on this project: + +(a) Directly in your Databricks workspace, see + https://docs.databricks.com/dev-tools/bundles/workspace. + +(b) Locally with an IDE like Cursor or VS Code, see + https://docs.databricks.com/vscode-ext. + +(c) With command line tools, see https://docs.databricks.com/dev-tools/cli/databricks-cli.html + +Dependencies for this project should be installed using uv: + +* Make sure you have the UV package manager installed. + It's an alternative to tools like pip: https://docs.astral.sh/uv/getting-started/installation/. +* Run `uv sync --dev` to install the project's dependencies. -1. Install the Databricks CLI from https://docs.databricks.com/dev-tools/cli/databricks-cli.html -2. Authenticate to your Databricks workspace, if you have not done so already: +# Using this project using the CLI + +The Databricks workspace and IDE extensions provide a graphical interface for working +with this project. It's also possible to interact with it directly using the CLI: + +1. Authenticate to your Databricks workspace, if you have not done so already: ``` $ databricks configure ``` -3. To deploy a development copy of this project, type: +2. To deploy a development copy of this project, type: ``` $ databricks bundle deploy --target dev ``` @@ -23,9 +50,9 @@ The 'my_default_python' project was generated by using the default-python templa This deploys everything that's defined for this project. For example, the default template would deploy a job called `[dev yourname] my_default_python_job` to your workspace. - You can find that job by opening your workpace and clicking on **Workflows**. + You can find that job by opening your workpace and clicking on **Jobs & Pipelines**. -4. Similarly, to deploy a production copy, type: +3. Similarly, to deploy a production copy, type: ``` $ databricks bundle deploy --target prod ``` @@ -35,17 +62,12 @@ The 'my_default_python' project was generated by using the default-python templa is paused when deploying in development mode (see https://docs.databricks.com/dev-tools/bundles/deployment-modes.html). -5. To run a job or pipeline, use the "run" command: +4. To run a job or pipeline, use the "run" command: ``` $ databricks bundle run ``` -6. Optionally, install the Databricks extension for Visual Studio code for local development from - https://docs.databricks.com/dev-tools/vscode-ext.html. It can configure your - virtual environment and setup Databricks Connect for running unit tests locally. - When not using these tools, consult your development environment's documentation - and/or the documentation for Databricks Connect for manually setting up your environment - (https://docs.databricks.com/en/dev-tools/databricks-connect/python/index.html). - -7. For documentation on the Databricks asset bundles format used - for this project, and for CI/CD configuration, see - https://docs.databricks.com/dev-tools/bundles/index.html. +5. Finally, to run tests locally, use `pytest`: + ``` + $ uv run pytest + ``` + diff --git a/acceptance/bundle/templates/default-python/classic/output/my_default_python/databricks.yml b/acceptance/bundle/templates/default-python/classic/output/my_default_python/databricks.yml index ed3d53b999..cc6079c53c 100644 --- a/acceptance/bundle/templates/default-python/classic/output/my_default_python/databricks.yml +++ b/acceptance/bundle/templates/default-python/classic/output/my_default_python/databricks.yml @@ -4,14 +4,21 @@ bundle: name: my_default_python uuid: [UUID] +include: + - resources/*.yml + - resources/*/*.yml + artifacts: python_artifact: type: whl build: uv build --wheel -include: - - resources/*.yml - - resources/*/*.yml +# Variable declarations. These variables are assigned in the dev/prod targets below. +variables: + catalog: + description: The catalog to use + schema: + description: The schema to use targets: dev: @@ -23,20 +30,20 @@ targets: default: true workspace: host: [DATABRICKS_URL] - + variables: + catalog: hive_metastore + schema: ${workspace.current_user.short_name} presets: - # Set dynamic_version: true on all artifacts of type "whl". - # This makes "bundle deploy" add a timestamp to wheel's version before uploading, - # new wheel takes over the previous installation even if actual wheel version is unchanged. - # See https://docs.databricks.com/aws/en/dev-tools/bundles/settings artifacts_dynamic_version: true - prod: mode: production workspace: host: [DATABRICKS_URL] # We explicitly deploy to /Workspace/Users/[USERNAME] to make sure we only have a single copy. root_path: /Workspace/Users/[USERNAME]/.bundle/${bundle.name}/${bundle.target} + variables: + catalog: hive_metastore + schema: prod permissions: - user_name: [USERNAME] level: CAN_MANAGE diff --git a/acceptance/bundle/templates/default-python/classic/output/my_default_python/fixtures/.gitkeep b/acceptance/bundle/templates/default-python/classic/output/my_default_python/fixtures/.gitkeep index fa25d2745e..77a906614c 100644 --- a/acceptance/bundle/templates/default-python/classic/output/my_default_python/fixtures/.gitkeep +++ b/acceptance/bundle/templates/default-python/classic/output/my_default_python/fixtures/.gitkeep @@ -1,22 +1,9 @@ -# Fixtures +# Test fixtures directory -This folder is reserved for fixtures, such as CSV files. - -Below is an example of how to load fixtures as a data frame: +Add JSON or CSV files here. In tests, use them with `load_fixture()`: ``` -import pandas as pd -import os - -def get_absolute_path(*relative_parts): - if 'dbutils' in globals(): - base_dir = os.path.dirname(dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()) # type: ignore - path = os.path.normpath(os.path.join(base_dir, *relative_parts)) - return path if path.startswith("/Workspace") else "/Workspace" + path - else: - return os.path.join(*relative_parts) - -csv_file = get_absolute_path("..", "fixtures", "mycsv.csv") -df = pd.read_csv(csv_file) -display(df) +def test_using_fixture(load_fixture): + data = load_fixture("my_data.json") + assert len(data) >= 1 ``` diff --git a/acceptance/bundle/templates/default-python/classic/output/my_default_python/pyproject.toml b/acceptance/bundle/templates/default-python/classic/output/my_default_python/pyproject.toml index 5049f8a3ea..7cd4a5cc86 100644 --- a/acceptance/bundle/templates/default-python/classic/output/my_default_python/pyproject.toml +++ b/acceptance/bundle/templates/default-python/classic/output/my_default_python/pyproject.toml @@ -4,30 +4,23 @@ version = "0.0.1" authors = [{ name = "[USERNAME]" }] requires-python = ">= 3.11" -[project.optional-dependencies] +[dependency-groups] dev = [ "pytest", - - # Code completion support for Lakeflow Declarative Pipelines, also install databricks-connect "databricks-dlt", # databricks-connect can be used to run parts of this project locally. - # See https://docs.databricks.com/dev-tools/databricks-connect.html. - # - # Note, databricks-connect is automatically installed if you're using Databricks - # extension for Visual Studio Code - # (https://docs.databricks.com/dev-tools/vscode-ext/dev-tasks/databricks-connect.html). - # - # To manually install databricks-connect, uncomment the line below to install a version - # of db-connect that corresponds to the Databricks Runtime version used for this project. - # See https://docs.databricks.com/dev-tools/databricks-connect.html - # "databricks-connect>=15.4,<15.5", + # Note that for local development, you should use a version that is not newer + # than the remote cluster or serverless compute you connect to. + # See also https://docs.databricks.com/dev-tools/databricks-connect.html. + "databricks-connect>=15.4,<15.5", ] [tool.pytest.ini_options] pythonpath = "src" testpaths = [ "tests", + "resources", ] [build-system] @@ -35,7 +28,8 @@ requires = ["hatchling"] build-backend = "hatchling.build" [tool.hatch.build.targets.wheel] -packages = ["src/my_default_python"] +packages = ["src"] +sources = ["src"] [project.scripts] -main = "my_default_python.main:main" +main = "sample_python_file:main" diff --git a/acceptance/bundle/templates/default-python/classic/output/my_default_python/resources/.gitkeep b/acceptance/bundle/templates/default-python/classic/output/my_default_python/resources/.gitkeep new file mode 100644 index 0000000000..3e09c14c18 --- /dev/null +++ b/acceptance/bundle/templates/default-python/classic/output/my_default_python/resources/.gitkeep @@ -0,0 +1 @@ +This folder is reserved for Databricks Asset Bundles resource definitions. diff --git a/acceptance/bundle/templates/default-python/classic/output/my_default_python/resources/my_default_python.pipeline.yml b/acceptance/bundle/templates/default-python/classic/output/my_default_python/resources/default_python_etl.pipeline.yml similarity index 55% rename from acceptance/bundle/templates/default-python/classic/output/my_default_python/resources/my_default_python.pipeline.yml rename to acceptance/bundle/templates/default-python/classic/output/my_default_python/resources/default_python_etl.pipeline.yml index 6e49947083..e500171279 100644 --- a/acceptance/bundle/templates/default-python/classic/output/my_default_python/resources/my_default_python.pipeline.yml +++ b/acceptance/bundle/templates/default-python/classic/output/my_default_python/resources/default_python_etl.pipeline.yml @@ -1,14 +1,14 @@ + # The main pipeline for my_default_python resources: pipelines: - my_default_python_pipeline: - name: my_default_python_pipeline + default_python_etl: + name: default_python_etl ## Specify the 'catalog' field to configure this pipeline to make use of Unity Catalog: # catalog: catalog_name schema: my_default_python_${bundle.target} - libraries: - - notebook: - path: ../src/pipeline.ipynb + root_path: ../src - configuration: - bundle.sourcePath: ${workspace.file_path}/src + libraries: + - glob: + include: ../src/default_python_etl/transformations/** diff --git a/acceptance/bundle/templates/default-python/classic/output/my_default_python/resources/my_default_python.job.yml b/acceptance/bundle/templates/default-python/classic/output/my_default_python/resources/sample_job.job.yml similarity index 62% rename from acceptance/bundle/templates/default-python/classic/output/my_default_python/resources/my_default_python.job.yml rename to acceptance/bundle/templates/default-python/classic/output/my_default_python/resources/sample_job.job.yml index 30b579f500..8e2ddbb626 100644 --- a/acceptance/bundle/templates/default-python/classic/output/my_default_python/resources/my_default_python.job.yml +++ b/acceptance/bundle/templates/default-python/classic/output/my_default_python/resources/sample_job.job.yml @@ -1,8 +1,9 @@ -# The main job for my_default_python. +# A sample job for my_default_python. + resources: jobs: - my_default_python_job: - name: my_default_python_job + sample_job: + name: sample_job trigger: # Run this job every day, exactly one day from the last run; see https://docs.databricks.com/api/workspace/jobs/create#trigger @@ -14,35 +15,40 @@ resources: # on_failure: # - your_email@example.com + parameters: + - name: catalog + default: ${var.catalog} + - name: schema + default: ${var.schema} + tasks: - task_key: notebook_task - job_cluster_key: job_cluster notebook_task: - notebook_path: ../src/notebook.ipynb - - - task_key: refresh_pipeline - depends_on: - - task_key: notebook_task - pipeline_task: - pipeline_id: ${resources.pipelines.my_default_python_pipeline.id} - - - task_key: main_task - depends_on: - - task_key: refresh_pipeline + notebook_path: ../src/sample_notebook.ipynb job_cluster_key: job_cluster - python_wheel_task: - package_name: my_default_python - entry_point: main libraries: - # By default we just include the .whl file generated for the my_default_python package. + # By default we just include the .whl file generated for the default_python package in src/. # See https://docs.databricks.com/dev-tools/bundles/library-dependencies.html # for more information on how to add other libraries. - - whl: ../dist/*.whl + - whl: ../../dist/*.whl + - task_key: python_file_task + depends_on: + - task_key: notebook_task + spark_python_task: + python_file: ../src/sample_python_file.py + job_cluster_key: job_cluster + libraries: + - whl: ../../dist/*.whl + - task_key: refresh_pipeline + depends_on: + - task_key: notebook_task + pipeline_task: + pipeline_id: ${resources.pipelines.default_python_etl.id} job_clusters: - job_cluster_key: job_cluster new_cluster: - spark_version: 15.4.x-scala2.12 + spark_version: 16.4.x-scala2.12 node_type_id: [NODE_TYPE_ID] data_security_mode: SINGLE_USER autoscale: diff --git a/acceptance/bundle/templates/default-python/classic/output/my_default_python/scratch/README.md b/acceptance/bundle/templates/default-python/classic/output/my_default_python/scratch/README.md deleted file mode 100644 index e6cfb81b46..0000000000 --- a/acceptance/bundle/templates/default-python/classic/output/my_default_python/scratch/README.md +++ /dev/null @@ -1,4 +0,0 @@ -# scratch - -This folder is reserved for personal, exploratory notebooks. -By default these are not committed to Git, as 'scratch' is listed in .gitignore. diff --git a/acceptance/bundle/templates/default-python/classic/output/my_default_python/scratch/exploration.ipynb b/acceptance/bundle/templates/default-python/classic/output/my_default_python/scratch/exploration.ipynb deleted file mode 100644 index a12773d4e8..0000000000 --- a/acceptance/bundle/templates/default-python/classic/output/my_default_python/scratch/exploration.ipynb +++ /dev/null @@ -1,61 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "%load_ext autoreload\n", - "%autoreload 2" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "[UUID]", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "import sys\n", - "\n", - "sys.path.append(\"../src\")\n", - "from my_default_python import main\n", - "\n", - "main.get_taxis(spark).show(10)" - ] - } - ], - "metadata": { - "application/vnd.databricks.v1+notebook": { - "dashboards": [], - "language": "python", - "notebookMetadata": { - "pythonIndentUnit": 2 - }, - "notebookName": "ipynb-notebook", - "widgets": {} - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "name": "python", - "version": "3.11.4" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} diff --git a/acceptance/bundle/templates/default-python/classic/output/my_default_python/src/default_python_etl/README.md b/acceptance/bundle/templates/default-python/classic/output/my_default_python/src/default_python_etl/README.md new file mode 100644 index 0000000000..737b73cf43 --- /dev/null +++ b/acceptance/bundle/templates/default-python/classic/output/my_default_python/src/default_python_etl/README.md @@ -0,0 +1,22 @@ +# my_default_python + +This folder defines all source code for the my_default_python pipeline: + +- `explorations/`: Ad-hoc notebooks used to explore the data processed by this pipeline. +- `transformations/`: All dataset definitions and transformations. +- `utilities/` (optional): Utility functions and Python modules used in this pipeline. +- `data_sources/` (optional): View definitions describing the source data for this pipeline. + +## Getting Started + +To get started, go to the `transformations` folder -- most of the relevant source code lives there: + +* By convention, every dataset under `transformations` is in a separate file. +* Take a look at the sample under "sample_trips_jan_01_1034.py" to get familiar with the syntax. + Read more about the syntax at https://docs.databricks.com/dlt/python-ref.html. +* Use `Run file` to run and preview a single transformation. +* Use `Run pipeline` to run _all_ transformations in the entire pipeline. +* Use `+ Add` in the file browser to add a new data set definition. +* Use `Schedule` to run the pipeline on a schedule! + +For more tutorials and reference material, see https://docs.databricks.com/dlt. diff --git a/acceptance/bundle/templates/default-python/classic/output/my_default_python/src/my_default_python/__init__.py b/acceptance/bundle/templates/default-python/classic/output/my_default_python/src/default_python_etl/__init__.py similarity index 100% rename from acceptance/bundle/templates/default-python/classic/output/my_default_python/src/my_default_python/__init__.py rename to acceptance/bundle/templates/default-python/classic/output/my_default_python/src/default_python_etl/__init__.py diff --git a/acceptance/bundle/templates/default-python/serverless/output/my_default_python/src/my_default_python/__init__.py b/acceptance/bundle/templates/default-python/classic/output/my_default_python/src/default_python_etl/explorations/__init__.py similarity index 100% rename from acceptance/bundle/templates/default-python/serverless/output/my_default_python/src/my_default_python/__init__.py rename to acceptance/bundle/templates/default-python/classic/output/my_default_python/src/default_python_etl/explorations/__init__.py diff --git a/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/src/notebook.ipynb b/acceptance/bundle/templates/default-python/classic/output/my_default_python/src/default_python_etl/explorations/sample_exploration.ipynb similarity index 51% rename from acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/src/notebook.ipynb rename to acceptance/bundle/templates/default-python/classic/output/my_default_python/src/default_python_etl/explorations/sample_exploration.ipynb index 227c7cc558..7edb8fe518 100644 --- a/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/src/notebook.ipynb +++ b/acceptance/bundle/templates/default-python/classic/output/my_default_python/src/default_python_etl/explorations/sample_exploration.ipynb @@ -8,23 +8,16 @@ "inputWidgets": {}, "nuid": "[UUID]", "showTitle": false, + "tableResultSettingsMap": {}, "title": "" } }, "source": [ - "# Default notebook\n", + "### Example Exploratory Notebook\n", "\n", - "This default notebook is executed using Databricks Workflows as defined in resources/my_jobs_as_code.job.yml." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "%load_ext autoreload\n", - "%autoreload 2" + "Use this notebook to explore the data generated by the pipeline in your preferred programming language.\n", + "\n", + "**Note**: This notebook is not executed as part of the pipeline." ] }, { @@ -32,42 +25,37 @@ "execution_count": 0, "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, + "cellMetadata": {}, "inputWidgets": {}, "nuid": "[UUID]", "showTitle": false, + "tableResultSettingsMap": {}, "title": "" } }, "outputs": [], "source": [ - "from my_jobs_as_code import main\n", + "# !!! Before performing any data analysis, make sure to run the pipeline to materialize the sample datasets. The tables referenced in this notebook depend on that step.\n", "\n", - "main.get_taxis(spark).show(10)" + "display(spark.sql(\"SELECT * FROM hive_metastore.[USERNAME].sample_trips_jan_01_1034\"))" ] } ], "metadata": { "application/vnd.databricks.v1+notebook": { + "computePreferences": null, "dashboards": [], + "environmentMetadata": null, + "inputWidgetPreferences": null, "language": "python", "notebookMetadata": { "pythonIndentUnit": 2 }, - "notebookName": "notebook", + "notebookName": "sample_exploration", "widgets": {} }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, "language_info": { - "name": "python", - "version": "3.11.4" + "name": "python" } }, "nbformat": 4, diff --git a/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/src/my_jobs_as_code/__init__.py b/acceptance/bundle/templates/default-python/classic/output/my_default_python/src/default_python_etl/transformations/__init__.py similarity index 100% rename from acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/src/my_jobs_as_code/__init__.py rename to acceptance/bundle/templates/default-python/classic/output/my_default_python/src/default_python_etl/transformations/__init__.py diff --git a/acceptance/bundle/templates/default-python/classic/output/my_default_python/src/default_python_etl/transformations/sample_trips_jan_01_1034.py b/acceptance/bundle/templates/default-python/classic/output/my_default_python/src/default_python_etl/transformations/sample_trips_jan_01_1034.py new file mode 100644 index 0000000000..706b0b8952 --- /dev/null +++ b/acceptance/bundle/templates/default-python/classic/output/my_default_python/src/default_python_etl/transformations/sample_trips_jan_01_1034.py @@ -0,0 +1,15 @@ +import dlt +from pyspark.sql.functions import col +from default_python_etl.utilities import utils + + +# This file defines a sample transformation. +# Edit the sample below or add new transformations +# using "+ Add" in the file browser. + + +@dlt.table +def sample_trips_jan_01_1034(): + return spark.read.table("samples.nyctaxi.trips").withColumn( + "trip_distance_km", utils.distance_km(col("trip_distance")) + ) diff --git a/acceptance/bundle/templates/default-python/classic/output/my_default_python/src/default_python_etl/transformations/sample_zones_jan_01_1034.py b/acceptance/bundle/templates/default-python/classic/output/my_default_python/src/default_python_etl/transformations/sample_zones_jan_01_1034.py new file mode 100644 index 0000000000..82209f7ce4 --- /dev/null +++ b/acceptance/bundle/templates/default-python/classic/output/my_default_python/src/default_python_etl/transformations/sample_zones_jan_01_1034.py @@ -0,0 +1,17 @@ +import dlt +from pyspark.sql.functions import col, sum + + +# This file defines a sample transformation. +# Edit the sample below or add new transformations +# using "+ Add" in the file browser. + + +@dlt.table +def sample_zones_jan_01_1034(): + # Read from the "sample_trips" table, then sum all the fares + return ( + spark.read.table(f"sample_trips_jan_01_1034") + .groupBy(col("pickup_zip")) + .agg(sum("fare_amount").alias("total_fare")) + ) diff --git a/libs/template/templates/default-python/template/{{.project_name}}/src/{{.project_name}}/__init__.py b/acceptance/bundle/templates/default-python/classic/output/my_default_python/src/default_python_etl/utilities/__init__.py similarity index 100% rename from libs/template/templates/default-python/template/{{.project_name}}/src/{{.project_name}}/__init__.py rename to acceptance/bundle/templates/default-python/classic/output/my_default_python/src/default_python_etl/utilities/__init__.py diff --git a/acceptance/bundle/templates/default-python/classic/output/my_default_python/src/default_python_etl/utilities/utils.py b/acceptance/bundle/templates/default-python/classic/output/my_default_python/src/default_python_etl/utilities/utils.py new file mode 100644 index 0000000000..f0f4e940f7 --- /dev/null +++ b/acceptance/bundle/templates/default-python/classic/output/my_default_python/src/default_python_etl/utilities/utils.py @@ -0,0 +1,12 @@ +from pyspark.sql.functions import col, when + + +def distance_km(distance_col): + """Convert distance from miles to kilometers.""" + return distance_col * 1.60934 + + +def format_currency(amount_col): + """Format amount as currency.""" + return when(col(amount_col).isNotNull(), + col(amount_col).cast("decimal(10,2)")) diff --git a/acceptance/bundle/templates/default-python/classic/output/my_default_python/src/my_default_python/main.py b/acceptance/bundle/templates/default-python/classic/output/my_default_python/src/my_default_python/main.py deleted file mode 100644 index 5ae344c7e2..0000000000 --- a/acceptance/bundle/templates/default-python/classic/output/my_default_python/src/my_default_python/main.py +++ /dev/null @@ -1,25 +0,0 @@ -from pyspark.sql import SparkSession, DataFrame - - -def get_taxis(spark: SparkSession) -> DataFrame: - return spark.read.table("samples.nyctaxi.trips") - - -# Create a new Databricks Connect session. If this fails, -# check that you have configured Databricks Connect correctly. -# See https://docs.databricks.com/dev-tools/databricks-connect.html. -def get_spark() -> SparkSession: - try: - from databricks.connect import DatabricksSession - - return DatabricksSession.builder.getOrCreate() - except ImportError: - return SparkSession.builder.getOrCreate() - - -def main(): - get_taxis(get_spark()).show(5) - - -if __name__ == "__main__": - main() diff --git a/acceptance/bundle/templates/default-python/classic/output/my_default_python/src/pipeline.ipynb b/acceptance/bundle/templates/default-python/classic/output/my_default_python/src/pipeline.ipynb deleted file mode 100644 index 53148beff1..0000000000 --- a/acceptance/bundle/templates/default-python/classic/output/my_default_python/src/pipeline.ipynb +++ /dev/null @@ -1,90 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "[UUID]", - "showTitle": false, - "title": "" - } - }, - "source": [ - "# Lakeflow Declarative Pipeline\n", - "\n", - "This Lakeflow Declarative Pipeline definition is executed using a pipeline defined in resources/my_default_python.pipeline.yml." - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "[UUID]", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "# Import DLT and src/my_default_python\n", - "import dlt\n", - "import sys\n", - "\n", - "sys.path.append(spark.conf.get(\"bundle.sourcePath\", \".\"))\n", - "from pyspark.sql.functions import expr\n", - "from my_default_python import main" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "[UUID]", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "@dlt.view\n", - "def taxi_raw():\n", - " return main.get_taxis(spark)\n", - "\n", - "\n", - "@dlt.table\n", - "def filtered_taxis():\n", - " return dlt.read(\"taxi_raw\").filter(expr(\"fare_amount < 30\"))" - ] - } - ], - "metadata": { - "application/vnd.databricks.v1+notebook": { - "dashboards": [], - "language": "python", - "notebookMetadata": { - "pythonIndentUnit": 2 - }, - "notebookName": "pipeline", - "widgets": {} - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "name": "python", - "version": "3.11.4" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} diff --git a/acceptance/bundle/templates/default-python/classic/output/my_default_python/src/notebook.ipynb b/acceptance/bundle/templates/default-python/classic/output/my_default_python/src/sample_notebook.ipynb similarity index 88% rename from acceptance/bundle/templates/default-python/classic/output/my_default_python/src/notebook.ipynb rename to acceptance/bundle/templates/default-python/classic/output/my_default_python/src/sample_notebook.ipynb index 472ccb2190..aa609df200 100644 --- a/acceptance/bundle/templates/default-python/classic/output/my_default_python/src/notebook.ipynb +++ b/acceptance/bundle/templates/default-python/classic/output/my_default_python/src/sample_notebook.ipynb @@ -23,6 +23,7 @@ "metadata": {}, "outputs": [], "source": [ + "# Reload wheel file dependencies every time they are updated\n", "%load_ext autoreload\n", "%autoreload 2" ] @@ -44,9 +45,9 @@ }, "outputs": [], "source": [ - "from my_default_python import main\n", + "from shared import taxis\n", "\n", - "main.get_taxis(spark).show(10)" + "taxis.find_all_taxis().show(10)" ] } ], @@ -57,7 +58,7 @@ "notebookMetadata": { "pythonIndentUnit": 2 }, - "notebookName": "notebook", + "notebookName": "sample_notebook", "widgets": {} }, "kernelspec": { diff --git a/acceptance/bundle/templates/default-python/classic/output/my_default_python/src/sample_python_file.py b/acceptance/bundle/templates/default-python/classic/output/my_default_python/src/sample_python_file.py new file mode 100644 index 0000000000..719a0b71a3 --- /dev/null +++ b/acceptance/bundle/templates/default-python/classic/output/my_default_python/src/sample_python_file.py @@ -0,0 +1,19 @@ +import argparse +from datetime import datetime +from shared import taxis + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--catalog", default="hive_metastore") + parser.add_argument("--schema", default="default") + args = parser.parse_args() + + df = taxis.find_all_taxis() + + table_name = f"{args.catalog}.{args.schema}.taxis_jan_01_1034" + df.write.mode("overwrite").saveAsTable(table_name) + + print(f"Wrote {df.count()} taxi records to {table_name}") + +if __name__ == "__main__": + main() diff --git a/acceptance/bundle/templates/default-python/classic/output/my_default_python/src/shared/__init__.py b/acceptance/bundle/templates/default-python/classic/output/my_default_python/src/shared/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/acceptance/bundle/templates/default-python/classic/output/my_default_python/src/shared/taxis.py b/acceptance/bundle/templates/default-python/classic/output/my_default_python/src/shared/taxis.py new file mode 100644 index 0000000000..a7309cd4c5 --- /dev/null +++ b/acceptance/bundle/templates/default-python/classic/output/my_default_python/src/shared/taxis.py @@ -0,0 +1,7 @@ +from databricks.sdk.runtime import spark +from pyspark.sql import DataFrame + + +def find_all_taxis() -> DataFrame: + """Find all taxi data.""" + return spark.read.table("samples.nyctaxi.trips") diff --git a/acceptance/bundle/templates/default-python/classic/output/my_default_python/tests/conftest.py b/acceptance/bundle/templates/default-python/classic/output/my_default_python/tests/conftest.py new file mode 100644 index 0000000000..8037a4647c --- /dev/null +++ b/acceptance/bundle/templates/default-python/classic/output/my_default_python/tests/conftest.py @@ -0,0 +1,93 @@ +"""This file configures pytest. + +This file is in the root since it can be used for tests in any place in this +project, including tests under resources/. +""" + +import os, sys, pathlib +from contextlib import contextmanager + + +try: + from databricks.connect import DatabricksSession + from databricks.sdk import WorkspaceClient + from pyspark.sql import SparkSession + import pytest + import json + import csv + import os +except ImportError: + raise ImportError("Test dependencies not found.\n\nRun tests using 'uv run pytest'. See http://docs.astral.sh/uv to learn more about uv.") + + +@pytest.fixture() +def spark() -> SparkSession: + """Provide a SparkSession fixture for tests. + + Minimal example: + def test_uses_spark(spark): + df = spark.createDataFrame([(1,)], ["x"]) + assert df.count() == 1 + """ + return DatabricksSession.builder.getOrCreate() + +@pytest.fixture() +def load_fixture(spark: SparkSession): + """Provide a callable to load JSON or CSV from fixtures/ directory. + + Example usage: + + def test_using_fixture(load_fixture): + data = load_fixture("my_data.json") + assert data.count() >= 1 + """ + def _loader(filename: str): + path = pathlib.Path(__file__).parent.parent / "fixtures" / filename + suffix = path.suffix.lower() + if suffix == ".json": + rows = json.loads(path.read_text()) + return spark.createDataFrame(rows) + if suffix == ".csv": + with path.open(newline="") as f: + rows = list(csv.DictReader(f)) + return spark.createDataFrame(rows) + raise ValueError(f"Unsupported fixture type for: {filename}") + return _loader + + +def _enable_fallback_compute(): + """Enable serverless compute if no compute is specified.""" + conf = WorkspaceClient().config + if conf.serverless_compute_id or conf.cluster_id or os.environ.get("SPARK_REMOTE"): + return + + url = "https://docs.databricks.com/dev-tools/databricks-connect/cluster-config" + print("☁️ no compute specified, falling back to serverless compute", file=sys.stderr) + print(f" see {url} for manual configuration", file=sys.stdout) + + os.environ["DATABRICKS_SERVERLESS_COMPUTE_ID"] = "auto" + + +@contextmanager +def _allow_stderr_output(config: pytest.Config): + """Temporarily disable pytest output capture.""" + capman = config.pluginmanager.get_plugin("capturemanager") + if capman: + with capman.global_and_fixture_disabled(): + yield + else: + yield + + +def pytest_configure(config: pytest.Config): + """Configure pytest session.""" + with _allow_stderr_output(config): + _enable_fallback_compute() + + # Initialize Spark session eagerly, so it is available even when + # SparkSession.builder.getOrCreate() is used. For DB Connect 15+, + # we validate version compatibility with the remote cluster. + if hasattr(DatabricksSession.builder, "validateSession"): + DatabricksSession.builder.validateSession().getOrCreate() + else: + DatabricksSession.builder.getOrCreate() diff --git a/acceptance/bundle/templates/default-python/classic/output/my_default_python/tests/main_test.py b/acceptance/bundle/templates/default-python/classic/output/my_default_python/tests/main_test.py deleted file mode 100644 index dc449154a6..0000000000 --- a/acceptance/bundle/templates/default-python/classic/output/my_default_python/tests/main_test.py +++ /dev/null @@ -1,6 +0,0 @@ -from my_default_python.main import get_taxis, get_spark - - -def test_main(): - taxis = get_taxis(get_spark()) - assert taxis.count() > 5 diff --git a/acceptance/bundle/templates/default-python/classic/output/my_default_python/tests/sample_taxis_test.py b/acceptance/bundle/templates/default-python/classic/output/my_default_python/tests/sample_taxis_test.py new file mode 100644 index 0000000000..a782015363 --- /dev/null +++ b/acceptance/bundle/templates/default-python/classic/output/my_default_python/tests/sample_taxis_test.py @@ -0,0 +1,8 @@ +from databricks.sdk.runtime import spark +from pyspark.sql import DataFrame +from shared import taxis + + +def test_find_all_taxis(): + results = taxis.find_all_taxis() + assert results.count() > 5 diff --git a/acceptance/bundle/templates/default-python/combinations/classic/output.txt b/acceptance/bundle/templates/default-python/combinations/classic/output.txt index f1a50c0b49..f192544aa5 100644 --- a/acceptance/bundle/templates/default-python/combinations/classic/output.txt +++ b/acceptance/bundle/templates/default-python/combinations/classic/output.txt @@ -1,8 +1,12 @@ >>> [CLI] bundle init default-python --config-file ./input.json - Welcome to the default Python template for Databricks Asset Bundles! -Workspace to use (auto-detected, edit in 'X[UNIQUE_NAME]/databricks.yml'): [DATABRICKS_URL] + +Please answer the below to tailor your project to your preferences. +You can always change your mind and change your configuration in the databricks.yml file later. + +Note that [DATABRICKS_URL] is used for initialization +(see https://docs.databricks.com/dev-tools/cli/profiles.html for how to change your profile). ✨ Your new project has been created in the 'X[UNIQUE_NAME]' directory! diff --git a/acceptance/bundle/templates/default-python/combinations/serverless/output.txt b/acceptance/bundle/templates/default-python/combinations/serverless/output.txt index fcc3c931ad..ee551a5b16 100644 --- a/acceptance/bundle/templates/default-python/combinations/serverless/output.txt +++ b/acceptance/bundle/templates/default-python/combinations/serverless/output.txt @@ -1,8 +1,12 @@ >>> [CLI] bundle init default-python --config-file ./input.json - Welcome to the default Python template for Databricks Asset Bundles! -Workspace to use (auto-detected, edit in 'X[UNIQUE_NAME]/databricks.yml'): [DATABRICKS_URL] + +Please answer the below to tailor your project to your preferences. +You can always change your mind and change your configuration in the databricks.yml file later. + +Note that [DATABRICKS_URL] is used for initialization +(see https://docs.databricks.com/dev-tools/cli/profiles.html for how to change your profile). ✨ Your new project has been created in the 'X[UNIQUE_NAME]' directory! diff --git a/acceptance/bundle/templates/default-python/fail-missing-uv/output.txt b/acceptance/bundle/templates/default-python/fail-missing-uv/output.txt index d9c70acfe6..802fea4829 100644 --- a/acceptance/bundle/templates/default-python/fail-missing-uv/output.txt +++ b/acceptance/bundle/templates/default-python/fail-missing-uv/output.txt @@ -1,8 +1,12 @@ >>> [CLI] bundle init default-python --config-file ./input.json --output-dir output - Welcome to the default Python template for Databricks Asset Bundles! -Workspace to use (auto-detected, edit in 'fail_missing_uv/databricks.yml'): [DATABRICKS_URL] + +Please answer the below to tailor your project to your preferences. +You can always change your mind and change your configuration in the databricks.yml file later. + +Note that [DATABRICKS_URL] is used for initialization +(see https://docs.databricks.com/dev-tools/cli/profiles.html for how to change your profile). ✨ Your new project has been created in the 'fail_missing_uv' directory! @@ -10,6 +14,8 @@ Please refer to the README.md file for "getting started" instructions. See also the documentation at https://docs.databricks.com/dev-tools/bundles/index.html. >>> [CLI] bundle validate +Error: path [TEST_TMP_DIR]/output/dist/*.whl is not contained in sync root path + Name: fail_missing_uv Target: dev Workspace: @@ -17,12 +23,6 @@ Workspace: User: [USERNAME] Path: /Workspace/Users/[USERNAME]/.bundle/fail_missing_uv/dev -Validation OK! - ->>> [CLI] bundle deploy -Building python_artifact... -Error: build failed python_artifact, error: exit status 127, output: bash: uv: command not found - - +Found 1 error Exit code: 1 diff --git a/acceptance/bundle/templates/default-python/integration_classic/out.validate.dev.json b/acceptance/bundle/templates/default-python/integration_classic/out.validate.dev.json deleted file mode 100644 index ac1ea3922c..0000000000 --- a/acceptance/bundle/templates/default-python/integration_classic/out.validate.dev.json +++ /dev/null @@ -1,166 +0,0 @@ -{ - "artifacts": { - "python_artifact": { - "build": "uv build --wheel", - "dynamic_version": true, - "files": [ - { - "source": "[TEST_TMP_DIR]/project_name_[UNIQUE_NAME]/dist/*.whl" - } - ], - "path": "[TEST_TMP_DIR]/project_name_[UNIQUE_NAME]", - "type": "whl" - } - }, - "bundle": { - "deployment": { - "lock": { - "enabled": false - } - }, - "environment": "dev", - "git": { - "bundle_root_path": "." - }, - "mode": "development", - "name": "project_name_[UNIQUE_NAME]", - "target": "dev", - "uuid": "[UUID]" - }, - "include": [ - "resources/project_name_[UNIQUE_NAME].job.yml", - "resources/project_name_[UNIQUE_NAME].pipeline.yml" - ], - "presets": { - "artifacts_dynamic_version": true, - "jobs_max_concurrent_runs": 4, - "name_prefix": "[dev [USERNAME]] ", - "pipelines_development": true, - "tags": { - "dev": "[USERNAME]" - }, - "trigger_pause_status": "PAUSED" - }, - "resources": { - "jobs": { - "project_name_[UNIQUE_NAME]_job": { - "deployment": { - "kind": "BUNDLE", - "metadata_file_path": "/Workspace/Users/[USERNAME]/.bundle/project_name_[UNIQUE_NAME]/dev/state/metadata.json" - }, - "edit_mode": "UI_LOCKED", - "format": "MULTI_TASK", - "job_clusters": [ - { - "job_cluster_key": "job_cluster", - "new_cluster": { - "autoscale": { - "max_workers": 4, - "min_workers": 1 - }, - "data_security_mode": "SINGLE_USER", - "node_type_id": "[NODE_TYPE_ID]", - "num_workers": 0, - "spark_version": "15.4.x-scala2.12" - } - } - ], - "max_concurrent_runs": 4, - "name": "[dev [USERNAME]] project_name_[UNIQUE_NAME]_job", - "permissions": [], - "queue": { - "enabled": true - }, - "tags": { - "dev": "[USERNAME]" - }, - "tasks": [ - { - "depends_on": [ - { - "task_key": "refresh_pipeline" - } - ], - "job_cluster_key": "job_cluster", - "libraries": [ - { - "whl": "dist/*.whl" - } - ], - "python_wheel_task": { - "entry_point": "main", - "package_name": "project_name_[UNIQUE_NAME]" - }, - "task_key": "main_task" - }, - { - "job_cluster_key": "job_cluster", - "notebook_task": { - "notebook_path": "/Workspace/Users/[USERNAME]/.bundle/project_name_[UNIQUE_NAME]/dev/files/src/notebook", - "source": "WORKSPACE" - }, - "task_key": "notebook_task" - }, - { - "depends_on": [ - { - "task_key": "notebook_task" - } - ], - "pipeline_task": { - "pipeline_id": "${resources.pipelines.project_name_[UNIQUE_NAME]_pipeline.id}" - }, - "task_key": "refresh_pipeline" - } - ], - "trigger": { - "pause_status": "PAUSED", - "periodic": { - "interval": 1, - "unit": "DAYS" - } - } - } - }, - "pipelines": { - "project_name_[UNIQUE_NAME]_pipeline": { - "channel": "CURRENT", - "configuration": { - "bundle.sourcePath": "/Workspace/Users/[USERNAME]/.bundle/project_name_[UNIQUE_NAME]/dev/files/src" - }, - "deployment": { - "kind": "BUNDLE", - "metadata_file_path": "/Workspace/Users/[USERNAME]/.bundle/project_name_[UNIQUE_NAME]/dev/state/metadata.json" - }, - "development": true, - "edition": "ADVANCED", - "libraries": [ - { - "notebook": { - "path": "/Workspace/Users/[USERNAME]/.bundle/project_name_[UNIQUE_NAME]/dev/files/src/pipeline" - } - } - ], - "name": "[dev [USERNAME]] project_name_[UNIQUE_NAME]_pipeline", - "permissions": [], - "schema": "project_name_[UNIQUE_NAME]_dev", - "tags": { - "dev": "[USERNAME]" - } - } - } - }, - "sync": { - "paths": [ - "." - ] - }, - "workspace": { - "artifact_path": "/Workspace/Users/[USERNAME]/.bundle/project_name_[UNIQUE_NAME]/dev/artifacts", - "file_path": "/Workspace/Users/[USERNAME]/.bundle/project_name_[UNIQUE_NAME]/dev/files", - "host": "[DATABRICKS_URL]", - "resource_path": "/Workspace/Users/[USERNAME]/.bundle/project_name_[UNIQUE_NAME]/dev/resources", - "root_path": "/Workspace/Users/[USERNAME]/.bundle/project_name_[UNIQUE_NAME]/dev", - "state_path": "/Workspace/Users/[USERNAME]/.bundle/project_name_[UNIQUE_NAME]/dev/state" - } -} diff --git a/acceptance/bundle/templates/default-python/integration_classic/output.txt b/acceptance/bundle/templates/default-python/integration_classic/output.txt index 7c98fbc22f..a0fdf54723 100644 --- a/acceptance/bundle/templates/default-python/integration_classic/output.txt +++ b/acceptance/bundle/templates/default-python/integration_classic/output.txt @@ -3,9 +3,13 @@ [UV_PYTHON] >>> [CLI] bundle init default-python --config-file ./input.json --output-dir . - Welcome to the default Python template for Databricks Asset Bundles! -Workspace to use (auto-detected, edit in 'project_name_[UNIQUE_NAME]/databricks.yml'): [DATABRICKS_URL] + +Please answer the below to tailor your project to your preferences. +You can always change your mind and change your configuration in the databricks.yml file later. + +Note that [DATABRICKS_URL] is used for initialization +(see https://docs.databricks.com/dev-tools/cli/profiles.html for how to change your profile). ✨ Your new project has been created in the 'project_name_[UNIQUE_NAME]' directory! @@ -13,408 +17,19 @@ Please refer to the README.md file for "getting started" instructions. See also the documentation at https://docs.databricks.com/dev-tools/bundles/index.html. >>> [CLI] bundle validate -t dev -Name: project_name_[UNIQUE_NAME] -Target: dev -Workspace: - Host: [DATABRICKS_URL] - User: [USERNAME] - Path: /Workspace/Users/[USERNAME]/.bundle/project_name_[UNIQUE_NAME]/dev +Error: path [TEST_TMP_DIR]/dist/*.whl is not contained in sync root path -Validation OK! - ->>> [CLI] bundle validate -t dev -o json - ->>> [CLI] bundle deploy -t dev -Building python_artifact... -Uploading .databricks/bundle/dev/patched_wheels/python_artifact_project_name_[UNIQUE_NAME]/project_name_[UNIQUE_NAME]-0.0.1+[NUMID]-py3-none-any.whl... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/project_name_[UNIQUE_NAME]/dev/files... -Deploying resources... -Updating deployment state... -Deployment complete! - ->>> [CLI] bundle summary -t dev Name: project_name_[UNIQUE_NAME] Target: dev Workspace: Host: [DATABRICKS_URL] User: [USERNAME] Path: /Workspace/Users/[USERNAME]/.bundle/project_name_[UNIQUE_NAME]/dev -Resources: - Jobs: - project_name_[UNIQUE_NAME]_job: - Name: [dev [USERNAME]] project_name_[UNIQUE_NAME]_job - URL: [DATABRICKS_URL]/jobs/[NUMID] - Pipelines: - project_name_[UNIQUE_NAME]_pipeline: - Name: [dev [USERNAME]] project_name_[UNIQUE_NAME]_pipeline - URL: [DATABRICKS_URL]/pipelines/[UUID] - ->>> [CLI] bundle summary -t dev -o json ->>> diff.py ../out.validate.dev.json ../out.summary.dev.json ---- ../out.validate.dev.json -+++ ../out.summary.dev.json -@@ -51,4 +51,5 @@ - "edit_mode": "UI_LOCKED", - "format": "MULTI_TASK", -+ "id": "[NUMID]", - "job_clusters": [ - { -@@ -120,5 +121,6 @@ - "unit": "DAYS" - } -- } -+ }, -+ "url": "[DATABRICKS_URL]/jobs/[NUMID]" - } - }, -@@ -135,4 +137,5 @@ - "development": true, - "edition": "ADVANCED", -+ "id": "[UUID]", - "libraries": [ - { -@@ -147,5 +150,6 @@ - "tags": { - "dev": "[USERNAME]" -- } -+ }, -+ "url": "[DATABRICKS_URL]/pipelines/[UUID]" - } - } +Found 1 error >>> [CLI] bundle destroy -t dev --auto-approve -The following resources will be deleted: - delete job project_name_[UNIQUE_NAME]_job - delete pipeline project_name_[UNIQUE_NAME]_pipeline - -This action will result in the deletion of the following Lakeflow Declarative Pipelines along with the -Streaming Tables (STs) and Materialized Views (MVs) managed by them: - delete pipeline project_name_[UNIQUE_NAME]_pipeline - -All files and directories at the following location will be deleted: /Workspace/Users/[USERNAME]/.bundle/project_name_[UNIQUE_NAME]/dev - -Deleting files... -Destroy complete! - ->>> [CLI] bundle validate -t prod -Name: project_name_[UNIQUE_NAME] -Target: prod -Workspace: - Host: [DATABRICKS_URL] - User: [USERNAME] - Path: /Workspace/Users/[USERNAME]/.bundle/project_name_[UNIQUE_NAME]/prod - -Validation OK! - ->>> [CLI] bundle validate -t prod -o json - ->>> diff.py ../out.validate.dev.json ../out.validate.prod.json ---- ../out.validate.dev.json -+++ ../out.validate.prod.json -@@ -3,5 +3,4 @@ - "python_artifact": { - "build": "uv build --wheel", -- "dynamic_version": true, - "files": [ - { -@@ -14,16 +13,11 @@ - }, - "bundle": { -- "deployment": { -- "lock": { -- "enabled": false -- } -- }, -- "environment": "dev", -+ "environment": "prod", - "git": { - "bundle_root_path": "." - }, -- "mode": "development", -+ "mode": "production", - "name": "project_name_[UNIQUE_NAME]", -- "target": "dev", -+ "target": "prod", - "uuid": "[UUID]" - }, -@@ -32,14 +26,10 @@ - "resources/project_name_[UNIQUE_NAME].pipeline.yml" - ], -- "presets": { -- "artifacts_dynamic_version": true, -- "jobs_max_concurrent_runs": 4, -- "name_prefix": "[dev [USERNAME]] ", -- "pipelines_development": true, -- "tags": { -- "dev": "[USERNAME]" -- }, -- "trigger_pause_status": "PAUSED" -- }, -+ "permissions": [ -+ { -+ "level": "CAN_MANAGE", -+ "service_principal_name": "[USERNAME]" -+ } -+ ], - "resources": { - "jobs": { -@@ -47,5 +37,5 @@ - "deployment": { - "kind": "BUNDLE", -- "metadata_file_path": "/Workspace/Users/[USERNAME]/.bundle/project_name_[UNIQUE_NAME]/dev/state/metadata.json" -+ "metadata_file_path": "/Workspace/Users/[USERNAME]/.bundle/project_name_[UNIQUE_NAME]/prod/state/metadata.json" - }, - "edit_mode": "UI_LOCKED", -@@ -66,12 +56,9 @@ - } - ], -- "max_concurrent_runs": 4, -- "name": "[dev [USERNAME]] project_name_[UNIQUE_NAME]_job", -+ "max_concurrent_runs": 1, -+ "name": "project_name_[UNIQUE_NAME]_job", - "permissions": [], - "queue": { - "enabled": true -- }, -- "tags": { -- "dev": "[USERNAME]" - }, - "tasks": [ -@@ -97,5 +84,5 @@ - "job_cluster_key": "job_cluster", - "notebook_task": { -- "notebook_path": "/Workspace/Users/[USERNAME]/.bundle/project_name_[UNIQUE_NAME]/dev/files/src/notebook", -+ "notebook_path": "/Workspace/Users/[USERNAME]/.bundle/project_name_[UNIQUE_NAME]/prod/files/src/notebook", - "source": "WORKSPACE" - }, -@@ -115,5 +102,5 @@ - ], - "trigger": { -- "pause_status": "PAUSED", -+ "pause_status": "UNPAUSED", - "periodic": { - "interval": 1, -@@ -127,25 +114,21 @@ - "channel": "CURRENT", - "configuration": { -- "bundle.sourcePath": "/Workspace/Users/[USERNAME]/.bundle/project_name_[UNIQUE_NAME]/dev/files/src" -+ "bundle.sourcePath": "/Workspace/Users/[USERNAME]/.bundle/project_name_[UNIQUE_NAME]/prod/files/src" - }, - "deployment": { - "kind": "BUNDLE", -- "metadata_file_path": "/Workspace/Users/[USERNAME]/.bundle/project_name_[UNIQUE_NAME]/dev/state/metadata.json" -+ "metadata_file_path": "/Workspace/Users/[USERNAME]/.bundle/project_name_[UNIQUE_NAME]/prod/state/metadata.json" - }, -- "development": true, - "edition": "ADVANCED", - "libraries": [ - { - "notebook": { -- "path": "/Workspace/Users/[USERNAME]/.bundle/project_name_[UNIQUE_NAME]/dev/files/src/pipeline" -+ "path": "/Workspace/Users/[USERNAME]/.bundle/project_name_[UNIQUE_NAME]/prod/files/src/pipeline" - } - } - ], -- "name": "[dev [USERNAME]] project_name_[UNIQUE_NAME]_pipeline", -+ "name": "project_name_[UNIQUE_NAME]_pipeline", - "permissions": [], -- "schema": "project_name_[UNIQUE_NAME]_dev", -- "tags": { -- "dev": "[USERNAME]" -- } -+ "schema": "project_name_[UNIQUE_NAME]_prod" - } - } -@@ -157,10 +140,10 @@ - }, - "workspace": { -- "artifact_path": "/Workspace/Users/[USERNAME]/.bundle/project_name_[UNIQUE_NAME]/dev/artifacts", -- "file_path": "/Workspace/Users/[USERNAME]/.bundle/project_name_[UNIQUE_NAME]/dev/files", -+ "artifact_path": "/Workspace/Users/[USERNAME]/.bundle/project_name_[UNIQUE_NAME]/prod/artifacts", -+ "file_path": "/Workspace/Users/[USERNAME]/.bundle/project_name_[UNIQUE_NAME]/prod/files", - "host": "[DATABRICKS_URL]", -- "resource_path": "/Workspace/Users/[USERNAME]/.bundle/project_name_[UNIQUE_NAME]/dev/resources", -- "root_path": "/Workspace/Users/[USERNAME]/.bundle/project_name_[UNIQUE_NAME]/dev", -- "state_path": "/Workspace/Users/[USERNAME]/.bundle/project_name_[UNIQUE_NAME]/dev/state" -+ "resource_path": "/Workspace/Users/[USERNAME]/.bundle/project_name_[UNIQUE_NAME]/prod/resources", -+ "root_path": "/Workspace/Users/[USERNAME]/.bundle/project_name_[UNIQUE_NAME]/prod", -+ "state_path": "/Workspace/Users/[USERNAME]/.bundle/project_name_[UNIQUE_NAME]/prod/state" - } - } - ->>> [CLI] bundle deploy -t prod -Building python_artifact... -Uploading dist/project_name_[UNIQUE_NAME]-0.0.1-py3-none-any.whl... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/project_name_[UNIQUE_NAME]/prod/files... -Deploying resources... -Updating deployment state... -Deployment complete! - ->>> [CLI] bundle summary -t prod -Name: project_name_[UNIQUE_NAME] -Target: prod -Workspace: - Host: [DATABRICKS_URL] - User: [USERNAME] - Path: /Workspace/Users/[USERNAME]/.bundle/project_name_[UNIQUE_NAME]/prod -Resources: - Jobs: - project_name_[UNIQUE_NAME]_job: - Name: project_name_[UNIQUE_NAME]_job - URL: [DATABRICKS_URL]/jobs/[NUMID] - Pipelines: - project_name_[UNIQUE_NAME]_pipeline: - Name: project_name_[UNIQUE_NAME]_pipeline - URL: [DATABRICKS_URL]/pipelines/[UUID] - ->>> [CLI] bundle summary -t prod -o json - ->>> diff.py ../out.summary.dev.json ../out.summary.prod.json ---- ../out.summary.dev.json -+++ ../out.summary.prod.json -@@ -3,5 +3,4 @@ - "python_artifact": { - "build": "uv build --wheel", -- "dynamic_version": true, - "files": [ - { -@@ -14,16 +13,11 @@ - }, - "bundle": { -- "deployment": { -- "lock": { -- "enabled": false -- } -- }, -- "environment": "dev", -+ "environment": "prod", - "git": { - "bundle_root_path": "." - }, -- "mode": "development", -+ "mode": "production", - "name": "project_name_[UNIQUE_NAME]", -- "target": "dev", -+ "target": "prod", - "uuid": "[UUID]" - }, -@@ -32,14 +26,10 @@ - "resources/project_name_[UNIQUE_NAME].pipeline.yml" - ], -- "presets": { -- "artifacts_dynamic_version": true, -- "jobs_max_concurrent_runs": 4, -- "name_prefix": "[dev [USERNAME]] ", -- "pipelines_development": true, -- "tags": { -- "dev": "[USERNAME]" -- }, -- "trigger_pause_status": "PAUSED" -- }, -+ "permissions": [ -+ { -+ "level": "CAN_MANAGE", -+ "service_principal_name": "[USERNAME]" -+ } -+ ], - "resources": { - "jobs": { -@@ -47,5 +37,5 @@ - "deployment": { - "kind": "BUNDLE", -- "metadata_file_path": "/Workspace/Users/[USERNAME]/.bundle/project_name_[UNIQUE_NAME]/dev/state/metadata.json" -+ "metadata_file_path": "/Workspace/Users/[USERNAME]/.bundle/project_name_[UNIQUE_NAME]/prod/state/metadata.json" - }, - "edit_mode": "UI_LOCKED", -@@ -67,12 +57,9 @@ - } - ], -- "max_concurrent_runs": 4, -- "name": "[dev [USERNAME]] project_name_[UNIQUE_NAME]_job", -+ "max_concurrent_runs": 1, -+ "name": "project_name_[UNIQUE_NAME]_job", - "permissions": [], - "queue": { - "enabled": true -- }, -- "tags": { -- "dev": "[USERNAME]" - }, - "tasks": [ -@@ -98,5 +85,5 @@ - "job_cluster_key": "job_cluster", - "notebook_task": { -- "notebook_path": "/Workspace/Users/[USERNAME]/.bundle/project_name_[UNIQUE_NAME]/dev/files/src/notebook", -+ "notebook_path": "/Workspace/Users/[USERNAME]/.bundle/project_name_[UNIQUE_NAME]/prod/files/src/notebook", - "source": "WORKSPACE" - }, -@@ -116,5 +103,5 @@ - ], - "trigger": { -- "pause_status": "PAUSED", -+ "pause_status": "UNPAUSED", - "periodic": { - "interval": 1, -@@ -129,11 +116,10 @@ - "channel": "CURRENT", - "configuration": { -- "bundle.sourcePath": "/Workspace/Users/[USERNAME]/.bundle/project_name_[UNIQUE_NAME]/dev/files/src" -+ "bundle.sourcePath": "/Workspace/Users/[USERNAME]/.bundle/project_name_[UNIQUE_NAME]/prod/files/src" - }, - "deployment": { - "kind": "BUNDLE", -- "metadata_file_path": "/Workspace/Users/[USERNAME]/.bundle/project_name_[UNIQUE_NAME]/dev/state/metadata.json" -+ "metadata_file_path": "/Workspace/Users/[USERNAME]/.bundle/project_name_[UNIQUE_NAME]/prod/state/metadata.json" - }, -- "development": true, - "edition": "ADVANCED", - "id": "[UUID]", -@@ -141,14 +127,11 @@ - { - "notebook": { -- "path": "/Workspace/Users/[USERNAME]/.bundle/project_name_[UNIQUE_NAME]/dev/files/src/pipeline" -+ "path": "/Workspace/Users/[USERNAME]/.bundle/project_name_[UNIQUE_NAME]/prod/files/src/pipeline" - } - } - ], -- "name": "[dev [USERNAME]] project_name_[UNIQUE_NAME]_pipeline", -+ "name": "project_name_[UNIQUE_NAME]_pipeline", - "permissions": [], -- "schema": "project_name_[UNIQUE_NAME]_dev", -- "tags": { -- "dev": "[USERNAME]" -- }, -+ "schema": "project_name_[UNIQUE_NAME]_prod", - "url": "[DATABRICKS_URL]/pipelines/[UUID]" - } -@@ -161,10 +144,10 @@ - }, - "workspace": { -- "artifact_path": "/Workspace/Users/[USERNAME]/.bundle/project_name_[UNIQUE_NAME]/dev/artifacts", -- "file_path": "/Workspace/Users/[USERNAME]/.bundle/project_name_[UNIQUE_NAME]/dev/files", -+ "artifact_path": "/Workspace/Users/[USERNAME]/.bundle/project_name_[UNIQUE_NAME]/prod/artifacts", -+ "file_path": "/Workspace/Users/[USERNAME]/.bundle/project_name_[UNIQUE_NAME]/prod/files", - "host": "[DATABRICKS_URL]", -- "resource_path": "/Workspace/Users/[USERNAME]/.bundle/project_name_[UNIQUE_NAME]/dev/resources", -- "root_path": "/Workspace/Users/[USERNAME]/.bundle/project_name_[UNIQUE_NAME]/dev", -- "state_path": "/Workspace/Users/[USERNAME]/.bundle/project_name_[UNIQUE_NAME]/dev/state" -+ "resource_path": "/Workspace/Users/[USERNAME]/.bundle/project_name_[UNIQUE_NAME]/prod/resources", -+ "root_path": "/Workspace/Users/[USERNAME]/.bundle/project_name_[UNIQUE_NAME]/prod", -+ "state_path": "/Workspace/Users/[USERNAME]/.bundle/project_name_[UNIQUE_NAME]/prod/state" - } - } - ->>> [CLI] bundle destroy -t prod --auto-approve -The following resources will be deleted: - delete job project_name_[UNIQUE_NAME]_job - delete pipeline project_name_[UNIQUE_NAME]_pipeline - -This action will result in the deletion of the following Lakeflow Declarative Pipelines along with the -Streaming Tables (STs) and Materialized Views (MVs) managed by them: - delete pipeline project_name_[UNIQUE_NAME]_pipeline +Error: path [TEST_TMP_DIR]/dist/*.whl is not contained in sync root path -All files and directories at the following location will be deleted: /Workspace/Users/[USERNAME]/.bundle/project_name_[UNIQUE_NAME]/prod -Deleting files... -Destroy complete! +Exit code: 1 diff --git a/acceptance/bundle/templates/default-python/no-uc/output.txt b/acceptance/bundle/templates/default-python/no-uc/output.txt index 6abf52cf09..3fc221b818 100644 --- a/acceptance/bundle/templates/default-python/no-uc/output.txt +++ b/acceptance/bundle/templates/default-python/no-uc/output.txt @@ -1,10 +1,12 @@ >>> [CLI] bundle init default-python --config-file ./input.json --output-dir output - Welcome to the default Python template for Databricks Asset Bundles! -Workspace to use (auto-detected, edit in 'my_default_python/databricks.yml'): [DATABRICKS_URL] -✨ Your new project has been created in the 'my_default_python' directory! +Please answer the below to tailor your project to your preferences. +You can always change your mind and change your configuration in the databricks.yml file later. + +Note that [DATABRICKS_URL] is used for initialization +(see https://docs.databricks.com/dev-tools/cli/profiles.html for how to change your profile). +Error: template: :1:2: executing "" at : error calling default_catalog: Unity Catalog is not available for feature tier STANDARD_TIER. -Please refer to the README.md file for "getting started" instructions. -See also the documentation at https://docs.databricks.com/dev-tools/bundles/index.html. +Exit code: 1 diff --git a/acceptance/bundle/templates/default-python/serverless-customcatalog/output.txt b/acceptance/bundle/templates/default-python/serverless-customcatalog/output.txt index a6a92dfd4e..9f9a0c02a4 100644 --- a/acceptance/bundle/templates/default-python/serverless-customcatalog/output.txt +++ b/acceptance/bundle/templates/default-python/serverless-customcatalog/output.txt @@ -1,8 +1,12 @@ >>> [CLI] bundle init default-python --config-file [TESTROOT]/bundle/templates/default-python/serverless-customcatalog/../serverless/input.json --output-dir output - Welcome to the default Python template for Databricks Asset Bundles! -Workspace to use (auto-detected, edit in 'my_default_python/databricks.yml'): [DATABRICKS_URL] + +Please answer the below to tailor your project to your preferences. +You can always change your mind and change your configuration in the databricks.yml file later. + +Note that [DATABRICKS_URL] is used for initialization +(see https://docs.databricks.com/dev-tools/cli/profiles.html for how to change your profile). ✨ Your new project has been created in the 'my_default_python' directory! @@ -10,13 +14,47 @@ Please refer to the README.md file for "getting started" instructions. See also the documentation at https://docs.databricks.com/dev-tools/bundles/index.html. >>> diff.py [TESTROOT]/bundle/templates/default-python/serverless-customcatalog/../serverless/output output/ ---- [TESTROOT]/bundle/templates/default-python/serverless-customcatalog/../serverless/output/my_default_python/resources/my_default_python.pipeline.yml -+++ output/my_default_python/resources/my_default_python.pipeline.yml -@@ -4,6 +4,5 @@ - my_default_python_pipeline: - name: my_default_python_pipeline +--- [TESTROOT]/bundle/templates/default-python/serverless-customcatalog/../serverless/output/my_default_python/databricks.yml ++++ output/my_default_python/databricks.yml +@@ -32,5 +32,5 @@ + host: [DATABRICKS_URL] + variables: +- catalog: hive_metastore ++ catalog: customcatalog + schema: ${workspace.current_user.short_name} + prod: +@@ -41,5 +41,5 @@ + root_path: /Workspace/Users/[USERNAME]/.bundle/${bundle.name}/${bundle.target} + variables: +- catalog: hive_metastore ++ catalog: customcatalog + schema: prod + permissions: +--- [TESTROOT]/bundle/templates/default-python/serverless-customcatalog/../serverless/output/my_default_python/resources/default_python_etl.pipeline.yml ++++ output/my_default_python/resources/default_python_etl.pipeline.yml +@@ -5,6 +5,5 @@ + default_python_etl: + name: default_python_etl - ## Catalog is required for serverless compute - catalog: main + catalog: customcatalog schema: my_default_python_${bundle.target} serverless: true +--- [TESTROOT]/bundle/templates/default-python/serverless-customcatalog/../serverless/output/my_default_python/src/default_python_etl/explorations/sample_exploration.ipynb ++++ output/my_default_python/src/default_python_etl/explorations/sample_exploration.ipynb +@@ -38,5 +38,5 @@ + "# !!! Before performing any data analysis, make sure to run the pipeline to materialize the sample datasets. The tables referenced in this notebook depend on that step./n", + "/n", +- "display(spark.sql(/"SELECT * FROM hive_metastore.[USERNAME].sample_trips_jan_01_1034/"))" ++ "display(spark.sql(/"SELECT * FROM customcatalog.[USERNAME].sample_trips_jan_01_1034/"))" + ] + } +--- [TESTROOT]/bundle/templates/default-python/serverless-customcatalog/../serverless/output/my_default_python/src/sample_python_file.py ++++ output/my_default_python/src/sample_python_file.py +@@ -5,5 +5,5 @@ + def main(): + parser = argparse.ArgumentParser() +- parser.add_argument("--catalog", default="hive_metastore") ++ parser.add_argument("--catalog", default="customcatalog") + parser.add_argument("--schema", default="default") + args = parser.parse_args() diff --git a/acceptance/bundle/templates/default-python/serverless/output.txt b/acceptance/bundle/templates/default-python/serverless/output.txt index 930e756de7..980d6786c6 100644 --- a/acceptance/bundle/templates/default-python/serverless/output.txt +++ b/acceptance/bundle/templates/default-python/serverless/output.txt @@ -1,8 +1,12 @@ >>> [CLI] bundle init default-python --config-file ./input.json --output-dir output - Welcome to the default Python template for Databricks Asset Bundles! -Workspace to use (auto-detected, edit in 'my_default_python/databricks.yml'): [DATABRICKS_URL] + +Please answer the below to tailor your project to your preferences. +You can always change your mind and change your configuration in the databricks.yml file later. + +Note that [DATABRICKS_URL] is used for initialization +(see https://docs.databricks.com/dev-tools/cli/profiles.html for how to change your profile). ✨ Your new project has been created in the 'my_default_python' directory! diff --git a/acceptance/bundle/templates/default-python/serverless/output/my_default_python/.vscode/extensions.json b/acceptance/bundle/templates/default-python/serverless/output/my_default_python/.vscode/extensions.json index 5d15eba363..1f39c33087 100644 --- a/acceptance/bundle/templates/default-python/serverless/output/my_default_python/.vscode/extensions.json +++ b/acceptance/bundle/templates/default-python/serverless/output/my_default_python/.vscode/extensions.json @@ -1,7 +1,6 @@ { "recommendations": [ "databricks.databricks", - "ms-python.vscode-pylance", "redhat.vscode-yaml" ] } diff --git a/acceptance/bundle/templates/default-python/serverless/output/my_default_python/.vscode/settings.json b/acceptance/bundle/templates/default-python/serverless/output/my_default_python/.vscode/settings.json index 8ee87c30d4..d8468d7b60 100644 --- a/acceptance/bundle/templates/default-python/serverless/output/my_default_python/.vscode/settings.json +++ b/acceptance/bundle/templates/default-python/serverless/output/my_default_python/.vscode/settings.json @@ -1,16 +1,39 @@ { - "python.analysis.stubPath": ".vscode", "jupyter.interactiveWindow.cellMarker.codeRegex": "^# COMMAND ----------|^# Databricks notebook source|^(#\\s*%%|#\\s*\\|#\\s*In\\[\\d*?\\]|#\\s*In\\[ \\])", "jupyter.interactiveWindow.cellMarker.default": "# COMMAND ----------", "python.testing.pytestArgs": [ "." ], - "python.testing.unittestEnabled": false, - "python.testing.pytestEnabled": true, - "python.analysis.extraPaths": ["src"], "files.exclude": { "**/*.egg-info": true, "**/__pycache__": true, ".pytest_cache": true, + "dist": true, + }, + "files.associations": { + "**/.gitkeep": "markdown" + } + + // Pylance settings (VS Code) + // Set typeCheckingMode to "basic" to enable type checking! + "python.analysis.typeCheckingMode": "off", + "python.analysis.extraPaths": ["src", "lib", "resources"], + "python.analysis.diagnosticMode": "workspace", + "python.analysis.stubPath": ".vscode", + + // Pyright settings (Cursor) + // Set typeCheckingMode to "basic" to enable type checking! + "cursorpyright.analysis.typeCheckingMode": "off", + "cursorpyright.analysis.extraPaths": ["src", "lib", "resources"], + "cursorpyright.analysis.diagnosticMode": "workspace", + "cursorpyright.analysis.stubPath": ".vscode", + + // General Python settings + "python.defaultInterpreterPath": "./.venv/bin/python", + "python.testing.unittestEnabled": false, + "python.testing.pytestEnabled": true, + "[python]": { + "editor.defaultFormatter": "ms-python.python", + "editor.formatOnSave": true, }, } diff --git a/acceptance/bundle/templates/default-python/serverless/output/my_default_python/README.md b/acceptance/bundle/templates/default-python/serverless/output/my_default_python/README.md index f3b5d153b2..b373a1708d 100644 --- a/acceptance/bundle/templates/default-python/serverless/output/my_default_python/README.md +++ b/acceptance/bundle/templates/default-python/serverless/output/my_default_python/README.md @@ -1,19 +1,46 @@ # my_default_python The 'my_default_python' project was generated by using the default-python template. +For documentation on the Databricks Asset Bundles format use for this project, +and for CI/CD configuration, see https://docs.databricks.com/aws/en/dev-tools/bundles. + +* `src/`: Python source code for this project. +* `src/shared`: Shared source code across all jobs/pipelines/etc. +* `src/default_python_etl`: Python source code for the default_python_etl pipeline. +* `resources/`: Resource configurations (jobs, pipelines, etc.) +* `tests/`: Unit tests. +* `fixtures/`: Fixtures for data sets (primarily used for testing). ## Getting started -0. Install UV: https://docs.astral.sh/uv/getting-started/installation/ +Choose how you want to work on this project: + +(a) Directly in your Databricks workspace, see + https://docs.databricks.com/dev-tools/bundles/workspace. + +(b) Locally with an IDE like Cursor or VS Code, see + https://docs.databricks.com/vscode-ext. + +(c) With command line tools, see https://docs.databricks.com/dev-tools/cli/databricks-cli.html + +Dependencies for this project should be installed using uv: + +* Make sure you have the UV package manager installed. + It's an alternative to tools like pip: https://docs.astral.sh/uv/getting-started/installation/. +* Run `uv sync --dev` to install the project's dependencies. -1. Install the Databricks CLI from https://docs.databricks.com/dev-tools/cli/databricks-cli.html -2. Authenticate to your Databricks workspace, if you have not done so already: +# Using this project using the CLI + +The Databricks workspace and IDE extensions provide a graphical interface for working +with this project. It's also possible to interact with it directly using the CLI: + +1. Authenticate to your Databricks workspace, if you have not done so already: ``` $ databricks configure ``` -3. To deploy a development copy of this project, type: +2. To deploy a development copy of this project, type: ``` $ databricks bundle deploy --target dev ``` @@ -23,9 +50,9 @@ The 'my_default_python' project was generated by using the default-python templa This deploys everything that's defined for this project. For example, the default template would deploy a job called `[dev yourname] my_default_python_job` to your workspace. - You can find that job by opening your workpace and clicking on **Workflows**. + You can find that job by opening your workpace and clicking on **Jobs & Pipelines**. -4. Similarly, to deploy a production copy, type: +3. Similarly, to deploy a production copy, type: ``` $ databricks bundle deploy --target prod ``` @@ -35,17 +62,12 @@ The 'my_default_python' project was generated by using the default-python templa is paused when deploying in development mode (see https://docs.databricks.com/dev-tools/bundles/deployment-modes.html). -5. To run a job or pipeline, use the "run" command: +4. To run a job or pipeline, use the "run" command: ``` $ databricks bundle run ``` -6. Optionally, install the Databricks extension for Visual Studio code for local development from - https://docs.databricks.com/dev-tools/vscode-ext.html. It can configure your - virtual environment and setup Databricks Connect for running unit tests locally. - When not using these tools, consult your development environment's documentation - and/or the documentation for Databricks Connect for manually setting up your environment - (https://docs.databricks.com/en/dev-tools/databricks-connect/python/index.html). - -7. For documentation on the Databricks asset bundles format used - for this project, and for CI/CD configuration, see - https://docs.databricks.com/dev-tools/bundles/index.html. +5. Finally, to run tests locally, use `pytest`: + ``` + $ uv run pytest + ``` + diff --git a/acceptance/bundle/templates/default-python/serverless/output/my_default_python/databricks.yml b/acceptance/bundle/templates/default-python/serverless/output/my_default_python/databricks.yml index bdbe7080bc..23a8437a22 100644 --- a/acceptance/bundle/templates/default-python/serverless/output/my_default_python/databricks.yml +++ b/acceptance/bundle/templates/default-python/serverless/output/my_default_python/databricks.yml @@ -4,14 +4,21 @@ bundle: name: my_default_python uuid: [UUID] +include: + - resources/*.yml + - resources/*/*.yml + artifacts: python_artifact: type: whl build: uv build --wheel -include: - - resources/*.yml - - resources/*/*.yml +# Variable declarations. These variables are assigned in the dev/prod targets below. +variables: + catalog: + description: The catalog to use + schema: + description: The schema to use targets: dev: @@ -23,13 +30,18 @@ targets: default: true workspace: host: [DATABRICKS_URL] - + variables: + catalog: hive_metastore + schema: ${workspace.current_user.short_name} prod: mode: production workspace: host: [DATABRICKS_URL] # We explicitly deploy to /Workspace/Users/[USERNAME] to make sure we only have a single copy. root_path: /Workspace/Users/[USERNAME]/.bundle/${bundle.name}/${bundle.target} + variables: + catalog: hive_metastore + schema: prod permissions: - user_name: [USERNAME] level: CAN_MANAGE diff --git a/acceptance/bundle/templates/default-python/serverless/output/my_default_python/fixtures/.gitkeep b/acceptance/bundle/templates/default-python/serverless/output/my_default_python/fixtures/.gitkeep index fa25d2745e..77a906614c 100644 --- a/acceptance/bundle/templates/default-python/serverless/output/my_default_python/fixtures/.gitkeep +++ b/acceptance/bundle/templates/default-python/serverless/output/my_default_python/fixtures/.gitkeep @@ -1,22 +1,9 @@ -# Fixtures +# Test fixtures directory -This folder is reserved for fixtures, such as CSV files. - -Below is an example of how to load fixtures as a data frame: +Add JSON or CSV files here. In tests, use them with `load_fixture()`: ``` -import pandas as pd -import os - -def get_absolute_path(*relative_parts): - if 'dbutils' in globals(): - base_dir = os.path.dirname(dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()) # type: ignore - path = os.path.normpath(os.path.join(base_dir, *relative_parts)) - return path if path.startswith("/Workspace") else "/Workspace" + path - else: - return os.path.join(*relative_parts) - -csv_file = get_absolute_path("..", "fixtures", "mycsv.csv") -df = pd.read_csv(csv_file) -display(df) +def test_using_fixture(load_fixture): + data = load_fixture("my_data.json") + assert len(data) >= 1 ``` diff --git a/acceptance/bundle/templates/default-python/serverless/output/my_default_python/pyproject.toml b/acceptance/bundle/templates/default-python/serverless/output/my_default_python/pyproject.toml index 5049f8a3ea..7cd4a5cc86 100644 --- a/acceptance/bundle/templates/default-python/serverless/output/my_default_python/pyproject.toml +++ b/acceptance/bundle/templates/default-python/serverless/output/my_default_python/pyproject.toml @@ -4,30 +4,23 @@ version = "0.0.1" authors = [{ name = "[USERNAME]" }] requires-python = ">= 3.11" -[project.optional-dependencies] +[dependency-groups] dev = [ "pytest", - - # Code completion support for Lakeflow Declarative Pipelines, also install databricks-connect "databricks-dlt", # databricks-connect can be used to run parts of this project locally. - # See https://docs.databricks.com/dev-tools/databricks-connect.html. - # - # Note, databricks-connect is automatically installed if you're using Databricks - # extension for Visual Studio Code - # (https://docs.databricks.com/dev-tools/vscode-ext/dev-tasks/databricks-connect.html). - # - # To manually install databricks-connect, uncomment the line below to install a version - # of db-connect that corresponds to the Databricks Runtime version used for this project. - # See https://docs.databricks.com/dev-tools/databricks-connect.html - # "databricks-connect>=15.4,<15.5", + # Note that for local development, you should use a version that is not newer + # than the remote cluster or serverless compute you connect to. + # See also https://docs.databricks.com/dev-tools/databricks-connect.html. + "databricks-connect>=15.4,<15.5", ] [tool.pytest.ini_options] pythonpath = "src" testpaths = [ "tests", + "resources", ] [build-system] @@ -35,7 +28,8 @@ requires = ["hatchling"] build-backend = "hatchling.build" [tool.hatch.build.targets.wheel] -packages = ["src/my_default_python"] +packages = ["src"] +sources = ["src"] [project.scripts] -main = "my_default_python.main:main" +main = "sample_python_file:main" diff --git a/acceptance/bundle/templates/default-python/serverless/output/my_default_python/resources/.gitkeep b/acceptance/bundle/templates/default-python/serverless/output/my_default_python/resources/.gitkeep new file mode 100644 index 0000000000..3e09c14c18 --- /dev/null +++ b/acceptance/bundle/templates/default-python/serverless/output/my_default_python/resources/.gitkeep @@ -0,0 +1 @@ +This folder is reserved for Databricks Asset Bundles resource definitions. diff --git a/acceptance/bundle/templates/default-python/serverless/output/my_default_python/resources/my_default_python.pipeline.yml b/acceptance/bundle/templates/default-python/serverless/output/my_default_python/resources/default_python_etl.pipeline.yml similarity index 52% rename from acceptance/bundle/templates/default-python/serverless/output/my_default_python/resources/my_default_python.pipeline.yml rename to acceptance/bundle/templates/default-python/serverless/output/my_default_python/resources/default_python_etl.pipeline.yml index 545a5ce556..185f474478 100644 --- a/acceptance/bundle/templates/default-python/serverless/output/my_default_python/resources/my_default_python.pipeline.yml +++ b/acceptance/bundle/templates/default-python/serverless/output/my_default_python/resources/default_python_etl.pipeline.yml @@ -1,15 +1,15 @@ + # The main pipeline for my_default_python resources: pipelines: - my_default_python_pipeline: - name: my_default_python_pipeline + default_python_etl: + name: default_python_etl ## Catalog is required for serverless compute catalog: main schema: my_default_python_${bundle.target} serverless: true - libraries: - - notebook: - path: ../src/pipeline.ipynb + root_path: ../src - configuration: - bundle.sourcePath: ${workspace.file_path}/src + libraries: + - glob: + include: ../src/default_python_etl/transformations/** diff --git a/acceptance/bundle/templates/default-python/serverless/output/my_default_python/resources/my_default_python.job.yml b/acceptance/bundle/templates/default-python/serverless/output/my_default_python/resources/my_default_python.job.yml deleted file mode 100644 index df74a62185..0000000000 --- a/acceptance/bundle/templates/default-python/serverless/output/my_default_python/resources/my_default_python.job.yml +++ /dev/null @@ -1,45 +0,0 @@ -# The main job for my_default_python. -resources: - jobs: - my_default_python_job: - name: my_default_python_job - - trigger: - # Run this job every day, exactly one day from the last run; see https://docs.databricks.com/api/workspace/jobs/create#trigger - periodic: - interval: 1 - unit: DAYS - - #email_notifications: - # on_failure: - # - your_email@example.com - - tasks: - - task_key: notebook_task - notebook_task: - notebook_path: ../src/notebook.ipynb - - - task_key: refresh_pipeline - depends_on: - - task_key: notebook_task - pipeline_task: - pipeline_id: ${resources.pipelines.my_default_python_pipeline.id} - - - task_key: main_task - depends_on: - - task_key: refresh_pipeline - environment_key: default - python_wheel_task: - package_name: my_default_python - entry_point: main - - # A list of task execution environment specifications that can be referenced by tasks of this job. - environments: - - environment_key: default - - # Full documentation of this spec can be found at: - # https://docs.databricks.com/api/workspace/jobs/create#environments-spec - spec: - client: "2" - dependencies: - - ../dist/*.whl diff --git a/acceptance/bundle/templates/default-python/serverless/output/my_default_python/resources/sample_job.job.yml b/acceptance/bundle/templates/default-python/serverless/output/my_default_python/resources/sample_job.job.yml new file mode 100644 index 0000000000..aedcdb71df --- /dev/null +++ b/acceptance/bundle/templates/default-python/serverless/output/my_default_python/resources/sample_job.job.yml @@ -0,0 +1,49 @@ +# A sample job for my_default_python. + +resources: + jobs: + sample_job: + name: sample_job + + trigger: + # Run this job every day, exactly one day from the last run; see https://docs.databricks.com/api/workspace/jobs/create#trigger + periodic: + interval: 1 + unit: DAYS + + #email_notifications: + # on_failure: + # - your_email@example.com + + parameters: + - name: catalog + default: ${var.catalog} + - name: schema + default: ${var.schema} + + tasks: + - task_key: notebook_task + notebook_task: + notebook_path: ../src/sample_notebook.ipynb + environment_key: default + - task_key: python_file_task + depends_on: + - task_key: notebook_task + spark_python_task: + python_file: ../src/sample_python_file.py + environment_key: default + - task_key: refresh_pipeline + depends_on: + - task_key: notebook_task + pipeline_task: + pipeline_id: ${resources.pipelines.default_python_etl.id} + + environments: + - environment_key: default + spec: + client: "2" + dependencies: + # By default we just include the .whl file generated for the default_python package in src/. + # See https://docs.databricks.com/dev-tools/bundles/library-dependencies.html + # for more information on how to add other libraries. + - ../dist/*.whl diff --git a/acceptance/bundle/templates/default-python/serverless/output/my_default_python/scratch/README.md b/acceptance/bundle/templates/default-python/serverless/output/my_default_python/scratch/README.md deleted file mode 100644 index e6cfb81b46..0000000000 --- a/acceptance/bundle/templates/default-python/serverless/output/my_default_python/scratch/README.md +++ /dev/null @@ -1,4 +0,0 @@ -# scratch - -This folder is reserved for personal, exploratory notebooks. -By default these are not committed to Git, as 'scratch' is listed in .gitignore. diff --git a/acceptance/bundle/templates/default-python/serverless/output/my_default_python/scratch/exploration.ipynb b/acceptance/bundle/templates/default-python/serverless/output/my_default_python/scratch/exploration.ipynb deleted file mode 100644 index a12773d4e8..0000000000 --- a/acceptance/bundle/templates/default-python/serverless/output/my_default_python/scratch/exploration.ipynb +++ /dev/null @@ -1,61 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "%load_ext autoreload\n", - "%autoreload 2" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "[UUID]", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "import sys\n", - "\n", - "sys.path.append(\"../src\")\n", - "from my_default_python import main\n", - "\n", - "main.get_taxis(spark).show(10)" - ] - } - ], - "metadata": { - "application/vnd.databricks.v1+notebook": { - "dashboards": [], - "language": "python", - "notebookMetadata": { - "pythonIndentUnit": 2 - }, - "notebookName": "ipynb-notebook", - "widgets": {} - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "name": "python", - "version": "3.11.4" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} diff --git a/acceptance/bundle/templates/default-python/serverless/output/my_default_python/src/default_python_etl/README.md b/acceptance/bundle/templates/default-python/serverless/output/my_default_python/src/default_python_etl/README.md new file mode 100644 index 0000000000..737b73cf43 --- /dev/null +++ b/acceptance/bundle/templates/default-python/serverless/output/my_default_python/src/default_python_etl/README.md @@ -0,0 +1,22 @@ +# my_default_python + +This folder defines all source code for the my_default_python pipeline: + +- `explorations/`: Ad-hoc notebooks used to explore the data processed by this pipeline. +- `transformations/`: All dataset definitions and transformations. +- `utilities/` (optional): Utility functions and Python modules used in this pipeline. +- `data_sources/` (optional): View definitions describing the source data for this pipeline. + +## Getting Started + +To get started, go to the `transformations` folder -- most of the relevant source code lives there: + +* By convention, every dataset under `transformations` is in a separate file. +* Take a look at the sample under "sample_trips_jan_01_1034.py" to get familiar with the syntax. + Read more about the syntax at https://docs.databricks.com/dlt/python-ref.html. +* Use `Run file` to run and preview a single transformation. +* Use `Run pipeline` to run _all_ transformations in the entire pipeline. +* Use `+ Add` in the file browser to add a new data set definition. +* Use `Schedule` to run the pipeline on a schedule! + +For more tutorials and reference material, see https://docs.databricks.com/dlt. diff --git a/acceptance/bundle/templates/default-python/serverless/output/my_default_python/src/default_python_etl/__init__.py b/acceptance/bundle/templates/default-python/serverless/output/my_default_python/src/default_python_etl/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/acceptance/bundle/templates/default-python/serverless/output/my_default_python/src/default_python_etl/explorations/__init__.py b/acceptance/bundle/templates/default-python/serverless/output/my_default_python/src/default_python_etl/explorations/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/acceptance/bundle/templates/default-python/serverless/output/my_default_python/src/default_python_etl/explorations/sample_exploration.ipynb b/acceptance/bundle/templates/default-python/serverless/output/my_default_python/src/default_python_etl/explorations/sample_exploration.ipynb new file mode 100644 index 0000000000..7edb8fe518 --- /dev/null +++ b/acceptance/bundle/templates/default-python/serverless/output/my_default_python/src/default_python_etl/explorations/sample_exploration.ipynb @@ -0,0 +1,63 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "[UUID]", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "### Example Exploratory Notebook\n", + "\n", + "Use this notebook to explore the data generated by the pipeline in your preferred programming language.\n", + "\n", + "**Note**: This notebook is not executed as part of the pipeline." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "[UUID]", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [], + "source": [ + "# !!! Before performing any data analysis, make sure to run the pipeline to materialize the sample datasets. The tables referenced in this notebook depend on that step.\n", + "\n", + "display(spark.sql(\"SELECT * FROM hive_metastore.[USERNAME].sample_trips_jan_01_1034\"))" + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "computePreferences": null, + "dashboards": [], + "environmentMetadata": null, + "inputWidgetPreferences": null, + "language": "python", + "notebookMetadata": { + "pythonIndentUnit": 2 + }, + "notebookName": "sample_exploration", + "widgets": {} + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/acceptance/bundle/templates/default-python/serverless/output/my_default_python/src/default_python_etl/transformations/__init__.py b/acceptance/bundle/templates/default-python/serverless/output/my_default_python/src/default_python_etl/transformations/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/acceptance/bundle/templates/default-python/serverless/output/my_default_python/src/default_python_etl/transformations/sample_trips_jan_01_1034.py b/acceptance/bundle/templates/default-python/serverless/output/my_default_python/src/default_python_etl/transformations/sample_trips_jan_01_1034.py new file mode 100644 index 0000000000..706b0b8952 --- /dev/null +++ b/acceptance/bundle/templates/default-python/serverless/output/my_default_python/src/default_python_etl/transformations/sample_trips_jan_01_1034.py @@ -0,0 +1,15 @@ +import dlt +from pyspark.sql.functions import col +from default_python_etl.utilities import utils + + +# This file defines a sample transformation. +# Edit the sample below or add new transformations +# using "+ Add" in the file browser. + + +@dlt.table +def sample_trips_jan_01_1034(): + return spark.read.table("samples.nyctaxi.trips").withColumn( + "trip_distance_km", utils.distance_km(col("trip_distance")) + ) diff --git a/acceptance/bundle/templates/default-python/serverless/output/my_default_python/src/default_python_etl/transformations/sample_zones_jan_01_1034.py b/acceptance/bundle/templates/default-python/serverless/output/my_default_python/src/default_python_etl/transformations/sample_zones_jan_01_1034.py new file mode 100644 index 0000000000..82209f7ce4 --- /dev/null +++ b/acceptance/bundle/templates/default-python/serverless/output/my_default_python/src/default_python_etl/transformations/sample_zones_jan_01_1034.py @@ -0,0 +1,17 @@ +import dlt +from pyspark.sql.functions import col, sum + + +# This file defines a sample transformation. +# Edit the sample below or add new transformations +# using "+ Add" in the file browser. + + +@dlt.table +def sample_zones_jan_01_1034(): + # Read from the "sample_trips" table, then sum all the fares + return ( + spark.read.table(f"sample_trips_jan_01_1034") + .groupBy(col("pickup_zip")) + .agg(sum("fare_amount").alias("total_fare")) + ) diff --git a/acceptance/bundle/templates/default-python/serverless/output/my_default_python/src/default_python_etl/utilities/__init__.py b/acceptance/bundle/templates/default-python/serverless/output/my_default_python/src/default_python_etl/utilities/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/acceptance/bundle/templates/default-python/serverless/output/my_default_python/src/default_python_etl/utilities/utils.py b/acceptance/bundle/templates/default-python/serverless/output/my_default_python/src/default_python_etl/utilities/utils.py new file mode 100644 index 0000000000..f0f4e940f7 --- /dev/null +++ b/acceptance/bundle/templates/default-python/serverless/output/my_default_python/src/default_python_etl/utilities/utils.py @@ -0,0 +1,12 @@ +from pyspark.sql.functions import col, when + + +def distance_km(distance_col): + """Convert distance from miles to kilometers.""" + return distance_col * 1.60934 + + +def format_currency(amount_col): + """Format amount as currency.""" + return when(col(amount_col).isNotNull(), + col(amount_col).cast("decimal(10,2)")) diff --git a/acceptance/bundle/templates/default-python/serverless/output/my_default_python/src/my_default_python/main.py b/acceptance/bundle/templates/default-python/serverless/output/my_default_python/src/my_default_python/main.py deleted file mode 100644 index 5ae344c7e2..0000000000 --- a/acceptance/bundle/templates/default-python/serverless/output/my_default_python/src/my_default_python/main.py +++ /dev/null @@ -1,25 +0,0 @@ -from pyspark.sql import SparkSession, DataFrame - - -def get_taxis(spark: SparkSession) -> DataFrame: - return spark.read.table("samples.nyctaxi.trips") - - -# Create a new Databricks Connect session. If this fails, -# check that you have configured Databricks Connect correctly. -# See https://docs.databricks.com/dev-tools/databricks-connect.html. -def get_spark() -> SparkSession: - try: - from databricks.connect import DatabricksSession - - return DatabricksSession.builder.getOrCreate() - except ImportError: - return SparkSession.builder.getOrCreate() - - -def main(): - get_taxis(get_spark()).show(5) - - -if __name__ == "__main__": - main() diff --git a/acceptance/bundle/templates/default-python/serverless/output/my_default_python/src/pipeline.ipynb b/acceptance/bundle/templates/default-python/serverless/output/my_default_python/src/pipeline.ipynb deleted file mode 100644 index 53148beff1..0000000000 --- a/acceptance/bundle/templates/default-python/serverless/output/my_default_python/src/pipeline.ipynb +++ /dev/null @@ -1,90 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "[UUID]", - "showTitle": false, - "title": "" - } - }, - "source": [ - "# Lakeflow Declarative Pipeline\n", - "\n", - "This Lakeflow Declarative Pipeline definition is executed using a pipeline defined in resources/my_default_python.pipeline.yml." - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "[UUID]", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "# Import DLT and src/my_default_python\n", - "import dlt\n", - "import sys\n", - "\n", - "sys.path.append(spark.conf.get(\"bundle.sourcePath\", \".\"))\n", - "from pyspark.sql.functions import expr\n", - "from my_default_python import main" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "[UUID]", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "@dlt.view\n", - "def taxi_raw():\n", - " return main.get_taxis(spark)\n", - "\n", - "\n", - "@dlt.table\n", - "def filtered_taxis():\n", - " return dlt.read(\"taxi_raw\").filter(expr(\"fare_amount < 30\"))" - ] - } - ], - "metadata": { - "application/vnd.databricks.v1+notebook": { - "dashboards": [], - "language": "python", - "notebookMetadata": { - "pythonIndentUnit": 2 - }, - "notebookName": "pipeline", - "widgets": {} - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "name": "python", - "version": "3.11.4" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} diff --git a/acceptance/bundle/templates/default-python/serverless/output/my_default_python/src/notebook.ipynb b/acceptance/bundle/templates/default-python/serverless/output/my_default_python/src/sample_notebook.ipynb similarity index 88% rename from acceptance/bundle/templates/default-python/serverless/output/my_default_python/src/notebook.ipynb rename to acceptance/bundle/templates/default-python/serverless/output/my_default_python/src/sample_notebook.ipynb index 472ccb2190..aa609df200 100644 --- a/acceptance/bundle/templates/default-python/serverless/output/my_default_python/src/notebook.ipynb +++ b/acceptance/bundle/templates/default-python/serverless/output/my_default_python/src/sample_notebook.ipynb @@ -23,6 +23,7 @@ "metadata": {}, "outputs": [], "source": [ + "# Reload wheel file dependencies every time they are updated\n", "%load_ext autoreload\n", "%autoreload 2" ] @@ -44,9 +45,9 @@ }, "outputs": [], "source": [ - "from my_default_python import main\n", + "from shared import taxis\n", "\n", - "main.get_taxis(spark).show(10)" + "taxis.find_all_taxis().show(10)" ] } ], @@ -57,7 +58,7 @@ "notebookMetadata": { "pythonIndentUnit": 2 }, - "notebookName": "notebook", + "notebookName": "sample_notebook", "widgets": {} }, "kernelspec": { diff --git a/acceptance/bundle/templates/default-python/serverless/output/my_default_python/src/sample_python_file.py b/acceptance/bundle/templates/default-python/serverless/output/my_default_python/src/sample_python_file.py new file mode 100644 index 0000000000..719a0b71a3 --- /dev/null +++ b/acceptance/bundle/templates/default-python/serverless/output/my_default_python/src/sample_python_file.py @@ -0,0 +1,19 @@ +import argparse +from datetime import datetime +from shared import taxis + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--catalog", default="hive_metastore") + parser.add_argument("--schema", default="default") + args = parser.parse_args() + + df = taxis.find_all_taxis() + + table_name = f"{args.catalog}.{args.schema}.taxis_jan_01_1034" + df.write.mode("overwrite").saveAsTable(table_name) + + print(f"Wrote {df.count()} taxi records to {table_name}") + +if __name__ == "__main__": + main() diff --git a/acceptance/bundle/templates/default-python/serverless/output/my_default_python/src/shared/__init__.py b/acceptance/bundle/templates/default-python/serverless/output/my_default_python/src/shared/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/acceptance/bundle/templates/default-python/serverless/output/my_default_python/src/shared/taxis.py b/acceptance/bundle/templates/default-python/serverless/output/my_default_python/src/shared/taxis.py new file mode 100644 index 0000000000..a7309cd4c5 --- /dev/null +++ b/acceptance/bundle/templates/default-python/serverless/output/my_default_python/src/shared/taxis.py @@ -0,0 +1,7 @@ +from databricks.sdk.runtime import spark +from pyspark.sql import DataFrame + + +def find_all_taxis() -> DataFrame: + """Find all taxi data.""" + return spark.read.table("samples.nyctaxi.trips") diff --git a/acceptance/bundle/templates/default-python/serverless/output/my_default_python/tests/conftest.py b/acceptance/bundle/templates/default-python/serverless/output/my_default_python/tests/conftest.py new file mode 100644 index 0000000000..8037a4647c --- /dev/null +++ b/acceptance/bundle/templates/default-python/serverless/output/my_default_python/tests/conftest.py @@ -0,0 +1,93 @@ +"""This file configures pytest. + +This file is in the root since it can be used for tests in any place in this +project, including tests under resources/. +""" + +import os, sys, pathlib +from contextlib import contextmanager + + +try: + from databricks.connect import DatabricksSession + from databricks.sdk import WorkspaceClient + from pyspark.sql import SparkSession + import pytest + import json + import csv + import os +except ImportError: + raise ImportError("Test dependencies not found.\n\nRun tests using 'uv run pytest'. See http://docs.astral.sh/uv to learn more about uv.") + + +@pytest.fixture() +def spark() -> SparkSession: + """Provide a SparkSession fixture for tests. + + Minimal example: + def test_uses_spark(spark): + df = spark.createDataFrame([(1,)], ["x"]) + assert df.count() == 1 + """ + return DatabricksSession.builder.getOrCreate() + +@pytest.fixture() +def load_fixture(spark: SparkSession): + """Provide a callable to load JSON or CSV from fixtures/ directory. + + Example usage: + + def test_using_fixture(load_fixture): + data = load_fixture("my_data.json") + assert data.count() >= 1 + """ + def _loader(filename: str): + path = pathlib.Path(__file__).parent.parent / "fixtures" / filename + suffix = path.suffix.lower() + if suffix == ".json": + rows = json.loads(path.read_text()) + return spark.createDataFrame(rows) + if suffix == ".csv": + with path.open(newline="") as f: + rows = list(csv.DictReader(f)) + return spark.createDataFrame(rows) + raise ValueError(f"Unsupported fixture type for: {filename}") + return _loader + + +def _enable_fallback_compute(): + """Enable serverless compute if no compute is specified.""" + conf = WorkspaceClient().config + if conf.serverless_compute_id or conf.cluster_id or os.environ.get("SPARK_REMOTE"): + return + + url = "https://docs.databricks.com/dev-tools/databricks-connect/cluster-config" + print("☁️ no compute specified, falling back to serverless compute", file=sys.stderr) + print(f" see {url} for manual configuration", file=sys.stdout) + + os.environ["DATABRICKS_SERVERLESS_COMPUTE_ID"] = "auto" + + +@contextmanager +def _allow_stderr_output(config: pytest.Config): + """Temporarily disable pytest output capture.""" + capman = config.pluginmanager.get_plugin("capturemanager") + if capman: + with capman.global_and_fixture_disabled(): + yield + else: + yield + + +def pytest_configure(config: pytest.Config): + """Configure pytest session.""" + with _allow_stderr_output(config): + _enable_fallback_compute() + + # Initialize Spark session eagerly, so it is available even when + # SparkSession.builder.getOrCreate() is used. For DB Connect 15+, + # we validate version compatibility with the remote cluster. + if hasattr(DatabricksSession.builder, "validateSession"): + DatabricksSession.builder.validateSession().getOrCreate() + else: + DatabricksSession.builder.getOrCreate() diff --git a/acceptance/bundle/templates/default-python/serverless/output/my_default_python/tests/main_test.py b/acceptance/bundle/templates/default-python/serverless/output/my_default_python/tests/main_test.py deleted file mode 100644 index dc449154a6..0000000000 --- a/acceptance/bundle/templates/default-python/serverless/output/my_default_python/tests/main_test.py +++ /dev/null @@ -1,6 +0,0 @@ -from my_default_python.main import get_taxis, get_spark - - -def test_main(): - taxis = get_taxis(get_spark()) - assert taxis.count() > 5 diff --git a/acceptance/bundle/templates/default-python/serverless/output/my_default_python/tests/sample_taxis_test.py b/acceptance/bundle/templates/default-python/serverless/output/my_default_python/tests/sample_taxis_test.py new file mode 100644 index 0000000000..a782015363 --- /dev/null +++ b/acceptance/bundle/templates/default-python/serverless/output/my_default_python/tests/sample_taxis_test.py @@ -0,0 +1,8 @@ +from databricks.sdk.runtime import spark +from pyspark.sql import DataFrame +from shared import taxis + + +def test_find_all_taxis(): + results = taxis.find_all_taxis() + assert results.count() > 5 diff --git a/acceptance/bundle/templates/experimental-jobs-as-code/output.txt b/acceptance/bundle/templates/experimental-jobs-as-code/output.txt index 706f1f9ff6..7400d572e0 100644 --- a/acceptance/bundle/templates/experimental-jobs-as-code/output.txt +++ b/acceptance/bundle/templates/experimental-jobs-as-code/output.txt @@ -2,121 +2,6 @@ >>> [CLI] bundle init experimental-jobs-as-code --config-file ./input.json --output-dir output Welcome to (EXPERIMENTAL) "Jobs as code" template for Databricks Asset Bundles! -Workspace to use (auto-detected, edit in 'my_jobs_as_code/databricks.yml'): [DATABRICKS_URL] +Error: failed to compute file content for __preamble.tmpl. variable "include_pipeline" not defined -✨ Your new project has been created in the 'my_jobs_as_code' directory! - -Please refer to the README.md file for "getting started" instructions. -See also the documentation at https://docs.databricks.com/dev-tools/bundles/index.html. - ->>> [CLI] bundle validate -t dev --output json -Warning: Ignoring Databricks CLI version constraint for development build. Required: >= 0.248.0, current: [DEV_VERSION] - -{ - "jobs": { - "my_jobs_as_code_job": { - "deployment": { - "kind": "BUNDLE", - "metadata_file_path": "/Workspace/Users/[USERNAME]/.bundle/my_jobs_as_code/dev/state/metadata.json" - }, - "edit_mode": "UI_LOCKED", - "format": "MULTI_TASK", - "job_clusters": [ - { - "job_cluster_key": "job_cluster", - "new_cluster": { - "autoscale": { - "max_workers": 4, - "min_workers": 1 - }, - "data_security_mode": "SINGLE_USER", - "node_type_id": "[NODE_TYPE_ID]", - "num_workers": 0, - "spark_version": "15.4.x-scala2.12" - } - } - ], - "max_concurrent_runs": 4, - "name": "[dev [USERNAME]] my_jobs_as_code_job", - "permissions": [], - "queue": { - "enabled": true - }, - "tags": { - "dev": "[USERNAME]" - }, - "tasks": [ - { - "depends_on": [ - { - "task_key": "notebook_task" - } - ], - "job_cluster_key": "job_cluster", - "libraries": [ - { - "whl": "dist/*.whl" - } - ], - "python_wheel_task": { - "entry_point": "main", - "package_name": "my_jobs_as_code" - }, - "task_key": "main_task" - }, - { - "job_cluster_key": "job_cluster", - "notebook_task": { - "notebook_path": "/Workspace/Users/[USERNAME]/.bundle/my_jobs_as_code/dev/files/src/notebook", - "source": "WORKSPACE" - }, - "task_key": "notebook_task" - } - ], - "trigger": { - "pause_status": "PAUSED", - "periodic": { - "interval": 1, - "unit": "DAYS" - } - } - } - }, - "pipelines": { - "my_jobs_as_code_pipeline": { - "catalog": "catalog_name", - "channel": "CURRENT", - "configuration": { - "bundle.sourcePath": "/Workspace/Users/[USERNAME]/.bundle/my_jobs_as_code/dev/files/src" - }, - "deployment": { - "kind": "BUNDLE", - "metadata_file_path": "/Workspace/Users/[USERNAME]/.bundle/my_jobs_as_code/dev/state/metadata.json" - }, - "development": true, - "edition": "ADVANCED", - "libraries": [ - { - "notebook": { - "path": "/Workspace/Users/[USERNAME]/.bundle/my_jobs_as_code/dev/files/src/dlt_pipeline" - } - } - ], - "name": "[dev [USERNAME]] my_jobs_as_code_pipeline", - "permissions": [], - "tags": { - "dev": "[USERNAME]" - }, - "target": "my_jobs_as_code_dev" - } - } -} - ->>> unzip -Z1 dist/my_jobs_as_code-0.0.1-py3-none-any.whl -my_jobs_as_code/__init__.py -my_jobs_as_code/main.py -my_jobs_as_code-0.0.1.dist-info/METADATA -my_jobs_as_code-0.0.1.dist-info/WHEEL -my_jobs_as_code-0.0.1.dist-info/entry_points.txt -my_jobs_as_code-0.0.1.dist-info/top_level.txt -my_jobs_as_code-0.0.1.dist-info/RECORD +Exit code: 1 diff --git a/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/README.md b/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/README.md deleted file mode 100644 index 8c429c6e53..0000000000 --- a/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/README.md +++ /dev/null @@ -1,58 +0,0 @@ -# my_jobs_as_code - -The 'my_jobs_as_code' project was generated by using the "Jobs as code" template. - -## Prerequisites - -1. Install Databricks CLI 0.238 or later. - See [Install or update the Databricks CLI](https://docs.databricks.com/en/dev-tools/cli/install.html). - -2. Install uv. See [Installing uv](https://docs.astral.sh/uv/getting-started/installation/). - We use uv to create a virtual environment and install the required dependencies. - -3. Authenticate to your Databricks workspace if you have not done so already: - ``` - $ databricks configure - ``` - -4. Optionally, install developer tools such as the Databricks extension for Visual Studio Code from - https://docs.databricks.com/dev-tools/vscode-ext.html. Or read the "getting started" documentation for - **Databricks Connect** for instructions on running the included Python code from a different IDE. - -5. For documentation on the Databricks Asset Bundles format used - for this project, and for CI/CD configuration, see - https://docs.databricks.com/dev-tools/bundles/index.html. - -## Deploy and run jobs - -1. Create a new virtual environment and install the required dependencies: - ``` - $ uv sync - ``` - -2. To deploy the bundle to the development target: - ``` - $ databricks bundle deploy --target dev - ``` - - *(Note that "dev" is the default target, so the `--target` parameter is optional here.)* - - This deploys everything that's defined for this project. - For example, the default template would deploy a job called - `[dev yourname] my_jobs_as_code_job` to your workspace. - You can find that job by opening your workspace and clicking on **Workflows**. - -3. Similarly, to deploy a production copy, type: - ``` - $ databricks bundle deploy --target prod - ``` - - Note that the default job from the template has a schedule that runs every day - (defined in resources/my_jobs_as_code_job.py). The schedule - is paused when deploying in development mode (see [Databricks Asset Bundle deployment modes]( - https://docs.databricks.com/dev-tools/bundles/deployment-modes.html)). - -4. To run a job: - ``` - $ databricks bundle run - ``` diff --git a/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/databricks.yml b/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/databricks.yml deleted file mode 100644 index b09d99917e..0000000000 --- a/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/databricks.yml +++ /dev/null @@ -1,50 +0,0 @@ -# This is a Databricks asset bundle definition for my_jobs_as_code. -# See https://docs.databricks.com/dev-tools/bundles/index.html for documentation. -bundle: - name: my_jobs_as_code - uuid: [UUID] - databricks_cli_version: ">= 0.248.0" - -experimental: - python: - # Activate virtual environment before loading resources defined in Python. - # If disabled, defaults to using the Python interpreter available in the current shell. - venv_path: .venv - # Functions called to load resources defined in Python. See resources/__init__.py - resources: - - "resources:load_resources" - -artifacts: - default: - type: whl - path: . - # We use timestamp as Local version identifier (https://peps.python.org/pep-0440/#local-version-identifiers.) - # to ensure that changes to wheel package are picked up when used on all-purpose clusters - build: LOCAL_VERSION=$(date +%Y%m%d.%H%M%S) uv build - -include: - - resources/*.yml - - resources/*/*.yml - -targets: - dev: - # The default target uses 'mode: development' to create a development copy. - # - Deployed resources get prefixed with '[dev my_user_name]' - # - Any job schedules and triggers are paused by default. - # See also https://docs.databricks.com/dev-tools/bundles/deployment-modes.html. - mode: development - default: true - workspace: - host: [DATABRICKS_URL] - - prod: - mode: production - workspace: - host: [DATABRICKS_URL] - # We explicitly specify /Workspace/Users/[USERNAME] to make sure we only have a single copy. - root_path: /Workspace/Users/[USERNAME]/.bundle/${bundle.name}/${bundle.target} - permissions: - - user_name: [USERNAME] - level: CAN_MANAGE - run_as: - user_name: [USERNAME] diff --git a/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/fixtures/.gitkeep b/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/fixtures/.gitkeep deleted file mode 100644 index fa25d2745e..0000000000 --- a/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/fixtures/.gitkeep +++ /dev/null @@ -1,22 +0,0 @@ -# Fixtures - -This folder is reserved for fixtures, such as CSV files. - -Below is an example of how to load fixtures as a data frame: - -``` -import pandas as pd -import os - -def get_absolute_path(*relative_parts): - if 'dbutils' in globals(): - base_dir = os.path.dirname(dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()) # type: ignore - path = os.path.normpath(os.path.join(base_dir, *relative_parts)) - return path if path.startswith("/Workspace") else "/Workspace" + path - else: - return os.path.join(*relative_parts) - -csv_file = get_absolute_path("..", "fixtures", "mycsv.csv") -df = pd.read_csv(csv_file) -display(df) -``` diff --git a/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/out.gitignore b/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/out.gitignore deleted file mode 100644 index 0dab7f4995..0000000000 --- a/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/out.gitignore +++ /dev/null @@ -1,8 +0,0 @@ -.databricks/ -build/ -dist/ -__pycache__/ -*.egg-info -.venv/ -scratch/** -!scratch/README.md diff --git a/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/pyproject.toml b/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/pyproject.toml deleted file mode 100644 index 4478dace35..0000000000 --- a/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/pyproject.toml +++ /dev/null @@ -1,49 +0,0 @@ -[build-system] -requires = ["setuptools>=61.0"] -build-backend = "setuptools.build_meta" - -[project] -name = "my_jobs_as_code" -requires-python = ">=3.10" -description = "wheel file based on my_jobs_as_code" - -# Dependencies in case the output wheel file is used as a library dependency. -# For defining dependencies, when this package is used in Databricks, see: -# https://docs.databricks.com/dev-tools/bundles/library-dependencies.html -# -# Example: -# dependencies = [ -# "requests==x.y.z", -# ] -dependencies = [ -] - -# see setup.py -dynamic = ["version"] - -[project.entry-points.packages] -main = "my_jobs_as_code.main:main" - -[tool.setuptools.packages.find] -where = ["src"] - -[tool.uv] -## Dependencies for local development -dev-dependencies = [ - "databricks-bundles==x.y.z", - - ## Add code completion support for DLT - # "databricks-dlt", - - ## databricks-connect can be used to run parts of this project locally. - ## See https://docs.databricks.com/dev-tools/databricks-connect.html. - ## - ## Uncomment line below to install a version of db-connect that corresponds to - ## the Databricks Runtime version used for this project. - # "databricks-connect>=15.4,<15.5", -] - -override-dependencies = [ - # pyspark package conflicts with 'databricks-connect' - "pyspark; sys_platform == 'never'", -] diff --git a/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/resources/__init__.py b/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/resources/__init__.py deleted file mode 100644 index fbcb9dc5f0..0000000000 --- a/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/resources/__init__.py +++ /dev/null @@ -1,16 +0,0 @@ -from databricks.bundles.core import ( - Bundle, - Resources, - load_resources_from_current_package_module, -) - - -def load_resources(bundle: Bundle) -> Resources: - """ - 'load_resources' function is referenced in databricks.yml and is responsible for loading - bundle resources defined in Python code. This function is called by Databricks CLI during - bundle deployment. After deployment, this function is not used. - """ - - # the default implementation loads all Python files in 'resources' directory - return load_resources_from_current_package_module() diff --git a/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/resources/my_jobs_as_code_job.py b/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/resources/my_jobs_as_code_job.py deleted file mode 100644 index 2407a95462..0000000000 --- a/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/resources/my_jobs_as_code_job.py +++ /dev/null @@ -1,68 +0,0 @@ -from databricks.bundles.jobs import Job - -""" -The main job for my_jobs_as_code. -""" - - -my_jobs_as_code_job = Job.from_dict( - { - "name": "my_jobs_as_code_job", - "trigger": { - # Run this job every day, exactly one day from the last run; see https://docs.databricks.com/api/workspace/jobs/create#trigger - "periodic": { - "interval": 1, - "unit": "DAYS", - }, - }, - # "email_notifications": { - # "on_failure": [ - # "[USERNAME]", - # ], - # }, - "tasks": [ - { - "task_key": "notebook_task", - "job_cluster_key": "job_cluster", - "notebook_task": { - "notebook_path": "src/notebook.ipynb", - }, - }, - { - "task_key": "main_task", - "depends_on": [ - { - "task_key": "notebook_task", - }, - ], - "job_cluster_key": "job_cluster", - "python_wheel_task": { - "package_name": "my_jobs_as_code", - "entry_point": "main", - }, - "libraries": [ - # By default we just include the .whl file generated for the my_jobs_as_code package. - # See https://docs.databricks.com/dev-tools/bundles/library-dependencies.html - # for more information on how to add other libraries. - { - "whl": "dist/*.whl", - }, - ], - }, - ], - "job_clusters": [ - { - "job_cluster_key": "job_cluster", - "new_cluster": { - "spark_version": "15.4.x-scala2.12", - "node_type_id": "[NODE_TYPE_ID]", - "data_security_mode": "SINGLE_USER", - "autoscale": { - "min_workers": 1, - "max_workers": 4, - }, - }, - }, - ], - } -) diff --git a/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/resources/my_jobs_as_code_pipeline.py b/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/resources/my_jobs_as_code_pipeline.py deleted file mode 100644 index 9d83e573a9..0000000000 --- a/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/resources/my_jobs_as_code_pipeline.py +++ /dev/null @@ -1,20 +0,0 @@ -from databricks.bundles.pipelines import Pipeline - -my_jobs_as_code_pipeline = Pipeline.from_dict( - { - "name": "my_jobs_as_code_pipeline", - "target": "my_jobs_as_code_${bundle.target}", - ## Specify the 'catalog' field to configure this pipeline to make use of Unity Catalog: - "catalog": "catalog_name", - "libraries": [ - { - "notebook": { - "path": "src/dlt_pipeline.ipynb", - }, - }, - ], - "configuration": { - "bundle.sourcePath": "${workspace.file_path}/src", - }, - } -) diff --git a/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/scratch/README.md b/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/scratch/README.md deleted file mode 100644 index e6cfb81b46..0000000000 --- a/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/scratch/README.md +++ /dev/null @@ -1,4 +0,0 @@ -# scratch - -This folder is reserved for personal, exploratory notebooks. -By default these are not committed to Git, as 'scratch' is listed in .gitignore. diff --git a/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/setup.py b/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/setup.py deleted file mode 100644 index ba284ba828..0000000000 --- a/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/setup.py +++ /dev/null @@ -1,18 +0,0 @@ -""" -setup.py configuration script describing how to build and package this project. - -This file is primarily used by the setuptools library and typically should not -be executed directly. See README.md for how to deploy, test, and run -the my_jobs_as_code project. -""" - -import os - -from setuptools import setup - -local_version = os.getenv("LOCAL_VERSION") -version = "0.0.1" - -setup( - version=f"{version}+{local_version}" if local_version else version, -) diff --git a/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/src/dlt_pipeline.ipynb b/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/src/dlt_pipeline.ipynb deleted file mode 100644 index 9b1c66629e..0000000000 --- a/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/src/dlt_pipeline.ipynb +++ /dev/null @@ -1,90 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "[UUID]", - "showTitle": false, - "title": "" - } - }, - "source": [ - "# DLT pipeline\n", - "\n", - "This Lakeflow Declarative Pipeline definition is executed using a pipeline defined in resources/my_jobs_as_code.pipeline.yml." - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "[UUID]", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "# Import DLT and src/my_jobs_as_code\n", - "import dlt\n", - "import sys\n", - "\n", - "sys.path.append(spark.conf.get(\"bundle.sourcePath\", \".\"))\n", - "from pyspark.sql.functions import expr\n", - "from my_jobs_as_code import main" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "[UUID]", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "@dlt.view\n", - "def taxi_raw():\n", - " return main.get_taxis(spark)\n", - "\n", - "\n", - "@dlt.table\n", - "def filtered_taxis():\n", - " return dlt.read(\"taxi_raw\").filter(expr(\"fare_amount < 30\"))" - ] - } - ], - "metadata": { - "application/vnd.databricks.v1+notebook": { - "dashboards": [], - "language": "python", - "notebookMetadata": { - "pythonIndentUnit": 2 - }, - "notebookName": "dlt_pipeline", - "widgets": {} - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "name": "python", - "version": "3.11.4" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} diff --git a/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/src/my_jobs_as_code/main.py b/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/src/my_jobs_as_code/main.py deleted file mode 100644 index 5ae344c7e2..0000000000 --- a/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/src/my_jobs_as_code/main.py +++ /dev/null @@ -1,25 +0,0 @@ -from pyspark.sql import SparkSession, DataFrame - - -def get_taxis(spark: SparkSession) -> DataFrame: - return spark.read.table("samples.nyctaxi.trips") - - -# Create a new Databricks Connect session. If this fails, -# check that you have configured Databricks Connect correctly. -# See https://docs.databricks.com/dev-tools/databricks-connect.html. -def get_spark() -> SparkSession: - try: - from databricks.connect import DatabricksSession - - return DatabricksSession.builder.getOrCreate() - except ImportError: - return SparkSession.builder.getOrCreate() - - -def main(): - get_taxis(get_spark()).show(5) - - -if __name__ == "__main__": - main() diff --git a/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/tests/main_test.py b/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/tests/main_test.py deleted file mode 100644 index 13e100ee2e..0000000000 --- a/acceptance/bundle/templates/experimental-jobs-as-code/output/my_jobs_as_code/tests/main_test.py +++ /dev/null @@ -1,8 +0,0 @@ -from my_jobs_as_code.main import get_taxis, get_spark - -# running tests requires installing databricks-connect, e.g. by uncommenting it in pyproject.toml - - -def test_main(): - taxis = get_taxis(get_spark()) - assert taxis.count() > 5 diff --git a/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/.vscode/settings.json b/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/.vscode/settings.json index 3e76d20bd8..f3be9a10ae 100644 --- a/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/.vscode/settings.json +++ b/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/.vscode/settings.json @@ -1,19 +1,31 @@ { - "python.analysis.stubPath": ".vscode", - "databricks.python.envFile": "${workspaceFolder}/.env", "jupyter.interactiveWindow.cellMarker.codeRegex": "^# COMMAND ----------|^# Databricks notebook source|^(#\\s*%%|#\\s*\\|#\\s*In\\[\\d*?\\]|#\\s*In\\[ \\])", "jupyter.interactiveWindow.cellMarker.default": "# COMMAND ----------", "python.testing.pytestArgs": [ "." ], - "python.testing.unittestEnabled": false, - "python.testing.pytestEnabled": true, - "python.analysis.extraPaths": ["resources/my_lakeflow_pipelines_pipeline"], "files.exclude": { "**/*.egg-info": true, "**/__pycache__": true, ".pytest_cache": true, }, + + // Pylance settings (VS Code) + "python.analysis.extraPaths": ["src", "resources"], + "python.analysis.typeCheckingMode": "basic", + "python.analysis.diagnosticMode": "workspace", + "python.analysis.stubPath": ".vscode", + + // Pyright settings (Cursor) + "cursorpyright.analysis.extraPaths": ["src", "resources"], + "cursorpyright.analysis.typeCheckingMode": "basic", + "cursorpyright.analysis.diagnosticMode": "workspace", + "cursorpyright.analysis.stubPath": ".vscode", + + // General Python settings + "python.defaultInterpreterPath": "./.venv/bin/python", + "python.testing.unittestEnabled": false, + "python.testing.pytestEnabled": true, "[python]": { "editor.defaultFormatter": "ms-python.black-formatter", "editor.formatOnSave": true, diff --git a/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/.vscode/settings.json b/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/.vscode/settings.json index 3e76d20bd8..f3be9a10ae 100644 --- a/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/.vscode/settings.json +++ b/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/.vscode/settings.json @@ -1,19 +1,31 @@ { - "python.analysis.stubPath": ".vscode", - "databricks.python.envFile": "${workspaceFolder}/.env", "jupyter.interactiveWindow.cellMarker.codeRegex": "^# COMMAND ----------|^# Databricks notebook source|^(#\\s*%%|#\\s*\\|#\\s*In\\[\\d*?\\]|#\\s*In\\[ \\])", "jupyter.interactiveWindow.cellMarker.default": "# COMMAND ----------", "python.testing.pytestArgs": [ "." ], - "python.testing.unittestEnabled": false, - "python.testing.pytestEnabled": true, - "python.analysis.extraPaths": ["resources/my_lakeflow_pipelines_pipeline"], "files.exclude": { "**/*.egg-info": true, "**/__pycache__": true, ".pytest_cache": true, }, + + // Pylance settings (VS Code) + "python.analysis.extraPaths": ["src", "resources"], + "python.analysis.typeCheckingMode": "basic", + "python.analysis.diagnosticMode": "workspace", + "python.analysis.stubPath": ".vscode", + + // Pyright settings (Cursor) + "cursorpyright.analysis.extraPaths": ["src", "resources"], + "cursorpyright.analysis.typeCheckingMode": "basic", + "cursorpyright.analysis.diagnosticMode": "workspace", + "cursorpyright.analysis.stubPath": ".vscode", + + // General Python settings + "python.defaultInterpreterPath": "./.venv/bin/python", + "python.testing.unittestEnabled": false, + "python.testing.pytestEnabled": true, "[python]": { "editor.defaultFormatter": "ms-python.black-formatter", "editor.formatOnSave": true, diff --git a/acceptance/bundle/templates/telemetry/default-python/out.databricks.yml b/acceptance/bundle/templates/telemetry/default-python/out.databricks.yml index 687383d471..62bd43d325 100644 --- a/acceptance/bundle/templates/telemetry/default-python/out.databricks.yml +++ b/acceptance/bundle/templates/telemetry/default-python/out.databricks.yml @@ -4,14 +4,21 @@ bundle: name: my_default_python uuid: [BUNDLE-UUID] +include: + - resources/*.yml + - resources/*/*.yml + artifacts: python_artifact: type: whl build: uv build --wheel -include: - - resources/*.yml - - resources/*/*.yml +# Variable declarations. These variables are assigned in the dev/prod targets below. +variables: + catalog: + description: The catalog to use + schema: + description: The schema to use targets: dev: @@ -23,20 +30,20 @@ targets: default: true workspace: host: [DATABRICKS_URL] - + variables: + catalog: hive_metastore + schema: ${workspace.current_user.short_name} presets: - # Set dynamic_version: true on all artifacts of type "whl". - # This makes "bundle deploy" add a timestamp to wheel's version before uploading, - # new wheel takes over the previous installation even if actual wheel version is unchanged. - # See https://docs.databricks.com/aws/en/dev-tools/bundles/settings artifacts_dynamic_version: true - prod: mode: production workspace: host: [DATABRICKS_URL] # We explicitly deploy to /Workspace/Users/[USERNAME] to make sure we only have a single copy. root_path: /Workspace/Users/[USERNAME]/.bundle/${bundle.name}/${bundle.target} + variables: + catalog: hive_metastore + schema: prod permissions: - user_name: [USERNAME] level: CAN_MANAGE diff --git a/acceptance/bundle/templates/telemetry/default-python/out.requests.txt b/acceptance/bundle/templates/telemetry/default-python/out.requests.txt index f36603307e..680efcfccf 100644 --- a/acceptance/bundle/templates/telemetry/default-python/out.requests.txt +++ b/acceptance/bundle/templates/telemetry/default-python/out.requests.txt @@ -5,7 +5,7 @@ ] }, "method": "GET", - "path": "/api/2.0/preview/scim/v2/Me" + "path": "/api/2.1/unity-catalog/current-metastore-assignment" } { "headers": { @@ -14,7 +14,7 @@ ] }, "method": "GET", - "path": "/api/2.1/unity-catalog/current-metastore-assignment" + "path": "/api/2.0/preview/scim/v2/Me" } { "headers": { @@ -28,7 +28,7 @@ "uploadTime": [UNIX_TIME_MILLIS], "items": [], "protoLogs": [ - "{\"frontend_log_event_id\":\"[UUID]\",\"entry\":{\"databricks_cli_log\":{\"execution_context\":{\"cmd_exec_id\":\"[CMD-EXEC-ID]\",\"version\":\"[DEV_VERSION]\",\"command\":\"bundle_init\",\"operating_system\":\"[OS]\",\"execution_time_ms\":\"SMALL_INT\",\"exit_code\":0},\"bundle_init_event\":{\"bundle_uuid\":\"[BUNDLE-UUID]\",\"template_name\":\"default-python\",\"template_enum_args\":[{\"key\":\"include_dlt\",\"value\":\"no\"},{\"key\":\"include_notebook\",\"value\":\"yes\"},{\"key\":\"include_python\",\"value\":\"yes\"},{\"key\":\"serverless\",\"value\":\"no\"}]}}}}" + "{\"frontend_log_event_id\":\"[UUID]\",\"entry\":{\"databricks_cli_log\":{\"execution_context\":{\"cmd_exec_id\":\"[CMD-EXEC-ID]\",\"version\":\"[DEV_VERSION]\",\"command\":\"bundle_init\",\"operating_system\":\"[OS]\",\"execution_time_ms\":\"SMALL_INT\",\"exit_code\":0},\"bundle_init_event\":{\"bundle_uuid\":\"[BUNDLE-UUID]\",\"template_name\":\"default-python\",\"template_enum_args\":[{\"key\":\"include_job\",\"value\":\"yes\"},{\"key\":\"include_pipeline\",\"value\":\"yes\"},{\"key\":\"include_python\",\"value\":\"yes\"},{\"key\":\"personal_schemas\",\"value\":\"yes\"},{\"key\":\"serverless\",\"value\":\"no\"}]}}}}" ] } } diff --git a/acceptance/bundle/templates/telemetry/default-python/output.txt b/acceptance/bundle/templates/telemetry/default-python/output.txt index 6124901bf9..b95acf68bc 100644 --- a/acceptance/bundle/templates/telemetry/default-python/output.txt +++ b/acceptance/bundle/templates/telemetry/default-python/output.txt @@ -1,6 +1,10 @@ - Welcome to the default Python template for Databricks Asset Bundles! -Workspace to use (auto-detected, edit in 'my_default_python/databricks.yml'): [DATABRICKS_URL] + +Please answer the below to tailor your project to your preferences. +You can always change your mind and change your configuration in the databricks.yml file later. + +Note that [DATABRICKS_URL] is used for initialization +(see https://docs.databricks.com/dev-tools/cli/profiles.html for how to change your profile). ✨ Your new project has been created in the 'my_default_python' directory! @@ -25,17 +29,21 @@ See also the documentation at https://docs.databricks.com/dev-tools/bundles/inde "template_name": "default-python", "template_enum_args": [ { - "key": "include_dlt", - "value": "no" + "key": "include_job", + "value": "yes" }, { - "key": "include_notebook", + "key": "include_pipeline", "value": "yes" }, { "key": "include_python", "value": "yes" }, + { + "key": "personal_schemas", + "value": "yes" + }, { "key": "serverless", "value": "no" diff --git a/acceptance/cmd/workspace/apps/run-local-node/out.run.txt b/acceptance/cmd/workspace/apps/run-local-node/out.run.txt new file mode 100644 index 0000000000..6a76f5e09d --- /dev/null +++ b/acceptance/cmd/workspace/apps/run-local-node/out.run.txt @@ -0,0 +1,23 @@ + +>>> [CLI] apps run-local --prepare-environment --debug --port 8081 --debug-port 5252 --app-port 8080 + +up to date, audited 68 packages in 586ms + +14 packages are looking for funding + run `npm fund` for details + +found 0 vulnerabilities + +> app@1.0.0 build +> echo 'Building app...' + +Building app... +Running command: npm run run-app +To debug your app, attach a debugger to port $(debug_port) +To access your app go to http://localhost:8081 +listen tcp 127.0.0.1:$(port): bind: address already in use + +> app@1.0.0 run-app +> node app.js + +Server is running on port 8080 diff --git a/acceptance/cmd/workspace/apps/run-local-node/output.txt b/acceptance/cmd/workspace/apps/run-local-node/output.txt index 0185dbe523..ea44447b9d 100644 --- a/acceptance/cmd/workspace/apps/run-local-node/output.txt +++ b/acceptance/cmd/workspace/apps/run-local-node/output.txt @@ -5,8 +5,5 @@ Hello, world === Waiting === Checking app is running... >>> curl -s -o - http://127.0.0.1:$(port) -{"message":"Hello From App","timestamp":"[TIMESTAMP]","status":"running"} -=== Sending shutdown request... ->>> curl -s -o /dev/null http://127.0.0.1:$(port)/shutdown -Process terminated +Exit code: 1 diff --git a/acceptance/cmd/workspace/apps/run-local/out.run.txt b/acceptance/cmd/workspace/apps/run-local/out.run.txt new file mode 100644 index 0000000000..0988a5b50b --- /dev/null +++ b/acceptance/cmd/workspace/apps/run-local/out.run.txt @@ -0,0 +1,54 @@ + +>>> [CLI] apps run-local --prepare-environment --debug --port 8081 --debug-port 5252 --app-port 8080 +Using CPython 3.13.5 +Creating virtual environment at: .venv +warning: A virtual environment already exists at `.venv`. In the future, uv will require `--clear` to replace it +Activate with: source .venv/bin/activate +Resolved 129 packages in 40ms +Uninstalled 2 packages in 14ms +Installed 2 packages in 7ms + - flask==3.1.1 + + flask==3.0.3 + - werkzeug==3.1.3 + + werkzeug==3.0.6 +Resolved 7 packages in 2ms +Uninstalled 2 packages in 17ms +Installed 2 packages in 14ms + - flask==3.0.3 + + flask==3.1.1 + - werkzeug==3.0.6 + + werkzeug==3.1.3 +Running command: uv run python -m debugpy --listen 5252 -m flask run +To debug your app, attach a debugger to port $(debug_port) +To access your app go to http://localhost:8081 +listen tcp 127.0.0.1:$(port): bind: address already in use +0.00s - Debugger warning: It seems that frozen modules are being used, which may +0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off +0.00s - to python to disable frozen modules. +0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation. +Traceback (most recent call last): + File "", line 198, in _run_module_as_main + File "", line 88, in _run_code + File "[TEST_TMP_DIR]/app/.venv/lib/python3.13/site-packages/debugpy/__main__.py", line 71, in + cli.main() + ~~~~~~~~^^ + File "[TEST_TMP_DIR]/app/.venv/lib/python3.13/site-packages/debugpy/server/cli.py", line 508, in main + run() + ~~~^^ + File "[TEST_TMP_DIR]/app/.venv/lib/python3.13/site-packages/debugpy/server/cli.py", line 376, in run_module + start_debugging(argv_0) + ~~~~~~~~~~~~~~~^^^^^^^^ + File "[TEST_TMP_DIR]/app/.venv/lib/python3.13/site-packages/debugpy/server/cli.py", line 328, in start_debugging + debugpy.listen(options.address) + ~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^ + File "[TEST_TMP_DIR]/app/.venv/lib/python3.13/site-packages/debugpy/public_api.py", line 47, in wrapper + return wrapped(*args, **kwargs) + File "[TEST_TMP_DIR]/app/.venv/lib/python3.13/site-packages/debugpy/server/api.py", line 133, in debug + log.reraise_exception("{0}() failed:", func.__name__, level="info") + ~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "[TEST_TMP_DIR]/app/.venv/lib/python3.13/site-packages/debugpy/server/api.py", line 131, in debug + return func(address, settrace_kwargs, **kwargs) + File "[TEST_TMP_DIR]/app/.venv/lib/python3.13/site-packages/debugpy/server/api.py", line 260, in listen + raise RuntimeError(str(endpoints["error"])) +RuntimeError: Can't listen for client connections: [Errno 48] Address already in use +Error: exit status 1 diff --git a/acceptance/cmd/workspace/apps/run-local/output.txt b/acceptance/cmd/workspace/apps/run-local/output.txt index 94386290ef..00cf46ca71 100644 --- a/acceptance/cmd/workspace/apps/run-local/output.txt +++ b/acceptance/cmd/workspace/apps/run-local/output.txt @@ -8,27 +8,4 @@ Hello, world === Starting the app in background... === Waiting -=== Checking app is running... ->>> curl -s -o - http://127.0.0.1:$(port) -{ - "Accept": "*/*", - "Accept-Encoding": "gzip", - "Host": "127.0.0.1:$(port)", - "User-Agent": "curl/(version)", - "X-Forwarded-Email": "[USERNAME]", - "X-Forwarded-Host": "localhost", - "X-Forwarded-Preferred-Username": "", - "X-Forwarded-User": "[USERNAME]", - "X-Real-Ip": "127.0.0.1", - "X-Request-Id": "[UUID]" -} - -=== Sending shutdown request... ->>> curl -s -o /dev/null http://127.0.0.1:$(port)/shutdown - -=== Checking CLI command output... ->>> grep To debug your app, attach a debugger to port ./out.run.txt -To debug your app, attach a debugger to port $(debug_port) - ->>> grep -o Python Flask app has started with: test ./out.run.txt -Python Flask app has started with: test +Error: Test script killed due to a timeout diff --git a/acceptance/pipelines/e2e/output.txt b/acceptance/pipelines/e2e/output.txt index 1274b5e96f..8325c5ff12 100644 --- a/acceptance/pipelines/e2e/output.txt +++ b/acceptance/pipelines/e2e/output.txt @@ -14,87 +14,10 @@ Refer to the README.md file for "getting started" instructions! >>> [PIPELINES] deploy Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/my_project/dev/files... Deploying resources... -Updating deployment state... Deployment complete! -View your pipeline my_project_pipeline here: [DATABRICKS_URL]/pipelines/[UUID]?o=[NUMID] === Run pipeline >>> [PIPELINES] run -Update URL: [DATABRICKS_URL]/#joblist/pipelines/[UUID]/updates/[UUID] +Error: expected a KEY of the resource to run -Update ID: [UUID] - -Update for pipeline completed successfully. - -Pipeline configurations for this update: -• All tables are refreshed - -=== Edit project by creating and running a new second pipeline ->>> [PIPELINES] deploy -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/my_project/dev/files... -Deploying resources... -Updating deployment state... -Deployment complete! -View your pipeline my_project_pipeline here: [DATABRICKS_URL]/pipelines/[UUID]?o=[NUMID] -View your pipeline my_project_pipeline_2 here: [DATABRICKS_URL]/pipelines/[UUID]?o=[NUMID] - -=== Assert the second pipeline is created ->>> [CLI] pipelines get [UUID] -{ - "creator_user_name":"[USERNAME]", - "last_modified":[UNIX_TIME_MILLIS], - "name":"[dev [USERNAME]] my_project_pipeline_2", - "pipeline_id":"[UUID]", - "run_as_user_name":"[USERNAME]", - "spec": { - "channel":"CURRENT", - "deployment": { - "kind":"BUNDLE", - "metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/my_project/dev/state/metadata.json" - }, - "development":true, - "edition":"ADVANCED", - "id":"[UUID]", - "name":"[dev [USERNAME]] my_project_pipeline_2", - "storage":"dbfs:/pipelines/[UUID]", - "tags": { - "dev":"[USERNAME]" - } - }, - "state":"IDLE" -} - ->>> [PIPELINES] run my_project_pipeline_2 -Update URL: [DATABRICKS_URL]/#joblist/pipelines/[UUID]/updates/[UUID] - -Update ID: [UUID] - -Update for pipeline completed successfully. - -Pipeline configurations for this update: -• All tables are refreshed - -=== Stop both pipelines before destroy ->>> [PIPELINES] stop my_project_pipeline -Stopping my_project_pipeline... -my_project_pipeline has been stopped. - ->>> [PIPELINES] stop my_project_pipeline_2 -Stopping my_project_pipeline_2... -my_project_pipeline_2 has been stopped. - -=== Destroy project ->>> [PIPELINES] destroy --auto-approve -The following resources will be deleted: - delete pipeline my_project_pipeline - delete pipeline my_project_pipeline_2 - -This action will result in the deletion of the following Lakeflow Declarative Pipelines along with the -Streaming Tables (STs) and Materialized Views (MVs) managed by them: - delete pipeline my_project_pipeline - delete pipeline my_project_pipeline_2 - -All files and directories at the following location will be deleted: /Workspace/Users/[USERNAME]/.bundle/my_project/dev - -Deleting files... -Destroy complete! +Exit code: 1 diff --git a/acceptance/pipelines/e2e/output/my_project/out.gitignore b/acceptance/pipelines/e2e/output/my_project/.gitignore similarity index 100% rename from acceptance/pipelines/e2e/output/my_project/out.gitignore rename to acceptance/pipelines/e2e/output/my_project/.gitignore diff --git a/acceptance/pipelines/e2e/output/my_project/README.md b/acceptance/pipelines/e2e/output/my_project/README.md index 88914e1e36..48def0c4df 100644 --- a/acceptance/pipelines/e2e/output/my_project/README.md +++ b/acceptance/pipelines/e2e/output/my_project/README.md @@ -13,30 +13,13 @@ The 'my_project' project was generated by using the CLI Pipelines template. 3. Authenticate to your Databricks workspace, if you have not done so already: ``` - $ pipelines auth login + $ databricks auth login ``` 4. Optionally, install developer tools such as the Databricks extension for Visual Studio Code from https://docs.databricks.com/dev-tools/vscode-ext.html. Or the PyCharm plugin from https://www.databricks.com/blog/announcing-pycharm-integration-databricks. -## Pipeline Structure - -This folder defines all source code for the my_project_pipeline pipeline: - -- `explorations`: Ad-hoc notebooks used to explore the data processed by this pipeline. -- `transformations`: All dataset definitions and transformations. -- `utilities` (optional): Utility functions and Python modules used in this pipeline. - -## Getting Started - -To get started, go to the `transformations` folder -- most of the relevant source code lives there: - -* By convention, every dataset under `transformations` is in a separate file. -* Take a look at the sample under "sample_trips_my_project.py" to get familiar with the syntax. - Read more about the syntax at https://docs.databricks.com/dlt/python-ref.html. - -For more tutorials and reference material, see https://docs.databricks.com/dlt. ## Deploying pipelines @@ -47,11 +30,6 @@ For more tutorials and reference material, see https://docs.databricks.com/dlt. (Note that "dev" is the default target, so the `--target` parameter is optional here.) - This deploys everything that's defined for this project. - For example, the default template would deploy a pipeline called - `[dev yourname] my_project_pipeline` to your workspace. - You can find that pipeline by opening your workpace and clicking on **Jobs & Pipelines**. - 2. Similarly, to deploy a production copy, type: ``` $ pipelines deploy --target prod diff --git a/acceptance/pipelines/e2e/output/my_project/databricks.yml b/acceptance/pipelines/e2e/output/my_project/databricks.yml index 871656882c..0f2cecce23 100644 --- a/acceptance/pipelines/e2e/output/my_project/databricks.yml +++ b/acceptance/pipelines/e2e/output/my_project/databricks.yml @@ -7,7 +7,7 @@ bundle: include: - resources/*.yml - resources/*/*.yml - - ./*.yml + - my_project_pipeline/*.yml # Variable declarations. These variables are assigned in the dev/prod targets below. variables: diff --git a/acceptance/pipelines/e2e/output/my_project/my_project_pipeline_2.pipeline.yml b/acceptance/pipelines/e2e/output/my_project/my_project_pipeline_2.pipeline.yml deleted file mode 100644 index c2a2f17887..0000000000 --- a/acceptance/pipelines/e2e/output/my_project/my_project_pipeline_2.pipeline.yml +++ /dev/null @@ -1,4 +0,0 @@ -resources: - pipelines: - my_project_pipeline_2: - name: my_project_pipeline_2 diff --git a/acceptance/pipelines/e2e/output/my_project/transformations/sample_trips_my_project.sql b/acceptance/pipelines/e2e/output/my_project/transformations/sample_trips_my_project.sql new file mode 100644 index 0000000000..a65d9b0cb8 --- /dev/null +++ b/acceptance/pipelines/e2e/output/my_project/transformations/sample_trips_my_project.sql @@ -0,0 +1,9 @@ +-- This file defines a sample transformation. +-- Edit the sample below or add new transformations +-- using "+ Add" in the file browser. + +CREATE MATERIALIZED VIEW sample_trips_my_project AS +SELECT + pickup_zip, + fare_amount +FROM samples.nyctaxi.trips diff --git a/acceptance/pipelines/e2e/output/my_project/transformations/sample_zones_my_project.sql b/acceptance/pipelines/e2e/output/my_project/transformations/sample_zones_my_project.sql new file mode 100644 index 0000000000..28785fc619 --- /dev/null +++ b/acceptance/pipelines/e2e/output/my_project/transformations/sample_zones_my_project.sql @@ -0,0 +1,10 @@ +-- This file defines a sample transformation. +-- Edit the sample below or add new transformations +-- using "+ Add" in the file browser. + +CREATE MATERIALIZED VIEW sample_zones_my_project AS +SELECT + pickup_zip, + SUM(fare_amount) AS total_fare +FROM sample_trips_my_project +GROUP BY pickup_zip diff --git a/acceptance/pipelines/init/error-cases/output/my_project/README.md b/acceptance/pipelines/init/error-cases/output/my_project/README.md index 88914e1e36..48def0c4df 100644 --- a/acceptance/pipelines/init/error-cases/output/my_project/README.md +++ b/acceptance/pipelines/init/error-cases/output/my_project/README.md @@ -13,30 +13,13 @@ The 'my_project' project was generated by using the CLI Pipelines template. 3. Authenticate to your Databricks workspace, if you have not done so already: ``` - $ pipelines auth login + $ databricks auth login ``` 4. Optionally, install developer tools such as the Databricks extension for Visual Studio Code from https://docs.databricks.com/dev-tools/vscode-ext.html. Or the PyCharm plugin from https://www.databricks.com/blog/announcing-pycharm-integration-databricks. -## Pipeline Structure - -This folder defines all source code for the my_project_pipeline pipeline: - -- `explorations`: Ad-hoc notebooks used to explore the data processed by this pipeline. -- `transformations`: All dataset definitions and transformations. -- `utilities` (optional): Utility functions and Python modules used in this pipeline. - -## Getting Started - -To get started, go to the `transformations` folder -- most of the relevant source code lives there: - -* By convention, every dataset under `transformations` is in a separate file. -* Take a look at the sample under "sample_trips_my_project.py" to get familiar with the syntax. - Read more about the syntax at https://docs.databricks.com/dlt/python-ref.html. - -For more tutorials and reference material, see https://docs.databricks.com/dlt. ## Deploying pipelines @@ -47,11 +30,6 @@ For more tutorials and reference material, see https://docs.databricks.com/dlt. (Note that "dev" is the default target, so the `--target` parameter is optional here.) - This deploys everything that's defined for this project. - For example, the default template would deploy a pipeline called - `[dev yourname] my_project_pipeline` to your workspace. - You can find that pipeline by opening your workpace and clicking on **Jobs & Pipelines**. - 2. Similarly, to deploy a production copy, type: ``` $ pipelines deploy --target prod diff --git a/acceptance/pipelines/init/error-cases/output/my_project/databricks.yml b/acceptance/pipelines/init/error-cases/output/my_project/databricks.yml index 871656882c..0f2cecce23 100644 --- a/acceptance/pipelines/init/error-cases/output/my_project/databricks.yml +++ b/acceptance/pipelines/init/error-cases/output/my_project/databricks.yml @@ -7,7 +7,7 @@ bundle: include: - resources/*.yml - resources/*/*.yml - - ./*.yml + - my_project_pipeline/*.yml # Variable declarations. These variables are assigned in the dev/prod targets below. variables: diff --git a/acceptance/pipelines/init/error-cases/output/my_project/transformations/sample_trips_my_project.sql b/acceptance/pipelines/init/error-cases/output/my_project/transformations/sample_trips_my_project.sql new file mode 100644 index 0000000000..a65d9b0cb8 --- /dev/null +++ b/acceptance/pipelines/init/error-cases/output/my_project/transformations/sample_trips_my_project.sql @@ -0,0 +1,9 @@ +-- This file defines a sample transformation. +-- Edit the sample below or add new transformations +-- using "+ Add" in the file browser. + +CREATE MATERIALIZED VIEW sample_trips_my_project AS +SELECT + pickup_zip, + fare_amount +FROM samples.nyctaxi.trips diff --git a/acceptance/pipelines/init/error-cases/output/my_project/transformations/sample_zones_my_project.sql b/acceptance/pipelines/init/error-cases/output/my_project/transformations/sample_zones_my_project.sql new file mode 100644 index 0000000000..28785fc619 --- /dev/null +++ b/acceptance/pipelines/init/error-cases/output/my_project/transformations/sample_zones_my_project.sql @@ -0,0 +1,10 @@ +-- This file defines a sample transformation. +-- Edit the sample below or add new transformations +-- using "+ Add" in the file browser. + +CREATE MATERIALIZED VIEW sample_zones_my_project AS +SELECT + pickup_zip, + SUM(fare_amount) AS total_fare +FROM sample_trips_my_project +GROUP BY pickup_zip diff --git a/acceptance/pipelines/init/python/output/my_python_project/README.md b/acceptance/pipelines/init/python/output/my_python_project/README.md index 5c87ad38c9..02ff63426f 100644 --- a/acceptance/pipelines/init/python/output/my_python_project/README.md +++ b/acceptance/pipelines/init/python/output/my_python_project/README.md @@ -13,30 +13,13 @@ The 'my_python_project' project was generated by using the CLI Pipelines templat 3. Authenticate to your Databricks workspace, if you have not done so already: ``` - $ pipelines auth login + $ databricks auth login ``` 4. Optionally, install developer tools such as the Databricks extension for Visual Studio Code from https://docs.databricks.com/dev-tools/vscode-ext.html. Or the PyCharm plugin from https://www.databricks.com/blog/announcing-pycharm-integration-databricks. -## Pipeline Structure - -This folder defines all source code for the my_python_project_pipeline pipeline: - -- `explorations`: Ad-hoc notebooks used to explore the data processed by this pipeline. -- `transformations`: All dataset definitions and transformations. -- `utilities` (optional): Utility functions and Python modules used in this pipeline. - -## Getting Started - -To get started, go to the `transformations` folder -- most of the relevant source code lives there: - -* By convention, every dataset under `transformations` is in a separate file. -* Take a look at the sample under "sample_trips_my_python_project.py" to get familiar with the syntax. - Read more about the syntax at https://docs.databricks.com/dlt/python-ref.html. - -For more tutorials and reference material, see https://docs.databricks.com/dlt. ## Deploying pipelines @@ -47,11 +30,6 @@ For more tutorials and reference material, see https://docs.databricks.com/dlt. (Note that "dev" is the default target, so the `--target` parameter is optional here.) - This deploys everything that's defined for this project. - For example, the default template would deploy a pipeline called - `[dev yourname] my_python_project_pipeline` to your workspace. - You can find that pipeline by opening your workpace and clicking on **Jobs & Pipelines**. - 2. Similarly, to deploy a production copy, type: ``` $ pipelines deploy --target prod diff --git a/acceptance/pipelines/init/python/output/my_python_project/databricks.yml b/acceptance/pipelines/init/python/output/my_python_project/databricks.yml index f9b7ef40de..6116fc783b 100644 --- a/acceptance/pipelines/init/python/output/my_python_project/databricks.yml +++ b/acceptance/pipelines/init/python/output/my_python_project/databricks.yml @@ -7,7 +7,7 @@ bundle: include: - resources/*.yml - resources/*/*.yml - - ./*.yml + - my_python_project_pipeline/*.yml # Variable declarations. These variables are assigned in the dev/prod targets below. variables: diff --git a/acceptance/pipelines/init/python/output/my_python_project/transformations/sample_trips_my_python_project.sql b/acceptance/pipelines/init/python/output/my_python_project/transformations/sample_trips_my_python_project.sql new file mode 100644 index 0000000000..7ddb904a3d --- /dev/null +++ b/acceptance/pipelines/init/python/output/my_python_project/transformations/sample_trips_my_python_project.sql @@ -0,0 +1,9 @@ +-- This file defines a sample transformation. +-- Edit the sample below or add new transformations +-- using "+ Add" in the file browser. + +CREATE MATERIALIZED VIEW sample_trips_my_python_project AS +SELECT + pickup_zip, + fare_amount +FROM samples.nyctaxi.trips diff --git a/acceptance/pipelines/init/python/output/my_python_project/transformations/sample_zones_my_python_project.sql b/acceptance/pipelines/init/python/output/my_python_project/transformations/sample_zones_my_python_project.sql new file mode 100644 index 0000000000..b2b496dde0 --- /dev/null +++ b/acceptance/pipelines/init/python/output/my_python_project/transformations/sample_zones_my_python_project.sql @@ -0,0 +1,10 @@ +-- This file defines a sample transformation. +-- Edit the sample below or add new transformations +-- using "+ Add" in the file browser. + +CREATE MATERIALIZED VIEW sample_zones_my_python_project AS +SELECT + pickup_zip, + SUM(fare_amount) AS total_fare +FROM sample_trips_my_python_project +GROUP BY pickup_zip diff --git a/acceptance/pipelines/init/sql/output/my_sql_project/README.md b/acceptance/pipelines/init/sql/output/my_sql_project/README.md index fa7a8d3307..b04732c41a 100644 --- a/acceptance/pipelines/init/sql/output/my_sql_project/README.md +++ b/acceptance/pipelines/init/sql/output/my_sql_project/README.md @@ -13,29 +13,13 @@ The 'my_sql_project' project was generated by using the CLI Pipelines template. 3. Authenticate to your Databricks workspace, if you have not done so already: ``` - $ pipelines auth login + $ databricks auth login ``` 4. Optionally, install developer tools such as the Databricks extension for Visual Studio Code from https://docs.databricks.com/dev-tools/vscode-ext.html. Or the PyCharm plugin from https://www.databricks.com/blog/announcing-pycharm-integration-databricks. -## Pipeline Structure - -This folder defines all source code for the my_sql_project_pipeline pipeline: - -- `explorations`: Ad-hoc notebooks used to explore the data processed by this pipeline. -- `transformations`: All dataset definitions and transformations. - -## Getting Started - -To get started, go to the `transformations` folder -- most of the relevant source code lives there: - -* By convention, every dataset under `transformations` is in a separate file. -* Take a look at the sample under "sample_trips_my_sql_project.sql" to get familiar with the syntax. - Read more about the syntax at https://docs.databricks.com/dlt/sql-ref.html. - -For more tutorials and reference material, see https://docs.databricks.com/dlt. ## Deploying pipelines @@ -46,11 +30,6 @@ For more tutorials and reference material, see https://docs.databricks.com/dlt. (Note that "dev" is the default target, so the `--target` parameter is optional here.) - This deploys everything that's defined for this project. - For example, the default template would deploy a pipeline called - `[dev yourname] my_sql_project_pipeline` to your workspace. - You can find that pipeline by opening your workpace and clicking on **Jobs & Pipelines**. - 2. Similarly, to deploy a production copy, type: ``` $ pipelines deploy --target prod diff --git a/acceptance/pipelines/init/sql/output/my_sql_project/databricks.yml b/acceptance/pipelines/init/sql/output/my_sql_project/databricks.yml index fc415f32d4..28cf4723af 100644 --- a/acceptance/pipelines/init/sql/output/my_sql_project/databricks.yml +++ b/acceptance/pipelines/init/sql/output/my_sql_project/databricks.yml @@ -7,7 +7,7 @@ bundle: include: - resources/*.yml - resources/*/*.yml - - ./*.yml + - my_sql_project_pipeline/*.yml # Variable declarations. These variables are assigned in the dev/prod targets below. variables: @@ -28,7 +28,7 @@ targets: host: [DATABRICKS_URL] variables: catalog: main - schema: shared_dev + schema: shared notifications: [] prod: diff --git a/acceptance/pipelines/init/sql/output/my_sql_project/explorations/sample_exploration.ipynb b/acceptance/pipelines/init/sql/output/my_sql_project/explorations/sample_exploration.ipynb index deee8395ea..c61ec69bc6 100644 --- a/acceptance/pipelines/init/sql/output/my_sql_project/explorations/sample_exploration.ipynb +++ b/acceptance/pipelines/init/sql/output/my_sql_project/explorations/sample_exploration.ipynb @@ -38,7 +38,7 @@ "-- !!! Before performing any data analysis, make sure to run the pipeline to materialize the sample datasets. The tables referenced in this notebook depend on that step.\n", "\n", "USE CATALOG `main`;\n", - "USE SCHEMA `shared_dev`;\n", + "USE SCHEMA `shared`;\n", "\n", "SELECT * from my_sql_project;" ] diff --git a/acceptance/pipelines/init/sql/output/my_sql_project/transformations/sample_trips_my_sql_project.py b/acceptance/pipelines/init/sql/output/my_sql_project/transformations/sample_trips_my_sql_project.py new file mode 100644 index 0000000000..ca10e69400 --- /dev/null +++ b/acceptance/pipelines/init/sql/output/my_sql_project/transformations/sample_trips_my_sql_project.py @@ -0,0 +1,13 @@ +import dlt +from pyspark.sql.functions import col +from utilities import utils + + +# This file defines a sample transformation. +# Edit the sample below or add new transformations +# using "+ Add" in the file browser. + + +@dlt.table +def sample_trips_my_sql_project(): + return spark.read.table("samples.nyctaxi.trips").withColumn("trip_distance_km", utils.distance_km(col("trip_distance"))) diff --git a/acceptance/pipelines/init/sql/output/my_sql_project/transformations/sample_zones_my_sql_project.py b/acceptance/pipelines/init/sql/output/my_sql_project/transformations/sample_zones_my_sql_project.py new file mode 100644 index 0000000000..167fc6cb81 --- /dev/null +++ b/acceptance/pipelines/init/sql/output/my_sql_project/transformations/sample_zones_my_sql_project.py @@ -0,0 +1,13 @@ +import dlt +from pyspark.sql.functions import col, sum + + +# This file defines a sample transformation. +# Edit the sample below or add new transformations +# using "+ Add" in the file browser. + + +@dlt.table +def sample_zones_my_sql_project(): + # Read from the "sample_trips" table, then sum all the fares + return spark.read.table("sample_trips_my_sql_project").groupBy(col("pickup_zip")).agg(sum("fare_amount").alias("total_fare")) diff --git a/acceptance/pipelines/init/sql/output/my_sql_project/utilities/utils.py b/acceptance/pipelines/init/sql/output/my_sql_project/utilities/utils.py new file mode 100644 index 0000000000..ff039898f0 --- /dev/null +++ b/acceptance/pipelines/init/sql/output/my_sql_project/utilities/utils.py @@ -0,0 +1,8 @@ +from pyspark.sql.functions import udf +from pyspark.sql.types import FloatType + + +@udf(returnType=FloatType()) +def distance_km(distance_miles): + """Convert distance from miles to kilometers (1 mile = 1.60934 km).""" + return distance_miles * 1.60934 diff --git a/libs/template/helpers.go b/libs/template/helpers.go index 27bb0d0432..914aa5be50 100644 --- a/libs/template/helpers.go +++ b/libs/template/helpers.go @@ -8,9 +8,8 @@ import ( "net/url" "os" "regexp" - "slices" - "strings" "text/template" + "time" "github.com/databricks/cli/libs/cmdctx" "github.com/databricks/cli/libs/iamutil" @@ -39,12 +38,6 @@ var ( cachedCatalog *string ) -var metastoreDisabledErrorCodes = []string{ - "PERMISSION_DENIED", - "METASTORE_DOES_NOT_EXIST", // Default metastore is not assigned to the workspace. - "FEATURE_DISABLED", // Unity Catalog is not available for feature tier STANDARD_TIER. -} - // UUID that is stable for the duration of the template execution. This can be used // to populate the `bundle.uuid` field in databricks.yml by template authors. // @@ -96,6 +89,10 @@ func loadHelpers(ctx context.Context) template.FuncMap { } return result }, + "short_date_time": func() string { + now := time.Now() + return fmt.Sprintf("%s_%02d_%02d%02d", now.Format("jan"), now.Day(), now.Hour(), now.Minute()) + }, // Get smallest node type (follows Terraform's GetSmallestNodeType) "smallest_node_type": func() (string, error) { if w.Config.Host == "" { @@ -148,7 +145,7 @@ func loadHelpers(ctx context.Context) template.FuncMap { metastore, err := w.Metastores.Current(ctx) if err != nil { var aerr *apierr.APIError - if errors.As(err, &aerr) && slices.Contains(metastoreDisabledErrorCodes, aerr.ErrorCode) { + if errors.As(err, &aerr) && (aerr.ErrorCode == "PERMISSION_DENIED" || aerr.ErrorCode == "METASTORE_DOES_NOT_EXIST") { // Ignore: access denied or workspace doesn't have a metastore assigned empty_default := "" cachedCatalog = &empty_default @@ -175,11 +172,5 @@ func loadHelpers(ctx context.Context) template.FuncMap { cachedIsServicePrincipal = &result return result, nil }, - "lower": func(s string) string { - return strings.ToLower(s) - }, - "upper": func(s string) string { - return strings.ToUpper(s) - }, } } diff --git a/libs/template/helpers_test.go b/libs/template/helpers_test.go index 36d0e1cc5b..1d7d3a5108 100644 --- a/libs/template/helpers_test.go +++ b/libs/template/helpers_test.go @@ -105,6 +105,22 @@ func TestTemplateUuidFunction(t *testing.T) { assert.Regexp(t, "^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$", uuid) } +func TestTemplateReplaceFunction(t *testing.T) { + ctx := context.Background() + + ctx = cmdctx.SetWorkspaceClient(ctx, nil) + helpers := loadHelpers(ctx) + r, err := newRenderer(ctx, nil, helpers, os.DirFS("."), "./testdata/replace/template", "./testdata/replace/library") + require.NoError(t, err) + + err = r.walk() + assert.NoError(t, err) + + assert.Len(t, r.files, 1) + content := strings.TrimSpace(string(r.files[0].(*inMemoryFile).content)) + assert.Equal(t, "my", content) +} + func TestTemplateUrlFunction(t *testing.T) { ctx := context.Background() @@ -137,6 +153,26 @@ func TestTemplateMapPairFunction(t *testing.T) { assert.Equal(t, "false 123 hello 12.3", string(r.files[0].(*inMemoryFile).content)) } +func TestTemplateShortDateFunction(t *testing.T) { + ctx := context.Background() + + ctx = cmdctx.SetWorkspaceClient(ctx, nil) + helpers := loadHelpers(ctx) + r, err := newRenderer(ctx, nil, helpers, os.DirFS("."), "./testdata/short-date/template", "./testdata/short-date/library") + require.NoError(t, err) + + err = r.walk() + assert.NoError(t, err) + + assert.Len(t, r.files, 1) + content := string(r.files[0].(*inMemoryFile).content) + assert.Contains(t, content, "This file was created on") + + // Check that the file path contains a date-like pattern + filePath := r.files[0].RelPath() + assert.Regexp(t, `^[A-Za-z]{3}_\d{2}_\d{4}\.txt$`, filePath) +} + func TestWorkspaceHost(t *testing.T) { ctx := context.Background() diff --git a/libs/template/template.go b/libs/template/template.go index 48dd5f9236..a1986cad9d 100644 --- a/libs/template/template.go +++ b/libs/template/template.go @@ -38,7 +38,7 @@ const ( var databricksTemplates = []Template{ { name: DefaultPython, - description: "The default Python template for Notebooks and Lakeflow", + description: "The default Python template, using Python files or notebooks with Lakeflow", Reader: &builtinReader{name: string(DefaultPython)}, Writer: &writerWithFullTelemetry{defaultWriter: defaultWriter{name: DefaultPython}}, }, diff --git a/libs/template/template_test.go b/libs/template/template_test.go index c738bac801..cd0d394bdd 100644 --- a/libs/template/template_test.go +++ b/libs/template/template_test.go @@ -8,7 +8,7 @@ import ( ) func TestTemplateHelpDescriptions(t *testing.T) { - expected := `- default-python: The default Python template for Notebooks and Lakeflow + expected := `- default-python: The default Python template, using Python files or notebooks with Lakeflow - default-sql: The default SQL template for .sql files that run with Databricks SQL - dbt-sql: The dbt SQL template (databricks.com/blog/delivering-cost-effective-data-real-time-dbt-and-databricks) - mlops-stacks: The Databricks MLOps Stacks template (github.com/databricks/mlops-stacks) @@ -18,7 +18,7 @@ func TestTemplateHelpDescriptions(t *testing.T) { func TestTemplateOptions(t *testing.T) { expected := []cmdio.Tuple{ - {Name: "default-python", Id: "The default Python template for Notebooks and Lakeflow"}, + {Name: "default-python", Id: "The default Python template, using Python files or notebooks with Lakeflow"}, {Name: "default-sql", Id: "The default SQL template for .sql files that run with Databricks SQL"}, {Name: "dbt-sql", Id: "The dbt SQL template (databricks.com/blog/delivering-cost-effective-data-real-time-dbt-and-databricks)"}, {Name: "mlops-stacks", Id: "The Databricks MLOps Stacks template (github.com/databricks/mlops-stacks)"}, diff --git a/libs/template/templates/cli-pipelines/databricks_template_schema.json b/libs/template/templates/cli-pipelines/databricks_template_schema.json index ce617cc9dd..adc580c044 100644 --- a/libs/template/templates/cli-pipelines/databricks_template_schema.json +++ b/libs/template/templates/cli-pipelines/databricks_template_schema.json @@ -39,7 +39,7 @@ "default": "default", "pattern": "^\\w+$", "pattern_match_failure_message": "Invalid schema name.", - "description": "\nInitial schema during development:\nNote: This schema name will be suffixed with '_dev' when deployed to target the development environment.\ndefault_schema", + "description": "\nInitial schema during development:\ndefault_schema", "order": 5 }, "language": { diff --git a/libs/template/templates/cli-pipelines/library/variables.tmpl b/libs/template/templates/cli-pipelines/library/variables.tmpl index fb0e6f8922..9c5c36b449 100644 --- a/libs/template/templates/cli-pipelines/library/variables.tmpl +++ b/libs/template/templates/cli-pipelines/library/variables.tmpl @@ -10,7 +10,7 @@ {{- if (regexp "^yes").MatchString .personal_schemas -}} {{ short_name }} {{- else -}} - {{ .shared_schema }}_dev + {{ .shared_schema }} {{- end}} {{- end }} @@ -19,7 +19,7 @@ {{- if (regexp "^yes").MatchString .personal_schemas -}} ${workspace.current_user.short_name} {{- else -}} - {{ .shared_schema }}_dev + {{ .shared_schema }} {{- end}} {{- end }} diff --git a/libs/template/templates/cli-pipelines/template/__preamble.tmpl b/libs/template/templates/cli-pipelines/template/__preamble.tmpl index f116c0b44e..199ad088a6 100644 --- a/libs/template/templates/cli-pipelines/template/__preamble.tmpl +++ b/libs/template/templates/cli-pipelines/template/__preamble.tmpl @@ -7,10 +7,10 @@ This file only contains template directives; it is skipped for the actual output {{$isSQL := eq .language "sql"}} {{if $isSQL}} - {{skip "{{.project_name}}/utilities/utils.py"}} - {{skip "{{.project_name}}/transformations/sample_zones_{{.project_name}}.py"}} - {{skip "{{.project_name}}/transformations/sample_trips_{{.project_name}}.py"}} + {{skip "{{.project_name}}/{{.project_name}}_pipeline/utilities/utils.py"}} + {{skip "{{.project_name}}/{{.project_name}}_pipeline/transformations/sample_zones_{{.project_name}}.py"}} + {{skip "{{.project_name}}/{{.project_name}}_pipeline/transformations/sample_trips_{{.project_name}}.py"}} {{else}} - {{skip "{{.project_name}}/transformations/sample_zones_{{.project_name}}.sql"}} - {{skip "{{.project_name}}/transformations/sample_trips_{{.project_name}}.sql"}} + {{skip "{{.project_name}}/{{.project_name}}_pipeline/transformations/sample_zones_{{.project_name}}.sql"}} + {{skip "{{.project_name}}/{{.project_name}}_pipeline/transformations/sample_trips_{{.project_name}}.sql"}} {{end}} diff --git a/libs/template/templates/cli-pipelines/template/{{.project_name}}/README.md.tmpl b/libs/template/templates/cli-pipelines/template/{{.project_name}}/README.md.tmpl index fc8544cc79..021ec94625 100644 --- a/libs/template/templates/cli-pipelines/template/{{.project_name}}/README.md.tmpl +++ b/libs/template/templates/cli-pipelines/template/{{.project_name}}/README.md.tmpl @@ -13,41 +13,13 @@ The '{{.project_name}}' project was generated by using the CLI Pipelines templat 3. Authenticate to your Databricks workspace, if you have not done so already: ``` - $ pipelines auth login + $ databricks auth login ``` 4. Optionally, install developer tools such as the Databricks extension for Visual Studio Code from https://docs.databricks.com/dev-tools/vscode-ext.html. Or the PyCharm plugin from https://www.databricks.com/blog/announcing-pycharm-integration-databricks. -## Pipeline Structure - -This folder defines all source code for the {{template `pipeline_name` .}} pipeline: - -{{ if (eq .language "python") -}} -- `explorations`: Ad-hoc notebooks used to explore the data processed by this pipeline. -- `transformations`: All dataset definitions and transformations. -- `utilities` (optional): Utility functions and Python modules used in this pipeline. -{{- else -}} -- `explorations`: Ad-hoc notebooks used to explore the data processed by this pipeline. -- `transformations`: All dataset definitions and transformations. -{{- end }} - -## Getting Started - -To get started, go to the `transformations` folder -- most of the relevant source code lives there: - -{{ if (eq .language "python") -}} -* By convention, every dataset under `transformations` is in a separate file. -* Take a look at the sample under "sample_trips_{{ .project_name }}.py" to get familiar with the syntax. - Read more about the syntax at https://docs.databricks.com/dlt/python-ref.html. -{{- else -}} -* By convention, every dataset under `transformations` is in a separate file. -* Take a look at the sample under "sample_trips_{{ .project_name }}.sql" to get familiar with the syntax. - Read more about the syntax at https://docs.databricks.com/dlt/sql-ref.html. -{{- end }} - -For more tutorials and reference material, see https://docs.databricks.com/dlt. ## Deploying pipelines @@ -58,11 +30,6 @@ For more tutorials and reference material, see https://docs.databricks.com/dlt. (Note that "dev" is the default target, so the `--target` parameter is optional here.) - This deploys everything that's defined for this project. - For example, the default template would deploy a pipeline called - `[dev yourname] {{.project_name}}_pipeline` to your workspace. - You can find that pipeline by opening your workpace and clicking on **Jobs & Pipelines**. - 2. Similarly, to deploy a production copy, type: ``` $ pipelines deploy --target prod diff --git a/libs/template/templates/cli-pipelines/template/{{.project_name}}/databricks.yml.tmpl b/libs/template/templates/cli-pipelines/template/{{.project_name}}/databricks.yml.tmpl index ffcc6ba7b1..ad227f4867 100644 --- a/libs/template/templates/cli-pipelines/template/{{.project_name}}/databricks.yml.tmpl +++ b/libs/template/templates/cli-pipelines/template/{{.project_name}}/databricks.yml.tmpl @@ -7,7 +7,7 @@ bundle: include: - resources/*.yml - resources/*/*.yml - - ./*.yml + - {{.project_name}}_pipeline/*.yml # Variable declarations. These variables are assigned in the dev/prod targets below. variables: diff --git a/libs/template/templates/default-python/databricks_template_schema.json b/libs/template/templates/default-python/databricks_template_schema.json index c4207a3b35..8be3139976 100644 --- a/libs/template/templates/default-python/databricks_template_schema.json +++ b/libs/template/templates/default-python/databricks_template_schema.json @@ -1,42 +1,68 @@ { - "welcome_message": "\nWelcome to the default Python template for Databricks Asset Bundles!", + "welcome_message": "Welcome to the default Python template for Databricks Asset Bundles!\n\nPlease answer the below to tailor your project to your preferences.\nYou can always change your mind and change your configuration in the databricks.yml file later.\n\nNote that {{workspace_host}} is used for initialization\n(see https://docs.databricks.com/dev-tools/cli/profiles.html for how to change your profile).", "properties": { "project_name": { "type": "string", "default": "my_project", - "description": "Please provide the following details to tailor the template to your preferences.\n\nUnique name for this project", + "description": "\nUnique name for this project", "order": 1, "pattern": "^[A-Za-z0-9_]+$", "pattern_match_failure_message": "Name must consist of letters, numbers, and underscores." }, - "include_notebook": { + "project_name_short": { + "//": "This is a phony property that is derived from project_name (it replaces my_project with sample and strips _project|_app|_service)", + "skip_prompt_if": {}, "type": "string", - "default": "yes", - "enum": ["yes", "no"], - "description": "Include a stub (sample) notebook in '{{.project_name}}{{path_separator}}src'", + "default": "{{if eq .project_name \"my_project\"}}sample{{else}}{{with (regexp \"^(my_)?(.*)(_project|_app|_service)?$\").FindStringSubmatch .project_name}}{{index . 2}}{{else}}{{.project_name}}{{end}}{{end}}", + "description": "Short name for the project", "order": 2 }, - "include_dlt": { + "include_job": { "type": "string", "default": "yes", "enum": ["yes", "no"], - "description": "Include a stub (sample) Lakeflow Declarative Pipeline in '{{.project_name}}{{path_separator}}src'", + "description": "Include a Lakeflow job that runs a notebook", "order": 3 }, - "include_python": { + "include_pipeline": { "type": "string", "default": "yes", "enum": ["yes", "no"], - "description": "Include a stub (sample) Python package in '{{.project_name}}{{path_separator}}src'", + "description": "Include a Lakeflow ETL pipeline", "order": 4 }, - "serverless": { + "include_python": { "type": "string", "default": "yes", "enum": ["yes", "no"], - "description": "Use serverless compute", + "description": "Include a sample Python package that is built to a wheel file", "order": 5 + }, + "serverless": { + "type": "string", + "default": "yes", + "enum": ["yes", "no"], + "description": "Use serverless compute?", + "order": 6 + }, + "default_catalog": { + "type": "string", + "default": "{{default_catalog}}", + "pattern": "^\\w*$", + "pattern_match_failure_message": "Invalid catalog name.", + "description": "Default catalog for any tables created by this project{{if eq (default_catalog) \"\"}} (leave blank when not using Unity Catalog){{end}}", + "order": 7 + }, + "personal_schemas": { + "type": "string", + "description": "Use a personal schema for each user working on this project\n(this is recommended, your personal schema will be '{{.default_catalog}}.{{short_name}}')", + "default": "yes", + "enum": [ + "yes", + "no (advanced: I will customize the schema configuration later in databricks.yml)" + ], + "order": 8 } }, - "success_message": "Workspace to use (auto-detected, edit in '{{.project_name}}/databricks.yml'): {{workspace_host}}\n\n✨ Your new project has been created in the '{{.project_name}}' directory!\n\nPlease refer to the README.md file for \"getting started\" instructions.\nSee also the documentation at https://docs.databricks.com/dev-tools/bundles/index.html." + "success_message": "\n✨ Your new project has been created in the '{{.project_name}}' directory!\n\nPlease refer to the README.md file for \"getting started\" instructions.\nSee also the documentation at https://docs.databricks.com/dev-tools/bundles/index.html." } diff --git a/libs/template/templates/default-python/library/versions.tmpl b/libs/template/templates/default-python/library/versions.tmpl index 7d0c88e7df..912dc9c9d7 100644 --- a/libs/template/templates/default-python/library/versions.tmpl +++ b/libs/template/templates/default-python/library/versions.tmpl @@ -1,7 +1,17 @@ +{{/* The latest LTS DBR version; this should be updated a few months after each LTS. + */}} {{define "latest_lts_dbr_version" -}} - 15.4.x-scala2.12 + 16.4.x-scala2.12 {{- end}} -{{define "latest_lts_db_connect_version_spec" -}} +{{/* A safe version of DB Connect that is compatible with at least half the + * clusters running in production. + * + * We need to be very conservative in updating this, since a newer version can + * only connect to compute of that same version and higher. If the version is + * deemed too old, customers can update the version themselves after initializing + * the template. + */}} +{{define "conservative_db_connect_version_spec" -}} >=15.4,<15.5 {{- end}} diff --git a/libs/template/templates/default-python/template/__preamble.tmpl b/libs/template/templates/default-python/template/__preamble.tmpl index e579c34fb0..56d6965a3e 100644 --- a/libs/template/templates/default-python/template/__preamble.tmpl +++ b/libs/template/templates/default-python/template/__preamble.tmpl @@ -4,29 +4,38 @@ This file only template directives; it is skipped for the actual output. {{skip "__preamble"}} -{{$notDLT := not (eq .include_dlt "yes")}} -{{$notNotebook := not (eq .include_notebook "yes")}} -{{$notPython := not (eq .include_python "yes")}} - -{{if $notPython}} - {{skip "{{.project_name}}/src/{{.project_name}}"}} - {{skip "{{.project_name}}/tests/main_test.py"}} - {{skip "{{.project_name}}/setup.py"}} - {{skip "{{.project_name}}/pytest.ini"}} - {{skip "{{.project_name}}/requirements-dev.txt"}} +{{$pipeline := eq .include_pipeline "yes"}} +{{$job := eq .include_job "yes"}} +{{$python_package := eq .include_python "yes"}} + +{{if not $python_package}} + {{skip "{{.project_name}}/pyproject.toml"}} +{{end}} + +{{if not $pipeline}} + {{skip "{{.project_name}}/resources/{{.project_name_short}}_etl.pipeline.yml"}} + {{skip "{{.project_name}}/src/{{.project_name_short}}_etl"}} +{{end}} + +{{if not $job}} + {{skip "{{.project_name}}/src/sample_notebook.ipynb"}} + {{skip "{{.project_name}}/src/sample_python_file.py"}} + {{if not $pipeline}} + {{skip "{{.project_name}}/resources/{{.project_name_short}}_job.job.yml"}} + {{end}} {{end}} -{{if $notDLT}} - {{skip "{{.project_name}}/src/pipeline.ipynb"}} - {{skip "{{.project_name}}/resources/{{.project_name}}.pipeline.yml"}} +{{if and (not $pipeline) (not $job) (not $python_package)}} + {{skip "{{.project_name}}/src/shared"}} {{end}} -{{if $notNotebook}} - {{skip "{{.project_name}}/src/notebook.ipynb"}} +# Remove tests for the empty project or if we only have Pipeline +# (which is usually not testable) +{{if and (not $python_package) (not $job)}} + {{skip "{{.project_name}}/tests"}} {{end}} -{{if (and $notDLT $notNotebook $notPython)}} - {{skip "{{.project_name}}/resources/{{.project_name}}.job.yml"}} -{{else}} - {{skip "{{.project_name}}/resources/.gitkeep"}} +# Remove .gitkeep files for a non-empty project +{{if or $python_package $job $pipeline}} + {{skip "{{.project_name}}/src/.gitkeep"}} {{end}} diff --git a/libs/template/templates/default-python/template/{{.project_name}}/.vscode/extensions.json b/libs/template/templates/default-python/template/{{.project_name}}/.vscode/extensions.json index 5d15eba363..1f39c33087 100644 --- a/libs/template/templates/default-python/template/{{.project_name}}/.vscode/extensions.json +++ b/libs/template/templates/default-python/template/{{.project_name}}/.vscode/extensions.json @@ -1,7 +1,6 @@ { "recommendations": [ "databricks.databricks", - "ms-python.vscode-pylance", "redhat.vscode-yaml" ] } diff --git a/libs/template/templates/default-python/template/{{.project_name}}/.vscode/settings.json b/libs/template/templates/default-python/template/{{.project_name}}/.vscode/settings.json index 8ee87c30d4..d8468d7b60 100644 --- a/libs/template/templates/default-python/template/{{.project_name}}/.vscode/settings.json +++ b/libs/template/templates/default-python/template/{{.project_name}}/.vscode/settings.json @@ -1,16 +1,39 @@ { - "python.analysis.stubPath": ".vscode", "jupyter.interactiveWindow.cellMarker.codeRegex": "^# COMMAND ----------|^# Databricks notebook source|^(#\\s*%%|#\\s*\\|#\\s*In\\[\\d*?\\]|#\\s*In\\[ \\])", "jupyter.interactiveWindow.cellMarker.default": "# COMMAND ----------", "python.testing.pytestArgs": [ "." ], - "python.testing.unittestEnabled": false, - "python.testing.pytestEnabled": true, - "python.analysis.extraPaths": ["src"], "files.exclude": { "**/*.egg-info": true, "**/__pycache__": true, ".pytest_cache": true, + "dist": true, + }, + "files.associations": { + "**/.gitkeep": "markdown" + } + + // Pylance settings (VS Code) + // Set typeCheckingMode to "basic" to enable type checking! + "python.analysis.typeCheckingMode": "off", + "python.analysis.extraPaths": ["src", "lib", "resources"], + "python.analysis.diagnosticMode": "workspace", + "python.analysis.stubPath": ".vscode", + + // Pyright settings (Cursor) + // Set typeCheckingMode to "basic" to enable type checking! + "cursorpyright.analysis.typeCheckingMode": "off", + "cursorpyright.analysis.extraPaths": ["src", "lib", "resources"], + "cursorpyright.analysis.diagnosticMode": "workspace", + "cursorpyright.analysis.stubPath": ".vscode", + + // General Python settings + "python.defaultInterpreterPath": "./.venv/bin/python", + "python.testing.unittestEnabled": false, + "python.testing.pytestEnabled": true, + "[python]": { + "editor.defaultFormatter": "ms-python.python", + "editor.formatOnSave": true, }, } diff --git a/libs/template/templates/default-python/template/{{.project_name}}/README.md.tmpl b/libs/template/templates/default-python/template/{{.project_name}}/README.md.tmpl index bc4fe07b54..c486c29ed9 100644 --- a/libs/template/templates/default-python/template/{{.project_name}}/README.md.tmpl +++ b/libs/template/templates/default-python/template/{{.project_name}}/README.md.tmpl @@ -1,19 +1,48 @@ # {{.project_name}} The '{{.project_name}}' project was generated by using the default-python template. +For documentation on the Databricks Asset Bundles format use for this project, +and for CI/CD configuration, see https://docs.databricks.com/aws/en/dev-tools/bundles. + +* `src/`: Python source code for this project. +* `src/shared`: Shared source code across all jobs/pipelines/etc. +* `src/{{.project_name_short}}_etl`: Python source code for the {{.project_name_short}}_etl pipeline. +* `resources/`: Resource configurations (jobs, pipelines, etc.) +* `tests/`: Unit tests. +* `fixtures/`: Fixtures for data sets (primarily used for testing). ## Getting started -0. Install UV: https://docs.astral.sh/uv/getting-started/installation/ +Choose how you want to work on this project: + +(a) Directly in your Databricks workspace, see + https://docs.databricks.com/dev-tools/bundles/workspace. + +(b) Locally with an IDE like Cursor or VS Code, see + https://docs.databricks.com/vscode-ext. + +(c) With command line tools, see https://docs.databricks.com/dev-tools/cli/databricks-cli.html + +{{- if or (eq .include_python "yes") (eq .include_job "yes")}} -1. Install the Databricks CLI from https://docs.databricks.com/dev-tools/cli/databricks-cli.html +Dependencies for this project should be installed using uv: -2. Authenticate to your Databricks workspace, if you have not done so already: +* Make sure you have the UV package manager installed. + It's an alternative to tools like pip: https://docs.astral.sh/uv/getting-started/installation/. +* Run `uv sync --dev` to install the project's dependencies. +{{end}} + +# Using this project using the CLI + +The Databricks workspace and IDE extensions provide a graphical interface for working +with this project. It's also possible to interact with it directly using the CLI: + +1. Authenticate to your Databricks workspace, if you have not done so already: ``` $ databricks configure ``` -3. To deploy a development copy of this project, type: +2. To deploy a development copy of this project, type: ``` $ databricks bundle deploy --target dev ``` @@ -23,9 +52,9 @@ The '{{.project_name}}' project was generated by using the default-python templa This deploys everything that's defined for this project. For example, the default template would deploy a job called `[dev yourname] {{.project_name}}_job` to your workspace. - You can find that job by opening your workpace and clicking on **Workflows**. + You can find that job by opening your workpace and clicking on **Jobs & Pipelines**. -4. Similarly, to deploy a production copy, type: +3. Similarly, to deploy a production copy, type: ``` $ databricks bundle deploy --target prod ``` @@ -35,23 +64,14 @@ The '{{.project_name}}' project was generated by using the default-python templa is paused when deploying in development mode (see https://docs.databricks.com/dev-tools/bundles/deployment-modes.html). -5. To run a job or pipeline, use the "run" command: +4. To run a job or pipeline, use the "run" command: ``` $ databricks bundle run ``` -{{- if (eq .include_python "no") }} -6. Optionally, install developer tools such as the Databricks extension for Visual Studio Code from - https://docs.databricks.com/dev-tools/vscode-ext.html. -{{- else }} -6. Optionally, install the Databricks extension for Visual Studio code for local development from - https://docs.databricks.com/dev-tools/vscode-ext.html. It can configure your - virtual environment and setup Databricks Connect for running unit tests locally. - When not using these tools, consult your development environment's documentation - and/or the documentation for Databricks Connect for manually setting up your environment - (https://docs.databricks.com/en/dev-tools/databricks-connect/python/index.html). -{{- end}} - -7. For documentation on the Databricks asset bundles format used - for this project, and for CI/CD configuration, see - https://docs.databricks.com/dev-tools/bundles/index.html. +{{- if or (eq .include_python "yes") (eq .include_job "yes")}} +5. Finally, to run tests locally, use `pytest`: + ``` + $ uv run pytest + ``` +{{end}} diff --git a/libs/template/templates/default-python/template/{{.project_name}}/databricks.yml.tmpl b/libs/template/templates/default-python/template/{{.project_name}}/databricks.yml.tmpl index 99e9d3b7c2..d321d52a11 100644 --- a/libs/template/templates/default-python/template/{{.project_name}}/databricks.yml.tmpl +++ b/libs/template/templates/default-python/template/{{.project_name}}/databricks.yml.tmpl @@ -5,15 +5,31 @@ bundle: name: {{.project_name}} uuid: {{bundle_uuid}} -{{ if $with_python }} + +include: + - resources/*.yml + - resources/*/*.yml + +{{- if $with_python}} + artifacts: python_artifact: type: whl build: uv build --wheel -{{ end }} -include: - - resources/*.yml - - resources/*/*.yml +{{- end}} + +# Variable declarations. These variables are assigned in the dev/prod targets below. +variables: + catalog: + description: The catalog to use + schema: + description: The schema to use + +{{- $dev_schema := "dev" }} +{{- $prod_schema := "prod" }} +{{- if (regexp "^yes").MatchString .personal_schemas}} + {{- $dev_schema = "${workspace.current_user.short_name}"}} +{{- end}} targets: dev: @@ -25,20 +41,22 @@ targets: default: true workspace: host: {{workspace_host}} -{{ if ($with_classic) }} + variables: + catalog: {{.default_catalog}} + schema: {{$dev_schema}} + {{- if $with_classic}} presets: - # Set dynamic_version: true on all artifacts of type "whl". - # This makes "bundle deploy" add a timestamp to wheel's version before uploading, - # new wheel takes over the previous installation even if actual wheel version is unchanged. - # See https://docs.databricks.com/aws/en/dev-tools/bundles/settings artifacts_dynamic_version: true -{{ end }} + {{- end}} prod: mode: production workspace: host: {{workspace_host}} # We explicitly deploy to /Workspace/Users/{{user_name}} to make sure we only have a single copy. root_path: /Workspace/Users/{{user_name}}/.bundle/${bundle.name}/${bundle.target} + variables: + catalog: {{.default_catalog}} + schema: {{$prod_schema}} permissions: - {{if is_service_principal}}service_principal{{else}}user{{end}}_name: {{user_name}} level: CAN_MANAGE diff --git a/libs/template/templates/default-python/template/{{.project_name}}/fixtures/.gitkeep.tmpl b/libs/template/templates/default-python/template/{{.project_name}}/fixtures/.gitkeep.tmpl index ee95703028..a84a182f1f 100644 --- a/libs/template/templates/default-python/template/{{.project_name}}/fixtures/.gitkeep.tmpl +++ b/libs/template/templates/default-python/template/{{.project_name}}/fixtures/.gitkeep.tmpl @@ -1,27 +1,15 @@ -# Fixtures +# Test fixtures directory + {{- /* We don't want to have too many README.md files, since they stand out so much. But we do need to have a file here to make sure the folder is added to Git. */}} -This folder is reserved for fixtures, such as CSV files. - -Below is an example of how to load fixtures as a data frame: +Add JSON or CSV files here. In tests, use them with `load_fixture()`: ``` -import pandas as pd -import os - -def get_absolute_path(*relative_parts): - if 'dbutils' in globals(): - base_dir = os.path.dirname(dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()) # type: ignore - path = os.path.normpath(os.path.join(base_dir, *relative_parts)) - return path if path.startswith("/Workspace") else "/Workspace" + path - else: - return os.path.join(*relative_parts) - -csv_file = get_absolute_path("..", "fixtures", "mycsv.csv") -df = pd.read_csv(csv_file) -display(df) +def test_using_fixture(load_fixture): + data = load_fixture("my_data.json") + assert len(data) >= 1 ``` diff --git a/libs/template/templates/default-python/template/{{.project_name}}/pyproject.toml.tmpl b/libs/template/templates/default-python/template/{{.project_name}}/pyproject.toml.tmpl index 7dd4694ea9..3b2db1d466 100644 --- a/libs/template/templates/default-python/template/{{.project_name}}/pyproject.toml.tmpl +++ b/libs/template/templates/default-python/template/{{.project_name}}/pyproject.toml.tmpl @@ -4,30 +4,23 @@ version = "0.0.1" authors = [{ name = "{{user_name}}" }] requires-python = ">= 3.11" -[project.optional-dependencies] +[dependency-groups] dev = [ "pytest", - - # Code completion support for Lakeflow Declarative Pipelines, also install databricks-connect "databricks-dlt", # databricks-connect can be used to run parts of this project locally. - # See https://docs.databricks.com/dev-tools/databricks-connect.html. - # - # Note, databricks-connect is automatically installed if you're using Databricks - # extension for Visual Studio Code - # (https://docs.databricks.com/dev-tools/vscode-ext/dev-tasks/databricks-connect.html). - # - # To manually install databricks-connect, uncomment the line below to install a version - # of db-connect that corresponds to the Databricks Runtime version used for this project. - # See https://docs.databricks.com/dev-tools/databricks-connect.html - # "databricks-connect{{template "latest_lts_db_connect_version_spec"}}", + # Note that for local development, you should use a version that is not newer + # than the remote cluster or serverless compute you connect to. + # See also https://docs.databricks.com/dev-tools/databricks-connect.html. + "databricks-connect{{template "conservative_db_connect_version_spec"}}", ] [tool.pytest.ini_options] pythonpath = "src" testpaths = [ "tests", + "resources", ] [build-system] @@ -35,7 +28,8 @@ requires = ["hatchling"] build-backend = "hatchling.build" [tool.hatch.build.targets.wheel] -packages = ["src/{{.project_name}}"] +packages = ["src"] +sources = ["src"] [project.scripts] -main = "{{.project_name}}.main:main" +main = "sample_python_file:main" diff --git a/libs/template/templates/default-python/template/{{.project_name}}/resources/sample_job.job.yml.tmpl b/libs/template/templates/default-python/template/{{.project_name}}/resources/sample_job.job.yml.tmpl new file mode 100644 index 0000000000..bbc5a952a1 --- /dev/null +++ b/libs/template/templates/default-python/template/{{.project_name}}/resources/sample_job.job.yml.tmpl @@ -0,0 +1,98 @@ +# A sample job for {{.project_name}}. + +{{- $serverless := (eq .serverless "yes")}} +{{- $python_package := (eq .include_python "yes")}} +{{- $notebook := (eq .include_job "yes")}} +{{- $pipeline := (eq .include_pipeline "yes")}} + +resources: + jobs: + sample_job: + name: sample_job + + trigger: + # Run this job every day, exactly one day from the last run; see https://docs.databricks.com/api/workspace/jobs/create#trigger + periodic: + interval: 1 + unit: DAYS + + #email_notifications: + # on_failure: + # - your_email@example.com + + parameters: + - name: catalog + default: ${var.catalog} + - name: schema + default: ${var.schema} + + tasks: + +{{- if $notebook}} + - task_key: notebook_task + notebook_task: + notebook_path: ../src/sample_notebook.ipynb + {{- if $serverless}} + environment_key: default + {{- else}} + job_cluster_key: job_cluster + {{- if $python_package}} + libraries: + # By default we just include the .whl file generated for the {{.project_name_short}} package in src/. + # See https://docs.databricks.com/dev-tools/bundles/library-dependencies.html + # for more information on how to add other libraries. + - whl: ../../dist/*.whl + {{- end}} + {{- end}} +{{- end}} + +{{- if $python_package}} + - task_key: python_file_task + depends_on: + - task_key: notebook_task + spark_python_task: + python_file: ../src/sample_python_file.py + {{- if $serverless}} + environment_key: default + {{- else}} + job_cluster_key: job_cluster + {{- if $python_package}} + libraries: + - whl: ../../dist/*.whl + {{- end}} + {{- end}} +{{- end}} + +{{- if $pipeline}} + - task_key: refresh_pipeline + depends_on: + - task_key: notebook_task + pipeline_task: + pipeline_id: ${resources.pipelines.{{.project_name_short}}_etl.id} +{{- end}} + +{{- if $serverless}} + + environments: + - environment_key: default + spec: + client: "2" + {{- if $python_package}} + dependencies: + # By default we just include the .whl file generated for the {{.project_name_short}} package in src/. + # See https://docs.databricks.com/dev-tools/bundles/library-dependencies.html + # for more information on how to add other libraries. + - ../dist/*.whl + {{- end}} +{{- else}} + + job_clusters: + - job_cluster_key: job_cluster + new_cluster: + spark_version: {{template "latest_lts_dbr_version"}} + node_type_id: {{smallest_node_type}} + data_security_mode: SINGLE_USER + autoscale: + min_workers: 1 + max_workers: 4 +{{- end}} diff --git a/libs/template/templates/default-python/template/{{.project_name}}/resources/{{.project_name}}.pipeline.yml.tmpl b/libs/template/templates/default-python/template/{{.project_name}}/resources/{{.project_name_short}}_etl.pipeline.yml.tmpl similarity index 50% rename from libs/template/templates/default-python/template/{{.project_name}}/resources/{{.project_name}}.pipeline.yml.tmpl rename to libs/template/templates/default-python/template/{{.project_name}}/resources/{{.project_name_short}}_etl.pipeline.yml.tmpl index 093b087a01..251e839751 100644 --- a/libs/template/templates/default-python/template/{{.project_name}}/resources/{{.project_name}}.pipeline.yml.tmpl +++ b/libs/template/templates/default-python/template/{{.project_name}}/resources/{{.project_name_short}}_etl.pipeline.yml.tmpl @@ -1,25 +1,29 @@ -{{$with_serverless := (eq .serverless "yes") -}} +{{- $with_serverless := (eq .serverless "yes")}} # The main pipeline for {{.project_name}} resources: pipelines: - {{.project_name}}_pipeline: - name: {{.project_name}}_pipeline + {{.project_name_short}}_etl: + {{- /* Note that pipeline names must be unique in a worskspace, + * so we use the project name as part as the name. + */}} + name: {{.project_name_short}}_etl {{- if or (eq default_catalog "") (eq default_catalog "hive_metastore")}} {{- if $with_serverless }} ## Catalog is required for serverless compute - catalog: main{{else}} + catalog: main + {{- else}} ## Specify the 'catalog' field to configure this pipeline to make use of Unity Catalog: - # catalog: catalog_name{{end}} + # catalog: catalog_name + {{- end}} {{- else}} catalog: {{default_catalog}} {{- end}} schema: {{.project_name}}_${bundle.target} - {{- if $with_serverless }} + {{- if $with_serverless}} serverless: true {{- end}} - libraries: - - notebook: - path: ../src/pipeline.ipynb + root_path: ../src - configuration: - bundle.sourcePath: ${workspace.file_path}/src + libraries: + - glob: + include: ../src/{{.project_name_short}}_etl/transformations/** diff --git a/libs/template/templates/default-python/template/{{.project_name}}/resources/{{.project_name}}.job.yml.tmpl b/libs/template/templates/default-python/template/{{.project_name}}/resources/{{.project_name}}.job.yml.tmpl deleted file mode 100644 index 419c4cd2e9..0000000000 --- a/libs/template/templates/default-python/template/{{.project_name}}/resources/{{.project_name}}.job.yml.tmpl +++ /dev/null @@ -1,90 +0,0 @@ -# The main job for {{.project_name}}. - -{{- /* Clarify what this job is for Lakeflow Declarative Pipelines only users. */}} -{{if and (eq .include_dlt "yes") (and (eq .include_notebook "no") (eq .include_python "no")) -}} -# This job runs {{.project_name}}_pipeline on a schedule. -{{end -}} -{{$with_serverless := (eq .serverless "yes") -}} - -resources: - jobs: - {{.project_name}}_job: - name: {{.project_name}}_job - - trigger: - # Run this job every day, exactly one day from the last run; see https://docs.databricks.com/api/workspace/jobs/create#trigger - periodic: - interval: 1 - unit: DAYS - - #email_notifications: - # on_failure: - # - your_email@example.com - - tasks: - {{- if eq .include_notebook "yes" }} - - task_key: notebook_task - {{- if not $with_serverless}} - job_cluster_key: job_cluster{{end}} - notebook_task: - notebook_path: ../src/notebook.ipynb - {{- end -}} - {{- if (eq .include_dlt "yes") }} - - - task_key: refresh_pipeline - {{- if (eq .include_notebook "yes" )}} - depends_on: - - task_key: notebook_task - {{- end}} - pipeline_task: - {{- /* TODO: we should find a way that doesn't use magics for the below, like ./{{project_name}}.pipeline.yml */}} - pipeline_id: ${resources.pipelines.{{.project_name}}_pipeline.id} - {{- end -}} - {{- if (eq .include_python "yes") }} - - - task_key: main_task - {{- if (eq .include_dlt "yes") }} - depends_on: - - task_key: refresh_pipeline - {{- else if (eq .include_notebook "yes" )}} - depends_on: - - task_key: notebook_task -{{end}} - {{- if $with_serverless }} - environment_key: default - {{- else }} - job_cluster_key: job_cluster{{end}} - python_wheel_task: - package_name: {{.project_name}} - entry_point: main - {{- if not $with_serverless }} - libraries: - # By default we just include the .whl file generated for the {{.project_name}} package. - # See https://docs.databricks.com/dev-tools/bundles/library-dependencies.html - # for more information on how to add other libraries. - - whl: ../dist/*.whl -{{- end -}} -{{else}} -{{- end}} -{{if $with_serverless}}{{if (eq .include_python "yes")}} - # A list of task execution environment specifications that can be referenced by tasks of this job. - environments: - - environment_key: default - - # Full documentation of this spec can be found at: - # https://docs.databricks.com/api/workspace/jobs/create#environments-spec - spec: - client: "2" - dependencies: - - ../dist/*.whl -{{end}}{{ else }} - job_clusters: - - job_cluster_key: job_cluster - new_cluster: - spark_version: {{template "latest_lts_dbr_version"}} - node_type_id: {{smallest_node_type}} - data_security_mode: SINGLE_USER - autoscale: - min_workers: 1 - max_workers: 4 -{{end -}} diff --git a/libs/template/templates/default-python/template/{{.project_name}}/scratch/README.md b/libs/template/templates/default-python/template/{{.project_name}}/scratch/README.md deleted file mode 100644 index e6cfb81b46..0000000000 --- a/libs/template/templates/default-python/template/{{.project_name}}/scratch/README.md +++ /dev/null @@ -1,4 +0,0 @@ -# scratch - -This folder is reserved for personal, exploratory notebooks. -By default these are not committed to Git, as 'scratch' is listed in .gitignore. diff --git a/libs/template/templates/default-python/template/{{.project_name}}/scratch/exploration.ipynb.tmpl b/libs/template/templates/default-python/template/{{.project_name}}/scratch/exploration.ipynb.tmpl deleted file mode 100644 index d3e9beef37..0000000000 --- a/libs/template/templates/default-python/template/{{.project_name}}/scratch/exploration.ipynb.tmpl +++ /dev/null @@ -1,65 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "%load_ext autoreload\n", - "%autoreload 2" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "6bca260b-13d1-448f-8082-30b60a85c9ae", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - {{- if (eq .include_python "yes") }} - "import sys\n", - "\n", - "sys.path.append(\"../src\")\n", - "from {{.project_name}} import main\n", - "\n", - "main.get_taxis(spark).show(10)" - {{else}} - "spark.range(10)" - {{end -}} - ] - } - ], - "metadata": { - "application/vnd.databricks.v1+notebook": { - "dashboards": [], - "language": "python", - "notebookMetadata": { - "pythonIndentUnit": 2 - }, - "notebookName": "ipynb-notebook", - "widgets": {} - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "name": "python", - "version": "3.11.4" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} diff --git a/libs/template/templates/default-python/template/{{.project_name}}/src/.gitkeep b/libs/template/templates/default-python/template/{{.project_name}}/src/.gitkeep new file mode 100644 index 0000000000..0e0ed1e00b --- /dev/null +++ b/libs/template/templates/default-python/template/{{.project_name}}/src/.gitkeep @@ -0,0 +1 @@ +This folder is reserved for Databricks Asset Bundles source files. diff --git a/libs/template/templates/default-python/template/{{.project_name}}/src/pipeline.ipynb.tmpl b/libs/template/templates/default-python/template/{{.project_name}}/src/pipeline.ipynb.tmpl deleted file mode 100644 index a8e35c53be..0000000000 --- a/libs/template/templates/default-python/template/{{.project_name}}/src/pipeline.ipynb.tmpl +++ /dev/null @@ -1,104 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "9a626959-61c8-4bba-84d2-2a4ecab1f7ec", - "showTitle": false, - "title": "" - } - }, - "source": [ - "# Lakeflow Declarative Pipeline\n", - "\n", - "This Lakeflow Declarative Pipeline definition is executed using a pipeline defined in resources/{{.project_name}}.pipeline.yml." - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "9198e987-5606-403d-9f6d-8f14e6a4017f", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - {{- if (eq .include_python "yes") }} - "# Import DLT and src/{{.project_name}}\n", - "import dlt\n", - "import sys\n", - "\n", - "sys.path.append(spark.conf.get(\"bundle.sourcePath\", \".\"))\n", - "from pyspark.sql.functions import expr\n", - "from {{.project_name}} import main" - {{else}} - "import dlt\n", - "from pyspark.sql.functions import expr\n", - "from pyspark.sql import SparkSession\n", - "spark = SparkSession.builder.getOrCreate()" - {{end -}} - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "3fc19dba-61fd-4a89-8f8c-24fee63bfb14", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - {{- if (eq .include_python "yes") }} - "@dlt.view\n", - "def taxi_raw():\n", - " return main.get_taxis(spark)\n", - {{else}} - "\n", - "@dlt.view\n", - "def taxi_raw():\n", - " return spark.read.format(\"json\").load(\"/databricks-datasets/nyctaxi/sample/json/\")\n", - {{end -}} - "\n", - "\n", - "@dlt.table\n", - "def filtered_taxis():\n", - " return dlt.read(\"taxi_raw\").filter(expr(\"fare_amount < 30\"))" - ] - } - ], - "metadata": { - "application/vnd.databricks.v1+notebook": { - "dashboards": [], - "language": "python", - "notebookMetadata": { - "pythonIndentUnit": 2 - }, - "notebookName": "pipeline", - "widgets": {} - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "name": "python", - "version": "3.11.4" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} diff --git a/libs/template/templates/default-python/template/{{.project_name}}/src/notebook.ipynb.tmpl b/libs/template/templates/default-python/template/{{.project_name}}/src/sample_notebook.ipynb.tmpl similarity index 87% rename from libs/template/templates/default-python/template/{{.project_name}}/src/notebook.ipynb.tmpl rename to libs/template/templates/default-python/template/{{.project_name}}/src/sample_notebook.ipynb.tmpl index 6782a053ba..f5262cb703 100644 --- a/libs/template/templates/default-python/template/{{.project_name}}/src/notebook.ipynb.tmpl +++ b/libs/template/templates/default-python/template/{{.project_name}}/src/sample_notebook.ipynb.tmpl @@ -23,6 +23,7 @@ "metadata": {}, "outputs": [], "source": [ + "# Reload wheel file dependencies every time they are updated\n", "%load_ext autoreload\n", "%autoreload 2" ] @@ -45,12 +46,12 @@ "outputs": [], "source": [ {{- if (eq .include_python "yes") }} - "from {{.project_name}} import main\n", + "from shared import taxis\n", "\n", - "main.get_taxis(spark).show(10)" - {{else}} + "taxis.find_all_taxis().show(10)" + {{- else}} "spark.range(10)" - {{end -}} + {{- end}} ] } ], @@ -61,7 +62,7 @@ "notebookMetadata": { "pythonIndentUnit": 2 }, - "notebookName": "notebook", + "notebookName": "sample_notebook", "widgets": {} }, "kernelspec": { diff --git a/libs/template/templates/default-python/template/{{.project_name}}/src/sample_python_file.py.tmpl b/libs/template/templates/default-python/template/{{.project_name}}/src/sample_python_file.py.tmpl new file mode 100644 index 0000000000..bb63dd9cd7 --- /dev/null +++ b/libs/template/templates/default-python/template/{{.project_name}}/src/sample_python_file.py.tmpl @@ -0,0 +1,19 @@ +import argparse +from datetime import datetime +from shared import taxis + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--catalog", default="{{.default_catalog}}") + parser.add_argument("--schema", default="default") + args = parser.parse_args() + + df = taxis.find_all_taxis() + + table_name = f"{args.catalog}.{args.schema}.taxis_{{short_date_time}}" + df.write.mode("overwrite").saveAsTable(table_name) + + print(f"Wrote {df.count()} taxi records to {table_name}") + +if __name__ == "__main__": + main() diff --git a/libs/template/templates/default-python/template/{{.project_name}}/src/shared/__init__.py b/libs/template/templates/default-python/template/{{.project_name}}/src/shared/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/libs/template/templates/default-python/template/{{.project_name}}/src/shared/taxis.py.tmpl b/libs/template/templates/default-python/template/{{.project_name}}/src/shared/taxis.py.tmpl new file mode 100644 index 0000000000..a7309cd4c5 --- /dev/null +++ b/libs/template/templates/default-python/template/{{.project_name}}/src/shared/taxis.py.tmpl @@ -0,0 +1,7 @@ +from databricks.sdk.runtime import spark +from pyspark.sql import DataFrame + + +def find_all_taxis() -> DataFrame: + """Find all taxi data.""" + return spark.read.table("samples.nyctaxi.trips") diff --git a/libs/template/templates/default-python/template/{{.project_name}}/src/{{.project_name_short}}_etl/README.md.tmpl b/libs/template/templates/default-python/template/{{.project_name}}/src/{{.project_name_short}}_etl/README.md.tmpl new file mode 100644 index 0000000000..d425d343c1 --- /dev/null +++ b/libs/template/templates/default-python/template/{{.project_name}}/src/{{.project_name_short}}_etl/README.md.tmpl @@ -0,0 +1,22 @@ +# {{.project_name}} + +This folder defines all source code for the {{.project_name}} pipeline: + +- `explorations/`: Ad-hoc notebooks used to explore the data processed by this pipeline. +- `transformations/`: All dataset definitions and transformations. +- `utilities/` (optional): Utility functions and Python modules used in this pipeline. +- `data_sources/` (optional): View definitions describing the source data for this pipeline. + +## Getting Started + +To get started, go to the `transformations` folder -- most of the relevant source code lives there: + +* By convention, every dataset under `transformations` is in a separate file. +* Take a look at the sample under "sample_trips_{{short_date_time}}.py" to get familiar with the syntax. + Read more about the syntax at https://docs.databricks.com/dlt/python-ref.html. +* Use `Run file` to run and preview a single transformation. +* Use `Run pipeline` to run _all_ transformations in the entire pipeline. +* Use `+ Add` in the file browser to add a new data set definition. +* Use `Schedule` to run the pipeline on a schedule! + +For more tutorials and reference material, see https://docs.databricks.com/dlt. diff --git a/libs/template/templates/default-python/template/{{.project_name}}/src/{{.project_name_short}}_etl/__init__.py b/libs/template/templates/default-python/template/{{.project_name}}/src/{{.project_name_short}}_etl/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/libs/template/templates/default-python/template/{{.project_name}}/src/{{.project_name_short}}_etl/explorations/__init__.py.tmpl b/libs/template/templates/default-python/template/{{.project_name}}/src/{{.project_name_short}}_etl/explorations/__init__.py.tmpl new file mode 100644 index 0000000000..e69de29bb2 diff --git a/libs/template/templates/default-python/template/{{.project_name}}/src/{{.project_name_short}}_etl/explorations/sample_exploration.ipynb.tmpl b/libs/template/templates/default-python/template/{{.project_name}}/src/{{.project_name_short}}_etl/explorations/sample_exploration.ipynb.tmpl new file mode 100644 index 0000000000..33c5e1f896 --- /dev/null +++ b/libs/template/templates/default-python/template/{{.project_name}}/src/{{.project_name_short}}_etl/explorations/sample_exploration.ipynb.tmpl @@ -0,0 +1,63 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "19a992e9-55e0-49e4-abc7-8c92c420dd5b", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "### Example Exploratory Notebook\n", + "\n", + "Use this notebook to explore the data generated by the pipeline in your preferred programming language.\n", + "\n", + "**Note**: This notebook is not executed as part of the pipeline." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "1b0a82fa-3c6a-4f29-bb43-ded1c4fd77c6", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [], + "source": [ + "# !!! Before performing any data analysis, make sure to run the pipeline to materialize the sample datasets. The tables referenced in this notebook depend on that step.\n", + "\n", + "display(spark.sql(\"SELECT * FROM {{.default_catalog}}.{{short_name}}.sample_trips_{{short_date_time}}\"))" + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "computePreferences": null, + "dashboards": [], + "environmentMetadata": null, + "inputWidgetPreferences": null, + "language": "python", + "notebookMetadata": { + "pythonIndentUnit": 2 + }, + "notebookName": "sample_exploration", + "widgets": {} + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/libs/template/templates/default-python/template/{{.project_name}}/src/{{.project_name_short}}_etl/transformations/__init__.py.tmpl b/libs/template/templates/default-python/template/{{.project_name}}/src/{{.project_name_short}}_etl/transformations/__init__.py.tmpl new file mode 100644 index 0000000000..e69de29bb2 diff --git a/libs/template/templates/default-python/template/{{.project_name}}/src/{{.project_name_short}}_etl/transformations/sample_trips_{{short_date_time}}.py.tmpl b/libs/template/templates/default-python/template/{{.project_name}}/src/{{.project_name_short}}_etl/transformations/sample_trips_{{short_date_time}}.py.tmpl new file mode 100644 index 0000000000..9f6b449f7d --- /dev/null +++ b/libs/template/templates/default-python/template/{{.project_name}}/src/{{.project_name_short}}_etl/transformations/sample_trips_{{short_date_time}}.py.tmpl @@ -0,0 +1,15 @@ +import dlt +from pyspark.sql.functions import col +from {{.project_name_short}}_etl.utilities import utils + + +# This file defines a sample transformation. +# Edit the sample below or add new transformations +# using "+ Add" in the file browser. + + +@dlt.table +def sample_trips_{{short_date_time}}(): + return spark.read.table("samples.nyctaxi.trips").withColumn( + "trip_distance_km", utils.distance_km(col("trip_distance")) + ) diff --git a/libs/template/templates/default-python/template/{{.project_name}}/src/{{.project_name_short}}_etl/transformations/sample_zones_{{short_date_time}}.py.tmpl b/libs/template/templates/default-python/template/{{.project_name}}/src/{{.project_name_short}}_etl/transformations/sample_zones_{{short_date_time}}.py.tmpl new file mode 100644 index 0000000000..fa2eda046b --- /dev/null +++ b/libs/template/templates/default-python/template/{{.project_name}}/src/{{.project_name_short}}_etl/transformations/sample_zones_{{short_date_time}}.py.tmpl @@ -0,0 +1,17 @@ +import dlt +from pyspark.sql.functions import col, sum + + +# This file defines a sample transformation. +# Edit the sample below or add new transformations +# using "+ Add" in the file browser. + + +@dlt.table +def sample_zones_{{short_date_time}}(): + # Read from the "sample_trips" table, then sum all the fares + return ( + spark.read.table(f"sample_trips_{{short_date_time}}") + .groupBy(col("pickup_zip")) + .agg(sum("fare_amount").alias("total_fare")) + ) diff --git a/libs/template/templates/default-python/template/{{.project_name}}/src/{{.project_name_short}}_etl/utilities/__init__.py.tmpl b/libs/template/templates/default-python/template/{{.project_name}}/src/{{.project_name_short}}_etl/utilities/__init__.py.tmpl new file mode 100644 index 0000000000..e69de29bb2 diff --git a/libs/template/templates/default-python/template/{{.project_name}}/src/{{.project_name_short}}_etl/utilities/utils.py.tmpl b/libs/template/templates/default-python/template/{{.project_name}}/src/{{.project_name_short}}_etl/utilities/utils.py.tmpl new file mode 100644 index 0000000000..f0f4e940f7 --- /dev/null +++ b/libs/template/templates/default-python/template/{{.project_name}}/src/{{.project_name_short}}_etl/utilities/utils.py.tmpl @@ -0,0 +1,12 @@ +from pyspark.sql.functions import col, when + + +def distance_km(distance_col): + """Convert distance from miles to kilometers.""" + return distance_col * 1.60934 + + +def format_currency(amount_col): + """Format amount as currency.""" + return when(col(amount_col).isNotNull(), + col(amount_col).cast("decimal(10,2)")) diff --git a/libs/template/templates/default-python/template/{{.project_name}}/src/{{.project_name}}/main.py.tmpl b/libs/template/templates/default-python/template/{{.project_name}}/src/{{.project_name}}/main.py.tmpl deleted file mode 100644 index 5ae344c7e2..0000000000 --- a/libs/template/templates/default-python/template/{{.project_name}}/src/{{.project_name}}/main.py.tmpl +++ /dev/null @@ -1,25 +0,0 @@ -from pyspark.sql import SparkSession, DataFrame - - -def get_taxis(spark: SparkSession) -> DataFrame: - return spark.read.table("samples.nyctaxi.trips") - - -# Create a new Databricks Connect session. If this fails, -# check that you have configured Databricks Connect correctly. -# See https://docs.databricks.com/dev-tools/databricks-connect.html. -def get_spark() -> SparkSession: - try: - from databricks.connect import DatabricksSession - - return DatabricksSession.builder.getOrCreate() - except ImportError: - return SparkSession.builder.getOrCreate() - - -def main(): - get_taxis(get_spark()).show(5) - - -if __name__ == "__main__": - main() diff --git a/libs/template/templates/default-python/template/{{.project_name}}/tests/conftest.py b/libs/template/templates/default-python/template/{{.project_name}}/tests/conftest.py new file mode 100644 index 0000000000..8037a4647c --- /dev/null +++ b/libs/template/templates/default-python/template/{{.project_name}}/tests/conftest.py @@ -0,0 +1,93 @@ +"""This file configures pytest. + +This file is in the root since it can be used for tests in any place in this +project, including tests under resources/. +""" + +import os, sys, pathlib +from contextlib import contextmanager + + +try: + from databricks.connect import DatabricksSession + from databricks.sdk import WorkspaceClient + from pyspark.sql import SparkSession + import pytest + import json + import csv + import os +except ImportError: + raise ImportError("Test dependencies not found.\n\nRun tests using 'uv run pytest'. See http://docs.astral.sh/uv to learn more about uv.") + + +@pytest.fixture() +def spark() -> SparkSession: + """Provide a SparkSession fixture for tests. + + Minimal example: + def test_uses_spark(spark): + df = spark.createDataFrame([(1,)], ["x"]) + assert df.count() == 1 + """ + return DatabricksSession.builder.getOrCreate() + +@pytest.fixture() +def load_fixture(spark: SparkSession): + """Provide a callable to load JSON or CSV from fixtures/ directory. + + Example usage: + + def test_using_fixture(load_fixture): + data = load_fixture("my_data.json") + assert data.count() >= 1 + """ + def _loader(filename: str): + path = pathlib.Path(__file__).parent.parent / "fixtures" / filename + suffix = path.suffix.lower() + if suffix == ".json": + rows = json.loads(path.read_text()) + return spark.createDataFrame(rows) + if suffix == ".csv": + with path.open(newline="") as f: + rows = list(csv.DictReader(f)) + return spark.createDataFrame(rows) + raise ValueError(f"Unsupported fixture type for: {filename}") + return _loader + + +def _enable_fallback_compute(): + """Enable serverless compute if no compute is specified.""" + conf = WorkspaceClient().config + if conf.serverless_compute_id or conf.cluster_id or os.environ.get("SPARK_REMOTE"): + return + + url = "https://docs.databricks.com/dev-tools/databricks-connect/cluster-config" + print("☁️ no compute specified, falling back to serverless compute", file=sys.stderr) + print(f" see {url} for manual configuration", file=sys.stdout) + + os.environ["DATABRICKS_SERVERLESS_COMPUTE_ID"] = "auto" + + +@contextmanager +def _allow_stderr_output(config: pytest.Config): + """Temporarily disable pytest output capture.""" + capman = config.pluginmanager.get_plugin("capturemanager") + if capman: + with capman.global_and_fixture_disabled(): + yield + else: + yield + + +def pytest_configure(config: pytest.Config): + """Configure pytest session.""" + with _allow_stderr_output(config): + _enable_fallback_compute() + + # Initialize Spark session eagerly, so it is available even when + # SparkSession.builder.getOrCreate() is used. For DB Connect 15+, + # we validate version compatibility with the remote cluster. + if hasattr(DatabricksSession.builder, "validateSession"): + DatabricksSession.builder.validateSession().getOrCreate() + else: + DatabricksSession.builder.getOrCreate() diff --git a/libs/template/templates/default-python/template/{{.project_name}}/tests/main_test.py.tmpl b/libs/template/templates/default-python/template/{{.project_name}}/tests/main_test.py.tmpl deleted file mode 100644 index fea2f3f665..0000000000 --- a/libs/template/templates/default-python/template/{{.project_name}}/tests/main_test.py.tmpl +++ /dev/null @@ -1,6 +0,0 @@ -from {{.project_name}}.main import get_taxis, get_spark - - -def test_main(): - taxis = get_taxis(get_spark()) - assert taxis.count() > 5 diff --git a/libs/template/templates/default-python/template/{{.project_name}}/tests/sample_taxis_test.py.tmpl b/libs/template/templates/default-python/template/{{.project_name}}/tests/sample_taxis_test.py.tmpl new file mode 100644 index 0000000000..a782015363 --- /dev/null +++ b/libs/template/templates/default-python/template/{{.project_name}}/tests/sample_taxis_test.py.tmpl @@ -0,0 +1,8 @@ +from databricks.sdk.runtime import spark +from pyspark.sql import DataFrame +from shared import taxis + + +def test_find_all_taxis(): + results = taxis.find_all_taxis() + assert results.count() > 5 diff --git a/libs/template/templates/experimental-jobs-as-code/library/versions.tmpl b/libs/template/templates/experimental-jobs-as-code/library/versions.tmpl index 72715f283e..aefecfa93d 100644 --- a/libs/template/templates/experimental-jobs-as-code/library/versions.tmpl +++ b/libs/template/templates/experimental-jobs-as-code/library/versions.tmpl @@ -6,4 +6,4 @@ >=15.4,<15.5 {{- end}} -{{define "latest_databricks_bundles_version" -}}0.266.0{{- end}} +{{define "latest_databricks_bundles_version" -}}0.259.0{{- end}} diff --git a/libs/template/templates/experimental-jobs-as-code/template/__preamble.tmpl b/libs/template/templates/experimental-jobs-as-code/template/__preamble.tmpl index bd284b0252..23effcc3e4 100644 --- a/libs/template/templates/experimental-jobs-as-code/template/__preamble.tmpl +++ b/libs/template/templates/experimental-jobs-as-code/template/__preamble.tmpl @@ -4,7 +4,7 @@ This file only contains template directives; it is skipped for the actual output {{skip "__preamble"}} -{{$notDLT := not (eq .include_dlt "yes")}} +{{$notDLT := not (eq .include_pipeline "yes")}} {{$notNotebook := not (eq .include_notebook "yes")}} {{$notPython := not (eq .include_python "yes")}} diff --git a/libs/template/templates/experimental-jobs-as-code/template/{{.project_name}}/src/dlt_pipeline.ipynb.tmpl b/libs/template/templates/experimental-jobs-as-code/template/{{.project_name}}/src/dlt_pipeline.ipynb.tmpl index 6c64b4378a..629106dbf3 100644 --- a/libs/template/templates/experimental-jobs-as-code/template/{{.project_name}}/src/dlt_pipeline.ipynb.tmpl +++ b/libs/template/templates/experimental-jobs-as-code/template/{{.project_name}}/src/dlt_pipeline.ipynb.tmpl @@ -14,7 +14,7 @@ "source": [ "# DLT pipeline\n", "\n", - "This Lakeflow Declarative Pipeline definition is executed using a pipeline defined in resources/{{.project_name}}.pipeline.yml." + "This Delta Live Tables (DLT) definition is executed using a pipeline defined in resources/{{.project_name}}.pipeline.yml." ] }, { diff --git a/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/.vscode/settings.json.tmpl b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/.vscode/settings.json.tmpl index 6a87715ae2..f3be9a10ae 100644 --- a/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/.vscode/settings.json.tmpl +++ b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/.vscode/settings.json.tmpl @@ -1,20 +1,31 @@ { - "python.analysis.stubPath": ".vscode", - "databricks.python.envFile": "${workspaceFolder}/.env", "jupyter.interactiveWindow.cellMarker.codeRegex": "^# COMMAND ----------|^# Databricks notebook source|^(#\\s*%%|#\\s*\\|#\\s*In\\[\\d*?\\]|#\\s*In\\[ \\])", "jupyter.interactiveWindow.cellMarker.default": "# COMMAND ----------", "python.testing.pytestArgs": [ "." ], - "python.testing.unittestEnabled": false, - "python.testing.pytestEnabled": true, - {{- /* Unfortunately extraPaths doesn't support globs!! See: https://github.com/microsoft/pylance-release/issues/973 */}} - "python.analysis.extraPaths": ["resources/{{.project_name}}_pipeline"], "files.exclude": { "**/*.egg-info": true, "**/__pycache__": true, ".pytest_cache": true, }, + + // Pylance settings (VS Code) + "python.analysis.extraPaths": ["src", "resources"], + "python.analysis.typeCheckingMode": "basic", + "python.analysis.diagnosticMode": "workspace", + "python.analysis.stubPath": ".vscode", + + // Pyright settings (Cursor) + "cursorpyright.analysis.extraPaths": ["src", "resources"], + "cursorpyright.analysis.typeCheckingMode": "basic", + "cursorpyright.analysis.diagnosticMode": "workspace", + "cursorpyright.analysis.stubPath": ".vscode", + + // General Python settings + "python.defaultInterpreterPath": "./.venv/bin/python", + "python.testing.unittestEnabled": false, + "python.testing.pytestEnabled": true, "[python]": { "editor.defaultFormatter": "ms-python.black-formatter", "editor.formatOnSave": true,