diff --git a/NEXT_CHANGELOG.md b/NEXT_CHANGELOG.md index c621b0d3fd..a23db094e9 100644 --- a/NEXT_CHANGELOG.md +++ b/NEXT_CHANGELOG.md @@ -12,6 +12,7 @@ * Upgrade TF provider to 1.88.0 ([#3529](https://github.com/databricks/cli/pull/3529)) ### Bundles +* Update default-python template to make DB Connect work out of the box for unit tests, using uv to install dependencies ([#3254](https://github.com/databricks/cli/pull/3254)) * Add support for `TaskRetryMode` for continuous jobs ([#3529](https://github.com/databricks/cli/pull/3529)) * Add support for specifying database instance as an application resource ([#3529](https://github.com/databricks/cli/pull/3529)) diff --git a/acceptance/bundle/templates/default-python/classic/output/my_default_python/README.md b/acceptance/bundle/templates/default-python/classic/output/my_default_python/README.md index f3b5d153b2..e01be4259d 100644 --- a/acceptance/bundle/templates/default-python/classic/output/my_default_python/README.md +++ b/acceptance/bundle/templates/default-python/classic/output/my_default_python/README.md @@ -2,18 +2,39 @@ The 'my_default_python' project was generated by using the default-python template. +For documentation on the Databricks Asset Bundles format use for this project, +and for CI/CD configuration, see https://docs.databricks.com/aws/en/dev-tools/bundles. + ## Getting started -0. Install UV: https://docs.astral.sh/uv/getting-started/installation/ +Choose how you want to work on this project: + +(a) Directly in your Databricks workspace, see + https://docs.databricks.com/dev-tools/bundles/workspace. + +(b) Locally with an IDE like Cursor or VS Code, see + https://docs.databricks.com/vscode-ext. + +(c) With command line tools, see https://docs.databricks.com/dev-tools/cli/databricks-cli.html + + +Dependencies for this project should be installed using uv: -1. Install the Databricks CLI from https://docs.databricks.com/dev-tools/cli/databricks-cli.html +* Make sure you have the UV package manager installed. + It's an alternative to tools like pip: https://docs.astral.sh/uv/getting-started/installation/. +* Run `uv sync --dev` to install the project's dependencies. -2. Authenticate to your Databricks workspace, if you have not done so already: +# Using this project using the CLI + +The Databricks workspace and IDE extensions provide a graphical interface for working +with this project. It's also possible to interact with it directly using the CLI: + +1. Authenticate to your Databricks workspace, if you have not done so already: ``` $ databricks configure ``` -3. To deploy a development copy of this project, type: +2. To deploy a development copy of this project, type: ``` $ databricks bundle deploy --target dev ``` @@ -23,9 +44,9 @@ The 'my_default_python' project was generated by using the default-python templa This deploys everything that's defined for this project. For example, the default template would deploy a job called `[dev yourname] my_default_python_job` to your workspace. - You can find that job by opening your workpace and clicking on **Workflows**. + You can find that job by opening your workpace and clicking on **Jobs & Pipelines**. -4. Similarly, to deploy a production copy, type: +3. Similarly, to deploy a production copy, type: ``` $ databricks bundle deploy --target prod ``` @@ -35,17 +56,12 @@ The 'my_default_python' project was generated by using the default-python templa is paused when deploying in development mode (see https://docs.databricks.com/dev-tools/bundles/deployment-modes.html). -5. To run a job or pipeline, use the "run" command: +4. To run a job or pipeline, use the "run" command: ``` $ databricks bundle run ``` -6. Optionally, install the Databricks extension for Visual Studio code for local development from - https://docs.databricks.com/dev-tools/vscode-ext.html. It can configure your - virtual environment and setup Databricks Connect for running unit tests locally. - When not using these tools, consult your development environment's documentation - and/or the documentation for Databricks Connect for manually setting up your environment - (https://docs.databricks.com/en/dev-tools/databricks-connect/python/index.html). - -7. For documentation on the Databricks asset bundles format used - for this project, and for CI/CD configuration, see - https://docs.databricks.com/dev-tools/bundles/index.html. + +5. Finally, to run tests locally, use `pytest`: + ``` + $ uv run pytest + ``` diff --git a/acceptance/bundle/templates/default-python/classic/output/my_default_python/pyproject.toml b/acceptance/bundle/templates/default-python/classic/output/my_default_python/pyproject.toml index 5049f8a3ea..ef43b9429f 100644 --- a/acceptance/bundle/templates/default-python/classic/output/my_default_python/pyproject.toml +++ b/acceptance/bundle/templates/default-python/classic/output/my_default_python/pyproject.toml @@ -2,9 +2,9 @@ name = "my_default_python" version = "0.0.1" authors = [{ name = "[USERNAME]" }] -requires-python = ">= 3.11" +requires-python = ">=3.10,<=3.13" -[project.optional-dependencies] +[dependency-groups] dev = [ "pytest", @@ -12,16 +12,10 @@ dev = [ "databricks-dlt", # databricks-connect can be used to run parts of this project locally. - # See https://docs.databricks.com/dev-tools/databricks-connect.html. - # - # Note, databricks-connect is automatically installed if you're using Databricks - # extension for Visual Studio Code - # (https://docs.databricks.com/dev-tools/vscode-ext/dev-tasks/databricks-connect.html). - # - # To manually install databricks-connect, uncomment the line below to install a version - # of db-connect that corresponds to the Databricks Runtime version used for this project. - # See https://docs.databricks.com/dev-tools/databricks-connect.html - # "databricks-connect>=15.4,<15.5", + # Note that for local development, you should use a version that is not newer + # than the remote cluster or serverless compute you connect to. + # See also https://docs.databricks.com/dev-tools/databricks-connect.html. + "databricks-connect>=15.4,<15.5", ] [tool.pytest.ini_options] diff --git a/acceptance/bundle/templates/default-python/classic/output/my_default_python/scratch/exploration.ipynb b/acceptance/bundle/templates/default-python/classic/output/my_default_python/scratch/exploration.ipynb index a12773d4e8..3f589fed74 100644 --- a/acceptance/bundle/templates/default-python/classic/output/my_default_python/scratch/exploration.ipynb +++ b/acceptance/bundle/templates/default-python/classic/output/my_default_python/scratch/exploration.ipynb @@ -32,7 +32,7 @@ "sys.path.append(\"../src\")\n", "from my_default_python import main\n", "\n", - "main.get_taxis(spark).show(10)" + "main.get_taxis().show(10)" ] } ], diff --git a/acceptance/bundle/templates/default-python/classic/output/my_default_python/src/my_default_python/main.py b/acceptance/bundle/templates/default-python/classic/output/my_default_python/src/my_default_python/main.py index 5ae344c7e2..04e8be4de0 100644 --- a/acceptance/bundle/templates/default-python/classic/output/my_default_python/src/my_default_python/main.py +++ b/acceptance/bundle/templates/default-python/classic/output/my_default_python/src/my_default_python/main.py @@ -1,24 +1,13 @@ -from pyspark.sql import SparkSession, DataFrame +from databricks.sdk.runtime import spark +from pyspark.sql import DataFrame -def get_taxis(spark: SparkSession) -> DataFrame: +def find_all_taxis() -> DataFrame: return spark.read.table("samples.nyctaxi.trips") -# Create a new Databricks Connect session. If this fails, -# check that you have configured Databricks Connect correctly. -# See https://docs.databricks.com/dev-tools/databricks-connect.html. -def get_spark() -> SparkSession: - try: - from databricks.connect import DatabricksSession - - return DatabricksSession.builder.getOrCreate() - except ImportError: - return SparkSession.builder.getOrCreate() - - def main(): - get_taxis(get_spark()).show(5) + find_all_taxis().show(5) if __name__ == "__main__": diff --git a/acceptance/bundle/templates/default-python/classic/output/my_default_python/src/notebook.ipynb b/acceptance/bundle/templates/default-python/classic/output/my_default_python/src/notebook.ipynb index 472ccb2190..27c3f19e34 100644 --- a/acceptance/bundle/templates/default-python/classic/output/my_default_python/src/notebook.ipynb +++ b/acceptance/bundle/templates/default-python/classic/output/my_default_python/src/notebook.ipynb @@ -46,7 +46,7 @@ "source": [ "from my_default_python import main\n", "\n", - "main.get_taxis(spark).show(10)" + "main.find_all_taxis().show(10)" ] } ], diff --git a/acceptance/bundle/templates/default-python/classic/output/my_default_python/src/pipeline.ipynb b/acceptance/bundle/templates/default-python/classic/output/my_default_python/src/pipeline.ipynb index 53148beff1..21e8560105 100644 --- a/acceptance/bundle/templates/default-python/classic/output/my_default_python/src/pipeline.ipynb +++ b/acceptance/bundle/templates/default-python/classic/output/my_default_python/src/pipeline.ipynb @@ -56,7 +56,7 @@ "source": [ "@dlt.view\n", "def taxi_raw():\n", - " return main.get_taxis(spark)\n", + " return main.find_all_taxis()\n", "\n", "\n", "@dlt.table\n", diff --git a/acceptance/bundle/templates/default-python/classic/output/my_default_python/tests/conftest.py b/acceptance/bundle/templates/default-python/classic/output/my_default_python/tests/conftest.py new file mode 100644 index 0000000000..579e398618 --- /dev/null +++ b/acceptance/bundle/templates/default-python/classic/output/my_default_python/tests/conftest.py @@ -0,0 +1,57 @@ +"""This file configures pytest.""" + +import os, sys, pathlib +from contextlib import contextmanager + + +try: + from databricks.connect import DatabricksSession + from databricks.sdk import WorkspaceClient + from pyspark.sql import SparkSession + import pytest +except ImportError: + raise ImportError("Test dependencies not found.\n\nRun tests using 'uv run pytest'. See http://docs.astral.sh/uv to learn more about uv.") + + +def enable_fallback_compute(): + """Enable serverless compute if no compute is specified.""" + conf = WorkspaceClient().config + if conf.serverless_compute_id or conf.cluster_id or os.environ.get("SPARK_REMOTE"): + return + + url = "https://docs.databricks.com/dev-tools/databricks-connect/cluster-config" + print("☁️ no compute specified, falling back to serverless compute", file=sys.stderr) + print(f" see {url} for manual configuration", file=sys.stderr) + + os.environ["DATABRICKS_SERVERLESS_COMPUTE_ID"] = "auto" + + +@contextmanager +def allow_stderr_output(config: pytest.Config): + """Temporarily disable pytest output capture.""" + capman = config.pluginmanager.get_plugin("capturemanager") + if capman: + with capman.global_and_fixture_disabled(): + yield + else: + yield + + +def pytest_configure(config: pytest.Config): + """Configure pytest session.""" + with allow_stderr_output(config): + enable_fallback_compute() + + # Initialize Spark session eagerly, so it is available even when + # SparkSession.builder.getOrCreate() is used. For DB Connect 15+, + # we validate version compatibility with the remote cluster. + if hasattr(DatabricksSession.builder, "validateSession"): + DatabricksSession.builder.validateSession().getOrCreate() + else: + DatabricksSession.builder.getOrCreate() + + +@pytest.fixture(scope="session") +def spark() -> SparkSession: + """Provide a SparkSession fixture for tests.""" + return DatabricksSession.builder.getOrCreate() diff --git a/acceptance/bundle/templates/default-python/classic/output/my_default_python/tests/main_test.py b/acceptance/bundle/templates/default-python/classic/output/my_default_python/tests/main_test.py index dc449154a6..4bfd5e1550 100644 --- a/acceptance/bundle/templates/default-python/classic/output/my_default_python/tests/main_test.py +++ b/acceptance/bundle/templates/default-python/classic/output/my_default_python/tests/main_test.py @@ -1,6 +1,6 @@ -from my_default_python.main import get_taxis, get_spark +from my_default_python import main -def test_main(): - taxis = get_taxis(get_spark()) +def test_find_all_taxis(): + taxis = main.find_all_taxis() assert taxis.count() > 5 diff --git a/acceptance/bundle/templates/default-python/serverless/output/my_default_python/README.md b/acceptance/bundle/templates/default-python/serverless/output/my_default_python/README.md index f3b5d153b2..e01be4259d 100644 --- a/acceptance/bundle/templates/default-python/serverless/output/my_default_python/README.md +++ b/acceptance/bundle/templates/default-python/serverless/output/my_default_python/README.md @@ -2,18 +2,39 @@ The 'my_default_python' project was generated by using the default-python template. +For documentation on the Databricks Asset Bundles format use for this project, +and for CI/CD configuration, see https://docs.databricks.com/aws/en/dev-tools/bundles. + ## Getting started -0. Install UV: https://docs.astral.sh/uv/getting-started/installation/ +Choose how you want to work on this project: + +(a) Directly in your Databricks workspace, see + https://docs.databricks.com/dev-tools/bundles/workspace. + +(b) Locally with an IDE like Cursor or VS Code, see + https://docs.databricks.com/vscode-ext. + +(c) With command line tools, see https://docs.databricks.com/dev-tools/cli/databricks-cli.html + + +Dependencies for this project should be installed using uv: -1. Install the Databricks CLI from https://docs.databricks.com/dev-tools/cli/databricks-cli.html +* Make sure you have the UV package manager installed. + It's an alternative to tools like pip: https://docs.astral.sh/uv/getting-started/installation/. +* Run `uv sync --dev` to install the project's dependencies. -2. Authenticate to your Databricks workspace, if you have not done so already: +# Using this project using the CLI + +The Databricks workspace and IDE extensions provide a graphical interface for working +with this project. It's also possible to interact with it directly using the CLI: + +1. Authenticate to your Databricks workspace, if you have not done so already: ``` $ databricks configure ``` -3. To deploy a development copy of this project, type: +2. To deploy a development copy of this project, type: ``` $ databricks bundle deploy --target dev ``` @@ -23,9 +44,9 @@ The 'my_default_python' project was generated by using the default-python templa This deploys everything that's defined for this project. For example, the default template would deploy a job called `[dev yourname] my_default_python_job` to your workspace. - You can find that job by opening your workpace and clicking on **Workflows**. + You can find that job by opening your workpace and clicking on **Jobs & Pipelines**. -4. Similarly, to deploy a production copy, type: +3. Similarly, to deploy a production copy, type: ``` $ databricks bundle deploy --target prod ``` @@ -35,17 +56,12 @@ The 'my_default_python' project was generated by using the default-python templa is paused when deploying in development mode (see https://docs.databricks.com/dev-tools/bundles/deployment-modes.html). -5. To run a job or pipeline, use the "run" command: +4. To run a job or pipeline, use the "run" command: ``` $ databricks bundle run ``` -6. Optionally, install the Databricks extension for Visual Studio code for local development from - https://docs.databricks.com/dev-tools/vscode-ext.html. It can configure your - virtual environment and setup Databricks Connect for running unit tests locally. - When not using these tools, consult your development environment's documentation - and/or the documentation for Databricks Connect for manually setting up your environment - (https://docs.databricks.com/en/dev-tools/databricks-connect/python/index.html). - -7. For documentation on the Databricks asset bundles format used - for this project, and for CI/CD configuration, see - https://docs.databricks.com/dev-tools/bundles/index.html. + +5. Finally, to run tests locally, use `pytest`: + ``` + $ uv run pytest + ``` diff --git a/acceptance/bundle/templates/default-python/serverless/output/my_default_python/pyproject.toml b/acceptance/bundle/templates/default-python/serverless/output/my_default_python/pyproject.toml index 5049f8a3ea..ef43b9429f 100644 --- a/acceptance/bundle/templates/default-python/serverless/output/my_default_python/pyproject.toml +++ b/acceptance/bundle/templates/default-python/serverless/output/my_default_python/pyproject.toml @@ -2,9 +2,9 @@ name = "my_default_python" version = "0.0.1" authors = [{ name = "[USERNAME]" }] -requires-python = ">= 3.11" +requires-python = ">=3.10,<=3.13" -[project.optional-dependencies] +[dependency-groups] dev = [ "pytest", @@ -12,16 +12,10 @@ dev = [ "databricks-dlt", # databricks-connect can be used to run parts of this project locally. - # See https://docs.databricks.com/dev-tools/databricks-connect.html. - # - # Note, databricks-connect is automatically installed if you're using Databricks - # extension for Visual Studio Code - # (https://docs.databricks.com/dev-tools/vscode-ext/dev-tasks/databricks-connect.html). - # - # To manually install databricks-connect, uncomment the line below to install a version - # of db-connect that corresponds to the Databricks Runtime version used for this project. - # See https://docs.databricks.com/dev-tools/databricks-connect.html - # "databricks-connect>=15.4,<15.5", + # Note that for local development, you should use a version that is not newer + # than the remote cluster or serverless compute you connect to. + # See also https://docs.databricks.com/dev-tools/databricks-connect.html. + "databricks-connect>=15.4,<15.5", ] [tool.pytest.ini_options] diff --git a/acceptance/bundle/templates/default-python/serverless/output/my_default_python/scratch/exploration.ipynb b/acceptance/bundle/templates/default-python/serverless/output/my_default_python/scratch/exploration.ipynb index a12773d4e8..3f589fed74 100644 --- a/acceptance/bundle/templates/default-python/serverless/output/my_default_python/scratch/exploration.ipynb +++ b/acceptance/bundle/templates/default-python/serverless/output/my_default_python/scratch/exploration.ipynb @@ -32,7 +32,7 @@ "sys.path.append(\"../src\")\n", "from my_default_python import main\n", "\n", - "main.get_taxis(spark).show(10)" + "main.get_taxis().show(10)" ] } ], diff --git a/acceptance/bundle/templates/default-python/serverless/output/my_default_python/src/my_default_python/main.py b/acceptance/bundle/templates/default-python/serverless/output/my_default_python/src/my_default_python/main.py index 5ae344c7e2..04e8be4de0 100644 --- a/acceptance/bundle/templates/default-python/serverless/output/my_default_python/src/my_default_python/main.py +++ b/acceptance/bundle/templates/default-python/serverless/output/my_default_python/src/my_default_python/main.py @@ -1,24 +1,13 @@ -from pyspark.sql import SparkSession, DataFrame +from databricks.sdk.runtime import spark +from pyspark.sql import DataFrame -def get_taxis(spark: SparkSession) -> DataFrame: +def find_all_taxis() -> DataFrame: return spark.read.table("samples.nyctaxi.trips") -# Create a new Databricks Connect session. If this fails, -# check that you have configured Databricks Connect correctly. -# See https://docs.databricks.com/dev-tools/databricks-connect.html. -def get_spark() -> SparkSession: - try: - from databricks.connect import DatabricksSession - - return DatabricksSession.builder.getOrCreate() - except ImportError: - return SparkSession.builder.getOrCreate() - - def main(): - get_taxis(get_spark()).show(5) + find_all_taxis().show(5) if __name__ == "__main__": diff --git a/acceptance/bundle/templates/default-python/serverless/output/my_default_python/src/notebook.ipynb b/acceptance/bundle/templates/default-python/serverless/output/my_default_python/src/notebook.ipynb index 472ccb2190..27c3f19e34 100644 --- a/acceptance/bundle/templates/default-python/serverless/output/my_default_python/src/notebook.ipynb +++ b/acceptance/bundle/templates/default-python/serverless/output/my_default_python/src/notebook.ipynb @@ -46,7 +46,7 @@ "source": [ "from my_default_python import main\n", "\n", - "main.get_taxis(spark).show(10)" + "main.find_all_taxis().show(10)" ] } ], diff --git a/acceptance/bundle/templates/default-python/serverless/output/my_default_python/src/pipeline.ipynb b/acceptance/bundle/templates/default-python/serverless/output/my_default_python/src/pipeline.ipynb index 53148beff1..21e8560105 100644 --- a/acceptance/bundle/templates/default-python/serverless/output/my_default_python/src/pipeline.ipynb +++ b/acceptance/bundle/templates/default-python/serverless/output/my_default_python/src/pipeline.ipynb @@ -56,7 +56,7 @@ "source": [ "@dlt.view\n", "def taxi_raw():\n", - " return main.get_taxis(spark)\n", + " return main.find_all_taxis()\n", "\n", "\n", "@dlt.table\n", diff --git a/acceptance/bundle/templates/default-python/serverless/output/my_default_python/tests/conftest.py b/acceptance/bundle/templates/default-python/serverless/output/my_default_python/tests/conftest.py new file mode 100644 index 0000000000..579e398618 --- /dev/null +++ b/acceptance/bundle/templates/default-python/serverless/output/my_default_python/tests/conftest.py @@ -0,0 +1,57 @@ +"""This file configures pytest.""" + +import os, sys, pathlib +from contextlib import contextmanager + + +try: + from databricks.connect import DatabricksSession + from databricks.sdk import WorkspaceClient + from pyspark.sql import SparkSession + import pytest +except ImportError: + raise ImportError("Test dependencies not found.\n\nRun tests using 'uv run pytest'. See http://docs.astral.sh/uv to learn more about uv.") + + +def enable_fallback_compute(): + """Enable serverless compute if no compute is specified.""" + conf = WorkspaceClient().config + if conf.serverless_compute_id or conf.cluster_id or os.environ.get("SPARK_REMOTE"): + return + + url = "https://docs.databricks.com/dev-tools/databricks-connect/cluster-config" + print("☁️ no compute specified, falling back to serverless compute", file=sys.stderr) + print(f" see {url} for manual configuration", file=sys.stderr) + + os.environ["DATABRICKS_SERVERLESS_COMPUTE_ID"] = "auto" + + +@contextmanager +def allow_stderr_output(config: pytest.Config): + """Temporarily disable pytest output capture.""" + capman = config.pluginmanager.get_plugin("capturemanager") + if capman: + with capman.global_and_fixture_disabled(): + yield + else: + yield + + +def pytest_configure(config: pytest.Config): + """Configure pytest session.""" + with allow_stderr_output(config): + enable_fallback_compute() + + # Initialize Spark session eagerly, so it is available even when + # SparkSession.builder.getOrCreate() is used. For DB Connect 15+, + # we validate version compatibility with the remote cluster. + if hasattr(DatabricksSession.builder, "validateSession"): + DatabricksSession.builder.validateSession().getOrCreate() + else: + DatabricksSession.builder.getOrCreate() + + +@pytest.fixture(scope="session") +def spark() -> SparkSession: + """Provide a SparkSession fixture for tests.""" + return DatabricksSession.builder.getOrCreate() diff --git a/acceptance/bundle/templates/default-python/serverless/output/my_default_python/tests/main_test.py b/acceptance/bundle/templates/default-python/serverless/output/my_default_python/tests/main_test.py index dc449154a6..4bfd5e1550 100644 --- a/acceptance/bundle/templates/default-python/serverless/output/my_default_python/tests/main_test.py +++ b/acceptance/bundle/templates/default-python/serverless/output/my_default_python/tests/main_test.py @@ -1,6 +1,6 @@ -from my_default_python.main import get_taxis, get_spark +from my_default_python import main -def test_main(): - taxis = get_taxis(get_spark()) +def test_find_all_taxis(): + taxis = main.find_all_taxis() assert taxis.count() > 5 diff --git a/libs/template/templates/default-python/library/versions.tmpl b/libs/template/templates/default-python/library/versions.tmpl index 7d0c88e7df..79c3955c99 100644 --- a/libs/template/templates/default-python/library/versions.tmpl +++ b/libs/template/templates/default-python/library/versions.tmpl @@ -1,7 +1,24 @@ +{{/* The latest LTS DBR version; this should be updated a few months after each LTS. + */}} {{define "latest_lts_dbr_version" -}} 15.4.x-scala2.12 {{- end}} -{{define "latest_lts_db_connect_version_spec" -}} +{{/* A safe version of DB Connect that is compatible with at least half the + * clusters running in production. + * + * We need to be very conservative in updating this, since a newer version can + * only connect to compute of that same version and higher. If the version is + * deemed too old, customers can update the version themselves after initializing + * the template. + */}} +{{define "conservative_db_connect_version_spec" -}} >=15.4,<15.5 {{- end}} + +{{/* DB Connect 15 only supports versions up to 3.13 because of a limitation in + * pyarrow: https://arrow.apache.org/docs/python/install.html#python-compatibility + */}} +{{define "conservative_db_connect_python_version_spec" -}} + >=3.10,<=3.13 +{{- end}} diff --git a/libs/template/templates/default-python/template/{{.project_name}}/README.md.tmpl b/libs/template/templates/default-python/template/{{.project_name}}/README.md.tmpl index bc4fe07b54..02da531477 100644 --- a/libs/template/templates/default-python/template/{{.project_name}}/README.md.tmpl +++ b/libs/template/templates/default-python/template/{{.project_name}}/README.md.tmpl @@ -2,18 +2,40 @@ The '{{.project_name}}' project was generated by using the default-python template. +For documentation on the Databricks Asset Bundles format use for this project, +and for CI/CD configuration, see https://docs.databricks.com/aws/en/dev-tools/bundles. + ## Getting started -0. Install UV: https://docs.astral.sh/uv/getting-started/installation/ +Choose how you want to work on this project: + +(a) Directly in your Databricks workspace, see + https://docs.databricks.com/dev-tools/bundles/workspace. + +(b) Locally with an IDE like Cursor or VS Code, see + https://docs.databricks.com/vscode-ext. + +(c) With command line tools, see https://docs.databricks.com/dev-tools/cli/databricks-cli.html + +{{if (eq .include_python "yes") }} +Dependencies for this project should be installed using uv: + +* Make sure you have the UV package manager installed. + It's an alternative to tools like pip: https://docs.astral.sh/uv/getting-started/installation/. +* Run `uv sync --dev` to install the project's dependencies. +{{- end}} + +# Using this project using the CLI -1. Install the Databricks CLI from https://docs.databricks.com/dev-tools/cli/databricks-cli.html +The Databricks workspace and IDE extensions provide a graphical interface for working +with this project. It's also possible to interact with it directly using the CLI: -2. Authenticate to your Databricks workspace, if you have not done so already: +1. Authenticate to your Databricks workspace, if you have not done so already: ``` $ databricks configure ``` -3. To deploy a development copy of this project, type: +2. To deploy a development copy of this project, type: ``` $ databricks bundle deploy --target dev ``` @@ -23,9 +45,9 @@ The '{{.project_name}}' project was generated by using the default-python templa This deploys everything that's defined for this project. For example, the default template would deploy a job called `[dev yourname] {{.project_name}}_job` to your workspace. - You can find that job by opening your workpace and clicking on **Workflows**. + You can find that job by opening your workpace and clicking on **Jobs & Pipelines**. -4. Similarly, to deploy a production copy, type: +3. Similarly, to deploy a production copy, type: ``` $ databricks bundle deploy --target prod ``` @@ -35,23 +57,12 @@ The '{{.project_name}}' project was generated by using the default-python templa is paused when deploying in development mode (see https://docs.databricks.com/dev-tools/bundles/deployment-modes.html). -5. To run a job or pipeline, use the "run" command: +4. To run a job or pipeline, use the "run" command: ``` $ databricks bundle run ``` -{{- if (eq .include_python "no") }} -6. Optionally, install developer tools such as the Databricks extension for Visual Studio Code from - https://docs.databricks.com/dev-tools/vscode-ext.html. -{{- else }} -6. Optionally, install the Databricks extension for Visual Studio code for local development from - https://docs.databricks.com/dev-tools/vscode-ext.html. It can configure your - virtual environment and setup Databricks Connect for running unit tests locally. - When not using these tools, consult your development environment's documentation - and/or the documentation for Databricks Connect for manually setting up your environment - (https://docs.databricks.com/en/dev-tools/databricks-connect/python/index.html). -{{- end}} - -7. For documentation on the Databricks asset bundles format used - for this project, and for CI/CD configuration, see - https://docs.databricks.com/dev-tools/bundles/index.html. +5. Finally, to run tests locally, use `pytest`: + ``` + $ uv run pytest + ``` diff --git a/libs/template/templates/default-python/template/{{.project_name}}/pyproject.toml.tmpl b/libs/template/templates/default-python/template/{{.project_name}}/pyproject.toml.tmpl index 7dd4694ea9..3d83de0077 100644 --- a/libs/template/templates/default-python/template/{{.project_name}}/pyproject.toml.tmpl +++ b/libs/template/templates/default-python/template/{{.project_name}}/pyproject.toml.tmpl @@ -2,9 +2,9 @@ name = "{{.project_name}}" version = "0.0.1" authors = [{ name = "{{user_name}}" }] -requires-python = ">= 3.11" +requires-python = "{{template "conservative_db_connect_python_version_spec"}}" -[project.optional-dependencies] +[dependency-groups] dev = [ "pytest", @@ -12,16 +12,10 @@ dev = [ "databricks-dlt", # databricks-connect can be used to run parts of this project locally. - # See https://docs.databricks.com/dev-tools/databricks-connect.html. - # - # Note, databricks-connect is automatically installed if you're using Databricks - # extension for Visual Studio Code - # (https://docs.databricks.com/dev-tools/vscode-ext/dev-tasks/databricks-connect.html). - # - # To manually install databricks-connect, uncomment the line below to install a version - # of db-connect that corresponds to the Databricks Runtime version used for this project. - # See https://docs.databricks.com/dev-tools/databricks-connect.html - # "databricks-connect{{template "latest_lts_db_connect_version_spec"}}", + # Note that for local development, you should use a version that is not newer + # than the remote cluster or serverless compute you connect to. + # See also https://docs.databricks.com/dev-tools/databricks-connect.html. + "databricks-connect{{template "conservative_db_connect_version_spec"}}", ] [tool.pytest.ini_options] diff --git a/libs/template/templates/default-python/template/{{.project_name}}/scratch/exploration.ipynb.tmpl b/libs/template/templates/default-python/template/{{.project_name}}/scratch/exploration.ipynb.tmpl index d3e9beef37..d5c05798ac 100644 --- a/libs/template/templates/default-python/template/{{.project_name}}/scratch/exploration.ipynb.tmpl +++ b/libs/template/templates/default-python/template/{{.project_name}}/scratch/exploration.ipynb.tmpl @@ -33,9 +33,9 @@ "sys.path.append(\"../src\")\n", "from {{.project_name}} import main\n", "\n", - "main.get_taxis(spark).show(10)" + "main.get_taxis().show(10)" {{else}} - "spark.range(10)" + "spark.read.table(\"samples.nyctaxi.trips\")" {{end -}} ] } diff --git a/libs/template/templates/default-python/template/{{.project_name}}/src/notebook.ipynb.tmpl b/libs/template/templates/default-python/template/{{.project_name}}/src/notebook.ipynb.tmpl index 6782a053ba..53cb3040c6 100644 --- a/libs/template/templates/default-python/template/{{.project_name}}/src/notebook.ipynb.tmpl +++ b/libs/template/templates/default-python/template/{{.project_name}}/src/notebook.ipynb.tmpl @@ -47,9 +47,9 @@ {{- if (eq .include_python "yes") }} "from {{.project_name}} import main\n", "\n", - "main.get_taxis(spark).show(10)" + "main.find_all_taxis().show(10)" {{else}} - "spark.range(10)" + "display(spark.read.table(\"samples.nyctaxi.trips\"))" {{end -}} ] } diff --git a/libs/template/templates/default-python/template/{{.project_name}}/src/pipeline.ipynb.tmpl b/libs/template/templates/default-python/template/{{.project_name}}/src/pipeline.ipynb.tmpl index a8e35c53be..342fafcf6f 100644 --- a/libs/template/templates/default-python/template/{{.project_name}}/src/pipeline.ipynb.tmpl +++ b/libs/template/templates/default-python/template/{{.project_name}}/src/pipeline.ipynb.tmpl @@ -64,7 +64,7 @@ {{- if (eq .include_python "yes") }} "@dlt.view\n", "def taxi_raw():\n", - " return main.get_taxis(spark)\n", + " return main.find_all_taxis()\n", {{else}} "\n", "@dlt.view\n", diff --git a/libs/template/templates/default-python/template/{{.project_name}}/src/{{.project_name}}/main.py.tmpl b/libs/template/templates/default-python/template/{{.project_name}}/src/{{.project_name}}/main.py.tmpl index 5ae344c7e2..04e8be4de0 100644 --- a/libs/template/templates/default-python/template/{{.project_name}}/src/{{.project_name}}/main.py.tmpl +++ b/libs/template/templates/default-python/template/{{.project_name}}/src/{{.project_name}}/main.py.tmpl @@ -1,24 +1,13 @@ -from pyspark.sql import SparkSession, DataFrame +from databricks.sdk.runtime import spark +from pyspark.sql import DataFrame -def get_taxis(spark: SparkSession) -> DataFrame: +def find_all_taxis() -> DataFrame: return spark.read.table("samples.nyctaxi.trips") -# Create a new Databricks Connect session. If this fails, -# check that you have configured Databricks Connect correctly. -# See https://docs.databricks.com/dev-tools/databricks-connect.html. -def get_spark() -> SparkSession: - try: - from databricks.connect import DatabricksSession - - return DatabricksSession.builder.getOrCreate() - except ImportError: - return SparkSession.builder.getOrCreate() - - def main(): - get_taxis(get_spark()).show(5) + find_all_taxis().show(5) if __name__ == "__main__": diff --git a/libs/template/templates/default-python/template/{{.project_name}}/tests/conftest.py.tmpl b/libs/template/templates/default-python/template/{{.project_name}}/tests/conftest.py.tmpl new file mode 100644 index 0000000000..579e398618 --- /dev/null +++ b/libs/template/templates/default-python/template/{{.project_name}}/tests/conftest.py.tmpl @@ -0,0 +1,57 @@ +"""This file configures pytest.""" + +import os, sys, pathlib +from contextlib import contextmanager + + +try: + from databricks.connect import DatabricksSession + from databricks.sdk import WorkspaceClient + from pyspark.sql import SparkSession + import pytest +except ImportError: + raise ImportError("Test dependencies not found.\n\nRun tests using 'uv run pytest'. See http://docs.astral.sh/uv to learn more about uv.") + + +def enable_fallback_compute(): + """Enable serverless compute if no compute is specified.""" + conf = WorkspaceClient().config + if conf.serverless_compute_id or conf.cluster_id or os.environ.get("SPARK_REMOTE"): + return + + url = "https://docs.databricks.com/dev-tools/databricks-connect/cluster-config" + print("☁️ no compute specified, falling back to serverless compute", file=sys.stderr) + print(f" see {url} for manual configuration", file=sys.stderr) + + os.environ["DATABRICKS_SERVERLESS_COMPUTE_ID"] = "auto" + + +@contextmanager +def allow_stderr_output(config: pytest.Config): + """Temporarily disable pytest output capture.""" + capman = config.pluginmanager.get_plugin("capturemanager") + if capman: + with capman.global_and_fixture_disabled(): + yield + else: + yield + + +def pytest_configure(config: pytest.Config): + """Configure pytest session.""" + with allow_stderr_output(config): + enable_fallback_compute() + + # Initialize Spark session eagerly, so it is available even when + # SparkSession.builder.getOrCreate() is used. For DB Connect 15+, + # we validate version compatibility with the remote cluster. + if hasattr(DatabricksSession.builder, "validateSession"): + DatabricksSession.builder.validateSession().getOrCreate() + else: + DatabricksSession.builder.getOrCreate() + + +@pytest.fixture(scope="session") +def spark() -> SparkSession: + """Provide a SparkSession fixture for tests.""" + return DatabricksSession.builder.getOrCreate() diff --git a/libs/template/templates/default-python/template/{{.project_name}}/tests/main_test.py.tmpl b/libs/template/templates/default-python/template/{{.project_name}}/tests/main_test.py.tmpl index fea2f3f665..084454eb3e 100644 --- a/libs/template/templates/default-python/template/{{.project_name}}/tests/main_test.py.tmpl +++ b/libs/template/templates/default-python/template/{{.project_name}}/tests/main_test.py.tmpl @@ -1,6 +1,6 @@ -from {{.project_name}}.main import get_taxis, get_spark +from {{.project_name}} import main -def test_main(): - taxis = get_taxis(get_spark()) +def test_find_all_taxis(): + taxis = main.find_all_taxis() assert taxis.count() > 5