From 748ba5e7ca66382447315ef23963bc1e45b559e0 Mon Sep 17 00:00:00 2001 From: Lennart Kats Date: Mon, 4 Sep 2023 16:00:29 +0200 Subject: [PATCH 1/4] Add default_python --- LICENSE | 51 +++++++++++++++ default_python/.gitignore | 9 +++ default_python/.vscode/__builtins__.pyi | 3 + default_python/.vscode/extensions.json | 7 ++ default_python/.vscode/settings.json | 14 ++++ default_python/README.md | 36 ++++++++++ default_python/databricks.yml | 43 ++++++++++++ default_python/fixtures/.gitkeep | 22 +++++++ default_python/pytest.ini | 3 + .../resources/default_python_job.yml | 37 +++++++++++ default_python/scratch/README.md | 4 ++ default_python/scratch/exploration.ipynb | 50 ++++++++++++++ default_python/setup.py | 24 +++++++ default_python/src/default_python/__init__.py | 1 + default_python/src/default_python/main.py | 11 ++++ default_python/src/notebook.ipynb | 65 +++++++++++++++++++ default_python/tests/main_test.py | 5 ++ 17 files changed, 385 insertions(+) create mode 100644 LICENSE create mode 100644 default_python/.gitignore create mode 100644 default_python/.vscode/__builtins__.pyi create mode 100644 default_python/.vscode/extensions.json create mode 100644 default_python/.vscode/settings.json create mode 100644 default_python/README.md create mode 100644 default_python/databricks.yml create mode 100644 default_python/fixtures/.gitkeep create mode 100644 default_python/pytest.ini create mode 100644 default_python/resources/default_python_job.yml create mode 100644 default_python/scratch/README.md create mode 100644 default_python/scratch/exploration.ipynb create mode 100644 default_python/setup.py create mode 100644 default_python/src/default_python/__init__.py create mode 100644 default_python/src/default_python/main.py create mode 100644 default_python/src/notebook.ipynb create mode 100644 default_python/tests/main_test.py diff --git a/LICENSE b/LICENSE new file mode 100644 index 00000000..42755f45 --- /dev/null +++ b/LICENSE @@ -0,0 +1,51 @@ +DB license + +Copyright (2022) Databricks, Inc. + +Definitions. + +Agreement: The agreement between Databricks, Inc., and you governing the use of the Databricks Services, which shall +be, with respect to Databricks, the Databricks Terms of Service located at www.databricks.com/termsofservice, and with +respect to Databricks Community Edition, the Community Edition Terms of Service located at +www.databricks.com/ce-termsofuse, in each case unless you have entered into a separate written agreement with +Databricks governing the use of the applicable Databricks Services. + +Software: The source code and object code to which this license applies. + +Scope of Use. You may not use this Software except in connection with your use of the Databricks Services pursuant to +the Agreement. Your use of the Software must comply at all times with any restrictions applicable to the Databricks +Services, generally, and must be used in accordance with any applicable documentation. You may view, use, copy, +modify, publish, and/or distribute the Software solely for the purposes of using the code within or connecting to the +Databricks Services. If you do not agree to these terms, you may not view, use, copy, modify, publish, and/or +distribute the Software. + +Redistribution. You may redistribute and sublicense the Software so long as all use is in compliance with these terms. +In addition: + +You must give any other recipients a copy of this License; +You must cause any modified files to carry prominent notices stating that you changed the files; +You must retain, in the source code form of any derivative works that you distribute, all copyright, patent, +trademark, and attribution notices from the source code form, excluding those notices that do not pertain to any part +of the derivative works; and +If the source code form includes a "NOTICE" text file as part of its distribution, then any derivative works that you +distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those +notices that do not pertain to any part of the derivative works. +You may add your own copyright statement to your modifications and may provide additional license terms and conditions +for use, reproduction, or distribution of your modifications, or for any such derivative works as a whole, provided +your use, reproduction, and distribution of the Software otherwise complies with the conditions stated in this +License. + +Termination. This license terminates automatically upon your breach of these terms or upon the termination of your +Agreement. Additionally, Databricks may terminate this license at any time on notice. Upon termination, you must +permanently delete the Software and all copies thereof. + +DISCLAIMER; LIMITATION OF LIABILITY. + +THE SOFTWARE IS PROVIDED “AS-IS” AND WITH ALL FAULTS. DATABRICKS, ON BEHALF OF ITSELF AND ITS LICENSORS, SPECIFICALLY +DISCLAIMS ALL WARRANTIES RELATING TO THE SOURCE CODE, EXPRESS AND IMPLIED, INCLUDING, WITHOUT LIMITATION, IMPLIED +WARRANTIES, CONDITIONS AND OTHER TERMS OF MERCHANTABILITY, SATISFACTORY QUALITY OR FITNESS FOR A PARTICULAR PURPOSE, +AND NON-INFRINGEMENT. DATABRICKS AND ITS LICENSORS TOTAL AGGREGATE LIABILITY RELATING TO OR ARISING OUT OF YOUR USE OF +OR DATABRICKS’ PROVISIONING OF THE SOURCE CODE SHALL BE LIMITED TO ONE THOUSAND ($1,000) DOLLARS. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF +CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/default_python/.gitignore b/default_python/.gitignore new file mode 100644 index 00000000..aa87f019 --- /dev/null +++ b/default_python/.gitignore @@ -0,0 +1,9 @@ + +.databricks/ +build/ +dist/ +__pycache__/ +*.egg-info +.venv/ +scratch/** +!scratch/README.md diff --git a/default_python/.vscode/__builtins__.pyi b/default_python/.vscode/__builtins__.pyi new file mode 100644 index 00000000..0edd5181 --- /dev/null +++ b/default_python/.vscode/__builtins__.pyi @@ -0,0 +1,3 @@ +# Typings for Pylance in Visual Studio Code +# see https://github.com/microsoft/pyright/blob/main/docs/builtins.md +from databricks.sdk.runtime import * diff --git a/default_python/.vscode/extensions.json b/default_python/.vscode/extensions.json new file mode 100644 index 00000000..5d15eba3 --- /dev/null +++ b/default_python/.vscode/extensions.json @@ -0,0 +1,7 @@ +{ + "recommendations": [ + "databricks.databricks", + "ms-python.vscode-pylance", + "redhat.vscode-yaml" + ] +} diff --git a/default_python/.vscode/settings.json b/default_python/.vscode/settings.json new file mode 100644 index 00000000..16cb2c96 --- /dev/null +++ b/default_python/.vscode/settings.json @@ -0,0 +1,14 @@ +{ + "python.analysis.stubPath": ".vscode", + "databricks.python.envFile": "${workspaceFolder}/.env", + "jupyter.interactiveWindow.cellMarker.codeRegex": "^# COMMAND ----------|^# Databricks notebook source|^(#\\s*%%|#\\s*\\|#\\s*In\\[\\d*?\\]|#\\s*In\\[ \\])", + "jupyter.interactiveWindow.cellMarker.default": "# COMMAND ----------", + "python.testing.pytestArgs": [ + "." + ], + "python.testing.unittestEnabled": false, + "python.testing.pytestEnabled": true, + "files.exclude": { + "**/*.egg-info": true + }, +} diff --git a/default_python/README.md b/default_python/README.md new file mode 100644 index 00000000..7784f87d --- /dev/null +++ b/default_python/README.md @@ -0,0 +1,36 @@ +# default_python + +The 'default_python' project was generated by using the default-python template. + +## Getting started + +1. Install the Databricks CLI from https://docs.databricks.com/dev-tools/cli/databricks-cli.html + +2. Authenticate to your Databricks workspace: + ``` + $ databricks configure + ``` + +3. To deploy a development copy of this project, type: + ``` + $ databricks bundle deploy --target dev + ``` + (Note that "dev" is the default target, so the `--target` parameter + is optional here.) + + This deploys everything that's defined for this project. + For example, the default template would deploy a job called + `[dev yourname] default_python-job` to your workspace. + You can find that job by opening your workpace and clicking on **Workflows**. + +4. Similarly, to deploy a production copy, type: + ``` + $ databricks bundle deploy --target prod + ``` + +5. Optionally, install developer tools such as the Databricks extension for Visual Studio Code from + https://docs.databricks.com/dev-tools/vscode-ext.html. + +6. For documentation on the Databricks asset bundles format used + for this project, and for CI/CD configuration, see + https://docs.databricks.com/dev-tools/bundles/index.html. diff --git a/default_python/databricks.yml b/default_python/databricks.yml new file mode 100644 index 00000000..ca87d9af --- /dev/null +++ b/default_python/databricks.yml @@ -0,0 +1,43 @@ +# This is a Databricks asset bundle definition for default_python. +# See https://docs.databricks.com/dev-tools/bundles/index.html for documentation. +bundle: + name: default_python + +include: + - resources/*.yml + +targets: + # The 'dev' target, used development purposes. + # Whenever a developer deploys using 'dev', they get their own copy. + dev: + # We use 'mode: development' to make everything deployed to this target gets a prefix + # like '[dev my_user_name]'. Setting this mode also disables any schedules and + # automatic triggers for jobs and enables the 'development' mode for Delta Live Tables pipelines. + mode: development + default: true + workspace: + host: https://e2-dogfood.staging.cloud.databricks.com + + # Optionally, there could be a 'staging' target here. + # (See Databricks docs on CI/CD at https://docs.databricks.com/dev-tools/bundles/index.html.) + # + # staging: + # workspace: + # host: https://e2-dogfood.staging.cloud.databricks.com + + # The 'prod' target, used for production deployment. + prod: + # For production deployments, we only have a single copy, so we override the + # workspace.root_path default of + # /Users/${workspace.current_user.userName}/.bundle/${bundle.target}/${bundle.name} + # to a path that is not specific to the current user. + mode: production + workspace: + host: https://e2-dogfood.staging.cloud.databricks.com + root_path: /Shared/.bundle/prod/${bundle.name} + run_as: + # This runs as lennart.kats@databricks.com in production. Alternatively, + # a service principal could be used here using service_principal_name + # (see Databricks documentation). + user_name: lennart.kats@databricks.com + \ No newline at end of file diff --git a/default_python/fixtures/.gitkeep b/default_python/fixtures/.gitkeep new file mode 100644 index 00000000..09166865 --- /dev/null +++ b/default_python/fixtures/.gitkeep @@ -0,0 +1,22 @@ +# Fixtures + +This folder is reserved for fixtures, such as CSV files. + +Below is an example of how to load fixtures as a data frame: + +``` +import pandas as pd +import os + +def get_absolute_path(*relative_parts): + if 'dbutils' in globals(): + base_dir = os.path.dirname(dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()) # type: ignore + path = os.path.normpath(os.path.join(base_dir, *relative_parts)) + return path if path.startswith("/Workspace") else os.path.join("/Workspace", path) + else: + return os.path.join(*relative_parts) + +csv_file = get_absolute_path("..", "fixtures", "mycsv.csv") +df = pd.read_csv(csv_file) +display(df) +``` diff --git a/default_python/pytest.ini b/default_python/pytest.ini new file mode 100644 index 00000000..80432c22 --- /dev/null +++ b/default_python/pytest.ini @@ -0,0 +1,3 @@ +[pytest] +testpaths = tests +pythonpath = src diff --git a/default_python/resources/default_python_job.yml b/default_python/resources/default_python_job.yml new file mode 100644 index 00000000..5f728629 --- /dev/null +++ b/default_python/resources/default_python_job.yml @@ -0,0 +1,37 @@ +# The main job for default_python +resources: + + jobs: + default_python_job: + name: default_python_job + + schedule: + quartz_cron_expression: '44 37 8 * * ?' + timezone_id: Europe/Amsterdam + email_notifications: + on_failure: + - lennart.kats@databricks.com + tasks: + - task_key: notebook_task + job_cluster_key: job_cluster + notebook_task: + notebook_path: ../src/notebook.ipynb + + - task_key: python_wheel_task + depends_on: + - task_key: notebook_task + job_cluster_key: job_cluster + python_wheel_task: + package_name: default_python + entry_point: main + libraries: + - whl: ../dist/*.whl + + job_clusters: + - job_cluster_key: job_cluster + new_cluster: + spark_version: 13.3.x-scala2.12 + node_type_id: i3.xlarge + autoscale: + min_workers: 1 + max_workers: 4 diff --git a/default_python/scratch/README.md b/default_python/scratch/README.md new file mode 100644 index 00000000..e6cfb81b --- /dev/null +++ b/default_python/scratch/README.md @@ -0,0 +1,4 @@ +# scratch + +This folder is reserved for personal, exploratory notebooks. +By default these are not committed to Git, as 'scratch' is listed in .gitignore. diff --git a/default_python/scratch/exploration.ipynb b/default_python/scratch/exploration.ipynb new file mode 100644 index 00000000..2ee36c3c --- /dev/null +++ b/default_python/scratch/exploration.ipynb @@ -0,0 +1,50 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "6bca260b-13d1-448f-8082-30b60a85c9ae", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.append('../src')\n", + "from project import main\n", + "\n", + "main.taxis.show(10)" + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "dashboards": [], + "language": "python", + "notebookMetadata": { + "pythonIndentUnit": 2 + }, + "notebookName": "ipynb-notebook", + "widgets": {} + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/default_python/setup.py b/default_python/setup.py new file mode 100644 index 00000000..be5d946e --- /dev/null +++ b/default_python/setup.py @@ -0,0 +1,24 @@ +""" +Setup script for default_python. + +This script packages and distributes the associated wheel file(s). +Source code is in ./src/. Run 'python setup.py sdist bdist_wheel' to build. +""" +from setuptools import setup, find_packages + +import sys +sys.path.append('./src') + +import default_python + +setup( + name="default_python", + version=default_python.__version__, + url="https://databricks.com", + author="", + description="my test wheel", + packages=find_packages(where='./src'), + package_dir={'': 'src'}, + entry_points={"entry_points": "main=default_python.main:main"}, + install_requires=["setuptools"], +) diff --git a/default_python/src/default_python/__init__.py b/default_python/src/default_python/__init__.py new file mode 100644 index 00000000..f102a9ca --- /dev/null +++ b/default_python/src/default_python/__init__.py @@ -0,0 +1 @@ +__version__ = "0.0.1" diff --git a/default_python/src/default_python/main.py b/default_python/src/default_python/main.py new file mode 100644 index 00000000..48a80b07 --- /dev/null +++ b/default_python/src/default_python/main.py @@ -0,0 +1,11 @@ +from pyspark.sql import SparkSession + +def get_taxis(): + spark = SparkSession.builder.getOrCreate() + return spark.read.table("samples.nyctaxi.trips") + +def main(): + get_taxis().show(5) + +if __name__ == '__main__': + main() diff --git a/default_python/src/notebook.ipynb b/default_python/src/notebook.ipynb new file mode 100644 index 00000000..a6f026eb --- /dev/null +++ b/default_python/src/notebook.ipynb @@ -0,0 +1,65 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "ee353e42-ff58-4955-9608-12865bd0950e", + "showTitle": false, + "title": "" + } + }, + "source": [ + "# Default notebook\n", + "\n", + "This default notebook is executed using Databricks Workflows as defined in resources/_job.yml." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "6bca260b-13d1-448f-8082-30b60a85c9ae", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "from default_python import main\n", + "\n", + "main.get_taxis().show(10)\n" + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "dashboards": [], + "language": "python", + "notebookMetadata": { + "pythonIndentUnit": 2 + }, + "notebookName": "notebook", + "widgets": {} + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/default_python/tests/main_test.py b/default_python/tests/main_test.py new file mode 100644 index 00000000..a05e73c2 --- /dev/null +++ b/default_python/tests/main_test.py @@ -0,0 +1,5 @@ +from default_python import main + +def test_main(): + taxis = main.get_taxis() + assert taxis.count() == 5 From f36e170f15e94257ed2de7d716d3882b35f83c2e Mon Sep 17 00:00:00 2001 From: Lennart Kats Date: Mon, 4 Sep 2023 16:03:33 +0200 Subject: [PATCH 2/4] Remove original example --- job_with_notebook_tasks/.gitignore | 1 - job_with_notebook_tasks/databricks.yml | 30 -------------------------- job_with_notebook_tasks/notebook.py | 4 ---- 3 files changed, 35 deletions(-) delete mode 100644 job_with_notebook_tasks/.gitignore delete mode 100644 job_with_notebook_tasks/databricks.yml delete mode 100644 job_with_notebook_tasks/notebook.py diff --git a/job_with_notebook_tasks/.gitignore b/job_with_notebook_tasks/.gitignore deleted file mode 100644 index 15bcc6dd..00000000 --- a/job_with_notebook_tasks/.gitignore +++ /dev/null @@ -1 +0,0 @@ -.databricks diff --git a/job_with_notebook_tasks/databricks.yml b/job_with_notebook_tasks/databricks.yml deleted file mode 100644 index b89c9c7f..00000000 --- a/job_with_notebook_tasks/databricks.yml +++ /dev/null @@ -1,30 +0,0 @@ -bundle: - name: job_with_notebook_tasks - -workspace: - host: https://e2-dogfood.staging.cloud.databricks.com - -resources: - jobs: - job_with_notebook_tasks: - name: "Demonstrate programmatically retrieving notebook output" - job_clusters: - - job_cluster_key: cluster - new_cluster: - spark_version: 13.3.x-scala2.12 - node_type_id: i3.xlarge - num_workers: 1 - tasks: - - task_key: task1 - job_cluster_key: cluster - notebook_task: - notebook_path: ./notebook.py - base_parameters: - hello: world - - - task_key: task2 - job_cluster_key: cluster - notebook_task: - notebook_path: ./notebook.py - base_parameters: - hello: universe diff --git a/job_with_notebook_tasks/notebook.py b/job_with_notebook_tasks/notebook.py deleted file mode 100644 index 82b99b66..00000000 --- a/job_with_notebook_tasks/notebook.py +++ /dev/null @@ -1,4 +0,0 @@ -# Databricks notebook source - -hello = dbutils.widgets.get("hello") -dbutils.notebook.exit("Widget parameter 'hello' contains: " + hello) From 698446a339537bc982400953997e99dd2ea12cbd Mon Sep 17 00:00:00 2001 From: Lennart Kats Date: Mon, 4 Sep 2023 16:06:14 +0200 Subject: [PATCH 3/4] Remove specifics --- default_python/databricks.yml | 8 ++++---- default_python/resources/default_python_job.yml | 5 ++++- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/default_python/databricks.yml b/default_python/databricks.yml index ca87d9af..c664cb4b 100644 --- a/default_python/databricks.yml +++ b/default_python/databricks.yml @@ -16,14 +16,14 @@ targets: mode: development default: true workspace: - host: https://e2-dogfood.staging.cloud.databricks.com + host: https://myworkspace.databricks.com # Optionally, there could be a 'staging' target here. # (See Databricks docs on CI/CD at https://docs.databricks.com/dev-tools/bundles/index.html.) # # staging: # workspace: - # host: https://e2-dogfood.staging.cloud.databricks.com + # host: https://myworkspace.databricks.com # The 'prod' target, used for production deployment. prod: @@ -33,10 +33,10 @@ targets: # to a path that is not specific to the current user. mode: production workspace: - host: https://e2-dogfood.staging.cloud.databricks.com + host: https://myworkspace.databricks.com root_path: /Shared/.bundle/prod/${bundle.name} run_as: - # This runs as lennart.kats@databricks.com in production. Alternatively, + # This runs as username@company.com in production. Alternatively, # a service principal could be used here using service_principal_name # (see Databricks documentation). user_name: lennart.kats@databricks.com diff --git a/default_python/resources/default_python_job.yml b/default_python/resources/default_python_job.yml index 5f728629..85ba3856 100644 --- a/default_python/resources/default_python_job.yml +++ b/default_python/resources/default_python_job.yml @@ -31,7 +31,10 @@ resources: - job_cluster_key: job_cluster new_cluster: spark_version: 13.3.x-scala2.12 - node_type_id: i3.xlarge + # note_type_id is the cluster node type to use. + # Typical node types on AWS include i3.xlarge; + # Standard_D3_v2 on Azure; + # n1-standard-4 on Google Cloud. autoscale: min_workers: 1 max_workers: 4 From 91c0a38f4d7f01a7a36ea6abc9f5220b59fb04a1 Mon Sep 17 00:00:00 2001 From: Lennart Kats Date: Tue, 5 Sep 2023 16:11:45 +0200 Subject: [PATCH 4/4] Update to latest template --- default_python/README.md | 3 +- .../resources/default_python_job.yml | 16 +++- .../resources/default_python_pipeline.yml | 12 +++ default_python/scratch/exploration.ipynb | 4 +- default_python/src/dlt_pipeline.ipynb | 96 +++++++++++++++++++ default_python/src/notebook.ipynb | 2 +- default_python/tests/main_test.py | 2 +- 7 files changed, 126 insertions(+), 9 deletions(-) create mode 100644 default_python/resources/default_python_pipeline.yml create mode 100644 default_python/src/dlt_pipeline.ipynb diff --git a/default_python/README.md b/default_python/README.md index 7784f87d..3784730f 100644 --- a/default_python/README.md +++ b/default_python/README.md @@ -29,7 +29,8 @@ The 'default_python' project was generated by using the default-python template. ``` 5. Optionally, install developer tools such as the Databricks extension for Visual Studio Code from - https://docs.databricks.com/dev-tools/vscode-ext.html. + https://docs.databricks.com/dev-tools/vscode-ext.html.Or read the "getting started" documentation for + **Databricks Connect** for instructions on running the included Python code from a different IDE. 6. For documentation on the Databricks asset bundles format used for this project, and for CI/CD configuration, see diff --git a/default_python/resources/default_python_job.yml b/default_python/resources/default_python_job.yml index 85ba3856..830111fe 100644 --- a/default_python/resources/default_python_job.yml +++ b/default_python/resources/default_python_job.yml @@ -1,6 +1,5 @@ # The main job for default_python resources: - jobs: default_python_job: name: default_python_job @@ -8,18 +7,26 @@ resources: schedule: quartz_cron_expression: '44 37 8 * * ?' timezone_id: Europe/Amsterdam + email_notifications: on_failure: - lennart.kats@databricks.com + tasks: - task_key: notebook_task job_cluster_key: job_cluster notebook_task: notebook_path: ../src/notebook.ipynb - - - task_key: python_wheel_task + + - task_key: refresh_pipeline depends_on: - task_key: notebook_task + pipeline_task: + pipeline_id: ${resources.pipelines.default_python_pipeline.id} + + - task_key: main_task + depends_on: + - task_key: refresh_pipeline job_cluster_key: job_cluster python_wheel_task: package_name: default_python @@ -31,10 +38,11 @@ resources: - job_cluster_key: job_cluster new_cluster: spark_version: 13.3.x-scala2.12 - # note_type_id is the cluster node type to use. + # node_type_id is the cluster node type to use. # Typical node types on AWS include i3.xlarge; # Standard_D3_v2 on Azure; # n1-standard-4 on Google Cloud. + node_type_id: i3.xlarge autoscale: min_workers: 1 max_workers: 4 diff --git a/default_python/resources/default_python_pipeline.yml b/default_python/resources/default_python_pipeline.yml new file mode 100644 index 00000000..8617d75d --- /dev/null +++ b/default_python/resources/default_python_pipeline.yml @@ -0,0 +1,12 @@ +# The main pipeline for default_python +resources: + pipelines: + default_python_pipeline: + name: "default_python_pipeline" + target: "default_python_${bundle.environment}" + libraries: + - notebook: + path: ../src/dlt_pipeline.ipynb + + configuration: + "bundle.sourcePath": "/Workspace/${workspace.file_path}/src" diff --git a/default_python/scratch/exploration.ipynb b/default_python/scratch/exploration.ipynb index 2ee36c3c..85c9640e 100644 --- a/default_python/scratch/exploration.ipynb +++ b/default_python/scratch/exploration.ipynb @@ -19,9 +19,9 @@ "source": [ "import sys\n", "sys.path.append('../src')\n", - "from project import main\n", + "from default_python import main\n", "\n", - "main.taxis.show(10)" + "main.get_taxis().show(10)" ] } ], diff --git a/default_python/src/dlt_pipeline.ipynb b/default_python/src/dlt_pipeline.ipynb new file mode 100644 index 00000000..2e324d7d --- /dev/null +++ b/default_python/src/dlt_pipeline.ipynb @@ -0,0 +1,96 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "9a626959-61c8-4bba-84d2-2a4ecab1f7ec", + "showTitle": false, + "title": "" + } + }, + "source": [ + "# DLT pipeline\n", + "\n", + "This Delta Live Tables (DLT) definition is executed using a pipeline defined in resources/_pipeline.yml." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "9198e987-5606-403d-9f6d-8f14e6a4017f", + "showTitle": false, + "title": "" + }, + "jupyter": { + "source_hidden": true + } + }, + "outputs": [], + "source": [ + "# Import DLT and make sure 'my_project' is on the Python path\n", + "import dlt\n", + "from pyspark.sql.functions import expr\n", + "from pyspark.sql import SparkSession\n", + "spark = SparkSession.builder.getOrCreate()\n", + "import sys\n", + "try:\n", + " sys.path.append(spark.conf.get(\"bundle.sourcePath\"))\n", + "except:\n", + " pass\n", + "from my_project import main" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "3fc19dba-61fd-4a89-8f8c-24fee63bfb14", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "@dlt.view\n", + "def taxi_raw():\n", + " return main.get_taxis()\n", + "\n", + "@dlt.table\n", + "def filtered_taxis():\n", + " return dlt.read(\"taxi_raw\").filter(expr(\"fare_amount < 30\"))" + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "dashboards": [], + "language": "python", + "notebookMetadata": { + "pythonIndentUnit": 2 + }, + "notebookName": "dlt_pipeline", + "widgets": {} + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/default_python/src/notebook.ipynb b/default_python/src/notebook.ipynb index a6f026eb..ffeabad3 100644 --- a/default_python/src/notebook.ipynb +++ b/default_python/src/notebook.ipynb @@ -36,7 +36,7 @@ "source": [ "from default_python import main\n", "\n", - "main.get_taxis().show(10)\n" + "main.get_taxis().show(10)" ] } ], diff --git a/default_python/tests/main_test.py b/default_python/tests/main_test.py index a05e73c2..72c6bf10 100644 --- a/default_python/tests/main_test.py +++ b/default_python/tests/main_test.py @@ -2,4 +2,4 @@ def test_main(): taxis = main.get_taxis() - assert taxis.count() == 5 + assert taxis.count() > 5