diff --git a/acceptance/pipelines/e2e/output.txt b/acceptance/pipelines/e2e/output.txt index 792e046f42..c5b7a8a46a 100644 --- a/acceptance/pipelines/e2e/output.txt +++ b/acceptance/pipelines/e2e/output.txt @@ -2,21 +2,26 @@ === E2E Test: Complete pipeline lifecycle (init, deploy, run, stop, destroy) === Initialize pipeline project >>> [PIPELINES] init --output-dir output +Welcome to the template for Lakeflow Declarative Pipelines! -Welcome to the template for pipelines! +Please answer the below to tailor your project to your preferences. +You can always change your mind and change your configuration in the databricks.yml file later. +Note that [DATABRICKS_URL] is used for initialization +(see https://docs.databricks.com/dev-tools/cli/profiles.html for how to change your profile). -Your new project has been created in the 'my_project' directory! +✨ Your new project has been created in the 'lakeflow_project' directory! -Refer to the README.md file for "getting started" instructions! +Please refer to the README.md file for "getting started" instructions. === Deploy pipeline >>> [PIPELINES] deploy -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/my_project/dev/files... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/lakeflow_project/dev/files... Deploying resources... Updating deployment state... Deployment complete! -View your pipeline my_project_pipeline here: [DATABRICKS_URL]/pipelines/[UUID]?o=[NUMID] +View your job sample_job here: [DATABRICKS_URL]/jobs/[NUMID]?o=[NUMID] +View your pipeline lakeflow_project_etl here: [DATABRICKS_URL]/pipelines/[UUID]?o=[NUMID] === Run pipeline >>> [PIPELINES] run @@ -31,31 +36,32 @@ Pipeline configurations for this update: === Edit project by creating and running a new second pipeline >>> [PIPELINES] deploy -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/my_project/dev/files... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/lakeflow_project/dev/files... Deploying resources... Updating deployment state... Deployment complete! -View your pipeline my_project_pipeline here: [DATABRICKS_URL]/pipelines/[UUID]?o=[NUMID] -View your pipeline my_project_pipeline_2 here: [DATABRICKS_URL]/pipelines/[UUID]?o=[NUMID] +View your job sample_job here: [DATABRICKS_URL]/jobs/[NUMID]?o=[NUMID] +View your pipeline lakeflow_project_etl here: [DATABRICKS_URL]/pipelines/[UUID]?o=[NUMID] +View your pipeline lakeflow_project_etl_2 here: [DATABRICKS_URL]/pipelines/[UUID]?o=[NUMID] === Assert the second pipeline is created >>> [CLI] pipelines get [UUID] { "creator_user_name":"[USERNAME]", "last_modified":[UNIX_TIME_MILLIS], - "name":"[dev [USERNAME]] my_project_pipeline_2", + "name":"[dev [USERNAME]] lakeflow_project_etl_2", "pipeline_id":"[UUID]", "run_as_user_name":"[USERNAME]", "spec": { "channel":"CURRENT", "deployment": { "kind":"BUNDLE", - "metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/my_project/dev/state/metadata.json" + "metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/lakeflow_project/dev/state/metadata.json" }, "development":true, "edition":"ADVANCED", "id":"[UUID]", - "name":"[dev [USERNAME]] my_project_pipeline_2", + "name":"[dev [USERNAME]] lakeflow_project_etl_2", "storage":"dbfs:/pipelines/[UUID]", "tags": { "dev":"[USERNAME]" @@ -64,7 +70,7 @@ View your pipeline my_project_pipeline_2 here: [DATABRICKS_URL]/pipelines/[UUID] "state":"IDLE" } ->>> [PIPELINES] run my_project_pipeline_2 +>>> [PIPELINES] run lakeflow_project_etl_2 Update URL: [DATABRICKS_URL]/#joblist/pipelines/[UUID]/updates/[UUID] Update ID: [UUID] @@ -75,26 +81,27 @@ Pipeline configurations for this update: • All tables are refreshed === Stop both pipelines before destroy ->>> [PIPELINES] stop my_project_pipeline -Stopping my_project_pipeline... -my_project_pipeline has been stopped. +>>> [PIPELINES] stop lakeflow_project_etl +Stopping lakeflow_project_etl... +lakeflow_project_etl has been stopped. ->>> [PIPELINES] stop my_project_pipeline_2 -Stopping my_project_pipeline_2... -my_project_pipeline_2 has been stopped. +>>> [PIPELINES] stop lakeflow_project_etl_2 +Stopping lakeflow_project_etl_2... +lakeflow_project_etl_2 has been stopped. === Destroy project >>> [PIPELINES] destroy --auto-approve The following resources will be deleted: - delete resources.pipelines.my_project_pipeline - delete resources.pipelines.my_project_pipeline_2 + delete resources.jobs.sample_job + delete resources.pipelines.lakeflow_project_etl + delete resources.pipelines.lakeflow_project_etl_2 This action will result in the deletion of the following Lakeflow Declarative Pipelines along with the Streaming Tables (STs) and Materialized Views (MVs) managed by them: - delete resources.pipelines.my_project_pipeline - delete resources.pipelines.my_project_pipeline_2 + delete resources.pipelines.lakeflow_project_etl + delete resources.pipelines.lakeflow_project_etl_2 -All files and directories at the following location will be deleted: /Workspace/Users/[USERNAME]/.bundle/my_project/dev +All files and directories at the following location will be deleted: /Workspace/Users/[USERNAME]/.bundle/lakeflow_project/dev Deleting files... Destroy complete! diff --git a/acceptance/pipelines/e2e/output/my_project/.vscode/__builtins__.pyi b/acceptance/pipelines/e2e/output/lakeflow_project/.vscode/__builtins__.pyi similarity index 100% rename from acceptance/pipelines/e2e/output/my_project/.vscode/__builtins__.pyi rename to acceptance/pipelines/e2e/output/lakeflow_project/.vscode/__builtins__.pyi diff --git a/acceptance/pipelines/e2e/output/my_project/.vscode/extensions.json b/acceptance/pipelines/e2e/output/lakeflow_project/.vscode/extensions.json similarity index 51% rename from acceptance/pipelines/e2e/output/my_project/.vscode/extensions.json rename to acceptance/pipelines/e2e/output/lakeflow_project/.vscode/extensions.json index 5d15eba363..5ba48e79c9 100644 --- a/acceptance/pipelines/e2e/output/my_project/.vscode/extensions.json +++ b/acceptance/pipelines/e2e/output/lakeflow_project/.vscode/extensions.json @@ -1,7 +1,7 @@ { "recommendations": [ "databricks.databricks", - "ms-python.vscode-pylance", - "redhat.vscode-yaml" + "redhat.vscode-yaml", + "charliermarsh.ruff" ] } diff --git a/acceptance/pipelines/e2e/output/lakeflow_project/.vscode/settings.json b/acceptance/pipelines/e2e/output/lakeflow_project/.vscode/settings.json new file mode 100644 index 0000000000..d73c73b570 --- /dev/null +++ b/acceptance/pipelines/e2e/output/lakeflow_project/.vscode/settings.json @@ -0,0 +1,39 @@ +{ + "jupyter.interactiveWindow.cellMarker.codeRegex": "^# COMMAND ----------|^# Databricks notebook source|^(#\\s*%%|#\\s*\\|#\\s*In\\[\\d*?\\]|#\\s*In\\[ \\])", + "jupyter.interactiveWindow.cellMarker.default": "# COMMAND ----------", + "python.testing.pytestArgs": [ + "." + ], + "files.exclude": { + "**/*.egg-info": true, + "**/__pycache__": true, + ".pytest_cache": true, + "dist": true, + }, + "files.associations": { + "**/.gitkeep": "markdown" + }, + + // Pylance settings (VS Code) + // Set typeCheckingMode to "basic" to enable type checking! + "python.analysis.typeCheckingMode": "off", + "python.analysis.extraPaths": ["src", "lib", "resources"], + "python.analysis.diagnosticMode": "workspace", + "python.analysis.stubPath": ".vscode", + + // Pyright settings (Cursor) + // Set typeCheckingMode to "basic" to enable type checking! + "cursorpyright.analysis.typeCheckingMode": "off", + "cursorpyright.analysis.extraPaths": ["src", "lib", "resources"], + "cursorpyright.analysis.diagnosticMode": "workspace", + "cursorpyright.analysis.stubPath": ".vscode", + + // General Python settings + "python.defaultInterpreterPath": "./.venv/bin/python", + "python.testing.unittestEnabled": false, + "python.testing.pytestEnabled": true, + "[python]": { + "editor.defaultFormatter": "charliermarsh.ruff", + "editor.formatOnSave": true, + }, +} diff --git a/acceptance/pipelines/e2e/output/lakeflow_project/README.md b/acceptance/pipelines/e2e/output/lakeflow_project/README.md new file mode 100644 index 0000000000..2a59c9a590 --- /dev/null +++ b/acceptance/pipelines/e2e/output/lakeflow_project/README.md @@ -0,0 +1,54 @@ +# lakeflow_project + +The 'lakeflow_project' project was generated by using the lakeflow-pipelines template. + +* `src/`: Python source code for this project. +* `resources/`: Resource configurations (jobs, pipelines, etc.) + +## Getting started + +Choose how you want to work on this project: + +(a) Directly in your Databricks workspace, see + https://docs.databricks.com/dev-tools/bundles/workspace. + +(b) Locally with an IDE like Cursor or VS Code, see + https://docs.databricks.com/dev-tools/vscode-ext.html. + +(c) With command line tools, see https://docs.databricks.com/dev-tools/cli/databricks-cli.html + +# Using this project using the CLI + +The Databricks workspace and IDE extensions provide a graphical interface for working +with this project. It's also possible to interact with it directly using the CLI: + +1. Authenticate to your Databricks workspace, if you have not done so already: + ``` + $ databricks configure + ``` + +2. To deploy a development copy of this project, type: + ``` + $ databricks bundle deploy --target dev + ``` + (Note that "dev" is the default target, so the `--target` parameter + is optional here.) + + This deploys everything that's defined for this project. + For example, the default template would deploy a pipeline called + `[dev yourname] lakeflow_project_etl` to your workspace. + You can find that resource by opening your workpace and clicking on **Jobs & Pipelines**. + +3. Similarly, to deploy a production copy, type: + ``` + $ databricks bundle deploy --target prod + ``` + Note the default template has a includes a job that runs the pipeline every day + (defined in resources/sample_job.job.yml). The schedule + is paused when deploying in development mode (see + https://docs.databricks.com/dev-tools/bundles/deployment-modes.html). + +4. To run a job or pipeline, use the "run" command: + ``` + $ databricks bundle run + ``` diff --git a/acceptance/pipelines/init/error-cases/output/my_project/databricks.yml b/acceptance/pipelines/e2e/output/lakeflow_project/databricks.yml similarity index 75% rename from acceptance/pipelines/init/error-cases/output/my_project/databricks.yml rename to acceptance/pipelines/e2e/output/lakeflow_project/databricks.yml index 871656882c..e0ca05c469 100644 --- a/acceptance/pipelines/init/error-cases/output/my_project/databricks.yml +++ b/acceptance/pipelines/e2e/output/lakeflow_project/databricks.yml @@ -1,13 +1,12 @@ -# This is a Databricks pipelines definition for my_project. +# This is a Databricks asset bundle definition for lakeflow_project. # See https://docs.databricks.com/dev-tools/bundles/index.html for documentation. bundle: - name: my_project + name: lakeflow_project uuid: [UUID] include: - resources/*.yml - resources/*/*.yml - - ./*.yml # Variable declarations. These variables are assigned in the dev/prod targets below. variables: @@ -15,13 +14,13 @@ variables: description: The catalog to use schema: description: The schema to use - notifications: - description: The email addresses to use for failure notifications targets: dev: # The default target uses 'mode: development' to create a development copy. - # - Deployed pipelines get prefixed with '[dev my_user_name]' + # - Deployed resources get prefixed with '[dev my_user_name]' + # - Any job schedules and triggers are paused by default. + # See also https://docs.databricks.com/dev-tools/bundles/deployment-modes.html. mode: development default: true workspace: @@ -29,18 +28,15 @@ targets: variables: catalog: hive_metastore schema: ${workspace.current_user.short_name} - notifications: [] - prod: mode: production workspace: host: [DATABRICKS_URL] # We explicitly deploy to /Workspace/Users/[USERNAME] to make sure we only have a single copy. root_path: /Workspace/Users/[USERNAME]/.bundle/${bundle.name}/${bundle.target} + variables: + catalog: hive_metastore + schema: prod permissions: - user_name: [USERNAME] level: CAN_MANAGE - variables: - catalog: hive_metastore - schema: default - notifications: [[USERNAME]] diff --git a/acceptance/pipelines/e2e/output/my_project/out.gitignore b/acceptance/pipelines/e2e/output/lakeflow_project/out.gitignore similarity index 77% rename from acceptance/pipelines/e2e/output/my_project/out.gitignore rename to acceptance/pipelines/e2e/output/lakeflow_project/out.gitignore index f6a3b5ff93..e566c51f74 100644 --- a/acceptance/pipelines/e2e/output/my_project/out.gitignore +++ b/acceptance/pipelines/e2e/output/lakeflow_project/out.gitignore @@ -4,5 +4,7 @@ dist/ __pycache__/ *.egg-info .venv/ +scratch/** +!scratch/README.md **/explorations/** **/!explorations/README.md diff --git a/acceptance/pipelines/e2e/output/lakeflow_project/pyproject.toml b/acceptance/pipelines/e2e/output/lakeflow_project/pyproject.toml new file mode 100644 index 0000000000..39353902f6 --- /dev/null +++ b/acceptance/pipelines/e2e/output/lakeflow_project/pyproject.toml @@ -0,0 +1,34 @@ +[project] +name = "lakeflow_project" +version = "0.0.1" +authors = [{ name = "[USERNAME]" }] +requires-python = ">=3.10,<3.13" +dependencies = [ + # Any dependencies for jobs and pipelines in this project can be added here + # See also https://docs.databricks.com/dev-tools/bundles/library-dependencies + # + # LIMITATION: for pipelines, dependencies are cached during development; + # add dependencies to the 'environment' section of your pipeline.yml file instead +] + +[dependency-groups] +dev = [ + "pytest", + "ruff", + "databricks-dlt", + "databricks-connect>=15.4,<15.5", + "ipykernel", +] + +[project.scripts] +main = "lakeflow_project.main:main" + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.hatch.build.targets.wheel] +packages = ["src"] + +[tool.ruff] +line-length = 120 diff --git a/acceptance/pipelines/e2e/output/lakeflow_project/resources/lakeflow_project_etl.pipeline.yml b/acceptance/pipelines/e2e/output/lakeflow_project/resources/lakeflow_project_etl.pipeline.yml new file mode 100644 index 0000000000..eac2eda22a --- /dev/null +++ b/acceptance/pipelines/e2e/output/lakeflow_project/resources/lakeflow_project_etl.pipeline.yml @@ -0,0 +1,21 @@ +# The main pipeline for lakeflow_project + +resources: + pipelines: + lakeflow_project_etl: + name: lakeflow_project_etl + # Catalog is required for serverless compute + catalog: main + schema: ${var.schema} + serverless: true + root_path: "../src/lakeflow_project_etl" + + libraries: + - glob: + include: ../src/lakeflow_project_etl/transformations/** + + environment: + dependencies: + # We include every dependency defined by pyproject.toml by defining an editable environment + # that points to the folder where pyproject.toml is deployed. + - --editable ${workspace.file_path} diff --git a/acceptance/pipelines/e2e/output/lakeflow_project/resources/lakeflow_project_etl_2.pipeline.yml b/acceptance/pipelines/e2e/output/lakeflow_project/resources/lakeflow_project_etl_2.pipeline.yml new file mode 100644 index 0000000000..11b17e0047 --- /dev/null +++ b/acceptance/pipelines/e2e/output/lakeflow_project/resources/lakeflow_project_etl_2.pipeline.yml @@ -0,0 +1,4 @@ +resources: + pipelines: + lakeflow_project_etl_2: + name: lakeflow_project_etl_2 diff --git a/acceptance/pipelines/e2e/output/lakeflow_project/resources/sample_job.job.yml b/acceptance/pipelines/e2e/output/lakeflow_project/resources/sample_job.job.yml new file mode 100644 index 0000000000..2ba45ba9ff --- /dev/null +++ b/acceptance/pipelines/e2e/output/lakeflow_project/resources/sample_job.job.yml @@ -0,0 +1,32 @@ +# A sample job for lakeflow_project. + +resources: + jobs: + sample_job: + name: sample_job + + trigger: + # Run this job every day, exactly one day from the last run; see https://docs.databricks.com/api/workspace/jobs/create#trigger + periodic: + interval: 1 + unit: DAYS + + #email_notifications: + # on_failure: + # - your_email@example.com + + parameters: + - name: catalog + default: ${var.catalog} + - name: schema + default: ${var.schema} + + tasks: + - task_key: refresh_pipeline + pipeline_task: + pipeline_id: ${resources.pipelines.lakeflow_project_etl.id} + + environments: + - environment_key: default + spec: + environment_version: "4" diff --git a/acceptance/pipelines/e2e/output/lakeflow_project/src/lakeflow_project_etl/README.md b/acceptance/pipelines/e2e/output/lakeflow_project/src/lakeflow_project_etl/README.md new file mode 100644 index 0000000000..1fd2a85b68 --- /dev/null +++ b/acceptance/pipelines/e2e/output/lakeflow_project/src/lakeflow_project_etl/README.md @@ -0,0 +1,20 @@ +# lakeflow_project + +This folder defines all source code for the lakeflow_project pipeline: + +- `explorations/`: Ad-hoc notebooks used to explore the data processed by this pipeline. +- `transformations/`: All dataset definitions and transformations. +- `utilities/` (optional): Utility functions and Python modules used in this pipeline. +- `data_sources/` (optional): View definitions describing the source data for this pipeline. + +## Getting Started + +To get started, go to the `transformations` folder -- most of the relevant source code lives there: + +* By convention, every dataset under `transformations` is in a separate file. +* Take a look at the sample called "sample_trips_lakeflow_project.py" to get familiar with the syntax. + Read more about the syntax at https://docs.databricks.com/dlt/python-ref.html. +* If you're using the workspace UI, use `Run file` to run and preview a single transformation. +* If you're using the CLI, use `databricks bundle run lakeflow_project_etl --select sample_trips_lakeflow_project` to run a single transformation. + +For more tutorials and reference material, see https://docs.databricks.com/dlt. diff --git a/acceptance/pipelines/e2e/output/my_project/explorations/sample_exploration.ipynb b/acceptance/pipelines/e2e/output/lakeflow_project/src/lakeflow_project_etl/explorations/sample_exploration.ipynb similarity index 97% rename from acceptance/pipelines/e2e/output/my_project/explorations/sample_exploration.ipynb rename to acceptance/pipelines/e2e/output/lakeflow_project/src/lakeflow_project_etl/explorations/sample_exploration.ipynb index 7368a6c4c0..b83ee2a9ab 100644 --- a/acceptance/pipelines/e2e/output/my_project/explorations/sample_exploration.ipynb +++ b/acceptance/pipelines/e2e/output/lakeflow_project/src/lakeflow_project_etl/explorations/sample_exploration.ipynb @@ -37,7 +37,7 @@ "source": [ "# !!! Before performing any data analysis, make sure to run the pipeline to materialize the sample datasets. The tables referenced in this notebook depend on that step.\n", "\n", - "display(spark.sql(\"SELECT * FROM hive_metastore.[USERNAME].my_project\"))" + "display(spark.sql(\"SELECT * FROM hive_metastore.[USERNAME].sample_trips_lakeflow_project\"))" ] } ], diff --git a/acceptance/pipelines/e2e/output/lakeflow_project/src/lakeflow_project_etl/transformations/sample_trips_lakeflow_project.py b/acceptance/pipelines/e2e/output/lakeflow_project/src/lakeflow_project_etl/transformations/sample_trips_lakeflow_project.py new file mode 100644 index 0000000000..abdae73026 --- /dev/null +++ b/acceptance/pipelines/e2e/output/lakeflow_project/src/lakeflow_project_etl/transformations/sample_trips_lakeflow_project.py @@ -0,0 +1,12 @@ +from pyspark import pipelines as dp +from pyspark.sql.functions import col + + +# This file defines a sample transformation. +# Edit the sample below or add new transformations +# using "+ Add" in the file browser. + + +@dp.table +def sample_trips_lakeflow_project(): + return spark.read.table("samples.nyctaxi.trips") diff --git a/libs/template/templates/cli-pipelines/template/{{.project_name}}/transformations/sample_zones_{{.project_name}}.py.tmpl b/acceptance/pipelines/e2e/output/lakeflow_project/src/lakeflow_project_etl/transformations/sample_zones_lakeflow_project.py similarity index 71% rename from libs/template/templates/cli-pipelines/template/{{.project_name}}/transformations/sample_zones_{{.project_name}}.py.tmpl rename to acceptance/pipelines/e2e/output/lakeflow_project/src/lakeflow_project_etl/transformations/sample_zones_lakeflow_project.py index df63cecd44..91d186e389 100644 --- a/libs/template/templates/cli-pipelines/template/{{.project_name}}/transformations/sample_zones_{{.project_name}}.py.tmpl +++ b/acceptance/pipelines/e2e/output/lakeflow_project/src/lakeflow_project_etl/transformations/sample_zones_lakeflow_project.py @@ -1,4 +1,4 @@ -import dlt +from pyspark import pipelines as dp from pyspark.sql.functions import col, sum @@ -7,11 +7,11 @@ # using "+ Add" in the file browser. -@dlt.table -def sample_zones_{{ .project_name }}(): +@dp.table +def sample_zones_lakeflow_project(): # Read from the "sample_trips" table, then sum all the fares return ( - spark.read.table(f"sample_trips_{{.project_name}}") + spark.read.table(f"sample_trips_lakeflow_project") .groupBy(col("pickup_zip")) .agg(sum("fare_amount").alias("total_fare")) ) diff --git a/acceptance/pipelines/e2e/output/my_project/.vscode/settings.json b/acceptance/pipelines/e2e/output/my_project/.vscode/settings.json deleted file mode 100644 index 09a01b181c..0000000000 --- a/acceptance/pipelines/e2e/output/my_project/.vscode/settings.json +++ /dev/null @@ -1,21 +0,0 @@ -{ - "python.analysis.stubPath": ".vscode", - "databricks.python.envFile": "${workspaceFolder}/.env", - "jupyter.interactiveWindow.cellMarker.codeRegex": "^# COMMAND ----------|^# Databricks notebook source|^(#\\s*%%|#\\s*\\|#\\s*In\\[\\d*?\\]|#\\s*In\\[ \\])", - "jupyter.interactiveWindow.cellMarker.default": "# COMMAND ----------", - "python.testing.pytestArgs": [ - "." - ], - "python.testing.unittestEnabled": false, - "python.testing.pytestEnabled": true, - "python.analysis.extraPaths": ["resources/my_project_pipeline"], - "files.exclude": { - "**/*.egg-info": true, - "**/__pycache__": true, - ".pytest_cache": true, - }, - "[python]": { - "editor.defaultFormatter": "ms-python.black-formatter", - "editor.formatOnSave": true, - }, -} diff --git a/acceptance/pipelines/e2e/output/my_project/README.md b/acceptance/pipelines/e2e/output/my_project/README.md deleted file mode 100644 index 88914e1e36..0000000000 --- a/acceptance/pipelines/e2e/output/my_project/README.md +++ /dev/null @@ -1,63 +0,0 @@ -# my_project - -The 'my_project' project was generated by using the CLI Pipelines template. - -## Setup - -1. Install the Databricks CLI from https://docs.databricks.com/dev-tools/cli/databricks-cli.html - -2. Install the Pipelines CLI: - ``` - $ databricks install-pipelines-cli - ``` - -3. Authenticate to your Databricks workspace, if you have not done so already: - ``` - $ pipelines auth login - ``` - -4. Optionally, install developer tools such as the Databricks extension for Visual Studio Code from - https://docs.databricks.com/dev-tools/vscode-ext.html. Or the PyCharm plugin from - https://www.databricks.com/blog/announcing-pycharm-integration-databricks. - -## Pipeline Structure - -This folder defines all source code for the my_project_pipeline pipeline: - -- `explorations`: Ad-hoc notebooks used to explore the data processed by this pipeline. -- `transformations`: All dataset definitions and transformations. -- `utilities` (optional): Utility functions and Python modules used in this pipeline. - -## Getting Started - -To get started, go to the `transformations` folder -- most of the relevant source code lives there: - -* By convention, every dataset under `transformations` is in a separate file. -* Take a look at the sample under "sample_trips_my_project.py" to get familiar with the syntax. - Read more about the syntax at https://docs.databricks.com/dlt/python-ref.html. - -For more tutorials and reference material, see https://docs.databricks.com/dlt. - -## Deploying pipelines - -1. To deploy a development copy of this project, type: - ``` - $ pipelines deploy --target dev - ``` - (Note that "dev" is the default target, so the `--target` parameter - is optional here.) - - This deploys everything that's defined for this project. - For example, the default template would deploy a pipeline called - `[dev yourname] my_project_pipeline` to your workspace. - You can find that pipeline by opening your workpace and clicking on **Jobs & Pipelines**. - -2. Similarly, to deploy a production copy, type: - ``` - $ pipelines deploy --target prod - ``` - -3. To run a pipeline, use the "run" command: - ``` - $ pipelines run - ``` diff --git a/acceptance/pipelines/e2e/output/my_project/databricks.yml b/acceptance/pipelines/e2e/output/my_project/databricks.yml deleted file mode 100644 index 871656882c..0000000000 --- a/acceptance/pipelines/e2e/output/my_project/databricks.yml +++ /dev/null @@ -1,46 +0,0 @@ -# This is a Databricks pipelines definition for my_project. -# See https://docs.databricks.com/dev-tools/bundles/index.html for documentation. -bundle: - name: my_project - uuid: [UUID] - -include: - - resources/*.yml - - resources/*/*.yml - - ./*.yml - -# Variable declarations. These variables are assigned in the dev/prod targets below. -variables: - catalog: - description: The catalog to use - schema: - description: The schema to use - notifications: - description: The email addresses to use for failure notifications - -targets: - dev: - # The default target uses 'mode: development' to create a development copy. - # - Deployed pipelines get prefixed with '[dev my_user_name]' - mode: development - default: true - workspace: - host: [DATABRICKS_URL] - variables: - catalog: hive_metastore - schema: ${workspace.current_user.short_name} - notifications: [] - - prod: - mode: production - workspace: - host: [DATABRICKS_URL] - # We explicitly deploy to /Workspace/Users/[USERNAME] to make sure we only have a single copy. - root_path: /Workspace/Users/[USERNAME]/.bundle/${bundle.name}/${bundle.target} - permissions: - - user_name: [USERNAME] - level: CAN_MANAGE - variables: - catalog: hive_metastore - schema: default - notifications: [[USERNAME]] diff --git a/acceptance/pipelines/e2e/output/my_project/my_project.pipeline.yml b/acceptance/pipelines/e2e/output/my_project/my_project.pipeline.yml deleted file mode 100644 index 94b4556177..0000000000 --- a/acceptance/pipelines/e2e/output/my_project/my_project.pipeline.yml +++ /dev/null @@ -1,11 +0,0 @@ -resources: - pipelines: - my_project_pipeline: - name: my_project_pipeline - serverless: true - catalog: ${var.catalog} - schema: ${var.schema} - root_path: "." - libraries: - - glob: - include: transformations/** diff --git a/acceptance/pipelines/e2e/output/my_project/my_project_pipeline_2.pipeline.yml b/acceptance/pipelines/e2e/output/my_project/my_project_pipeline_2.pipeline.yml deleted file mode 100644 index c2a2f17887..0000000000 --- a/acceptance/pipelines/e2e/output/my_project/my_project_pipeline_2.pipeline.yml +++ /dev/null @@ -1,4 +0,0 @@ -resources: - pipelines: - my_project_pipeline_2: - name: my_project_pipeline_2 diff --git a/acceptance/pipelines/e2e/output/my_project/transformations/sample_trips_my_project.py b/acceptance/pipelines/e2e/output/my_project/transformations/sample_trips_my_project.py deleted file mode 100644 index 1895070194..0000000000 --- a/acceptance/pipelines/e2e/output/my_project/transformations/sample_trips_my_project.py +++ /dev/null @@ -1,15 +0,0 @@ -import dlt -from pyspark.sql.functions import col -from utilities import utils - - -# This file defines a sample transformation. -# Edit the sample below or add new transformations -# using "+ Add" in the file browser. - - -@dlt.table -def sample_trips_my_project(): - return spark.read.table("samples.nyctaxi.trips").withColumn( - "trip_distance_km", utils.distance_km(col("trip_distance")) - ) diff --git a/acceptance/pipelines/e2e/output/my_project/transformations/sample_zones_my_project.py b/acceptance/pipelines/e2e/output/my_project/transformations/sample_zones_my_project.py deleted file mode 100644 index 280b6dab89..0000000000 --- a/acceptance/pipelines/e2e/output/my_project/transformations/sample_zones_my_project.py +++ /dev/null @@ -1,17 +0,0 @@ -import dlt -from pyspark.sql.functions import col, sum - - -# This file defines a sample transformation. -# Edit the sample below or add new transformations -# using "+ Add" in the file browser. - - -@dlt.table -def sample_zones_my_project(): - # Read from the "sample_trips" table, then sum all the fares - return ( - spark.read.table(f"sample_trips_my_project") - .groupBy(col("pickup_zip")) - .agg(sum("fare_amount").alias("total_fare")) - ) diff --git a/acceptance/pipelines/e2e/output/my_project/utilities/utils.py b/acceptance/pipelines/e2e/output/my_project/utilities/utils.py deleted file mode 100644 index ff039898f0..0000000000 --- a/acceptance/pipelines/e2e/output/my_project/utilities/utils.py +++ /dev/null @@ -1,8 +0,0 @@ -from pyspark.sql.functions import udf -from pyspark.sql.types import FloatType - - -@udf(returnType=FloatType()) -def distance_km(distance_miles): - """Convert distance from miles to kilometers (1 mile = 1.60934 km).""" - return distance_miles * 1.60934 diff --git a/acceptance/pipelines/e2e/script b/acceptance/pipelines/e2e/script index e9015647ee..62de74e436 100644 --- a/acceptance/pipelines/e2e/script +++ b/acceptance/pipelines/e2e/script @@ -4,30 +4,30 @@ title "Initialize pipeline project" trace $PIPELINES init --output-dir output title "Deploy pipeline" -cd output/my_project +cd output/lakeflow_project trace $PIPELINES deploy title "Run pipeline" trace $PIPELINES run title "Edit project by creating and running a new second pipeline" -cat < my_project_pipeline_2.pipeline.yml +cat < resources/lakeflow_project_etl_2.pipeline.yml resources: pipelines: - my_project_pipeline_2: - name: my_project_pipeline_2 + lakeflow_project_etl_2: + name: lakeflow_project_etl_2 EOF trace $PIPELINES deploy title "Assert the second pipeline is created" -PIPELINE_ID=$($CLI bundle summary -o json | jq -r '.resources.pipelines.my_project_pipeline_2.id') +PIPELINE_ID=$($CLI bundle summary -o json | jq -r '.resources.pipelines.lakeflow_project_etl_2.id') trace $CLI pipelines get "${PIPELINE_ID}" -trace $PIPELINES run my_project_pipeline_2 +trace $PIPELINES run lakeflow_project_etl_2 title "Stop both pipelines before destroy" -trace $PIPELINES stop my_project_pipeline -trace $PIPELINES stop my_project_pipeline_2 +trace $PIPELINES stop lakeflow_project_etl +trace $PIPELINES stop lakeflow_project_etl_2 title "Destroy project" trace $PIPELINES destroy --auto-approve diff --git a/acceptance/pipelines/init/error-cases/invalid_input.json b/acceptance/pipelines/init/error-cases/invalid_input.json new file mode 100644 index 0000000000..f577c1be4d --- /dev/null +++ b/acceptance/pipelines/init/error-cases/invalid_input.json @@ -0,0 +1,3 @@ +{ + "project_name": "InvalidProjectName" +} diff --git a/acceptance/pipelines/init/error-cases/output.txt b/acceptance/pipelines/init/error-cases/output.txt index cebdc30be2..3209536177 100644 --- a/acceptance/pipelines/init/error-cases/output.txt +++ b/acceptance/pipelines/init/error-cases/output.txt @@ -1,14 +1,4 @@ -=== Test with missing config file ->>> errcode [PIPELINES] init --output-dir output - -Welcome to the template for pipelines! - - -Your new project has been created in the 'my_project' directory! - -Refer to the README.md file for "getting started" instructions! - === Test with invalid project name (contains uppercase letters) >>> errcode [PIPELINES] init --config-file ./invalid_input.json --output-dir invalid-output Error: failed to load config from file ./invalid_input.json: invalid value for project_name: "InvalidProjectName". Name must consist of lower case letters, numbers, and underscores. diff --git a/acceptance/pipelines/init/error-cases/output/my_project/.vscode/__builtins__.pyi b/acceptance/pipelines/init/error-cases/output/my_project/.vscode/__builtins__.pyi deleted file mode 100644 index 0edd5181bc..0000000000 --- a/acceptance/pipelines/init/error-cases/output/my_project/.vscode/__builtins__.pyi +++ /dev/null @@ -1,3 +0,0 @@ -# Typings for Pylance in Visual Studio Code -# see https://github.com/microsoft/pyright/blob/main/docs/builtins.md -from databricks.sdk.runtime import * diff --git a/acceptance/pipelines/init/error-cases/output/my_project/.vscode/extensions.json b/acceptance/pipelines/init/error-cases/output/my_project/.vscode/extensions.json deleted file mode 100644 index 5d15eba363..0000000000 --- a/acceptance/pipelines/init/error-cases/output/my_project/.vscode/extensions.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "recommendations": [ - "databricks.databricks", - "ms-python.vscode-pylance", - "redhat.vscode-yaml" - ] -} diff --git a/acceptance/pipelines/init/error-cases/output/my_project/.vscode/settings.json b/acceptance/pipelines/init/error-cases/output/my_project/.vscode/settings.json deleted file mode 100644 index 09a01b181c..0000000000 --- a/acceptance/pipelines/init/error-cases/output/my_project/.vscode/settings.json +++ /dev/null @@ -1,21 +0,0 @@ -{ - "python.analysis.stubPath": ".vscode", - "databricks.python.envFile": "${workspaceFolder}/.env", - "jupyter.interactiveWindow.cellMarker.codeRegex": "^# COMMAND ----------|^# Databricks notebook source|^(#\\s*%%|#\\s*\\|#\\s*In\\[\\d*?\\]|#\\s*In\\[ \\])", - "jupyter.interactiveWindow.cellMarker.default": "# COMMAND ----------", - "python.testing.pytestArgs": [ - "." - ], - "python.testing.unittestEnabled": false, - "python.testing.pytestEnabled": true, - "python.analysis.extraPaths": ["resources/my_project_pipeline"], - "files.exclude": { - "**/*.egg-info": true, - "**/__pycache__": true, - ".pytest_cache": true, - }, - "[python]": { - "editor.defaultFormatter": "ms-python.black-formatter", - "editor.formatOnSave": true, - }, -} diff --git a/acceptance/pipelines/init/error-cases/output/my_project/README.md b/acceptance/pipelines/init/error-cases/output/my_project/README.md deleted file mode 100644 index 88914e1e36..0000000000 --- a/acceptance/pipelines/init/error-cases/output/my_project/README.md +++ /dev/null @@ -1,63 +0,0 @@ -# my_project - -The 'my_project' project was generated by using the CLI Pipelines template. - -## Setup - -1. Install the Databricks CLI from https://docs.databricks.com/dev-tools/cli/databricks-cli.html - -2. Install the Pipelines CLI: - ``` - $ databricks install-pipelines-cli - ``` - -3. Authenticate to your Databricks workspace, if you have not done so already: - ``` - $ pipelines auth login - ``` - -4. Optionally, install developer tools such as the Databricks extension for Visual Studio Code from - https://docs.databricks.com/dev-tools/vscode-ext.html. Or the PyCharm plugin from - https://www.databricks.com/blog/announcing-pycharm-integration-databricks. - -## Pipeline Structure - -This folder defines all source code for the my_project_pipeline pipeline: - -- `explorations`: Ad-hoc notebooks used to explore the data processed by this pipeline. -- `transformations`: All dataset definitions and transformations. -- `utilities` (optional): Utility functions and Python modules used in this pipeline. - -## Getting Started - -To get started, go to the `transformations` folder -- most of the relevant source code lives there: - -* By convention, every dataset under `transformations` is in a separate file. -* Take a look at the sample under "sample_trips_my_project.py" to get familiar with the syntax. - Read more about the syntax at https://docs.databricks.com/dlt/python-ref.html. - -For more tutorials and reference material, see https://docs.databricks.com/dlt. - -## Deploying pipelines - -1. To deploy a development copy of this project, type: - ``` - $ pipelines deploy --target dev - ``` - (Note that "dev" is the default target, so the `--target` parameter - is optional here.) - - This deploys everything that's defined for this project. - For example, the default template would deploy a pipeline called - `[dev yourname] my_project_pipeline` to your workspace. - You can find that pipeline by opening your workpace and clicking on **Jobs & Pipelines**. - -2. Similarly, to deploy a production copy, type: - ``` - $ pipelines deploy --target prod - ``` - -3. To run a pipeline, use the "run" command: - ``` - $ pipelines run - ``` diff --git a/acceptance/pipelines/init/error-cases/output/my_project/my_project.pipeline.yml b/acceptance/pipelines/init/error-cases/output/my_project/my_project.pipeline.yml deleted file mode 100644 index 94b4556177..0000000000 --- a/acceptance/pipelines/init/error-cases/output/my_project/my_project.pipeline.yml +++ /dev/null @@ -1,11 +0,0 @@ -resources: - pipelines: - my_project_pipeline: - name: my_project_pipeline - serverless: true - catalog: ${var.catalog} - schema: ${var.schema} - root_path: "." - libraries: - - glob: - include: transformations/** diff --git a/acceptance/pipelines/init/error-cases/output/my_project/out.gitignore b/acceptance/pipelines/init/error-cases/output/my_project/out.gitignore deleted file mode 100644 index f6a3b5ff93..0000000000 --- a/acceptance/pipelines/init/error-cases/output/my_project/out.gitignore +++ /dev/null @@ -1,8 +0,0 @@ -.databricks/ -build/ -dist/ -__pycache__/ -*.egg-info -.venv/ -**/explorations/** -**/!explorations/README.md diff --git a/acceptance/pipelines/init/error-cases/output/my_project/transformations/sample_trips_my_project.py b/acceptance/pipelines/init/error-cases/output/my_project/transformations/sample_trips_my_project.py deleted file mode 100644 index 1895070194..0000000000 --- a/acceptance/pipelines/init/error-cases/output/my_project/transformations/sample_trips_my_project.py +++ /dev/null @@ -1,15 +0,0 @@ -import dlt -from pyspark.sql.functions import col -from utilities import utils - - -# This file defines a sample transformation. -# Edit the sample below or add new transformations -# using "+ Add" in the file browser. - - -@dlt.table -def sample_trips_my_project(): - return spark.read.table("samples.nyctaxi.trips").withColumn( - "trip_distance_km", utils.distance_km(col("trip_distance")) - ) diff --git a/acceptance/pipelines/init/error-cases/output/my_project/transformations/sample_zones_my_project.py b/acceptance/pipelines/init/error-cases/output/my_project/transformations/sample_zones_my_project.py deleted file mode 100644 index 280b6dab89..0000000000 --- a/acceptance/pipelines/init/error-cases/output/my_project/transformations/sample_zones_my_project.py +++ /dev/null @@ -1,17 +0,0 @@ -import dlt -from pyspark.sql.functions import col, sum - - -# This file defines a sample transformation. -# Edit the sample below or add new transformations -# using "+ Add" in the file browser. - - -@dlt.table -def sample_zones_my_project(): - # Read from the "sample_trips" table, then sum all the fares - return ( - spark.read.table(f"sample_trips_my_project") - .groupBy(col("pickup_zip")) - .agg(sum("fare_amount").alias("total_fare")) - ) diff --git a/acceptance/pipelines/init/error-cases/output/my_project/utilities/utils.py b/acceptance/pipelines/init/error-cases/output/my_project/utilities/utils.py deleted file mode 100644 index ff039898f0..0000000000 --- a/acceptance/pipelines/init/error-cases/output/my_project/utilities/utils.py +++ /dev/null @@ -1,8 +0,0 @@ -from pyspark.sql.functions import udf -from pyspark.sql.types import FloatType - - -@udf(returnType=FloatType()) -def distance_km(distance_miles): - """Convert distance from miles to kilometers (1 mile = 1.60934 km).""" - return distance_miles * 1.60934 diff --git a/acceptance/pipelines/init/error-cases/script b/acceptance/pipelines/init/error-cases/script index 976ae722bf..c1bb876058 100644 --- a/acceptance/pipelines/init/error-cases/script +++ b/acceptance/pipelines/init/error-cases/script @@ -1,15 +1,5 @@ -title "Test with missing config file" -trace errcode $PIPELINES init --output-dir output - title "Test with invalid project name (contains uppercase letters)" -echo '{"project_name": "InvalidProjectName"}' > invalid_input.json trace errcode $PIPELINES init --config-file ./invalid_input.json --output-dir invalid-output title "Test with non-existent config file" trace errcode $PIPELINES init --config-file ./nonexistent.json --output-dir invalid-output-2 - -# Do not affect this repository's git behaviour -mv output/my_project/.gitignore output/my_project/out.gitignore - -# Clean up -rm -f invalid_input.json diff --git a/acceptance/pipelines/init/python/input.json b/acceptance/pipelines/init/python/input.json index 5a1211b99e..9ee95b139a 100644 --- a/acceptance/pipelines/init/python/input.json +++ b/acceptance/pipelines/init/python/input.json @@ -2,6 +2,5 @@ "project_name": "my_python_project", "default_catalog": "main", "personal_schemas": "yes", - "shared_schema": "default", "language": "python" } diff --git a/acceptance/pipelines/init/python/output.txt b/acceptance/pipelines/init/python/output.txt index d1f6567934..a3ad024bc5 100644 --- a/acceptance/pipelines/init/python/output.txt +++ b/acceptance/pipelines/init/python/output.txt @@ -1,10 +1,14 @@ === Test basic pipelines init with configuration file >>> [PIPELINES] init --config-file ./input.json --output-dir output +Welcome to the template for Lakeflow Declarative Pipelines! -Welcome to the template for pipelines! +Please answer the below to tailor your project to your preferences. +You can always change your mind and change your configuration in the databricks.yml file later. +Note that [DATABRICKS_URL] is used for initialization +(see https://docs.databricks.com/dev-tools/cli/profiles.html for how to change your profile). -Your new project has been created in the 'my_python_project' directory! +✨ Your new project has been created in the 'my_python_project' directory! -Refer to the README.md file for "getting started" instructions! +Please refer to the README.md file for "getting started" instructions. diff --git a/acceptance/pipelines/init/python/output/my_python_project/.vscode/extensions.json b/acceptance/pipelines/init/python/output/my_python_project/.vscode/extensions.json index 5d15eba363..5ba48e79c9 100644 --- a/acceptance/pipelines/init/python/output/my_python_project/.vscode/extensions.json +++ b/acceptance/pipelines/init/python/output/my_python_project/.vscode/extensions.json @@ -1,7 +1,7 @@ { "recommendations": [ "databricks.databricks", - "ms-python.vscode-pylance", - "redhat.vscode-yaml" + "redhat.vscode-yaml", + "charliermarsh.ruff" ] } diff --git a/acceptance/pipelines/init/python/output/my_python_project/.vscode/settings.json b/acceptance/pipelines/init/python/output/my_python_project/.vscode/settings.json index 674e2be3f3..d73c73b570 100644 --- a/acceptance/pipelines/init/python/output/my_python_project/.vscode/settings.json +++ b/acceptance/pipelines/init/python/output/my_python_project/.vscode/settings.json @@ -1,21 +1,39 @@ { - "python.analysis.stubPath": ".vscode", - "databricks.python.envFile": "${workspaceFolder}/.env", "jupyter.interactiveWindow.cellMarker.codeRegex": "^# COMMAND ----------|^# Databricks notebook source|^(#\\s*%%|#\\s*\\|#\\s*In\\[\\d*?\\]|#\\s*In\\[ \\])", "jupyter.interactiveWindow.cellMarker.default": "# COMMAND ----------", "python.testing.pytestArgs": [ "." ], - "python.testing.unittestEnabled": false, - "python.testing.pytestEnabled": true, - "python.analysis.extraPaths": ["resources/my_python_project_pipeline"], "files.exclude": { "**/*.egg-info": true, "**/__pycache__": true, ".pytest_cache": true, + "dist": true, + }, + "files.associations": { + "**/.gitkeep": "markdown" }, + + // Pylance settings (VS Code) + // Set typeCheckingMode to "basic" to enable type checking! + "python.analysis.typeCheckingMode": "off", + "python.analysis.extraPaths": ["src", "lib", "resources"], + "python.analysis.diagnosticMode": "workspace", + "python.analysis.stubPath": ".vscode", + + // Pyright settings (Cursor) + // Set typeCheckingMode to "basic" to enable type checking! + "cursorpyright.analysis.typeCheckingMode": "off", + "cursorpyright.analysis.extraPaths": ["src", "lib", "resources"], + "cursorpyright.analysis.diagnosticMode": "workspace", + "cursorpyright.analysis.stubPath": ".vscode", + + // General Python settings + "python.defaultInterpreterPath": "./.venv/bin/python", + "python.testing.unittestEnabled": false, + "python.testing.pytestEnabled": true, "[python]": { - "editor.defaultFormatter": "ms-python.black-formatter", + "editor.defaultFormatter": "charliermarsh.ruff", "editor.formatOnSave": true, }, } diff --git a/acceptance/pipelines/init/python/output/my_python_project/README.md b/acceptance/pipelines/init/python/output/my_python_project/README.md index 5c87ad38c9..9991128170 100644 --- a/acceptance/pipelines/init/python/output/my_python_project/README.md +++ b/acceptance/pipelines/init/python/output/my_python_project/README.md @@ -1,63 +1,54 @@ # my_python_project -The 'my_python_project' project was generated by using the CLI Pipelines template. +The 'my_python_project' project was generated by using the lakeflow-pipelines template. -## Setup +* `src/`: Python source code for this project. +* `resources/`: Resource configurations (jobs, pipelines, etc.) -1. Install the Databricks CLI from https://docs.databricks.com/dev-tools/cli/databricks-cli.html +## Getting started -2. Install the Pipelines CLI: - ``` - $ databricks install-pipelines-cli - ``` - -3. Authenticate to your Databricks workspace, if you have not done so already: - ``` - $ pipelines auth login - ``` +Choose how you want to work on this project: -4. Optionally, install developer tools such as the Databricks extension for Visual Studio Code from - https://docs.databricks.com/dev-tools/vscode-ext.html. Or the PyCharm plugin from - https://www.databricks.com/blog/announcing-pycharm-integration-databricks. +(a) Directly in your Databricks workspace, see + https://docs.databricks.com/dev-tools/bundles/workspace. -## Pipeline Structure +(b) Locally with an IDE like Cursor or VS Code, see + https://docs.databricks.com/dev-tools/vscode-ext.html. -This folder defines all source code for the my_python_project_pipeline pipeline: +(c) With command line tools, see https://docs.databricks.com/dev-tools/cli/databricks-cli.html -- `explorations`: Ad-hoc notebooks used to explore the data processed by this pipeline. -- `transformations`: All dataset definitions and transformations. -- `utilities` (optional): Utility functions and Python modules used in this pipeline. +# Using this project using the CLI -## Getting Started +The Databricks workspace and IDE extensions provide a graphical interface for working +with this project. It's also possible to interact with it directly using the CLI: -To get started, go to the `transformations` folder -- most of the relevant source code lives there: - -* By convention, every dataset under `transformations` is in a separate file. -* Take a look at the sample under "sample_trips_my_python_project.py" to get familiar with the syntax. - Read more about the syntax at https://docs.databricks.com/dlt/python-ref.html. - -For more tutorials and reference material, see https://docs.databricks.com/dlt. - -## Deploying pipelines +1. Authenticate to your Databricks workspace, if you have not done so already: + ``` + $ databricks configure + ``` -1. To deploy a development copy of this project, type: +2. To deploy a development copy of this project, type: ``` - $ pipelines deploy --target dev + $ databricks bundle deploy --target dev ``` (Note that "dev" is the default target, so the `--target` parameter is optional here.) This deploys everything that's defined for this project. For example, the default template would deploy a pipeline called - `[dev yourname] my_python_project_pipeline` to your workspace. - You can find that pipeline by opening your workpace and clicking on **Jobs & Pipelines**. + `[dev yourname] my_python_project_etl` to your workspace. + You can find that resource by opening your workpace and clicking on **Jobs & Pipelines**. -2. Similarly, to deploy a production copy, type: +3. Similarly, to deploy a production copy, type: ``` - $ pipelines deploy --target prod + $ databricks bundle deploy --target prod ``` + Note the default template has a includes a job that runs the pipeline every day + (defined in resources/sample_job.job.yml). The schedule + is paused when deploying in development mode (see + https://docs.databricks.com/dev-tools/bundles/deployment-modes.html). -3. To run a pipeline, use the "run" command: +4. To run a job or pipeline, use the "run" command: ``` - $ pipelines run + $ databricks bundle run ``` diff --git a/acceptance/pipelines/init/python/output/my_python_project/databricks.yml b/acceptance/pipelines/init/python/output/my_python_project/databricks.yml index f9b7ef40de..4efc47cbe3 100644 --- a/acceptance/pipelines/init/python/output/my_python_project/databricks.yml +++ b/acceptance/pipelines/init/python/output/my_python_project/databricks.yml @@ -1,4 +1,4 @@ -# This is a Databricks pipelines definition for my_python_project. +# This is a Databricks asset bundle definition for my_python_project. # See https://docs.databricks.com/dev-tools/bundles/index.html for documentation. bundle: name: my_python_project @@ -7,7 +7,6 @@ bundle: include: - resources/*.yml - resources/*/*.yml - - ./*.yml # Variable declarations. These variables are assigned in the dev/prod targets below. variables: @@ -15,13 +14,13 @@ variables: description: The catalog to use schema: description: The schema to use - notifications: - description: The email addresses to use for failure notifications targets: dev: # The default target uses 'mode: development' to create a development copy. - # - Deployed pipelines get prefixed with '[dev my_user_name]' + # - Deployed resources get prefixed with '[dev my_user_name]' + # - Any job schedules and triggers are paused by default. + # See also https://docs.databricks.com/dev-tools/bundles/deployment-modes.html. mode: development default: true workspace: @@ -29,18 +28,15 @@ targets: variables: catalog: main schema: ${workspace.current_user.short_name} - notifications: [] - prod: mode: production workspace: host: [DATABRICKS_URL] # We explicitly deploy to /Workspace/Users/[USERNAME] to make sure we only have a single copy. root_path: /Workspace/Users/[USERNAME]/.bundle/${bundle.name}/${bundle.target} + variables: + catalog: main + schema: prod permissions: - user_name: [USERNAME] level: CAN_MANAGE - variables: - catalog: main - schema: default - notifications: [[USERNAME]] diff --git a/acceptance/pipelines/init/python/output/my_python_project/my_python_project.pipeline.yml b/acceptance/pipelines/init/python/output/my_python_project/my_python_project.pipeline.yml deleted file mode 100644 index 00a1f5e3c1..0000000000 --- a/acceptance/pipelines/init/python/output/my_python_project/my_python_project.pipeline.yml +++ /dev/null @@ -1,11 +0,0 @@ -resources: - pipelines: - my_python_project_pipeline: - name: my_python_project_pipeline - serverless: true - catalog: ${var.catalog} - schema: ${var.schema} - root_path: "." - libraries: - - glob: - include: transformations/** diff --git a/acceptance/pipelines/init/python/output/my_python_project/out.gitignore b/acceptance/pipelines/init/python/output/my_python_project/out.gitignore index f6a3b5ff93..e566c51f74 100644 --- a/acceptance/pipelines/init/python/output/my_python_project/out.gitignore +++ b/acceptance/pipelines/init/python/output/my_python_project/out.gitignore @@ -4,5 +4,7 @@ dist/ __pycache__/ *.egg-info .venv/ +scratch/** +!scratch/README.md **/explorations/** **/!explorations/README.md diff --git a/acceptance/pipelines/init/python/output/my_python_project/pyproject.toml b/acceptance/pipelines/init/python/output/my_python_project/pyproject.toml new file mode 100644 index 0000000000..d75a0a4caa --- /dev/null +++ b/acceptance/pipelines/init/python/output/my_python_project/pyproject.toml @@ -0,0 +1,34 @@ +[project] +name = "my_python_project" +version = "0.0.1" +authors = [{ name = "[USERNAME]" }] +requires-python = ">=3.10,<3.13" +dependencies = [ + # Any dependencies for jobs and pipelines in this project can be added here + # See also https://docs.databricks.com/dev-tools/bundles/library-dependencies + # + # LIMITATION: for pipelines, dependencies are cached during development; + # add dependencies to the 'environment' section of your pipeline.yml file instead +] + +[dependency-groups] +dev = [ + "pytest", + "ruff", + "databricks-dlt", + "databricks-connect>=15.4,<15.5", + "ipykernel", +] + +[project.scripts] +main = "my_python_project.main:main" + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.hatch.build.targets.wheel] +packages = ["src"] + +[tool.ruff] +line-length = 120 diff --git a/acceptance/pipelines/init/python/output/my_python_project/resources/my_python_project_etl.pipeline.yml b/acceptance/pipelines/init/python/output/my_python_project/resources/my_python_project_etl.pipeline.yml new file mode 100644 index 0000000000..89470a1c5d --- /dev/null +++ b/acceptance/pipelines/init/python/output/my_python_project/resources/my_python_project_etl.pipeline.yml @@ -0,0 +1,20 @@ +# The main pipeline for my_python_project + +resources: + pipelines: + my_python_project_etl: + name: my_python_project_etl + catalog: ${var.catalog} + schema: ${var.schema} + serverless: true + root_path: "../src/my_python_project_etl" + + libraries: + - glob: + include: ../src/my_python_project_etl/transformations/** + + environment: + dependencies: + # We include every dependency defined by pyproject.toml by defining an editable environment + # that points to the folder where pyproject.toml is deployed. + - --editable ${workspace.file_path} diff --git a/acceptance/pipelines/init/python/output/my_python_project/resources/sample_job.job.yml b/acceptance/pipelines/init/python/output/my_python_project/resources/sample_job.job.yml new file mode 100644 index 0000000000..788216eaad --- /dev/null +++ b/acceptance/pipelines/init/python/output/my_python_project/resources/sample_job.job.yml @@ -0,0 +1,32 @@ +# A sample job for my_python_project. + +resources: + jobs: + sample_job: + name: sample_job + + trigger: + # Run this job every day, exactly one day from the last run; see https://docs.databricks.com/api/workspace/jobs/create#trigger + periodic: + interval: 1 + unit: DAYS + + #email_notifications: + # on_failure: + # - your_email@example.com + + parameters: + - name: catalog + default: ${var.catalog} + - name: schema + default: ${var.schema} + + tasks: + - task_key: refresh_pipeline + pipeline_task: + pipeline_id: ${resources.pipelines.my_python_project_etl.id} + + environments: + - environment_key: default + spec: + environment_version: "4" diff --git a/acceptance/pipelines/init/python/output/my_python_project/src/my_python_project_etl/README.md b/acceptance/pipelines/init/python/output/my_python_project/src/my_python_project_etl/README.md new file mode 100644 index 0000000000..7e6ec71d7d --- /dev/null +++ b/acceptance/pipelines/init/python/output/my_python_project/src/my_python_project_etl/README.md @@ -0,0 +1,20 @@ +# my_python_project + +This folder defines all source code for the my_python_project pipeline: + +- `explorations/`: Ad-hoc notebooks used to explore the data processed by this pipeline. +- `transformations/`: All dataset definitions and transformations. +- `utilities/` (optional): Utility functions and Python modules used in this pipeline. +- `data_sources/` (optional): View definitions describing the source data for this pipeline. + +## Getting Started + +To get started, go to the `transformations` folder -- most of the relevant source code lives there: + +* By convention, every dataset under `transformations` is in a separate file. +* Take a look at the sample called "sample_trips_my_python_project.py" to get familiar with the syntax. + Read more about the syntax at https://docs.databricks.com/dlt/python-ref.html. +* If you're using the workspace UI, use `Run file` to run and preview a single transformation. +* If you're using the CLI, use `databricks bundle run my_python_project_etl --select sample_trips_my_python_project` to run a single transformation. + +For more tutorials and reference material, see https://docs.databricks.com/dlt. diff --git a/acceptance/pipelines/init/error-cases/output/my_project/explorations/sample_exploration.ipynb b/acceptance/pipelines/init/python/output/my_python_project/src/my_python_project_etl/explorations/sample_exploration.ipynb similarity index 94% rename from acceptance/pipelines/init/error-cases/output/my_project/explorations/sample_exploration.ipynb rename to acceptance/pipelines/init/python/output/my_python_project/src/my_python_project_etl/explorations/sample_exploration.ipynb index 7368a6c4c0..4135310a57 100644 --- a/acceptance/pipelines/init/error-cases/output/my_project/explorations/sample_exploration.ipynb +++ b/acceptance/pipelines/init/python/output/my_python_project/src/my_python_project_etl/explorations/sample_exploration.ipynb @@ -37,7 +37,7 @@ "source": [ "# !!! Before performing any data analysis, make sure to run the pipeline to materialize the sample datasets. The tables referenced in this notebook depend on that step.\n", "\n", - "display(spark.sql(\"SELECT * FROM hive_metastore.[USERNAME].my_project\"))" + "display(spark.sql(\"SELECT * FROM main.[USERNAME].sample_trips_my_python_project\"))" ] } ], diff --git a/acceptance/pipelines/init/python/output/my_python_project/transformations/sample_trips_my_python_project.py b/acceptance/pipelines/init/python/output/my_python_project/src/my_python_project_etl/transformations/sample_trips_my_python_project.py similarity index 52% rename from acceptance/pipelines/init/python/output/my_python_project/transformations/sample_trips_my_python_project.py rename to acceptance/pipelines/init/python/output/my_python_project/src/my_python_project_etl/transformations/sample_trips_my_python_project.py index 10ef03301a..76945b8659 100644 --- a/acceptance/pipelines/init/python/output/my_python_project/transformations/sample_trips_my_python_project.py +++ b/acceptance/pipelines/init/python/output/my_python_project/src/my_python_project_etl/transformations/sample_trips_my_python_project.py @@ -1,6 +1,5 @@ -import dlt +from pyspark import pipelines as dp from pyspark.sql.functions import col -from utilities import utils # This file defines a sample transformation. @@ -8,8 +7,6 @@ # using "+ Add" in the file browser. -@dlt.table +@dp.table def sample_trips_my_python_project(): - return spark.read.table("samples.nyctaxi.trips").withColumn( - "trip_distance_km", utils.distance_km(col("trip_distance")) - ) + return spark.read.table("samples.nyctaxi.trips") diff --git a/acceptance/pipelines/init/python/output/my_python_project/transformations/sample_zones_my_python_project.py b/acceptance/pipelines/init/python/output/my_python_project/src/my_python_project_etl/transformations/sample_zones_my_python_project.py similarity index 90% rename from acceptance/pipelines/init/python/output/my_python_project/transformations/sample_zones_my_python_project.py rename to acceptance/pipelines/init/python/output/my_python_project/src/my_python_project_etl/transformations/sample_zones_my_python_project.py index 20fcd9645e..5ccd8e45ac 100644 --- a/acceptance/pipelines/init/python/output/my_python_project/transformations/sample_zones_my_python_project.py +++ b/acceptance/pipelines/init/python/output/my_python_project/src/my_python_project_etl/transformations/sample_zones_my_python_project.py @@ -1,4 +1,4 @@ -import dlt +from pyspark import pipelines as dp from pyspark.sql.functions import col, sum @@ -7,7 +7,7 @@ # using "+ Add" in the file browser. -@dlt.table +@dp.table def sample_zones_my_python_project(): # Read from the "sample_trips" table, then sum all the fares return ( diff --git a/acceptance/pipelines/init/python/output/my_python_project/utilities/utils.py b/acceptance/pipelines/init/python/output/my_python_project/utilities/utils.py deleted file mode 100644 index ff039898f0..0000000000 --- a/acceptance/pipelines/init/python/output/my_python_project/utilities/utils.py +++ /dev/null @@ -1,8 +0,0 @@ -from pyspark.sql.functions import udf -from pyspark.sql.types import FloatType - - -@udf(returnType=FloatType()) -def distance_km(distance_miles): - """Convert distance from miles to kilometers (1 mile = 1.60934 km).""" - return distance_miles * 1.60934 diff --git a/acceptance/pipelines/init/sql/input.json b/acceptance/pipelines/init/sql/input.json index 184f6147ea..a44b082d85 100644 --- a/acceptance/pipelines/init/sql/input.json +++ b/acceptance/pipelines/init/sql/input.json @@ -1,7 +1,6 @@ { "project_name": "my_sql_project", "default_catalog": "main", - "personal_schemas": "no", - "shared_schema": "shared", + "personal_schemas": "yes", "language": "sql" } diff --git a/acceptance/pipelines/init/sql/output.txt b/acceptance/pipelines/init/sql/output.txt index 4dc00c937b..19fd12bb24 100644 --- a/acceptance/pipelines/init/sql/output.txt +++ b/acceptance/pipelines/init/sql/output.txt @@ -1,10 +1,14 @@ === Test pipelines init with SQL configuration >>> [PIPELINES] init --config-file ./input.json --output-dir output +Welcome to the template for Lakeflow Declarative Pipelines! -Welcome to the template for pipelines! +Please answer the below to tailor your project to your preferences. +You can always change your mind and change your configuration in the databricks.yml file later. +Note that [DATABRICKS_URL] is used for initialization +(see https://docs.databricks.com/dev-tools/cli/profiles.html for how to change your profile). -Your new project has been created in the 'my_sql_project' directory! +✨ Your new project has been created in the 'my_sql_project' directory! -Refer to the README.md file for "getting started" instructions! +Please refer to the README.md file for "getting started" instructions. diff --git a/acceptance/pipelines/init/sql/output/my_sql_project/.vscode/extensions.json b/acceptance/pipelines/init/sql/output/my_sql_project/.vscode/extensions.json index 5d15eba363..5ba48e79c9 100644 --- a/acceptance/pipelines/init/sql/output/my_sql_project/.vscode/extensions.json +++ b/acceptance/pipelines/init/sql/output/my_sql_project/.vscode/extensions.json @@ -1,7 +1,7 @@ { "recommendations": [ "databricks.databricks", - "ms-python.vscode-pylance", - "redhat.vscode-yaml" + "redhat.vscode-yaml", + "charliermarsh.ruff" ] } diff --git a/acceptance/pipelines/init/sql/output/my_sql_project/.vscode/settings.json b/acceptance/pipelines/init/sql/output/my_sql_project/.vscode/settings.json index f38f52b03f..d73c73b570 100644 --- a/acceptance/pipelines/init/sql/output/my_sql_project/.vscode/settings.json +++ b/acceptance/pipelines/init/sql/output/my_sql_project/.vscode/settings.json @@ -1,21 +1,39 @@ { - "python.analysis.stubPath": ".vscode", - "databricks.python.envFile": "${workspaceFolder}/.env", "jupyter.interactiveWindow.cellMarker.codeRegex": "^# COMMAND ----------|^# Databricks notebook source|^(#\\s*%%|#\\s*\\|#\\s*In\\[\\d*?\\]|#\\s*In\\[ \\])", "jupyter.interactiveWindow.cellMarker.default": "# COMMAND ----------", "python.testing.pytestArgs": [ "." ], - "python.testing.unittestEnabled": false, - "python.testing.pytestEnabled": true, - "python.analysis.extraPaths": ["resources/my_sql_project_pipeline"], "files.exclude": { "**/*.egg-info": true, "**/__pycache__": true, ".pytest_cache": true, + "dist": true, + }, + "files.associations": { + "**/.gitkeep": "markdown" }, + + // Pylance settings (VS Code) + // Set typeCheckingMode to "basic" to enable type checking! + "python.analysis.typeCheckingMode": "off", + "python.analysis.extraPaths": ["src", "lib", "resources"], + "python.analysis.diagnosticMode": "workspace", + "python.analysis.stubPath": ".vscode", + + // Pyright settings (Cursor) + // Set typeCheckingMode to "basic" to enable type checking! + "cursorpyright.analysis.typeCheckingMode": "off", + "cursorpyright.analysis.extraPaths": ["src", "lib", "resources"], + "cursorpyright.analysis.diagnosticMode": "workspace", + "cursorpyright.analysis.stubPath": ".vscode", + + // General Python settings + "python.defaultInterpreterPath": "./.venv/bin/python", + "python.testing.unittestEnabled": false, + "python.testing.pytestEnabled": true, "[python]": { - "editor.defaultFormatter": "ms-python.black-formatter", + "editor.defaultFormatter": "charliermarsh.ruff", "editor.formatOnSave": true, }, } diff --git a/acceptance/pipelines/init/sql/output/my_sql_project/README.md b/acceptance/pipelines/init/sql/output/my_sql_project/README.md index fa7a8d3307..962dacff9d 100644 --- a/acceptance/pipelines/init/sql/output/my_sql_project/README.md +++ b/acceptance/pipelines/init/sql/output/my_sql_project/README.md @@ -1,62 +1,54 @@ # my_sql_project -The 'my_sql_project' project was generated by using the CLI Pipelines template. +The 'my_sql_project' project was generated by using the lakeflow-pipelines template. -## Setup +* `src/`: SQL source code for this project. +* `resources/`: Resource configurations (jobs, pipelines, etc.) -1. Install the Databricks CLI from https://docs.databricks.com/dev-tools/cli/databricks-cli.html +## Getting started -2. Install the Pipelines CLI: - ``` - $ databricks install-pipelines-cli - ``` - -3. Authenticate to your Databricks workspace, if you have not done so already: - ``` - $ pipelines auth login - ``` +Choose how you want to work on this project: -4. Optionally, install developer tools such as the Databricks extension for Visual Studio Code from - https://docs.databricks.com/dev-tools/vscode-ext.html. Or the PyCharm plugin from - https://www.databricks.com/blog/announcing-pycharm-integration-databricks. +(a) Directly in your Databricks workspace, see + https://docs.databricks.com/dev-tools/bundles/workspace. -## Pipeline Structure +(b) Locally with an IDE like Cursor or VS Code, see + https://docs.databricks.com/dev-tools/vscode-ext.html. -This folder defines all source code for the my_sql_project_pipeline pipeline: +(c) With command line tools, see https://docs.databricks.com/dev-tools/cli/databricks-cli.html -- `explorations`: Ad-hoc notebooks used to explore the data processed by this pipeline. -- `transformations`: All dataset definitions and transformations. +# Using this project using the CLI -## Getting Started +The Databricks workspace and IDE extensions provide a graphical interface for working +with this project. It's also possible to interact with it directly using the CLI: -To get started, go to the `transformations` folder -- most of the relevant source code lives there: - -* By convention, every dataset under `transformations` is in a separate file. -* Take a look at the sample under "sample_trips_my_sql_project.sql" to get familiar with the syntax. - Read more about the syntax at https://docs.databricks.com/dlt/sql-ref.html. - -For more tutorials and reference material, see https://docs.databricks.com/dlt. - -## Deploying pipelines +1. Authenticate to your Databricks workspace, if you have not done so already: + ``` + $ databricks configure + ``` -1. To deploy a development copy of this project, type: +2. To deploy a development copy of this project, type: ``` - $ pipelines deploy --target dev + $ databricks bundle deploy --target dev ``` (Note that "dev" is the default target, so the `--target` parameter is optional here.) This deploys everything that's defined for this project. For example, the default template would deploy a pipeline called - `[dev yourname] my_sql_project_pipeline` to your workspace. - You can find that pipeline by opening your workpace and clicking on **Jobs & Pipelines**. + `[dev yourname] my_sql_project_etl` to your workspace. + You can find that resource by opening your workpace and clicking on **Jobs & Pipelines**. -2. Similarly, to deploy a production copy, type: +3. Similarly, to deploy a production copy, type: ``` - $ pipelines deploy --target prod + $ databricks bundle deploy --target prod ``` + Note the default template has a includes a job that runs the pipeline every day + (defined in resources/sample_job.job.yml). The schedule + is paused when deploying in development mode (see + https://docs.databricks.com/dev-tools/bundles/deployment-modes.html). -3. To run a pipeline, use the "run" command: +4. To run a job or pipeline, use the "run" command: ``` - $ pipelines run + $ databricks bundle run ``` diff --git a/acceptance/pipelines/init/sql/output/my_sql_project/databricks.yml b/acceptance/pipelines/init/sql/output/my_sql_project/databricks.yml index fc415f32d4..f986d618b8 100644 --- a/acceptance/pipelines/init/sql/output/my_sql_project/databricks.yml +++ b/acceptance/pipelines/init/sql/output/my_sql_project/databricks.yml @@ -1,4 +1,4 @@ -# This is a Databricks pipelines definition for my_sql_project. +# This is a Databricks asset bundle definition for my_sql_project. # See https://docs.databricks.com/dev-tools/bundles/index.html for documentation. bundle: name: my_sql_project @@ -7,7 +7,6 @@ bundle: include: - resources/*.yml - resources/*/*.yml - - ./*.yml # Variable declarations. These variables are assigned in the dev/prod targets below. variables: @@ -15,32 +14,29 @@ variables: description: The catalog to use schema: description: The schema to use - notifications: - description: The email addresses to use for failure notifications targets: dev: # The default target uses 'mode: development' to create a development copy. - # - Deployed pipelines get prefixed with '[dev my_user_name]' + # - Deployed resources get prefixed with '[dev my_user_name]' + # - Any job schedules and triggers are paused by default. + # See also https://docs.databricks.com/dev-tools/bundles/deployment-modes.html. mode: development default: true workspace: host: [DATABRICKS_URL] variables: catalog: main - schema: shared_dev - notifications: [] - + schema: ${workspace.current_user.short_name} prod: mode: production workspace: host: [DATABRICKS_URL] # We explicitly deploy to /Workspace/Users/[USERNAME] to make sure we only have a single copy. root_path: /Workspace/Users/[USERNAME]/.bundle/${bundle.name}/${bundle.target} + variables: + catalog: main + schema: prod permissions: - user_name: [USERNAME] level: CAN_MANAGE - variables: - catalog: main - schema: shared - notifications: [[USERNAME]] diff --git a/acceptance/pipelines/init/sql/output/my_sql_project/explorations/sample_exploration.ipynb b/acceptance/pipelines/init/sql/output/my_sql_project/explorations/sample_exploration.ipynb deleted file mode 100644 index deee8395ea..0000000000 --- a/acceptance/pipelines/init/sql/output/my_sql_project/explorations/sample_exploration.ipynb +++ /dev/null @@ -1,64 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "[UUID]", - "showTitle": false, - "tableResultSettingsMap": {}, - "title": "" - } - }, - "source": [ - "### Example Exploratory Notebook\n", - "\n", - "Use this notebook to explore the data generated by the pipeline in your preferred programming language.\n", - "\n", - "**Note**: This notebook is not executed as part of the pipeline." - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "[UUID]", - "showTitle": false, - "tableResultSettingsMap": {}, - "title": "" - } - }, - "outputs": [], - "source": [ - "-- !!! Before performing any data analysis, make sure to run the pipeline to materialize the sample datasets. The tables referenced in this notebook depend on that step.\n", - "\n", - "USE CATALOG `main`;\n", - "USE SCHEMA `shared_dev`;\n", - "\n", - "SELECT * from my_sql_project;" - ] - } - ], - "metadata": { - "application/vnd.databricks.v1+notebook": { - "computePreferences": null, - "dashboards": [], - "environmentMetadata": null, - "inputWidgetPreferences": null, - "language": "sql", - "notebookMetadata": {}, - "notebookName": "sample_exploration", - "widgets": {} - }, - "language_info": { - "name": "sql" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} diff --git a/acceptance/pipelines/init/sql/output/my_sql_project/my_sql_project.pipeline.yml b/acceptance/pipelines/init/sql/output/my_sql_project/my_sql_project.pipeline.yml deleted file mode 100644 index b4ccf7914d..0000000000 --- a/acceptance/pipelines/init/sql/output/my_sql_project/my_sql_project.pipeline.yml +++ /dev/null @@ -1,11 +0,0 @@ -resources: - pipelines: - my_sql_project_pipeline: - name: my_sql_project_pipeline - serverless: true - catalog: ${var.catalog} - schema: ${var.schema} - root_path: "." - libraries: - - glob: - include: transformations/** diff --git a/acceptance/pipelines/init/sql/output/my_sql_project/out.gitignore b/acceptance/pipelines/init/sql/output/my_sql_project/out.gitignore index f6a3b5ff93..e566c51f74 100644 --- a/acceptance/pipelines/init/sql/output/my_sql_project/out.gitignore +++ b/acceptance/pipelines/init/sql/output/my_sql_project/out.gitignore @@ -4,5 +4,7 @@ dist/ __pycache__/ *.egg-info .venv/ +scratch/** +!scratch/README.md **/explorations/** **/!explorations/README.md diff --git a/acceptance/pipelines/init/sql/output/my_sql_project/resources/my_sql_project_etl.pipeline.yml b/acceptance/pipelines/init/sql/output/my_sql_project/resources/my_sql_project_etl.pipeline.yml new file mode 100644 index 0000000000..e892df8da7 --- /dev/null +++ b/acceptance/pipelines/init/sql/output/my_sql_project/resources/my_sql_project_etl.pipeline.yml @@ -0,0 +1,20 @@ +# The main pipeline for my_sql_project + +resources: + pipelines: + my_sql_project_etl: + name: my_sql_project_etl + catalog: ${var.catalog} + schema: ${var.schema} + serverless: true + root_path: "../src/my_sql_project_etl" + + libraries: + - glob: + include: ../src/my_sql_project_etl/transformations/** + + environment: + dependencies: + # We include every dependency defined by pyproject.toml by defining an editable environment + # that points to the folder where pyproject.toml is deployed. + - --editable ${workspace.file_path} diff --git a/acceptance/pipelines/init/sql/output/my_sql_project/resources/sample_job.job.yml b/acceptance/pipelines/init/sql/output/my_sql_project/resources/sample_job.job.yml new file mode 100644 index 0000000000..1e1767f2f3 --- /dev/null +++ b/acceptance/pipelines/init/sql/output/my_sql_project/resources/sample_job.job.yml @@ -0,0 +1,32 @@ +# A sample job for my_sql_project. + +resources: + jobs: + sample_job: + name: sample_job + + trigger: + # Run this job every day, exactly one day from the last run; see https://docs.databricks.com/api/workspace/jobs/create#trigger + periodic: + interval: 1 + unit: DAYS + + #email_notifications: + # on_failure: + # - your_email@example.com + + parameters: + - name: catalog + default: ${var.catalog} + - name: schema + default: ${var.schema} + + tasks: + - task_key: refresh_pipeline + pipeline_task: + pipeline_id: ${resources.pipelines.my_sql_project_etl.id} + + environments: + - environment_key: default + spec: + environment_version: "4" diff --git a/acceptance/pipelines/init/sql/output/my_sql_project/src/my_sql_project_etl/README.md b/acceptance/pipelines/init/sql/output/my_sql_project/src/my_sql_project_etl/README.md new file mode 100644 index 0000000000..6df3989072 --- /dev/null +++ b/acceptance/pipelines/init/sql/output/my_sql_project/src/my_sql_project_etl/README.md @@ -0,0 +1,20 @@ +# my_sql_project + +This folder defines all source code for the my_sql_project pipeline: + +- `explorations/`: Ad-hoc notebooks used to explore the data processed by this pipeline. +- `transformations/`: All dataset definitions and transformations. +- `utilities/` (optional): Utility functions and Python modules used in this pipeline. +- `data_sources/` (optional): View definitions describing the source data for this pipeline. + +## Getting Started + +To get started, go to the `transformations` folder -- most of the relevant source code lives there: + +* By convention, every dataset under `transformations` is in a separate file. +* Take a look at the sample called "sample_trips_my_sql_project.py" to get familiar with the syntax. + Read more about the syntax at https://docs.databricks.com/dlt/python-ref.html. +* If you're using the workspace UI, use `Run file` to run and preview a single transformation. +* If you're using the CLI, use `databricks bundle run my_sql_project_etl --select sample_trips_my_sql_project` to run a single transformation. + +For more tutorials and reference material, see https://docs.databricks.com/dlt. diff --git a/acceptance/pipelines/init/python/output/my_python_project/explorations/sample_exploration.ipynb b/acceptance/pipelines/init/sql/output/my_sql_project/src/my_sql_project_etl/explorations/sample_exploration.ipynb similarity index 94% rename from acceptance/pipelines/init/python/output/my_python_project/explorations/sample_exploration.ipynb rename to acceptance/pipelines/init/sql/output/my_sql_project/src/my_sql_project_etl/explorations/sample_exploration.ipynb index dd456692cc..6c0c2f1f4e 100644 --- a/acceptance/pipelines/init/python/output/my_python_project/explorations/sample_exploration.ipynb +++ b/acceptance/pipelines/init/sql/output/my_sql_project/src/my_sql_project_etl/explorations/sample_exploration.ipynb @@ -37,7 +37,7 @@ "source": [ "# !!! Before performing any data analysis, make sure to run the pipeline to materialize the sample datasets. The tables referenced in this notebook depend on that step.\n", "\n", - "display(spark.sql(\"SELECT * FROM main.[USERNAME].my_python_project\"))" + "display(spark.sql(\"SELECT * FROM main.[USERNAME].sample_trips_my_sql_project\"))" ] } ], diff --git a/acceptance/pipelines/init/sql/output/my_sql_project/transformations/sample_trips_my_sql_project.sql b/acceptance/pipelines/init/sql/output/my_sql_project/src/my_sql_project_etl/transformations/sample_trips_my_sql_project.sql similarity index 87% rename from acceptance/pipelines/init/sql/output/my_sql_project/transformations/sample_trips_my_sql_project.sql rename to acceptance/pipelines/init/sql/output/my_sql_project/src/my_sql_project_etl/transformations/sample_trips_my_sql_project.sql index 41971fa014..083cf14242 100644 --- a/acceptance/pipelines/init/sql/output/my_sql_project/transformations/sample_trips_my_sql_project.sql +++ b/acceptance/pipelines/init/sql/output/my_sql_project/src/my_sql_project_etl/transformations/sample_trips_my_sql_project.sql @@ -5,5 +5,6 @@ CREATE MATERIALIZED VIEW sample_trips_my_sql_project AS SELECT pickup_zip, - fare_amount + fare_amount, + trip_distance FROM samples.nyctaxi.trips diff --git a/acceptance/pipelines/init/sql/output/my_sql_project/transformations/sample_zones_my_sql_project.sql b/acceptance/pipelines/init/sql/output/my_sql_project/src/my_sql_project_etl/transformations/sample_zones_my_sql_project.sql similarity index 100% rename from acceptance/pipelines/init/sql/output/my_sql_project/transformations/sample_zones_my_sql_project.sql rename to acceptance/pipelines/init/sql/output/my_sql_project/src/my_sql_project_etl/transformations/sample_zones_my_sql_project.sql diff --git a/cmd/pipelines/init.go b/cmd/pipelines/init.go index e847b3fc22..0f91e665b0 100644 --- a/cmd/pipelines/init.go +++ b/cmd/pipelines/init.go @@ -22,7 +22,7 @@ func initCommand() *cobra.Command { ctx := cmd.Context() r := template.Resolver{ - TemplatePathOrUrl: "cli-pipelines", + TemplatePathOrUrl: string(template.LakeflowPipelines), ConfigFile: configFile, OutputDir: outputDir, } diff --git a/libs/template/template.go b/libs/template/template.go index 83a86e9cbb..aaea2b2b9b 100644 --- a/libs/template/template.go +++ b/libs/template/template.go @@ -30,13 +30,14 @@ const ( ExperimentalDefaultPython TemplateName = "experimental-default-python-vnext" DefaultSql TemplateName = "default-sql" LakeflowPipelines TemplateName = "lakeflow-pipelines" - CLIPipelines TemplateName = "cli-pipelines" - DbtSql TemplateName = "dbt-sql" - MlopsStacks TemplateName = "mlops-stacks" - Pydabs TemplateName = "pydabs" - Custom TemplateName = "custom" - ExperimentalJobsAsCode TemplateName = "experimental-jobs-as-code" - Default TemplateName = "default" + // CLIPipelines is deprecated. Use LakeflowPipelines instead + CLIPipelines TemplateName = "cli-pipelines" + DbtSql TemplateName = "dbt-sql" + MlopsStacks TemplateName = "mlops-stacks" + Pydabs TemplateName = "pydabs" + Custom TemplateName = "custom" + ExperimentalJobsAsCode TemplateName = "experimental-jobs-as-code" + Default TemplateName = "default" ) var databricksTemplates = []Template{ @@ -75,7 +76,7 @@ var databricksTemplates = []Template{ name: CLIPipelines, hidden: true, description: "The default template for CLI pipelines", - Reader: &builtinReader{name: string(CLIPipelines)}, + Reader: &builtinReader{name: string(LakeflowPipelines)}, Writer: &writerWithFullTelemetry{defaultWriter: defaultWriter{name: CLIPipelines}}, }, { diff --git a/libs/template/templates/cli-pipelines/README.md b/libs/template/templates/cli-pipelines/README.md deleted file mode 100644 index 12dc8a7a42..0000000000 --- a/libs/template/templates/cli-pipelines/README.md +++ /dev/null @@ -1,3 +0,0 @@ -# CLI Pipelines - -Default template for CLI Pipelines diff --git a/libs/template/templates/cli-pipelines/databricks_template_schema.json b/libs/template/templates/cli-pipelines/databricks_template_schema.json deleted file mode 100644 index ce617cc9dd..0000000000 --- a/libs/template/templates/cli-pipelines/databricks_template_schema.json +++ /dev/null @@ -1,57 +0,0 @@ -{ - "welcome_message": "\nWelcome to the template for pipelines!", - "properties": { - "project_name": { - "type": "string", - "default": "my_project", - "description": "Please provide the following details to tailor the template to your preferences.\n\nUnique name for this project\nproject_name", - "order": 1, - "pattern": "^[a-z0-9_]+$", - "pattern_match_failure_message": "Name must consist of lower case letters, numbers, and underscores." - }, - "default_catalog": { - "type": "string", - "default": "{{default_catalog}}", - "pattern": "^\\w*$", - "pattern_match_failure_message": "Invalid catalog name.", - "description": "\nInitial catalog:\ndefault_catalog", - "order": 3 - }, - "personal_schemas": { - "type": "string", - "description": "\nUse a personal schema for each user working on this project? (e.g., 'catalog.{{short_name}}')\npersonal_schemas", - "default": "yes", - "enum": [ - "yes", - "no" - ], - "order": 4 - }, - "shared_schema": { - "skip_prompt_if": { - "properties": { - "personal_schemas": { - "const": "yes" - } - } - }, - "type": "string", - "default": "default", - "pattern": "^\\w+$", - "pattern_match_failure_message": "Invalid schema name.", - "description": "\nInitial schema during development:\nNote: This schema name will be suffixed with '_dev' when deployed to target the development environment.\ndefault_schema", - "order": 5 - }, - "language": { - "type": "string", - "default": "python", - "description": "\nInitial language for this project:\nlanguage", - "enum": [ - "python", - "sql" - ], - "order": 6 - } - }, - "success_message": "\n\nYour new project has been created in the '{{.project_name}}' directory!\n\nRefer to the README.md file for \"getting started\" instructions!" -} diff --git a/libs/template/templates/cli-pipelines/library/variables.tmpl b/libs/template/templates/cli-pipelines/library/variables.tmpl deleted file mode 100644 index fb0e6f8922..0000000000 --- a/libs/template/templates/cli-pipelines/library/variables.tmpl +++ /dev/null @@ -1,33 +0,0 @@ -{{- define `pipeline_name` -}} - {{ .project_name }}_pipeline -{{- end }} - -{{- define `job_name` -}} - {{ .project_name }}_job -{{- end }} - -{{- define `static_dev_schema` -}} - {{- if (regexp "^yes").MatchString .personal_schemas -}} - {{ short_name }} - {{- else -}} - {{ .shared_schema }}_dev - {{- end}} -{{- end }} - - -{{- define `dev_schema` -}} - {{- if (regexp "^yes").MatchString .personal_schemas -}} - ${workspace.current_user.short_name} - {{- else -}} - {{ .shared_schema }}_dev - {{- end}} -{{- end }} - - -{{- define `prod_schema` -}} - {{- if (regexp "^yes").MatchString .personal_schemas -}} - default - {{- else -}} - {{ .shared_schema }} - {{- end}} -{{- end }} diff --git a/libs/template/templates/cli-pipelines/template/__preamble.tmpl b/libs/template/templates/cli-pipelines/template/__preamble.tmpl deleted file mode 100644 index f116c0b44e..0000000000 --- a/libs/template/templates/cli-pipelines/template/__preamble.tmpl +++ /dev/null @@ -1,16 +0,0 @@ -# Preamble - -This file only contains template directives; it is skipped for the actual output. - -{{skip "__preamble"}} - -{{$isSQL := eq .language "sql"}} - -{{if $isSQL}} - {{skip "{{.project_name}}/utilities/utils.py"}} - {{skip "{{.project_name}}/transformations/sample_zones_{{.project_name}}.py"}} - {{skip "{{.project_name}}/transformations/sample_trips_{{.project_name}}.py"}} -{{else}} - {{skip "{{.project_name}}/transformations/sample_zones_{{.project_name}}.sql"}} - {{skip "{{.project_name}}/transformations/sample_trips_{{.project_name}}.sql"}} -{{end}} diff --git a/libs/template/templates/cli-pipelines/template/{{.project_name}}/.gitignore.tmpl b/libs/template/templates/cli-pipelines/template/{{.project_name}}/.gitignore.tmpl deleted file mode 100644 index f6a3b5ff93..0000000000 --- a/libs/template/templates/cli-pipelines/template/{{.project_name}}/.gitignore.tmpl +++ /dev/null @@ -1,8 +0,0 @@ -.databricks/ -build/ -dist/ -__pycache__/ -*.egg-info -.venv/ -**/explorations/** -**/!explorations/README.md diff --git a/libs/template/templates/cli-pipelines/template/{{.project_name}}/.vscode/__builtins__.pyi b/libs/template/templates/cli-pipelines/template/{{.project_name}}/.vscode/__builtins__.pyi deleted file mode 100644 index 0edd5181bc..0000000000 --- a/libs/template/templates/cli-pipelines/template/{{.project_name}}/.vscode/__builtins__.pyi +++ /dev/null @@ -1,3 +0,0 @@ -# Typings for Pylance in Visual Studio Code -# see https://github.com/microsoft/pyright/blob/main/docs/builtins.md -from databricks.sdk.runtime import * diff --git a/libs/template/templates/cli-pipelines/template/{{.project_name}}/.vscode/extensions.json b/libs/template/templates/cli-pipelines/template/{{.project_name}}/.vscode/extensions.json deleted file mode 100644 index 5d15eba363..0000000000 --- a/libs/template/templates/cli-pipelines/template/{{.project_name}}/.vscode/extensions.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "recommendations": [ - "databricks.databricks", - "ms-python.vscode-pylance", - "redhat.vscode-yaml" - ] -} diff --git a/libs/template/templates/cli-pipelines/template/{{.project_name}}/.vscode/settings.json.tmpl b/libs/template/templates/cli-pipelines/template/{{.project_name}}/.vscode/settings.json.tmpl deleted file mode 100644 index 6a87715ae2..0000000000 --- a/libs/template/templates/cli-pipelines/template/{{.project_name}}/.vscode/settings.json.tmpl +++ /dev/null @@ -1,22 +0,0 @@ -{ - "python.analysis.stubPath": ".vscode", - "databricks.python.envFile": "${workspaceFolder}/.env", - "jupyter.interactiveWindow.cellMarker.codeRegex": "^# COMMAND ----------|^# Databricks notebook source|^(#\\s*%%|#\\s*\\|#\\s*In\\[\\d*?\\]|#\\s*In\\[ \\])", - "jupyter.interactiveWindow.cellMarker.default": "# COMMAND ----------", - "python.testing.pytestArgs": [ - "." - ], - "python.testing.unittestEnabled": false, - "python.testing.pytestEnabled": true, - {{- /* Unfortunately extraPaths doesn't support globs!! See: https://github.com/microsoft/pylance-release/issues/973 */}} - "python.analysis.extraPaths": ["resources/{{.project_name}}_pipeline"], - "files.exclude": { - "**/*.egg-info": true, - "**/__pycache__": true, - ".pytest_cache": true, - }, - "[python]": { - "editor.defaultFormatter": "ms-python.black-formatter", - "editor.formatOnSave": true, - }, -} diff --git a/libs/template/templates/cli-pipelines/template/{{.project_name}}/README.md.tmpl b/libs/template/templates/cli-pipelines/template/{{.project_name}}/README.md.tmpl deleted file mode 100644 index fc8544cc79..0000000000 --- a/libs/template/templates/cli-pipelines/template/{{.project_name}}/README.md.tmpl +++ /dev/null @@ -1,74 +0,0 @@ -# {{.project_name}} - -The '{{.project_name}}' project was generated by using the CLI Pipelines template. - -## Setup - -1. Install the Databricks CLI from https://docs.databricks.com/dev-tools/cli/databricks-cli.html - -2. Install the Pipelines CLI: - ``` - $ databricks install-pipelines-cli - ``` - -3. Authenticate to your Databricks workspace, if you have not done so already: - ``` - $ pipelines auth login - ``` - -4. Optionally, install developer tools such as the Databricks extension for Visual Studio Code from - https://docs.databricks.com/dev-tools/vscode-ext.html. Or the PyCharm plugin from - https://www.databricks.com/blog/announcing-pycharm-integration-databricks. - -## Pipeline Structure - -This folder defines all source code for the {{template `pipeline_name` .}} pipeline: - -{{ if (eq .language "python") -}} -- `explorations`: Ad-hoc notebooks used to explore the data processed by this pipeline. -- `transformations`: All dataset definitions and transformations. -- `utilities` (optional): Utility functions and Python modules used in this pipeline. -{{- else -}} -- `explorations`: Ad-hoc notebooks used to explore the data processed by this pipeline. -- `transformations`: All dataset definitions and transformations. -{{- end }} - -## Getting Started - -To get started, go to the `transformations` folder -- most of the relevant source code lives there: - -{{ if (eq .language "python") -}} -* By convention, every dataset under `transformations` is in a separate file. -* Take a look at the sample under "sample_trips_{{ .project_name }}.py" to get familiar with the syntax. - Read more about the syntax at https://docs.databricks.com/dlt/python-ref.html. -{{- else -}} -* By convention, every dataset under `transformations` is in a separate file. -* Take a look at the sample under "sample_trips_{{ .project_name }}.sql" to get familiar with the syntax. - Read more about the syntax at https://docs.databricks.com/dlt/sql-ref.html. -{{- end }} - -For more tutorials and reference material, see https://docs.databricks.com/dlt. - -## Deploying pipelines - -1. To deploy a development copy of this project, type: - ``` - $ pipelines deploy --target dev - ``` - (Note that "dev" is the default target, so the `--target` parameter - is optional here.) - - This deploys everything that's defined for this project. - For example, the default template would deploy a pipeline called - `[dev yourname] {{.project_name}}_pipeline` to your workspace. - You can find that pipeline by opening your workpace and clicking on **Jobs & Pipelines**. - -2. Similarly, to deploy a production copy, type: - ``` - $ pipelines deploy --target prod - ``` - -3. To run a pipeline, use the "run" command: - ``` - $ pipelines run - ``` diff --git a/libs/template/templates/cli-pipelines/template/{{.project_name}}/databricks.yml.tmpl b/libs/template/templates/cli-pipelines/template/{{.project_name}}/databricks.yml.tmpl deleted file mode 100644 index ffcc6ba7b1..0000000000 --- a/libs/template/templates/cli-pipelines/template/{{.project_name}}/databricks.yml.tmpl +++ /dev/null @@ -1,46 +0,0 @@ -# This is a Databricks pipelines definition for {{.project_name}}. -# See https://docs.databricks.com/dev-tools/bundles/index.html for documentation. -bundle: - name: {{.project_name}} - uuid: {{bundle_uuid}} - -include: - - resources/*.yml - - resources/*/*.yml - - ./*.yml - -# Variable declarations. These variables are assigned in the dev/prod targets below. -variables: - catalog: - description: The catalog to use - schema: - description: The schema to use - notifications: - description: The email addresses to use for failure notifications - -targets: - dev: - # The default target uses 'mode: development' to create a development copy. - # - Deployed pipelines get prefixed with '[dev my_user_name]' - mode: development - default: true - workspace: - host: {{workspace_host}} - variables: - catalog: {{.default_catalog}} - schema: {{template `dev_schema` .}} - notifications: [] - - prod: - mode: production - workspace: - host: {{workspace_host}} - # We explicitly deploy to /Workspace/Users/{{user_name}} to make sure we only have a single copy. - root_path: /Workspace/Users/{{user_name}}/.bundle/${bundle.name}/${bundle.target} - permissions: - - {{if is_service_principal}}service_principal{{else}}user{{end}}_name: {{user_name}} - level: CAN_MANAGE - variables: - catalog: {{.default_catalog}} - schema: {{template `prod_schema` .}} - notifications: [{{user_name}}] diff --git a/libs/template/templates/cli-pipelines/template/{{.project_name}}/explorations/sample_exploration.ipynb.tmpl b/libs/template/templates/cli-pipelines/template/{{.project_name}}/explorations/sample_exploration.ipynb.tmpl deleted file mode 100644 index 967e663fae..0000000000 --- a/libs/template/templates/cli-pipelines/template/{{.project_name}}/explorations/sample_exploration.ipynb.tmpl +++ /dev/null @@ -1,130 +0,0 @@ -{{- if (eq .language "python") -}} -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "19a992e9-55e0-49e4-abc7-8c92c420dd5b", - "showTitle": false, - "tableResultSettingsMap": {}, - "title": "" - } - }, - "source": [ - "### Example Exploratory Notebook\n", - "\n", - "Use this notebook to explore the data generated by the pipeline in your preferred programming language.\n", - "\n", - "**Note**: This notebook is not executed as part of the pipeline." - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "1b0a82fa-3c6a-4f29-bb43-ded1c4fd77c6", - "showTitle": false, - "tableResultSettingsMap": {}, - "title": "" - } - }, - "outputs": [], - "source": [ - "# !!! Before performing any data analysis, make sure to run the pipeline to materialize the sample datasets. The tables referenced in this notebook depend on that step.\n", - "\n", - "display(spark.sql(\"SELECT * FROM {{ .default_catalog}}.{{template `static_dev_schema` .}}.{{ .project_name }}\"))" - ] - } - ], - "metadata": { - "application/vnd.databricks.v1+notebook": { - "computePreferences": null, - "dashboards": [], - "environmentMetadata": null, - "inputWidgetPreferences": null, - "language": "python", - "notebookMetadata": { - "pythonIndentUnit": 2 - }, - "notebookName": "sample_exploration", - "widgets": {} - }, - "language_info": { - "name": "python" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} -{{ else -}} -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "3bd3cbb1-1518-4d0a-a8d1-f08da3f8840b", - "showTitle": false, - "tableResultSettingsMap": {}, - "title": "" - } - }, - "source": [ - "### Example Exploratory Notebook\n", - "\n", - "Use this notebook to explore the data generated by the pipeline in your preferred programming language.\n", - "\n", - "**Note**: This notebook is not executed as part of the pipeline." - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "d30a8e05-bf7a-47e1-982e-b37e64cd6d43", - "showTitle": false, - "tableResultSettingsMap": {}, - "title": "" - } - }, - "outputs": [], - "source": [ - "-- !!! Before performing any data analysis, make sure to run the pipeline to materialize the sample datasets. The tables referenced in this notebook depend on that step.\n", - "\n", - "USE CATALOG `{{.default_catalog}}`;\n", - "USE SCHEMA `{{template `static_dev_schema` .}}`;\n", - "\n", - "SELECT * from {{ .project_name }};" - ] - } - ], - "metadata": { - "application/vnd.databricks.v1+notebook": { - "computePreferences": null, - "dashboards": [], - "environmentMetadata": null, - "inputWidgetPreferences": null, - "language": "sql", - "notebookMetadata": {}, - "notebookName": "sample_exploration", - "widgets": {} - }, - "language_info": { - "name": "sql" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} -{{ end -}} diff --git a/libs/template/templates/cli-pipelines/template/{{.project_name}}/transformations/sample_trips_{{.project_name}}.py.tmpl b/libs/template/templates/cli-pipelines/template/{{.project_name}}/transformations/sample_trips_{{.project_name}}.py.tmpl deleted file mode 100644 index f975c6f357..0000000000 --- a/libs/template/templates/cli-pipelines/template/{{.project_name}}/transformations/sample_trips_{{.project_name}}.py.tmpl +++ /dev/null @@ -1,15 +0,0 @@ -import dlt -from pyspark.sql.functions import col -from utilities import utils - - -# This file defines a sample transformation. -# Edit the sample below or add new transformations -# using "+ Add" in the file browser. - - -@dlt.table -def sample_trips_{{ .project_name }}(): - return spark.read.table("samples.nyctaxi.trips").withColumn( - "trip_distance_km", utils.distance_km(col("trip_distance")) - ) diff --git a/libs/template/templates/cli-pipelines/template/{{.project_name}}/transformations/sample_trips_{{.project_name}}.sql.tmpl b/libs/template/templates/cli-pipelines/template/{{.project_name}}/transformations/sample_trips_{{.project_name}}.sql.tmpl deleted file mode 100644 index b95a95da4d..0000000000 --- a/libs/template/templates/cli-pipelines/template/{{.project_name}}/transformations/sample_trips_{{.project_name}}.sql.tmpl +++ /dev/null @@ -1,9 +0,0 @@ --- This file defines a sample transformation. --- Edit the sample below or add new transformations --- using "+ Add" in the file browser. - -CREATE MATERIALIZED VIEW sample_trips_{{ .project_name }} AS -SELECT - pickup_zip, - fare_amount -FROM samples.nyctaxi.trips diff --git a/libs/template/templates/cli-pipelines/template/{{.project_name}}/transformations/sample_zones_{{.project_name}}.sql.tmpl b/libs/template/templates/cli-pipelines/template/{{.project_name}}/transformations/sample_zones_{{.project_name}}.sql.tmpl deleted file mode 100644 index ab84f4066a..0000000000 --- a/libs/template/templates/cli-pipelines/template/{{.project_name}}/transformations/sample_zones_{{.project_name}}.sql.tmpl +++ /dev/null @@ -1,10 +0,0 @@ --- This file defines a sample transformation. --- Edit the sample below or add new transformations --- using "+ Add" in the file browser. - -CREATE MATERIALIZED VIEW sample_zones_{{ .project_name }} AS -SELECT - pickup_zip, - SUM(fare_amount) AS total_fare -FROM sample_trips_{{ .project_name }} -GROUP BY pickup_zip diff --git a/libs/template/templates/cli-pipelines/template/{{.project_name}}/utilities/utils.py b/libs/template/templates/cli-pipelines/template/{{.project_name}}/utilities/utils.py deleted file mode 100644 index ff039898f0..0000000000 --- a/libs/template/templates/cli-pipelines/template/{{.project_name}}/utilities/utils.py +++ /dev/null @@ -1,8 +0,0 @@ -from pyspark.sql.functions import udf -from pyspark.sql.types import FloatType - - -@udf(returnType=FloatType()) -def distance_km(distance_miles): - """Convert distance from miles to kilometers (1 mile = 1.60934 km).""" - return distance_miles * 1.60934 diff --git a/libs/template/templates/cli-pipelines/template/{{.project_name}}/{{.project_name}}.pipeline.yml.tmpl b/libs/template/templates/cli-pipelines/template/{{.project_name}}/{{.project_name}}.pipeline.yml.tmpl deleted file mode 100644 index aee44900c5..0000000000 --- a/libs/template/templates/cli-pipelines/template/{{.project_name}}/{{.project_name}}.pipeline.yml.tmpl +++ /dev/null @@ -1,11 +0,0 @@ -resources: - pipelines: - {{template `pipeline_name` .}}: - name: {{template `pipeline_name` .}} - serverless: true - catalog: ${var.catalog} - schema: ${var.schema} - root_path: "." - libraries: - - glob: - include: transformations/**