From 3950e307bc6692f882e348492d1d12c1dc4d769e Mon Sep 17 00:00:00 2001 From: Tobias Raabe Date: Sun, 9 Jun 2024 09:26:20 +0200 Subject: [PATCH 1/5] Redesign the scaling tasks guide. --- docs/source/how_to_guides/bp_scaling_tasks.md | 29 ++++++++++++++----- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/docs/source/how_to_guides/bp_scaling_tasks.md b/docs/source/how_to_guides/bp_scaling_tasks.md index fa7cb5e9b..0c13bf759 100644 --- a/docs/source/how_to_guides/bp_scaling_tasks.md +++ b/docs/source/how_to_guides/bp_scaling_tasks.md @@ -1,14 +1,27 @@ # Scaling tasks -In any bigger project you quickly come to the point where you stack multiple repetitions -of tasks on top of each other. +In many projects, tasks are repeated across multiple dimensions that are stacked on top +of each other. -For example, you have one dataset, four different ways to prepare it, and three -statistical models to analyze the data. The cartesian product of all steps combined -comprises twelve differently fitted models. +For example, take a project that there are four ways to simulate data and there are +three different models that should be fitted on each dataset. -Here you find some tips on how to set up your tasks such that you can easily modify the -cartesian product of steps. +Assuming there is a high-level interface to simulate data, we can loop over the task for +simulating data four times with different arguments. + +Assuming there is a high-level interface to fit models to data, + +Assuming that you can easily switch the model the model fitting can be done in a taskThe +cartesian product of all steps combined comprises twelve differently fitted models. + +This guide shows an approach to organizing your tasks that can be best described as +flattening the loops. + +## The data catalog + +First of all, we need to create a data catalog in a `config.py` in your project. + +The data catalog plays a key role in managing lots of repetitions of tasks because it ## Scalability @@ -18,7 +31,7 @@ different models to each specification. This is the structure of the project. -``` +```text my_project ├───pyproject.toml │ From e143a4d03dc218c1ecf81b63c79f8d2d45f9a8bf Mon Sep 17 00:00:00 2001 From: Tobias Raabe Date: Sun, 9 Jun 2024 09:28:16 +0200 Subject: [PATCH 2/5] Fix. --- docs/source/changes.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/source/changes.md b/docs/source/changes.md index 866452ea1..2a521c5a5 100644 --- a/docs/source/changes.md +++ b/docs/source/changes.md @@ -5,6 +5,10 @@ chronological order. Releases follow [semantic versioning](https://semver.org/) releases are available on [PyPI](https://pypi.org/project/pytask) and [Anaconda.org](https://anaconda.org/conda-forge/pytask). +## 0.5.1 - 2024-xx-xx + +- {pull}`616` redesigns the guide on "Scaling Tasks". + ## 0.5.0 - 2024-05-26 - {pull}`548` fixes the type hints for {meth}`~pytask.Task.execute` and From d34550f89b9b51e549af5ccd8dd7919010ec89b5 Mon Sep 17 00:00:00 2001 From: Tobias Raabe Date: Mon, 24 Jun 2024 15:27:57 +0200 Subject: [PATCH 3/5] Fix. --- docs/source/how_to_guides/bp_scaling_tasks.md | 93 +++++++++++++++++++ 1 file changed, 93 insertions(+) diff --git a/docs/source/how_to_guides/bp_scaling_tasks.md b/docs/source/how_to_guides/bp_scaling_tasks.md index 0c13bf759..e1a3d2772 100644 --- a/docs/source/how_to_guides/bp_scaling_tasks.md +++ b/docs/source/how_to_guides/bp_scaling_tasks.md @@ -1,5 +1,98 @@ # Scaling tasks +- \[ \] Write about adding another dimension. +- \[ \] Write about adding another level. +- \[ \] Write about executing subsets of tasks. +- \[ \] Write about grouping by one dimension´or aggregating. + +In projects where task inputs and outputs are sufficiently standardized, it is possible +to make extensive use of task repetition. + +A common pattern is to write multiple loops around a task function where each loop +stands for a different dimension. A dimension, for example, represents different +datasets or model specifications to analyze the datasets. + +There is nothing wrong with using nested loops for simpler projects that are clearly +defined in scope. But, often they are just the start of looking at a problem from +different angles and soon you want to add more dimensions. + +Adding another loop in a lot of places in your project is cumbersome and the increased +indentation is visually displeasing. + +It is not the most serious problem, though. More importantly, it becomes cumbersome to +reference dependencies of products and to set unique identifiers for tasks. The latter +is important to execute only subsets of the project. + +How do we solve these problems? Here is a brief explanation of the solution. + +1. Create objects to define every dimension in the project. A dimension can be + characterized by a single value like a {class}`~pathlib.Path`, an + {class}`~enum.Enum`, or a {class}`~typing.NamedTuple` or + {func}`~dataclasses.dataclass` if more fields are needed. + +1. Create an object like a {class}`~typing.NamedTuple` or a + {func}`~dataclasses.dataclass` that has one attribute for each dimension. For lack of + a better name, we will call this unit an experiment. + + The experiment combines the information provided by each dimension to create a unique + identifier for each experiment and the names or paths of dependencies and products + for each task. + +To make the idea more tangible, let us focus on an example. + +## Example + +Let us assume we have a project with multiple datasets and model specifications that +should be fitted to the data. + +The datasets are created by the task from the +{doc}`tutorials <../tutorials/defining_dependencies_products>` parametrized with +different coefficients. + +Below that is the task that fits different models to the datasets using a double loop. + +```python +from pathlib import Path +from pytask import task, Product + + +SRC = Path(__file__).parent +BLD = SRC / "bld" + + + + + +for dat + + + +for data_name in ("a", "b", "c"): + for model_name in ("ols", "logit", "linear_prob"): + + @task + def task_fit_model(path_to_data: Path = SRC / f"{data_name}.pkl") + +``` + +1. The level of indentation is not visually pleasing and does not allow us to + sufficiently use every line in the file. + +1. Whenever we add another dimension to our problem, we need to extend every occurrence + of the nested loops. + +But, these problems are more annoying than truly + +The first and most important problem is that + +The first problem is t + +There are couple of problems that arise in these projects. + +The main problem is that with + +In projects where task inputs and outputs can be standardized and general interface + In many projects, tasks are repeated across multiple dimensions that are stacked on top of each other. From 7175e83e9192bd172907c2c5eca425eda46fdecf Mon Sep 17 00:00:00 2001 From: Tobias Raabe Date: Fri, 12 Jul 2024 16:08:10 +0200 Subject: [PATCH 4/5] Add progress. --- ...asks.md => bp_complex_task_repetitions.md} | 28 +++++++++++-------- .../bp_structure_of_task_files.md | 2 +- docs/source/how_to_guides/index.md | 2 +- .../repeating_tasks_with_different_inputs.md | 3 +- .../bp_complex_task_repetitions/example.py | 19 +++++++++++++ 5 files changed, 40 insertions(+), 14 deletions(-) rename docs/source/how_to_guides/{bp_scaling_tasks.md => bp_complex_task_repetitions.md} (88%) create mode 100644 docs_src/how_to_guides/bp_complex_task_repetitions/example.py diff --git a/docs/source/how_to_guides/bp_scaling_tasks.md b/docs/source/how_to_guides/bp_complex_task_repetitions.md similarity index 88% rename from docs/source/how_to_guides/bp_scaling_tasks.md rename to docs/source/how_to_guides/bp_complex_task_repetitions.md index e1a3d2772..6e7e16fff 100644 --- a/docs/source/how_to_guides/bp_scaling_tasks.md +++ b/docs/source/how_to_guides/bp_complex_task_repetitions.md @@ -1,29 +1,35 @@ -# Scaling tasks +# Complex task repetitions - \[ \] Write about adding another dimension. - \[ \] Write about adding another level. - \[ \] Write about executing subsets of tasks. -- \[ \] Write about grouping by one dimension´or aggregating. +- \[ \] Write about grouping by one dimension or aggregating. In projects where task inputs and outputs are sufficiently standardized, it is possible to make extensive use of task repetition. A common pattern is to write multiple loops around a task function where each loop -stands for a different dimension. A dimension, for example, represents different -datasets or model specifications to analyze the datasets. +stands for a different dimension. A dimension might represent different datasets or +model specifications to analyze the datasets like in the following example. + +```{literalinclude} ../../../docs_src/how_to_guides/bp_complex_task_repetitions/example.py +``` There is nothing wrong with using nested loops for simpler projects that are clearly defined in scope. But, often they are just the start of looking at a problem from -different angles and soon you want to add more dimensions. +different angles. + +For more complex projects, you are quickly running into a couple of problems. -Adding another loop in a lot of places in your project is cumbersome and the increased -indentation is visually displeasing. +- You need to add the nested loops in a lot of places. +- Every dimension adds another level of indentation which is not aesthetically pleasing. +- Adding another dimension leads to a lot of changes in many places. +- It becomes cumbersome to manage the unique ids of the repeated tasks. -It is not the most serious problem, though. More importantly, it becomes cumbersome to -reference dependencies of products and to set unique identifiers for tasks. The latter -is important to execute only subsets of the project. +The rest of the guide lays out a pattern that -How do we solve these problems? Here is a brief explanation of the solution. +To solve these problems, the pattern laid out in the rest of the article proved to be +helpful. 1. Create objects to define every dimension in the project. A dimension can be characterized by a single value like a {class}`~pathlib.Path`, an diff --git a/docs/source/how_to_guides/bp_structure_of_task_files.md b/docs/source/how_to_guides/bp_structure_of_task_files.md index 857f64792..84e16789f 100644 --- a/docs/source/how_to_guides/bp_structure_of_task_files.md +++ b/docs/source/how_to_guides/bp_structure_of_task_files.md @@ -14,7 +14,7 @@ are looking for orientation or inspiration, here are some tips. module is for. ```{seealso} - The only exception might be for {doc}`repetitions `. + The only exception might be for {doc}`repetitions `. ``` - The purpose of the task function is to handle IO operations like loading and saving diff --git a/docs/source/how_to_guides/index.md b/docs/source/how_to_guides/index.md index 8f0e9f47e..53068ee0f 100644 --- a/docs/source/how_to_guides/index.md +++ b/docs/source/how_to_guides/index.md @@ -42,5 +42,5 @@ maxdepth: 1 bp_structure_of_a_research_project bp_structure_of_task_files bp_templates_and_projects -bp_scaling_tasks +bp_complex_task_repetitions ``` diff --git a/docs/source/tutorials/repeating_tasks_with_different_inputs.md b/docs/source/tutorials/repeating_tasks_with_different_inputs.md index 750435d65..136152ed0 100644 --- a/docs/source/tutorials/repeating_tasks_with_different_inputs.md +++ b/docs/source/tutorials/repeating_tasks_with_different_inputs.md @@ -291,7 +291,8 @@ for id_, kwargs in ID_TO_KWARGS.items(): def task_create_random_data(i, produces): ... ``` -The {doc}`best-practices guide on parametrizations <../how_to_guides/bp_scaling_tasks>` +The +{doc}`best-practices guide on parametrizations <../how_to_guides/bp_complex_task_repetitions>` goes into even more detail on how to scale parametrizations. ## A warning on globals diff --git a/docs_src/how_to_guides/bp_complex_task_repetitions/example.py b/docs_src/how_to_guides/bp_complex_task_repetitions/example.py new file mode 100644 index 000000000..d0893d7a1 --- /dev/null +++ b/docs_src/how_to_guides/bp_complex_task_repetitions/example.py @@ -0,0 +1,19 @@ +from pathlib import Path +from typing import Annotated + +from pytask import Product +from pytask import task + +SRC = Path(__file__).parent +BLD = SRC / "bld" + + +for data_name in ("a", "b", "c"): + for model_name in ("ols", "logit", "linear_prob"): + + @task + def task_fit_model( + path_to_data: Path = SRC / f"{data_name}.pkl", + path_to_model: Annotated[Path, Product] = BLD + / f"{data_name}-{model_name}.pkl", + ) -> None: ... From a828c217c5c50fe2ac2e959f5e386896a5ea4ce0 Mon Sep 17 00:00:00 2001 From: Tobias Raabe Date: Sun, 14 Jul 2024 16:35:22 +0200 Subject: [PATCH 5/5] FIx. --- .../bp_complex_task_repetitions.md | 226 ++++-------------- .../bp_complex_task_repetitions/example.py | 2 +- .../example_improved.py | 14 ++ .../bp_complex_task_repetitions/experiment.py | 37 +++ docs_src/how_to_guides/bp_scaling_tasks_1.py | 20 -- docs_src/how_to_guides/bp_scaling_tasks_2.py | 39 --- docs_src/how_to_guides/bp_scaling_tasks_3.py | 18 -- docs_src/how_to_guides/bp_scaling_tasks_4.py | 36 --- 8 files changed, 100 insertions(+), 292 deletions(-) create mode 100644 docs_src/how_to_guides/bp_complex_task_repetitions/example_improved.py create mode 100644 docs_src/how_to_guides/bp_complex_task_repetitions/experiment.py delete mode 100644 docs_src/how_to_guides/bp_scaling_tasks_1.py delete mode 100644 docs_src/how_to_guides/bp_scaling_tasks_2.py delete mode 100644 docs_src/how_to_guides/bp_scaling_tasks_3.py delete mode 100644 docs_src/how_to_guides/bp_scaling_tasks_4.py diff --git a/docs/source/how_to_guides/bp_complex_task_repetitions.md b/docs/source/how_to_guides/bp_complex_task_repetitions.md index 6e7e16fff..68e44569d 100644 --- a/docs/source/how_to_guides/bp_complex_task_repetitions.md +++ b/docs/source/how_to_guides/bp_complex_task_repetitions.md @@ -1,209 +1,79 @@ # Complex task repetitions -- \[ \] Write about adding another dimension. -- \[ \] Write about adding another level. -- \[ \] Write about executing subsets of tasks. -- \[ \] Write about grouping by one dimension or aggregating. +{doc}`Task repetitions <../tutorials/repeating_tasks_with_different_inputs>` are amazing +if you want to execute lots of tasks while not repeating yourself in code. -In projects where task inputs and outputs are sufficiently standardized, it is possible -to make extensive use of task repetition. +But, in any bigger project, repetitions can become hard to maintain because there are +multiple layers or dimensions of repetition. -A common pattern is to write multiple loops around a task function where each loop -stands for a different dimension. A dimension might represent different datasets or -model specifications to analyze the datasets like in the following example. - -```{literalinclude} ../../../docs_src/how_to_guides/bp_complex_task_repetitions/example.py -``` - -There is nothing wrong with using nested loops for simpler projects that are clearly -defined in scope. But, often they are just the start of looking at a problem from -different angles. - -For more complex projects, you are quickly running into a couple of problems. - -- You need to add the nested loops in a lot of places. -- Every dimension adds another level of indentation which is not aesthetically pleasing. -- Adding another dimension leads to a lot of changes in many places. -- It becomes cumbersome to manage the unique ids of the repeated tasks. - -The rest of the guide lays out a pattern that - -To solve these problems, the pattern laid out in the rest of the article proved to be -helpful. - -1. Create objects to define every dimension in the project. A dimension can be - characterized by a single value like a {class}`~pathlib.Path`, an - {class}`~enum.Enum`, or a {class}`~typing.NamedTuple` or - {func}`~dataclasses.dataclass` if more fields are needed. - -1. Create an object like a {class}`~typing.NamedTuple` or a - {func}`~dataclasses.dataclass` that has one attribute for each dimension. For lack of - a better name, we will call this unit an experiment. - - The experiment combines the information provided by each dimension to create a unique - identifier for each experiment and the names or paths of dependencies and products - for each task. - -To make the idea more tangible, let us focus on an example. +Here you find some tips on how to set up your project such that adding dimensions and +increasing dimensions becomes much easier. ## Example -Let us assume we have a project with multiple datasets and model specifications that -should be fitted to the data. - -The datasets are created by the task from the -{doc}`tutorials <../tutorials/defining_dependencies_products>` parametrized with -different coefficients. - -Below that is the task that fits different models to the datasets using a double loop. - -```python -from pathlib import Path -from pytask import task, Product - - -SRC = Path(__file__).parent -BLD = SRC / "bld" - - - - - -for dat - - - -for data_name in ("a", "b", "c"): - for model_name in ("ols", "logit", "linear_prob"): - - @task - def task_fit_model(path_to_data: Path = SRC / f"{data_name}.pkl") +You can write multiple loops around a task function where each loop stands for a +different dimension. A dimension might represent different datasets or model +specifications to analyze the datasets like in the following example. The task arguments +are derived from the dimensions. +```{literalinclude} ../../../docs_src/how_to_guides/bp_complex_task_repetitions/example.py +--- +caption: task_example.py +--- ``` -1. The level of indentation is not visually pleasing and does not allow us to - sufficiently use every line in the file. - -1. Whenever we add another dimension to our problem, we need to extend every occurrence - of the nested loops. - -But, these problems are more annoying than truly - -The first and most important problem is that - -The first problem is t - -There are couple of problems that arise in these projects. - -The main problem is that with - -In projects where task inputs and outputs can be standardized and general interface - -In many projects, tasks are repeated across multiple dimensions that are stacked on top -of each other. - -For example, take a project that there are four ways to simulate data and there are -three different models that should be fitted on each dataset. - -Assuming there is a high-level interface to simulate data, we can loop over the task for -simulating data four times with different arguments. - -Assuming there is a high-level interface to fit models to data, - -Assuming that you can easily switch the model the model fitting can be done in a taskThe -cartesian product of all steps combined comprises twelve differently fitted models. +There is nothing wrong with using nested loops for simpler projects. But, often projects +are growing over time and you run into these problems. -This guide shows an approach to organizing your tasks that can be best described as -flattening the loops. +- When you add a new task, you need to duplicate the nested loops in another module. +- When you add a dimension, you need to touch multiple files in your project and add + another loop and level of indentation. -## The data catalog +## Solution -First of all, we need to create a data catalog in a `config.py` in your project. +The main idea for the solution is quickly explained. We will, first, formalize +dimensions into objects and, secondly, combine them in one object such that we only have +to iterate over instances of this object in a single loop. -The data catalog plays a key role in managing lots of repetitions of tasks because it +We will start by defining the dimensions using {class}`~typing.NamedTuple` or +{func}`~dataclasses.dataclass`. -## Scalability +Then, we will define the object that holds both pieces of information together and for +the lack of a better name, we will call it an experiment. -Let us dive right into the aforementioned example. We start with one dataset `data.csv`. -Then, we will create four different specifications of the data and, finally, fit three -different models to each specification. - -This is the structure of the project. - -```text -my_project -├───pyproject.toml -│ -├───src -│ └───my_project -│ ├────config.py -│ │ -│ ├───data -│ │ └────data.csv -│ │ -│ ├───data_preparation -│ │ ├────__init__.py -│ │ ├────config.py -│ │ └────task_prepare_data.py -│ │ -│ └───estimation -│ ├────__init__.py -│ ├────config.py -│ └────task_estimate_models.py -│ -├───.pytask -│ └────... -│ -└───bld +```{literalinclude} ../../../docs_src/how_to_guides/bp_complex_task_repetitions/experiment.py +--- +caption: config.py +--- ``` -The folder structure, the main `config.py` which holds `SRC` and `BLD`, and the tasks -follow the same structure advocated throughout the tutorials. +There are some things to be said. -New are the local configuration files in each subfolder of `my_project`, which contain -objects shared across tasks. For example, `config.py` holds the paths to the processed -data and the names of the data sets. - -```{literalinclude} ../../../docs_src/how_to_guides/bp_scaling_tasks_1.py -``` +- The names on each dimension need to be unique and ensure that by combining them for + the name of the experiment, we get a unique and descriptive id. +- Dimensions might need more attributes than just a name, like paths, or other arguments + for the task. Add them. -The task file `task_prepare_data.py` uses these objects to build the repetitions. +Next, we will use these newly defined data structures and see how our tasks change when +we use them. -```{literalinclude} ../../../docs_src/how_to_guides/bp_scaling_tasks_2.py +```{literalinclude} ../../../docs_src/how_to_guides/bp_complex_task_repetitions/example_improved.py +--- +caption: task_example.py +--- ``` -All arguments for the loop and the {func}`@task ` decorator are built -within a function to keep the logic in one place and the module's namespace clean. +As you see, we replaced -Ids are used to make the task {ref}`ids ` more descriptive and to simplify their -selection with {ref}`expressions `. Here is an example of the task ids with -an explicit id. +## Using the `DataCatalog` -``` -# With id -.../my_project/data_preparation/task_prepare_data.py::task_prepare_data[data_0] -``` +## Adding another dimension -Next, we move to the estimation to see how we can build another repetition on top. +## Adding another level -```{literalinclude} ../../../docs_src/how_to_guides/bp_scaling_tasks_3.py -``` - -In the local configuration, we define `ESTIMATIONS` which combines the information on -data and model. The dictionary's key can be used as a task id whenever the estimation is -involved. It allows triggering all tasks related to one estimation - estimation, -figures, tables - with one command. - -```console -pytask -k linear_probability_data_0 -``` - -And here is the task file. - -```{literalinclude} ../../../docs_src/how_to_guides/bp_scaling_tasks_4.py -``` +## Executing a subset -Replicating this pattern across a project allows a clean way to define repetitions. +## Grouping and aggregating ## Extending repetitions diff --git a/docs_src/how_to_guides/bp_complex_task_repetitions/example.py b/docs_src/how_to_guides/bp_complex_task_repetitions/example.py index d0893d7a1..3e3bf14ef 100644 --- a/docs_src/how_to_guides/bp_complex_task_repetitions/example.py +++ b/docs_src/how_to_guides/bp_complex_task_repetitions/example.py @@ -11,7 +11,7 @@ for data_name in ("a", "b", "c"): for model_name in ("ols", "logit", "linear_prob"): - @task + @task(id=f"{model_name}-{data_name}") def task_fit_model( path_to_data: Path = SRC / f"{data_name}.pkl", path_to_model: Annotated[Path, Product] = BLD diff --git a/docs_src/how_to_guides/bp_complex_task_repetitions/example_improved.py b/docs_src/how_to_guides/bp_complex_task_repetitions/example_improved.py new file mode 100644 index 000000000..741d2c19c --- /dev/null +++ b/docs_src/how_to_guides/bp_complex_task_repetitions/example_improved.py @@ -0,0 +1,14 @@ +from pathlib import Path +from typing import Annotated + +from myproject.config import EXPERIMENTS +from pytask import Product +from pytask import task + +for experiment in EXPERIMENTS: + + @task(id=experiment.name) + def task_fit_model( + path_to_data: experiment.dataset.path, + path_to_model: Annotated[Path, Product] = experiment.path, + ) -> None: ... diff --git a/docs_src/how_to_guides/bp_complex_task_repetitions/experiment.py b/docs_src/how_to_guides/bp_complex_task_repetitions/experiment.py new file mode 100644 index 000000000..002c669e5 --- /dev/null +++ b/docs_src/how_to_guides/bp_complex_task_repetitions/experiment.py @@ -0,0 +1,37 @@ +from pathlib import Path +from typing import NamedTuple + +SRC = Path(__file__).parent +BLD = SRC / "bld" + + +class Dataset(NamedTuple): + name: str + + @property + def path(self) -> Path: + return SRC / f"{self.name}.pkl" + + +class Model(NamedTuple): + name: str + + +DATASETS = [Dataset("a"), Dataset("b"), Dataset("c")] +MODELS = [Model("ols"), Model("logit"), Model("linear_prob")] + + +class Experiment(NamedTuple): + dataset: Dataset + model: Model + + @property + def name(self) -> str: + return f"{self.model.name}-{self.dataset.name}" + + @property + def path(self) -> Path: + return BLD / f"{self.name}.pkl" + + +EXPERIMENTS = [Experiment(dataset, model) for dataset in DATASETS for model in MODELS] diff --git a/docs_src/how_to_guides/bp_scaling_tasks_1.py b/docs_src/how_to_guides/bp_scaling_tasks_1.py deleted file mode 100644 index 52d6ea61a..000000000 --- a/docs_src/how_to_guides/bp_scaling_tasks_1.py +++ /dev/null @@ -1,20 +0,0 @@ -# Content of config.py -from pathlib import Path - -from my_project.config import BLD -from my_project.config import SRC - -DATA = { - "data_0": {"subset": "subset_1"}, - "data_1": {"subset": "subset_2"}, - "data_2": {"subset": "subset_3"}, - "data_3": {"subset": "subset_4"}, -} - - -def path_to_input_data(name: str) -> Path: - return SRC / "data" / "data.csv" - - -def path_to_processed_data(name: str) -> Path: - return BLD / "data" / f"processed_{name}.pkl" diff --git a/docs_src/how_to_guides/bp_scaling_tasks_2.py b/docs_src/how_to_guides/bp_scaling_tasks_2.py deleted file mode 100644 index f31cfc644..000000000 --- a/docs_src/how_to_guides/bp_scaling_tasks_2.py +++ /dev/null @@ -1,39 +0,0 @@ -# Content of task_prepare_data.py -from pathlib import Path - -from my_project.data_preparation.config import DATA -from my_project.data_preparation.config import path_to_input_data -from my_project.data_preparation.config import path_to_processed_data -from pandas import pd -from pytask import Product -from pytask import task -from typing_extensions import Annotated - - -def _create_parametrization(data: list[str]) -> dict[str, Path]: - id_to_kwargs = {} - for data_name, kwargs in data.items(): - id_to_kwargs[data_name] = { - "path_to_input_data": path_to_input_data(data_name), - "path_to_processed_data": path_to_processed_data(data_name), - **kwargs, - } - - return id_to_kwargs - - -_ID_TO_KWARGS = _create_parametrization(DATA) - - -for id_, kwargs in _ID_TO_KWARGS.items(): - - @task(id=id_, kwargs=kwargs) - def task_prepare_data( - path_to_input_data: Path, - subset: str, - path_to_processed_data: Annotated[Path, Product], - ) -> None: - df = pd.read_csv(path_to_input_data) - # ... transform the data. - subset = df.loc[df["subset"].eq(subset)] - subset.to_pickle(path_to_processed_data) diff --git a/docs_src/how_to_guides/bp_scaling_tasks_3.py b/docs_src/how_to_guides/bp_scaling_tasks_3.py deleted file mode 100644 index 1e2103d45..000000000 --- a/docs_src/how_to_guides/bp_scaling_tasks_3.py +++ /dev/null @@ -1,18 +0,0 @@ -# Content of config.py -from pathlib import Path - -from my_project.config import BLD -from my_project.data_preparation.config import DATA - -_MODELS = ["linear_probability", "logistic_model", "decision_tree"] - - -ESTIMATIONS = { - f"{data_name}_{model_name}": {"model": model_name, "data": data_name} - for model_name in _MODELS - for data_name in DATA -} - - -def path_to_estimation_result(name: str) -> Path: - return BLD / "estimation" / f"estimation_{name}.pkl" diff --git a/docs_src/how_to_guides/bp_scaling_tasks_4.py b/docs_src/how_to_guides/bp_scaling_tasks_4.py deleted file mode 100644 index a6c665394..000000000 --- a/docs_src/how_to_guides/bp_scaling_tasks_4.py +++ /dev/null @@ -1,36 +0,0 @@ -# Content of task_estimate_models.py -from pathlib import Path - -from my_project.data_preparation.config import path_to_processed_data -from my_project.estimations.config import ESTIMATIONS -from my_project.estimations.config import path_to_estimation_result -from pytask import Product -from pytask import task -from typing_extensions import Annotated - - -def _create_parametrization( - estimations: dict[str, dict[str, str]], -) -> dict[str, str | Path]: - id_to_kwargs = {} - for name, config in estimations.items(): - id_to_kwargs[name] = { - "path_to_data": path_to_processed_data(config["data"]), - "model": config["model"], - "path_to_estimation": path_to_estimation_result(name), - } - - return id_to_kwargs - - -_ID_TO_KWARGS = _create_parametrization(ESTIMATIONS) - - -for id_, kwargs in _ID_TO_KWARGS.items(): - - @task(id=id_, kwargs=kwargs) - def task_estmate_models( - path_to_data: Path, model: str, path_to_estimation: Annotated[Path, Product] - ) -> None: - if model == "linear_probability": - ...