From a684ac4f7005d2d62bf121680ef00478937fc2d3 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Mon, 8 Feb 2021 14:22:26 +0100 Subject: [PATCH 01/68] work in progress --- .../network/network_embedding/__init__.py | 0 .../base_network_embedding.py | 16 +++++ .../network_embedding/modules/__init__.py | 0 .../modules/learned_entity_embedding.py | 69 +++++++++++++++++++ .../network_embedding/modules/no_embedding.py | 12 ++++ 5 files changed, 97 insertions(+) create mode 100644 autoPyTorch/pipeline/components/setup/network/network_embedding/__init__.py create mode 100644 autoPyTorch/pipeline/components/setup/network/network_embedding/base_network_embedding.py create mode 100644 autoPyTorch/pipeline/components/setup/network/network_embedding/modules/__init__.py create mode 100644 autoPyTorch/pipeline/components/setup/network/network_embedding/modules/learned_entity_embedding.py create mode 100644 autoPyTorch/pipeline/components/setup/network/network_embedding/modules/no_embedding.py diff --git a/autoPyTorch/pipeline/components/setup/network/network_embedding/__init__.py b/autoPyTorch/pipeline/components/setup/network/network_embedding/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/autoPyTorch/pipeline/components/setup/network/network_embedding/base_network_embedding.py b/autoPyTorch/pipeline/components/setup/network/network_embedding/base_network_embedding.py new file mode 100644 index 000000000..5615062d4 --- /dev/null +++ b/autoPyTorch/pipeline/components/setup/network/network_embedding/base_network_embedding.py @@ -0,0 +1,16 @@ +from typing import Optional, Any + +from sklearn.base import BaseEstimator +from torch import nn + +from autoPyTorch.pipeline.components.setup.base_setup import autoPyTorchSetupComponent + + +class NetworkEmbeddingComponent(autoPyTorchSetupComponent): + def __init__(self, + **kwargs): + super().__init__() + self.config = kwargs + self.embedding: Optional[nn.Module] = None + + # def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator: \ No newline at end of file diff --git a/autoPyTorch/pipeline/components/setup/network/network_embedding/modules/__init__.py b/autoPyTorch/pipeline/components/setup/network/network_embedding/modules/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/autoPyTorch/pipeline/components/setup/network/network_embedding/modules/learned_entity_embedding.py b/autoPyTorch/pipeline/components/setup/network/network_embedding/modules/learned_entity_embedding.py new file mode 100644 index 000000000..e6eb88f19 --- /dev/null +++ b/autoPyTorch/pipeline/components/setup/network/network_embedding/modules/learned_entity_embedding.py @@ -0,0 +1,69 @@ +""" +Class to learn an embedding for categorical hyperparameters. +""" + +import torch +import torch.nn as nn +import numpy as np + + +class LearnedEntityEmbedding(nn.Module): + """ Parent class for MlpNet, ResNet, ... Can use entity embedding for categorical features""" + + def __init__(self, config, in_features, num_numerical_features): + """ + Initialize the BaseFeatureNet. + Arguments: + config: The configuration sampled by the hyperparameter optimizer + in_features: the number of features of the dataset + one_hot_encoder: OneHot encoder, that is used to encode X + """ + super(LearnedEntityEmbedding, self).__init__() + self.config = config + + # self.num_numerical = len([f for f in one_hot_encoder.categorical_features if not f]) + # self.num_input_features = [len(c) for c in one_hot_encoder.categories_] + self.num_numerical = num_numerical_features + self.embed_features = [num_in >= config["min_unique_values_for_embedding"] for num_in in + self.num_input_features] + self.num_output_dimensions = [config["dimension_reduction_" + str(i)] * num_in for i, num_in in + enumerate(self.num_input_features)] + self.num_output_dimensions = [int(np.clip(num_out, 1, num_in - 1)) for num_out, num_in in + zip(self.num_output_dimensions, self.num_input_features)] + self.num_output_dimensions = [num_out if embed else num_in for num_out, embed, num_in in + zip(self.num_output_dimensions, self.embed_features, self.num_input_features)] + self.num_out_feats = self.num_numerical + sum(self.num_output_dimensions) + + self.ee_layers = self._create_ee_layers(in_features) + + def forward(self, x): + # pass the columns of each categorical feature through entity embedding layer + # before passing it through the model + concat_seq = [] + last_concat = 0 + x_pointer = 0 + layer_pointer = 0 + for num_in, embed in zip(self.num_input_features, self.embed_features): + if not embed: + x_pointer += 1 + continue + if x_pointer > last_concat: + concat_seq.append(x[:, last_concat: x_pointer]) + categorical_feature_slice = x[:, x_pointer: x_pointer + num_in] + concat_seq.append(self.ee_layers[layer_pointer](categorical_feature_slice)) + layer_pointer += 1 + x_pointer += num_in + last_concat = x_pointer + + concat_seq.append(x[:, last_concat:]) + return torch.cat(concat_seq, dim=1) + + def _create_ee_layers(self, in_features): + # entity embeding layers are Linear Layers + layers = nn.ModuleList() + for i, (num_in, embed, num_out) in enumerate( + zip(self.num_input_features, self.embed_features, self.num_output_dimensions)): + if not embed: + continue + layers.append(nn.Linear(num_in, num_out)) + return layers \ No newline at end of file diff --git a/autoPyTorch/pipeline/components/setup/network/network_embedding/modules/no_embedding.py b/autoPyTorch/pipeline/components/setup/network/network_embedding/modules/no_embedding.py new file mode 100644 index 000000000..cab1e9b7f --- /dev/null +++ b/autoPyTorch/pipeline/components/setup/network/network_embedding/modules/no_embedding.py @@ -0,0 +1,12 @@ +from torch import nn + + +class NoEmbedding(nn.Module): + def __init__(self, config, in_features, num_numerical_features): + super(NoEmbedding, self).__init__() + self.config = config + self.n_feats = in_features + self.num_numerical = num_numerical_features + + def forward(self, x): + return x \ No newline at end of file From a6a8471deb2aae4a84df2704937965c0317b6fed Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Mon, 8 Feb 2021 14:44:57 +0100 Subject: [PATCH 02/68] in progress --- .../base_network_embedding.py | 16 ------------ .../network_embedding/__init__.py | 0 .../base_network_embedding.py | 25 +++++++++++++++++++ .../network_embedding/modules/__init__.py | 0 .../modules/learned_entity_embedding.py | 7 +++--- .../network_embedding/modules/no_embedding.py | 0 6 files changed, 29 insertions(+), 19 deletions(-) delete mode 100644 autoPyTorch/pipeline/components/setup/network/network_embedding/base_network_embedding.py rename autoPyTorch/pipeline/components/setup/{network => }/network_embedding/__init__.py (100%) create mode 100644 autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py rename autoPyTorch/pipeline/components/setup/{network => }/network_embedding/modules/__init__.py (100%) rename autoPyTorch/pipeline/components/setup/{network => }/network_embedding/modules/learned_entity_embedding.py (93%) rename autoPyTorch/pipeline/components/setup/{network => }/network_embedding/modules/no_embedding.py (100%) diff --git a/autoPyTorch/pipeline/components/setup/network/network_embedding/base_network_embedding.py b/autoPyTorch/pipeline/components/setup/network/network_embedding/base_network_embedding.py deleted file mode 100644 index 5615062d4..000000000 --- a/autoPyTorch/pipeline/components/setup/network/network_embedding/base_network_embedding.py +++ /dev/null @@ -1,16 +0,0 @@ -from typing import Optional, Any - -from sklearn.base import BaseEstimator -from torch import nn - -from autoPyTorch.pipeline.components.setup.base_setup import autoPyTorchSetupComponent - - -class NetworkEmbeddingComponent(autoPyTorchSetupComponent): - def __init__(self, - **kwargs): - super().__init__() - self.config = kwargs - self.embedding: Optional[nn.Module] = None - - # def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator: \ No newline at end of file diff --git a/autoPyTorch/pipeline/components/setup/network/network_embedding/__init__.py b/autoPyTorch/pipeline/components/setup/network_embedding/__init__.py similarity index 100% rename from autoPyTorch/pipeline/components/setup/network/network_embedding/__init__.py rename to autoPyTorch/pipeline/components/setup/network_embedding/__init__.py diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py new file mode 100644 index 000000000..4c087a89d --- /dev/null +++ b/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py @@ -0,0 +1,25 @@ +from typing import Any, Dict, Optional + +from sklearn.base import BaseEstimator +from torch import nn + +from autoPyTorch.pipeline.components.setup.base_setup import autoPyTorchSetupComponent + + +class NetworkEmbeddingComponent(autoPyTorchSetupComponent): + def __init__(self, + **kwargs): + super().__init__() + self.config = kwargs + self.embedding: Optional[nn.Module] = None + + def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator: + in_features = X['X_train'].shape[1:] + + self.embedding = self.build_embedding( + in_features=in_features, + num_numerical_features=len(X['numerical_features'])) + return self + + def build_embedding(self, in_features, num_numerical_features) -> nn.Module: + raise NotImplementedError \ No newline at end of file diff --git a/autoPyTorch/pipeline/components/setup/network/network_embedding/modules/__init__.py b/autoPyTorch/pipeline/components/setup/network_embedding/modules/__init__.py similarity index 100% rename from autoPyTorch/pipeline/components/setup/network/network_embedding/modules/__init__.py rename to autoPyTorch/pipeline/components/setup/network_embedding/modules/__init__.py diff --git a/autoPyTorch/pipeline/components/setup/network/network_embedding/modules/learned_entity_embedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/modules/learned_entity_embedding.py similarity index 93% rename from autoPyTorch/pipeline/components/setup/network/network_embedding/modules/learned_entity_embedding.py rename to autoPyTorch/pipeline/components/setup/network_embedding/modules/learned_entity_embedding.py index e6eb88f19..d7d294661 100644 --- a/autoPyTorch/pipeline/components/setup/network/network_embedding/modules/learned_entity_embedding.py +++ b/autoPyTorch/pipeline/components/setup/network_embedding/modules/learned_entity_embedding.py @@ -10,7 +10,7 @@ class LearnedEntityEmbedding(nn.Module): """ Parent class for MlpNet, ResNet, ... Can use entity embedding for categorical features""" - def __init__(self, config, in_features, num_numerical_features): + def __init__(self, config, num_input_features, num_numerical_features): """ Initialize the BaseFeatureNet. Arguments: @@ -24,6 +24,7 @@ def __init__(self, config, in_features, num_numerical_features): # self.num_numerical = len([f for f in one_hot_encoder.categorical_features if not f]) # self.num_input_features = [len(c) for c in one_hot_encoder.categories_] self.num_numerical = num_numerical_features + self.num_input_features = num_input_features self.embed_features = [num_in >= config["min_unique_values_for_embedding"] for num_in in self.num_input_features] self.num_output_dimensions = [config["dimension_reduction_" + str(i)] * num_in for i, num_in in @@ -34,7 +35,7 @@ def __init__(self, config, in_features, num_numerical_features): zip(self.num_output_dimensions, self.embed_features, self.num_input_features)] self.num_out_feats = self.num_numerical + sum(self.num_output_dimensions) - self.ee_layers = self._create_ee_layers(in_features) + self.ee_layers = self._create_ee_layers() def forward(self, x): # pass the columns of each categorical feature through entity embedding layer @@ -58,7 +59,7 @@ def forward(self, x): concat_seq.append(x[:, last_concat:]) return torch.cat(concat_seq, dim=1) - def _create_ee_layers(self, in_features): + def _create_ee_layers(self): # entity embeding layers are Linear Layers layers = nn.ModuleList() for i, (num_in, embed, num_out) in enumerate( diff --git a/autoPyTorch/pipeline/components/setup/network/network_embedding/modules/no_embedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/modules/no_embedding.py similarity index 100% rename from autoPyTorch/pipeline/components/setup/network/network_embedding/modules/no_embedding.py rename to autoPyTorch/pipeline/components/setup/network_embedding/modules/no_embedding.py From 2b0c0e0ca36156ed4fb6f8ebcf69b28299141eca Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Mon, 15 Feb 2021 19:57:00 +0100 Subject: [PATCH 03/68] Working network embedding --- .../TabularColumnTransformer.py | 8 +- .../components/setup/network/base_network.py | 3 +- .../network_backbone/base_network_backbone.py | 6 +- .../LearnedEntityEmbedding.py | 133 +++++++++++++ .../setup/network_embedding/NoEmbedding.py | 49 +++++ .../base_network_embedding.py | 45 ++++- .../base_network_embedding_choice.py | 188 ++++++++++++++++++ .../modules/learned_entity_embedding.py | 70 ------- .../network_embedding/modules/no_embedding.py | 12 -- .../pipeline/tabular_classification.py | 30 +++ autoPyTorch/pipeline/tabular_regression.py | 27 +++ .../components/preprocessing}/__init__.py | 0 .../components/preprocessing/base.py | 36 ++++ .../test_encoder_choice.py | 0 .../{ => preprocessing}/test_encoders.py | 0 .../test_feature_preprocessor.py | 0 .../test_feature_preprocessor_choice.py | 0 .../{ => preprocessing}/test_imputers.py | 0 .../test_normalizer_choice.py | 0 .../{ => preprocessing}/test_normalizers.py | 0 .../{ => preprocessing}/test_scaler_choice.py | 0 .../{ => preprocessing}/test_scalers.py | 0 .../test_tabular_column_transformer.py | 2 +- .../components/setup/__init__.py | 0 .../components/{ => setup}/test_setup.py | 0 .../{ => setup}/test_setup_image_augmenter.py | 0 .../{ => setup}/test_setup_networks.py | 0 .../test_setup_preprocessing_node.py | 0 .../test_setup_traditional_classification.py | 0 .../components/training/__init__.py | 0 .../components/{ => training}/base.py | 35 ---- .../test_feature_data_loader.py | 0 .../{ => training}/test_image_data_loader.py | 0 .../{ => training}/test_training.py | 0 .../test_tabular_classification.py | 3 +- 35 files changed, 511 insertions(+), 136 deletions(-) create mode 100644 autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py create mode 100644 autoPyTorch/pipeline/components/setup/network_embedding/NoEmbedding.py create mode 100644 autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding_choice.py delete mode 100644 autoPyTorch/pipeline/components/setup/network_embedding/modules/learned_entity_embedding.py delete mode 100644 autoPyTorch/pipeline/components/setup/network_embedding/modules/no_embedding.py rename {autoPyTorch/pipeline/components/setup/network_embedding/modules => test/test_pipeline/components/preprocessing}/__init__.py (100%) create mode 100644 test/test_pipeline/components/preprocessing/base.py rename test/test_pipeline/components/{ => preprocessing}/test_encoder_choice.py (100%) rename test/test_pipeline/components/{ => preprocessing}/test_encoders.py (100%) rename test/test_pipeline/components/{ => preprocessing}/test_feature_preprocessor.py (100%) rename test/test_pipeline/components/{ => preprocessing}/test_feature_preprocessor_choice.py (100%) rename test/test_pipeline/components/{ => preprocessing}/test_imputers.py (100%) rename test/test_pipeline/components/{ => preprocessing}/test_normalizer_choice.py (100%) rename test/test_pipeline/components/{ => preprocessing}/test_normalizers.py (100%) rename test/test_pipeline/components/{ => preprocessing}/test_scaler_choice.py (100%) rename test/test_pipeline/components/{ => preprocessing}/test_scalers.py (100%) rename test/test_pipeline/components/{ => preprocessing}/test_tabular_column_transformer.py (97%) create mode 100644 test/test_pipeline/components/setup/__init__.py rename test/test_pipeline/components/{ => setup}/test_setup.py (100%) rename test/test_pipeline/components/{ => setup}/test_setup_image_augmenter.py (100%) rename test/test_pipeline/components/{ => setup}/test_setup_networks.py (100%) rename test/test_pipeline/components/{ => setup}/test_setup_preprocessing_node.py (100%) rename test/test_pipeline/components/{ => setup}/test_setup_traditional_classification.py (100%) create mode 100644 test/test_pipeline/components/training/__init__.py rename test/test_pipeline/components/{ => training}/base.py (59%) rename test/test_pipeline/components/{ => training}/test_feature_data_loader.py (100%) rename test/test_pipeline/components/{ => training}/test_image_data_loader.py (100%) rename test/test_pipeline/components/{ => training}/test_training.py (100%) diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py index 24491af44..2cf6e7fdd 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py @@ -2,7 +2,7 @@ import numpy as np -from sklearn.compose import ColumnTransformer, make_column_transformer +from sklearn.compose import ColumnTransformer from sklearn.pipeline import make_pipeline import torch @@ -57,9 +57,9 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> "TabularColumnTransformer": if len(X['dataset_properties']['categorical_columns']): categorical_pipeline = make_pipeline(*preprocessors['categorical']) - self.preprocessor = make_column_transformer( - (numerical_pipeline, X['dataset_properties']['numerical_columns']), - (categorical_pipeline, X['dataset_properties']['categorical_columns']), + self.preprocessor = ColumnTransformer([ + ('numerical_pipeline', numerical_pipeline, X['dataset_properties']['numerical_columns']), + ('categorical_pipeline', categorical_pipeline, X['dataset_properties']['categorical_columns'])], remainder='passthrough' ) diff --git a/autoPyTorch/pipeline/components/setup/network/base_network.py b/autoPyTorch/pipeline/components/setup/network/base_network.py index b40c7e774..fbb7fd336 100644 --- a/autoPyTorch/pipeline/components/setup/network/base_network.py +++ b/autoPyTorch/pipeline/components/setup/network/base_network.py @@ -32,6 +32,7 @@ def __init__( self.add_fit_requirements([ FitRequirement("network_head", (torch.nn.Module,), user_defined=False, dataset_property=False), FitRequirement("network_backbone", (torch.nn.Module,), user_defined=False, dataset_property=False), + FitRequirement("network_embedding", (torch.nn.Module,), user_defined=False, dataset_property=False), ]) self.final_activation = None @@ -50,7 +51,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> autoPyTorchTrainingComponent: # information to fit this stage self.check_requirements(X, y) - self.network = torch.nn.Sequential(X['network_backbone'], X['network_head']) + self.network = torch.nn.Sequential(X['network_embedding'], X['network_backbone'], X['network_head']) # Properly set the network training device self.to(self.device) diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py b/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py index 2557e92b8..241fcb51b 100644 --- a/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py +++ b/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py @@ -14,6 +14,7 @@ from autoPyTorch.pipeline.components.base_component import ( autoPyTorchComponent, ) +from autoPyTorch.pipeline.components.setup.network_backbone.utils import get_output_shape from autoPyTorch.utils.common import FitRequirement @@ -31,7 +32,9 @@ def __init__(self, FitRequirement('X_train', (np.ndarray, pd.DataFrame, csr_matrix), user_defined=True, dataset_property=False), FitRequirement('input_shape', (Iterable,), user_defined=True, dataset_property=True), - FitRequirement('tabular_transformer', (BaseEstimator,), user_defined=False, dataset_property=False)]) + FitRequirement('tabular_transformer', (BaseEstimator,), user_defined=False, dataset_property=False), + FitRequirement('network_embedding', (nn.Module,), user_defined=False, dataset_property=False) + ]) self.backbone: nn.Module = None self.config = kwargs self.input_shape: Optional[Iterable] = None @@ -56,6 +59,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator: column_transformer = X['tabular_transformer'].preprocessor input_shape = column_transformer.transform(X_train[:1]).shape[1:] + input_shape = get_output_shape(X['network_embedding'], input_shape=input_shape) self.input_shape = input_shape self.backbone = self.build_backbone( diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py new file mode 100644 index 000000000..41d2da581 --- /dev/null +++ b/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py @@ -0,0 +1,133 @@ +from typing import Any, Dict, Optional, Union + +from ConfigSpace.configuration_space import ConfigurationSpace +from ConfigSpace.hyperparameters import ( + UniformFloatHyperparameter, + UniformIntegerHyperparameter +) + +import numpy as np + +import torch +from torch import nn + +from autoPyTorch.pipeline.components.setup.network_embedding.base_network_embedding import NetworkEmbeddingComponent + + +class _LearnedEntityEmbedding(nn.Module): + """ Parent class for MlpNet, ResNet, ... Can use entity embedding for categorical features""" + + def __init__(self, config, num_input_features, num_numerical_features): + """ + Initialize the BaseFeatureNet. + Arguments: + config: The configuration sampled by the hyperparameter optimizer + # TODO: fix this + num_input_features: the number of features of the dataset + num_numerical_features: OneHot encoder, that is used to encode X + """ + super().__init__() + self.config = config + + self.num_numerical = num_numerical_features + # list of number of categories of categorical data + # or 0 for numerical data + self.num_input_features = num_input_features + categorical_features = self.num_input_features > 0 + + self.num_categorical_features = self.num_input_features[categorical_features] + + self.embed_features = [num_in >= config["min_unique_values_for_embedding"] for num_in in + self.num_input_features] + self.num_output_dimensions = [0] * num_numerical_features + self.num_output_dimensions.extend([config["dimension_reduction_" + str(i)] * num_in for i, num_in in + enumerate(self.num_categorical_features)]) + self.num_output_dimensions = [int(np.clip(num_out, 1, num_in - 1)) for num_out, num_in in + zip(self.num_output_dimensions, self.num_input_features)] + self.num_output_dimensions = [num_out if embed else num_in for num_out, embed, num_in in + zip(self.num_output_dimensions, self.embed_features, + self.num_input_features)] + self.num_out_feats = self.num_numerical + sum(self.num_output_dimensions) + + self.ee_layers = self._create_ee_layers() + + def forward(self, x): + # pass the columns of each categorical feature through entity embedding layer + # before passing it through the model + concat_seq = [] + last_concat = 0 + x_pointer = 0 + layer_pointer = 0 + for num_in, embed in zip(self.num_input_features, self.embed_features): + if not embed: + x_pointer += 1 + continue + if x_pointer > last_concat: + concat_seq.append(x[:, last_concat: x_pointer]) + categorical_feature_slice = x[:, x_pointer: x_pointer + num_in] + concat_seq.append(self.ee_layers[layer_pointer](categorical_feature_slice)) + layer_pointer += 1 + x_pointer += num_in + last_concat = x_pointer + + concat_seq.append(x[:, last_concat:]) + return torch.cat(concat_seq, dim=1) + + def _create_ee_layers(self): + # entity embeding layers are Linear Layers + layers = nn.ModuleList() + for i, (num_in, embed, num_out) in enumerate( + zip(self.num_input_features, self.embed_features, self.num_output_dimensions)): + if not embed: + continue + layers.append(nn.Linear(num_in, num_out)) + return layers + + +class LearnedEntityEmbedding(NetworkEmbeddingComponent): + """ + Class to learn an embedding for categorical hyperparameters. + """ + + def __init__(self, random_state: Optional[Union[np.random.RandomState, int]] = None, **kwargs: Any): + super().__init__(random_state=random_state) + self.config = kwargs + + def build_embedding(self, num_input_features, num_numerical_features) -> nn.Module: + return _LearnedEntityEmbedding(config=self.config, + num_input_features=num_input_features, + num_numerical_features=num_numerical_features) + + @staticmethod + def get_hyperparameter_search_space( + dataset_properties: Optional[Dict[str, str]] = None, + min_unique_values_for_embedding=((3, 7), 5, True), + dimension_reduction=((0, 1), 0.5), + ) -> ConfigurationSpace: + cs = ConfigurationSpace() + min_hp = UniformIntegerHyperparameter("min_unique_values_for_embedding", + lower=min_unique_values_for_embedding[0][0], + upper=min_unique_values_for_embedding[0][1], + default_value=min_unique_values_for_embedding[1], + log=min_unique_values_for_embedding[2] + ) + cs.add_hyperparameter(min_hp) + if dataset_properties is not None: + for i in range(len(dataset_properties['categorical_columns'])): + ee_dimensions_hp = UniformFloatHyperparameter("dimension_reduction_" + str(i), + lower=dimension_reduction[0][0], + upper=dimension_reduction[0][1], + default_value=dimension_reduction[1] + ) + cs.add_hyperparameter(ee_dimensions_hp) + return cs + + @staticmethod + def get_properties(dataset_properties: Optional[Dict[str, Any]] = None) -> Dict[str, Union[str, bool]]: + return { + 'shortname': 'embedding', + 'name': 'LearnedEntityEmbedding', + 'handles_tabular': True, + 'handles_image': False, + 'handles_time_series': False, + } diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/NoEmbedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/NoEmbedding.py new file mode 100644 index 000000000..0f18b5ed6 --- /dev/null +++ b/autoPyTorch/pipeline/components/setup/network_embedding/NoEmbedding.py @@ -0,0 +1,49 @@ +from typing import Any, Dict, Optional, Union + +from ConfigSpace.configuration_space import ConfigurationSpace + +import numpy as np + +from torch import nn + +from autoPyTorch.pipeline.components.setup.network_embedding.base_network_embedding import NetworkEmbeddingComponent + + +class _NoEmbedding(nn.Module): + def __init__(self, num_input_features, num_numerical_features): + super().__init__() + self.n_feats = num_input_features + self.num_numerical = num_numerical_features + + def forward(self, x): + return x + + +class NoEmbedding(NetworkEmbeddingComponent): + """ + Class to learn an embedding for categorical hyperparameters. + """ + + def __init__(self, random_state: Optional[Union[np.random.RandomState, int]] = None): + super().__init__(random_state=random_state) + + def build_embedding(self, num_input_features, num_numerical_features) -> nn.Module: + return _NoEmbedding(num_input_features=num_input_features, + num_numerical_features=num_numerical_features) + + @staticmethod + def get_hyperparameter_search_space( + dataset_properties: Optional[Dict[str, str]] = None, + ) -> ConfigurationSpace: + cs = ConfigurationSpace() + return cs + + @staticmethod + def get_properties(dataset_properties: Optional[Dict[str, Any]] = None) -> Dict[str, Union[str, bool]]: + return { + 'shortname': 'no embedding', + 'name': 'NoEmbedding', + 'handles_tabular': True, + 'handles_image': False, + 'handles_time_series': False, + } \ No newline at end of file diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py index 4c087a89d..e27cac3c0 100644 --- a/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py +++ b/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py @@ -1,25 +1,50 @@ -from typing import Any, Dict, Optional +import copy +from typing import Any, Dict, Optional, Union + +import numpy as np from sklearn.base import BaseEstimator + from torch import nn from autoPyTorch.pipeline.components.setup.base_setup import autoPyTorchSetupComponent - +from autoPyTorch.utils.common import subsampler class NetworkEmbeddingComponent(autoPyTorchSetupComponent): - def __init__(self, - **kwargs): + def __init__(self, random_state: Optional[Union[np.random.RandomState, int]] = None): super().__init__() - self.config = kwargs self.embedding: Optional[nn.Module] = None def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator: - in_features = X['X_train'].shape[1:] + + num_numerical_columns, num_input_features = self._get_args(X) self.embedding = self.build_embedding( - in_features=in_features, - num_numerical_features=len(X['numerical_features'])) + num_input_features=num_input_features, + num_numerical_features=num_numerical_columns) return self - def build_embedding(self, in_features, num_numerical_features) -> nn.Module: - raise NotImplementedError \ No newline at end of file + def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: + X.update({'network_embedding': self.embedding}) + return X + + def build_embedding(self, num_input_features, num_numerical_features) -> nn.Module: + raise NotImplementedError + + def _get_args(self, X: Dict[str, Any]) -> Union[int, np.ndarray]: + # Feature preprocessors can alter numerical columns + if len(X['dataset_properties']['numerical_columns']) == 0: + num_numerical_columns = 0 + else: + X_train = copy.deepcopy(X['backend'].load_datamanager().train_tensors[0][:2]) + # as numerical pipeline will always be the first pipeline + numerical_column_transformer = X['tabular_transformer'].preprocessor.named_transformers_['numerical_pipeline'] + num_numerical_columns = numerical_column_transformer.transform( + X_train.iloc[:, X['dataset_properties']['numerical_columns']]).shape[1] + num_input_features = np.zeros((num_numerical_columns + + len(X['dataset_properties']['categorical_columns'])), dtype=int) + categories = X['dataset_properties']['categories'] + + for i, category in enumerate(categories): + num_input_features[num_numerical_columns + i, ] = len(category) + return num_numerical_columns, num_input_features diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding_choice.py b/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding_choice.py new file mode 100644 index 000000000..c08b156ce --- /dev/null +++ b/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding_choice.py @@ -0,0 +1,188 @@ +import os +from collections import OrderedDict +from typing import Dict, List, Optional + +import ConfigSpace.hyperparameters as CSH +from ConfigSpace.configuration_space import ConfigurationSpace + +import numpy as np + +from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice +from autoPyTorch.pipeline.components.base_component import ( + ThirdPartyComponents, + autoPyTorchComponent, + find_components, +) +from autoPyTorch.pipeline.components.setup.network_embedding.base_network_embedding import ( + NetworkEmbeddingComponent, +) + +directory = os.path.split(__file__)[0] +_embeddings = find_components(__package__, + directory, + NetworkEmbeddingComponent) +_addons = ThirdPartyComponents(NetworkEmbeddingComponent) + + +def add_embedding(embedding: NetworkEmbeddingComponent) -> None: + _addons.add_component(embedding) + + +class NetworkEmbeddingChoice(autoPyTorchChoice): + + def get_components(self) -> Dict[str, autoPyTorchComponent]: + """Returns the available embedding components + + Args: + None + + Returns: + Dict[str, autoPyTorchComponent]: all baseembedding components available + as choices for learning rate scheduling + """ + components = OrderedDict() + components.update(_embeddings) + components.update(_addons.components) + return components + + def get_available_components( + self, + dataset_properties: Optional[Dict[str, str]] = None, + include: List[str] = None, + exclude: List[str] = None, + ) -> Dict[str, autoPyTorchComponent]: + """Filters out components based on user provided + include/exclude directives, as well as the dataset properties + + Args: + include (Optional[Dict[str, Any]]): what hyper-parameter configurations + to honor when creating the configuration space + exclude (Optional[Dict[str, Any]]): what hyper-parameter configurations + to remove from the configuration space + dataset_properties (Optional[Dict[str, Union[str, int]]]): Caracteristics + of the dataset to guide the pipeline choices of components + + Returns: + Dict[str, autoPyTorchComponent]: A filtered dict of learning + rate embeddings + + """ + if dataset_properties is None: + dataset_properties = {} + + if include is not None and exclude is not None: + raise ValueError( + "The argument include and exclude cannot be used together.") + + available_comp = self.get_components() + + if include is not None: + for incl in include: + if incl not in available_comp: + raise ValueError("Trying to include unknown component: " + "%s" % incl) + + components_dict = OrderedDict() + for name in available_comp: + if include is not None and name not in include: + continue + elif exclude is not None and name in exclude: + continue + + entry = available_comp[name] + + # Exclude itself to avoid infinite loop + if entry == NetworkEmbeddingChoice or hasattr(entry, 'get_components'): + continue + + task_type = dataset_properties['task_type'] + properties = entry.get_properties() + if 'tabular' in task_type and not properties['handles_tabular']: + continue + elif 'image' in task_type and not properties['handles_image']: + continue + elif 'time_series' in task_type and not properties['handles_time_series']: + continue + + components_dict[name] = entry + + return components_dict + + def get_hyperparameter_search_space( + self, + dataset_properties: Optional[Dict[str, str]] = None, + default: Optional[str] = None, + include: Optional[List[str]] = None, + exclude: Optional[List[str]] = None, + ) -> ConfigurationSpace: + """Returns the configuration space of the current chosen components + + Args: + dataset_properties (Optional[Dict[str, str]]): Describes the dataset to work on + default (Optional[str]): Default embedding to use + include: Optional[Dict[str, Any]]: what components to include. It is an exhaustive + list, and will exclusively use this components. + exclude: Optional[Dict[str, Any]]: which components to skip + + Returns: + ConfigurationSpace: the configuration space of the hyper-parameters of the + chosen component + """ + cs = ConfigurationSpace() + + if dataset_properties is None: + dataset_properties = {} + + # Compile a list of legal preprocessors for this problem + available_embedding = self.get_available_components( + dataset_properties=dataset_properties, + include=include, exclude=exclude) + + if len(available_embedding) == 0 and 'tabular' in dataset_properties['task_type']: + raise ValueError("No embedding found") + + if available_embedding == 0: + return cs + + if default is None: + defaults = [ + 'LearnedEntityEmbedding', + 'NoEmbedding' + ] + for default_ in defaults: + if default_ in available_embedding: + default = default_ + break + + if len(dataset_properties['categorical_columns']) == 0: + default = 'NoEmbedding' + if include is not None and default not in include: + raise ValueError("Provided {} in include, however, the dataset " + "is incompatible with it".format(include)) + embedding = CSH.CategoricalHyperparameter('__choice__', + ['NoEmbedding'], + default_value=default) + else: + embedding = CSH.CategoricalHyperparameter('__choice__', + list(available_embedding.keys()), + default_value=default) + + cs.add_hyperparameter(embedding) + for name in embedding.choices: + updates = self._get_search_space_updates(prefix=name) + config_space = available_embedding[name].get_hyperparameter_search_space(dataset_properties, # type: ignore + **updates) + parent_hyperparameter = {'parent': embedding, 'value': name} + cs.add_configuration_space( + name, + config_space, + parent_hyperparameter=parent_hyperparameter + ) + + self.configuration_space_ = cs + self.dataset_properties_ = dataset_properties + return cs + + def transform(self, X: np.ndarray) -> np.ndarray: + assert self.choice is not None, "Cannot call transform before the object is initialized" + return self.choice.transform(X) diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/modules/learned_entity_embedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/modules/learned_entity_embedding.py deleted file mode 100644 index d7d294661..000000000 --- a/autoPyTorch/pipeline/components/setup/network_embedding/modules/learned_entity_embedding.py +++ /dev/null @@ -1,70 +0,0 @@ -""" -Class to learn an embedding for categorical hyperparameters. -""" - -import torch -import torch.nn as nn -import numpy as np - - -class LearnedEntityEmbedding(nn.Module): - """ Parent class for MlpNet, ResNet, ... Can use entity embedding for categorical features""" - - def __init__(self, config, num_input_features, num_numerical_features): - """ - Initialize the BaseFeatureNet. - Arguments: - config: The configuration sampled by the hyperparameter optimizer - in_features: the number of features of the dataset - one_hot_encoder: OneHot encoder, that is used to encode X - """ - super(LearnedEntityEmbedding, self).__init__() - self.config = config - - # self.num_numerical = len([f for f in one_hot_encoder.categorical_features if not f]) - # self.num_input_features = [len(c) for c in one_hot_encoder.categories_] - self.num_numerical = num_numerical_features - self.num_input_features = num_input_features - self.embed_features = [num_in >= config["min_unique_values_for_embedding"] for num_in in - self.num_input_features] - self.num_output_dimensions = [config["dimension_reduction_" + str(i)] * num_in for i, num_in in - enumerate(self.num_input_features)] - self.num_output_dimensions = [int(np.clip(num_out, 1, num_in - 1)) for num_out, num_in in - zip(self.num_output_dimensions, self.num_input_features)] - self.num_output_dimensions = [num_out if embed else num_in for num_out, embed, num_in in - zip(self.num_output_dimensions, self.embed_features, self.num_input_features)] - self.num_out_feats = self.num_numerical + sum(self.num_output_dimensions) - - self.ee_layers = self._create_ee_layers() - - def forward(self, x): - # pass the columns of each categorical feature through entity embedding layer - # before passing it through the model - concat_seq = [] - last_concat = 0 - x_pointer = 0 - layer_pointer = 0 - for num_in, embed in zip(self.num_input_features, self.embed_features): - if not embed: - x_pointer += 1 - continue - if x_pointer > last_concat: - concat_seq.append(x[:, last_concat: x_pointer]) - categorical_feature_slice = x[:, x_pointer: x_pointer + num_in] - concat_seq.append(self.ee_layers[layer_pointer](categorical_feature_slice)) - layer_pointer += 1 - x_pointer += num_in - last_concat = x_pointer - - concat_seq.append(x[:, last_concat:]) - return torch.cat(concat_seq, dim=1) - - def _create_ee_layers(self): - # entity embeding layers are Linear Layers - layers = nn.ModuleList() - for i, (num_in, embed, num_out) in enumerate( - zip(self.num_input_features, self.embed_features, self.num_output_dimensions)): - if not embed: - continue - layers.append(nn.Linear(num_in, num_out)) - return layers \ No newline at end of file diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/modules/no_embedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/modules/no_embedding.py deleted file mode 100644 index cab1e9b7f..000000000 --- a/autoPyTorch/pipeline/components/setup/network_embedding/modules/no_embedding.py +++ /dev/null @@ -1,12 +0,0 @@ -from torch import nn - - -class NoEmbedding(nn.Module): - def __init__(self, config, in_features, num_numerical_features): - super(NoEmbedding, self).__init__() - self.config = config - self.n_feats = in_features - self.num_numerical = num_numerical_features - - def forward(self, x): - return x \ No newline at end of file diff --git a/autoPyTorch/pipeline/tabular_classification.py b/autoPyTorch/pipeline/tabular_classification.py index ec80b4a5c..298d0f1b3 100644 --- a/autoPyTorch/pipeline/tabular_classification.py +++ b/autoPyTorch/pipeline/tabular_classification.py @@ -1,7 +1,9 @@ +import copy import warnings from typing import Any, Dict, List, Optional, Tuple from ConfigSpace.configuration_space import Configuration, ConfigurationSpace +from ConfigSpace.forbidden import ForbiddenEqualsClause, ForbiddenAndConjunction import numpy as np @@ -24,6 +26,7 @@ from autoPyTorch.pipeline.components.setup.lr_scheduler.base_scheduler_choice import SchedulerChoice from autoPyTorch.pipeline.components.setup.network.base_network import NetworkComponent from autoPyTorch.pipeline.components.setup.network_backbone.base_network_backbone_choice import NetworkBackboneChoice +from autoPyTorch.pipeline.components.setup.network_embedding.base_network_embedding_choice import NetworkEmbeddingChoice from autoPyTorch.pipeline.components.setup.network_head.base_network_head_choice import NetworkHeadChoice from autoPyTorch.pipeline.components.setup.network_initializer.base_network_init_choice import ( NetworkInitializerChoice @@ -188,6 +191,32 @@ def _get_hyperparameter_search_space( # Here we add custom code, like this with this # is not a valid configuration + # Learned Entity Embedding is only valid when encoder is one hot encoder + embeddings = cs.get_hyperparameter('network_embedding:__choice__').choices + encoders = cs.get_hyperparameter('encoder:__choice__').choices + default = cs.get_hyperparameter('network_embedding:__choice__').default_value + possible_default_embeddings = copy.copy(list(embeddings)) + del possible_default_embeddings[possible_default_embeddings.index(default)] + if 'network_embedding' in self.named_steps.keys() and 'encoder' in self.named_steps.keys(): + for encoder in encoders: + if encoder == 'OneHotEncoder': + continue + while True: + try: + cs.add_forbidden_clause(ForbiddenAndConjunction( + ForbiddenEqualsClause(cs.get_hyperparameter( + 'network_embedding:__choice__'), 'LearnedEntityEmbedding'), + ForbiddenEqualsClause(cs.get_hyperparameter('encoder:__choice__') + , encoder) + )) + break + except ValueError: + # change the default and try again + try: + default = possible_default_embeddings.pop() + except IndexError: + raise ValueError("Cannot find a legal default configuration") + cs.get_hyperparameter('network_embedding:__choice__').default_value = default self.configuration_space = cs self.dataset_properties = dataset_properties @@ -216,6 +245,7 @@ def _get_pipeline_steps(self, dataset_properties: Optional[Dict[str, Any]], ("feature_preprocessor", FeatureProprocessorChoice(default_dataset_properties)), ("tabular_transformer", TabularColumnTransformer()), ("preprocessing", EarlyPreprocessing()), + ("network_embedding", NetworkEmbeddingChoice(default_dataset_properties)), ("network_backbone", NetworkBackboneChoice(default_dataset_properties)), ("network_head", NetworkHeadChoice(default_dataset_properties)), ("network", NetworkComponent(default_dataset_properties)), diff --git a/autoPyTorch/pipeline/tabular_regression.py b/autoPyTorch/pipeline/tabular_regression.py index 40645223f..52a65272f 100644 --- a/autoPyTorch/pipeline/tabular_regression.py +++ b/autoPyTorch/pipeline/tabular_regression.py @@ -2,6 +2,7 @@ from typing import Any, Dict, List, Optional, Tuple from ConfigSpace.configuration_space import Configuration, ConfigurationSpace +from ConfigSpace.forbidden import ForbiddenAndConjunction, ForbiddenEqualsClause import numpy as np @@ -138,6 +139,32 @@ def _get_hyperparameter_search_space( # Here we add custom code, like this with this # is not a valid configuration + # Learned Entity Embedding is only valid when encoder is one hot encoder + embeddings = cs.get_hyperparameter('network_embedding:__choice__').choices + encoders = cs.get_hyperparameter('encoder:__choice__').choices + default = cs.get_hyperparameter('network_embedding:__choice__').default_value + possible_default_embeddings = copy.copy(list(embeddings)) + del possible_default_embeddings[possible_default_embeddings.index(default)] + if 'network_embedding' in self.named_steps.keys() and 'encoder' in self.named_steps.keys(): + for encoder in encoders: + if encoder == 'OneHotEncoder': + continue + while True: + try: + cs.add_forbidden_clause(ForbiddenAndConjunction( + ForbiddenEqualsClause(cs.get_hyperparameter( + 'network_embedding:__choice__'), 'LearnedEntityEmbedding'), + ForbiddenEqualsClause(cs.get_hyperparameter('encoder:__choice__') + , encoder) + )) + break + except ValueError: + # change the default and try again + try: + default = possible_default_embeddings.pop() + except IndexError: + raise ValueError("Cannot find a legal default configuration") + cs.get_hyperparameter('network_embedding:__choice__').default_value = default self.configuration_space = cs self.dataset_properties = dataset_properties diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/modules/__init__.py b/test/test_pipeline/components/preprocessing/__init__.py similarity index 100% rename from autoPyTorch/pipeline/components/setup/network_embedding/modules/__init__.py rename to test/test_pipeline/components/preprocessing/__init__.py diff --git a/test/test_pipeline/components/preprocessing/base.py b/test/test_pipeline/components/preprocessing/base.py new file mode 100644 index 000000000..7bb4fee70 --- /dev/null +++ b/test/test_pipeline/components/preprocessing/base.py @@ -0,0 +1,36 @@ +from typing import Any, Dict, List, Optional, Tuple + +from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.TabularColumnTransformer import \ + TabularColumnTransformer +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.encoding.base_encoder_choice import \ + EncoderChoice +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.imputation.SimpleImputer import SimpleImputer +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling.base_scaler_choice import ScalerChoice +from autoPyTorch.pipeline.tabular_classification import TabularClassificationPipeline + + +class TabularPipeline(TabularClassificationPipeline): + def _get_pipeline_steps(self, dataset_properties: Optional[Dict[str, Any]], + ) -> List[Tuple[str, autoPyTorchChoice]]: + """ + Defines what steps a pipeline should follow. + The step itself has choices given via autoPyTorchChoice. + + Returns: + List[Tuple[str, autoPyTorchChoice]]: list of steps sequentially exercised + by the pipeline. + """ + steps = [] # type: List[Tuple[str, autoPyTorchChoice]] + + default_dataset_properties = {'target_type': 'tabular_classification'} + if dataset_properties is not None: + default_dataset_properties.update(dataset_properties) + + steps.extend([ + ("imputer", SimpleImputer()), + ("encoder", EncoderChoice(default_dataset_properties)), + ("scaler", ScalerChoice(default_dataset_properties)), + ("tabular_transformer", TabularColumnTransformer()), + ]) + return steps \ No newline at end of file diff --git a/test/test_pipeline/components/test_encoder_choice.py b/test/test_pipeline/components/preprocessing/test_encoder_choice.py similarity index 100% rename from test/test_pipeline/components/test_encoder_choice.py rename to test/test_pipeline/components/preprocessing/test_encoder_choice.py diff --git a/test/test_pipeline/components/test_encoders.py b/test/test_pipeline/components/preprocessing/test_encoders.py similarity index 100% rename from test/test_pipeline/components/test_encoders.py rename to test/test_pipeline/components/preprocessing/test_encoders.py diff --git a/test/test_pipeline/components/test_feature_preprocessor.py b/test/test_pipeline/components/preprocessing/test_feature_preprocessor.py similarity index 100% rename from test/test_pipeline/components/test_feature_preprocessor.py rename to test/test_pipeline/components/preprocessing/test_feature_preprocessor.py diff --git a/test/test_pipeline/components/test_feature_preprocessor_choice.py b/test/test_pipeline/components/preprocessing/test_feature_preprocessor_choice.py similarity index 100% rename from test/test_pipeline/components/test_feature_preprocessor_choice.py rename to test/test_pipeline/components/preprocessing/test_feature_preprocessor_choice.py diff --git a/test/test_pipeline/components/test_imputers.py b/test/test_pipeline/components/preprocessing/test_imputers.py similarity index 100% rename from test/test_pipeline/components/test_imputers.py rename to test/test_pipeline/components/preprocessing/test_imputers.py diff --git a/test/test_pipeline/components/test_normalizer_choice.py b/test/test_pipeline/components/preprocessing/test_normalizer_choice.py similarity index 100% rename from test/test_pipeline/components/test_normalizer_choice.py rename to test/test_pipeline/components/preprocessing/test_normalizer_choice.py diff --git a/test/test_pipeline/components/test_normalizers.py b/test/test_pipeline/components/preprocessing/test_normalizers.py similarity index 100% rename from test/test_pipeline/components/test_normalizers.py rename to test/test_pipeline/components/preprocessing/test_normalizers.py diff --git a/test/test_pipeline/components/test_scaler_choice.py b/test/test_pipeline/components/preprocessing/test_scaler_choice.py similarity index 100% rename from test/test_pipeline/components/test_scaler_choice.py rename to test/test_pipeline/components/preprocessing/test_scaler_choice.py diff --git a/test/test_pipeline/components/test_scalers.py b/test/test_pipeline/components/preprocessing/test_scalers.py similarity index 100% rename from test/test_pipeline/components/test_scalers.py rename to test/test_pipeline/components/preprocessing/test_scalers.py diff --git a/test/test_pipeline/components/test_tabular_column_transformer.py b/test/test_pipeline/components/preprocessing/test_tabular_column_transformer.py similarity index 97% rename from test/test_pipeline/components/test_tabular_column_transformer.py rename to test/test_pipeline/components/preprocessing/test_tabular_column_transformer.py index 5eae26f69..c0b4d94d3 100644 --- a/test/test_pipeline/components/test_tabular_column_transformer.py +++ b/test/test_pipeline/components/preprocessing/test_tabular_column_transformer.py @@ -1,4 +1,4 @@ -from test.test_pipeline.components.base import TabularPipeline +from test.test_pipeline.components.preprocessing.base import TabularPipeline import numpy as np diff --git a/test/test_pipeline/components/setup/__init__.py b/test/test_pipeline/components/setup/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/test/test_pipeline/components/test_setup.py b/test/test_pipeline/components/setup/test_setup.py similarity index 100% rename from test/test_pipeline/components/test_setup.py rename to test/test_pipeline/components/setup/test_setup.py diff --git a/test/test_pipeline/components/test_setup_image_augmenter.py b/test/test_pipeline/components/setup/test_setup_image_augmenter.py similarity index 100% rename from test/test_pipeline/components/test_setup_image_augmenter.py rename to test/test_pipeline/components/setup/test_setup_image_augmenter.py diff --git a/test/test_pipeline/components/test_setup_networks.py b/test/test_pipeline/components/setup/test_setup_networks.py similarity index 100% rename from test/test_pipeline/components/test_setup_networks.py rename to test/test_pipeline/components/setup/test_setup_networks.py diff --git a/test/test_pipeline/components/test_setup_preprocessing_node.py b/test/test_pipeline/components/setup/test_setup_preprocessing_node.py similarity index 100% rename from test/test_pipeline/components/test_setup_preprocessing_node.py rename to test/test_pipeline/components/setup/test_setup_preprocessing_node.py diff --git a/test/test_pipeline/components/test_setup_traditional_classification.py b/test/test_pipeline/components/setup/test_setup_traditional_classification.py similarity index 100% rename from test/test_pipeline/components/test_setup_traditional_classification.py rename to test/test_pipeline/components/setup/test_setup_traditional_classification.py diff --git a/test/test_pipeline/components/training/__init__.py b/test/test_pipeline/components/training/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/test/test_pipeline/components/base.py b/test/test_pipeline/components/training/base.py similarity index 59% rename from test/test_pipeline/components/base.py rename to test/test_pipeline/components/training/base.py index 6ad3ad824..f4d35b808 100644 --- a/test/test_pipeline/components/base.py +++ b/test/test_pipeline/components/training/base.py @@ -1,22 +1,13 @@ import logging import unittest -from typing import Any, Dict, List, Optional, Tuple from sklearn.datasets import make_classification import torch from autoPyTorch.constants import STRING_TO_OUTPUT_TYPES, STRING_TO_TASK_TYPES -from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice -from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.TabularColumnTransformer import \ - TabularColumnTransformer -from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.encoding.base_encoder_choice import \ - EncoderChoice -from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.imputation.SimpleImputer import SimpleImputer -from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling.base_scaler_choice import ScalerChoice from autoPyTorch.pipeline.components.training.metrics.utils import get_metrics from autoPyTorch.pipeline.components.training.trainer.base_trainer import BudgetTracker -from autoPyTorch.pipeline.tabular_classification import TabularClassificationPipeline class BaseTraining(unittest.TestCase): @@ -80,29 +71,3 @@ def _overfit_model(self): # Backward pass loss.backward() self.optimizer.step() - - -class TabularPipeline(TabularClassificationPipeline): - def _get_pipeline_steps(self, dataset_properties: Optional[Dict[str, Any]], - ) -> List[Tuple[str, autoPyTorchChoice]]: - """ - Defines what steps a pipeline should follow. - The step itself has choices given via autoPyTorchChoice. - - Returns: - List[Tuple[str, autoPyTorchChoice]]: list of steps sequentially exercised - by the pipeline. - """ - steps = [] # type: List[Tuple[str, autoPyTorchChoice]] - - default_dataset_properties = {'target_type': 'tabular_classification'} - if dataset_properties is not None: - default_dataset_properties.update(dataset_properties) - - steps.extend([ - ("imputer", SimpleImputer()), - ("encoder", EncoderChoice(default_dataset_properties)), - ("scaler", ScalerChoice(default_dataset_properties)), - ("tabular_transformer", TabularColumnTransformer()), - ]) - return steps diff --git a/test/test_pipeline/components/test_feature_data_loader.py b/test/test_pipeline/components/training/test_feature_data_loader.py similarity index 100% rename from test/test_pipeline/components/test_feature_data_loader.py rename to test/test_pipeline/components/training/test_feature_data_loader.py diff --git a/test/test_pipeline/components/test_image_data_loader.py b/test/test_pipeline/components/training/test_image_data_loader.py similarity index 100% rename from test/test_pipeline/components/test_image_data_loader.py rename to test/test_pipeline/components/training/test_image_data_loader.py diff --git a/test/test_pipeline/components/test_training.py b/test/test_pipeline/components/training/test_training.py similarity index 100% rename from test/test_pipeline/components/test_training.py rename to test/test_pipeline/components/training/test_training.py diff --git a/test/test_pipeline/test_tabular_classification.py b/test/test_pipeline/test_tabular_classification.py index 8f87d62ca..0da963c75 100644 --- a/test/test_pipeline/test_tabular_classification.py +++ b/test/test_pipeline/test_tabular_classification.py @@ -20,8 +20,7 @@ parse_hyperparameter_search_space_updates -@pytest.mark.parametrize("fit_dictionary", ['fit_dictionary_numerical_only', - 'fit_dictionary_categorical_only', +@pytest.mark.parametrize("fit_dictionary", ['fit_dictionary_categorical_only', 'fit_dictionary_num_and_categorical'], indirect=True) class TestTabularClassification: def _assert_pipeline_search_space(self, pipeline, search_space_updates): From 9be86a5bc9dcb4932454ee29303b086c9c4515f9 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Mon, 15 Feb 2021 22:24:05 +0100 Subject: [PATCH 04/68] ADD tests for network embedding --- .../encoding/OrdinalEncoder.py | 33 -------- .../LearnedEntityEmbedding.py | 29 ++++--- .../setup/network_embedding/NoEmbedding.py | 15 ++-- .../base_network_embedding.py | 20 ++--- .../pipeline/tabular_classification.py | 75 ++++++++++--------- autoPyTorch/pipeline/tabular_regression.py | 56 +++++++------- .../components/setup/test_setup_networks.py | 18 ++++- .../test_tabular_classification.py | 12 ++- 8 files changed, 122 insertions(+), 136 deletions(-) delete mode 100644 autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/OrdinalEncoder.py diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/OrdinalEncoder.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/OrdinalEncoder.py deleted file mode 100644 index c65726327..000000000 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/OrdinalEncoder.py +++ /dev/null @@ -1,33 +0,0 @@ -from typing import Any, Dict, Optional, Union - -import numpy as np - -from sklearn.preprocessing import OrdinalEncoder as OE - -from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.encoding.base_encoder import BaseEncoder - - -class OrdinalEncoder(BaseEncoder): - """ - Encode categorical features as a one-hot numerical array - """ - def __init__(self, random_state: Optional[Union[np.random.RandomState, int]] = None): - super().__init__() - self.random_state = random_state - - def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEncoder: - - self.check_requirements(X, y) - - self.preprocessor['categorical'] = OE(handle_unknown='use_encoded_value', - unknown_value=-1, - ) - return self - - @staticmethod - def get_properties(dataset_properties: Optional[Dict[str, Any]] = None) -> Dict[str, Union[str, bool]]: - return { - 'shortname': 'OrdinalEncoder', - 'name': 'Ordinal Encoder', - 'handles_sparse': False - } diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py index 41d2da581..3910afc37 100644 --- a/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py +++ b/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py @@ -1,4 +1,4 @@ -from typing import Any, Dict, Optional, Union +from typing import Any, Dict, Optional, Tuple, Union from ConfigSpace.configuration_space import ConfigurationSpace from ConfigSpace.hyperparameters import ( @@ -15,16 +15,15 @@ class _LearnedEntityEmbedding(nn.Module): - """ Parent class for MlpNet, ResNet, ... Can use entity embedding for categorical features""" + """ Learned entity embedding module for categorical features""" - def __init__(self, config, num_input_features, num_numerical_features): + def __init__(self, config: Dict[str, Any], num_input_features: np.ndarray, num_numerical_features: int): """ - Initialize the BaseFeatureNet. Arguments: - config: The configuration sampled by the hyperparameter optimizer - # TODO: fix this - num_input_features: the number of features of the dataset - num_numerical_features: OneHot encoder, that is used to encode X + config (Dict[str, Any]): The configuration sampled by the hyperparameter optimizer + num_input_features (np.ndarray): column wise information of number of output columns after transformation + for each categorical column and 0 for numerical columns + num_numerical_features (int): number of numerical features in X """ super().__init__() self.config = config @@ -51,7 +50,7 @@ def __init__(self, config, num_input_features, num_numerical_features): self.ee_layers = self._create_ee_layers() - def forward(self, x): + def forward(self, x: torch.Tensor) -> torch.Tensor: # pass the columns of each categorical feature through entity embedding layer # before passing it through the model concat_seq = [] @@ -73,11 +72,11 @@ def forward(self, x): concat_seq.append(x[:, last_concat:]) return torch.cat(concat_seq, dim=1) - def _create_ee_layers(self): + def _create_ee_layers(self) -> nn.ModuleList: # entity embeding layers are Linear Layers layers = nn.ModuleList() - for i, (num_in, embed, num_out) in enumerate( - zip(self.num_input_features, self.embed_features, self.num_output_dimensions)): + for i, (num_in, embed, num_out) in enumerate(zip(self.num_input_features, self.embed_features, + self.num_output_dimensions)): if not embed: continue layers.append(nn.Linear(num_in, num_out)) @@ -93,7 +92,7 @@ def __init__(self, random_state: Optional[Union[np.random.RandomState, int]] = N super().__init__(random_state=random_state) self.config = kwargs - def build_embedding(self, num_input_features, num_numerical_features) -> nn.Module: + def build_embedding(self, num_input_features: np.ndarray, num_numerical_features: int) -> nn.Module: return _LearnedEntityEmbedding(config=self.config, num_input_features=num_input_features, num_numerical_features=num_numerical_features) @@ -101,8 +100,8 @@ def build_embedding(self, num_input_features, num_numerical_features) -> nn.Modu @staticmethod def get_hyperparameter_search_space( dataset_properties: Optional[Dict[str, str]] = None, - min_unique_values_for_embedding=((3, 7), 5, True), - dimension_reduction=((0, 1), 0.5), + min_unique_values_for_embedding: Tuple[Tuple, int, bool] = ((3, 7), 5, True), + dimension_reduction: Tuple[Tuple, float] = ((0, 1), 0.5), ) -> ConfigurationSpace: cs = ConfigurationSpace() min_hp = UniformIntegerHyperparameter("min_unique_values_for_embedding", diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/NoEmbedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/NoEmbedding.py index 0f18b5ed6..a8b81af2f 100644 --- a/autoPyTorch/pipeline/components/setup/network_embedding/NoEmbedding.py +++ b/autoPyTorch/pipeline/components/setup/network_embedding/NoEmbedding.py @@ -4,18 +4,14 @@ import numpy as np +import torch from torch import nn from autoPyTorch.pipeline.components.setup.network_embedding.base_network_embedding import NetworkEmbeddingComponent class _NoEmbedding(nn.Module): - def __init__(self, num_input_features, num_numerical_features): - super().__init__() - self.n_feats = num_input_features - self.num_numerical = num_numerical_features - - def forward(self, x): + def forward(self, x: torch.Tensor) -> torch.Tensor: return x @@ -27,9 +23,8 @@ class NoEmbedding(NetworkEmbeddingComponent): def __init__(self, random_state: Optional[Union[np.random.RandomState, int]] = None): super().__init__(random_state=random_state) - def build_embedding(self, num_input_features, num_numerical_features) -> nn.Module: - return _NoEmbedding(num_input_features=num_input_features, - num_numerical_features=num_numerical_features) + def build_embedding(self, num_input_features: np.ndarray, num_numerical_features: int) -> nn.Module: + return _NoEmbedding() @staticmethod def get_hyperparameter_search_space( @@ -46,4 +41,4 @@ def get_properties(dataset_properties: Optional[Dict[str, Any]] = None) -> Dict[ 'handles_tabular': True, 'handles_image': False, 'handles_time_series': False, - } \ No newline at end of file + } diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py index e27cac3c0..8652c347c 100644 --- a/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py +++ b/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py @@ -1,5 +1,5 @@ import copy -from typing import Any, Dict, Optional, Union +from typing import Any, Dict, Optional, Tuple, Union import numpy as np @@ -8,12 +8,13 @@ from torch import nn from autoPyTorch.pipeline.components.setup.base_setup import autoPyTorchSetupComponent -from autoPyTorch.utils.common import subsampler + class NetworkEmbeddingComponent(autoPyTorchSetupComponent): def __init__(self, random_state: Optional[Union[np.random.RandomState, int]] = None): super().__init__() self.embedding: Optional[nn.Module] = None + self.random_state = random_state def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator: @@ -28,21 +29,22 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: X.update({'network_embedding': self.embedding}) return X - def build_embedding(self, num_input_features, num_numerical_features) -> nn.Module: + def build_embedding(self, num_input_features: np.ndarray, num_numerical_features: int) -> nn.Module: raise NotImplementedError - def _get_args(self, X: Dict[str, Any]) -> Union[int, np.ndarray]: + def _get_args(self, X: Dict[str, Any]) -> Tuple[int, np.ndarray]: # Feature preprocessors can alter numerical columns if len(X['dataset_properties']['numerical_columns']) == 0: num_numerical_columns = 0 else: X_train = copy.deepcopy(X['backend'].load_datamanager().train_tensors[0][:2]) - # as numerical pipeline will always be the first pipeline - numerical_column_transformer = X['tabular_transformer'].preprocessor.named_transformers_['numerical_pipeline'] + + numerical_column_transformer = X['tabular_transformer'].preprocessor. \ + named_transformers_['numerical_pipeline'] num_numerical_columns = numerical_column_transformer.transform( - X_train.iloc[:, X['dataset_properties']['numerical_columns']]).shape[1] - num_input_features = np.zeros((num_numerical_columns + - len(X['dataset_properties']['categorical_columns'])), dtype=int) + X_train[:, X['dataset_properties']['numerical_columns']]).shape[1] + num_input_features = np.zeros((num_numerical_columns + len(X['dataset_properties']['categorical_columns'])), + dtype=int) categories = X['dataset_properties']['categories'] for i, category in enumerate(categories): diff --git a/autoPyTorch/pipeline/tabular_classification.py b/autoPyTorch/pipeline/tabular_classification.py index 298d0f1b3..166a4085a 100644 --- a/autoPyTorch/pipeline/tabular_classification.py +++ b/autoPyTorch/pipeline/tabular_classification.py @@ -3,7 +3,7 @@ from typing import Any, Dict, List, Optional, Tuple from ConfigSpace.configuration_space import Configuration, ConfigurationSpace -from ConfigSpace.forbidden import ForbiddenEqualsClause, ForbiddenAndConjunction +from ConfigSpace.forbidden import ForbiddenAndConjunction, ForbiddenEqualsClause import numpy as np @@ -64,15 +64,15 @@ class TabularClassificationPipeline(ClassifierMixin, BasePipeline): """ def __init__( - self, - config: Optional[Configuration] = None, - steps: Optional[List[Tuple[str, autoPyTorchChoice]]] = None, - dataset_properties: Optional[Dict[str, Any]] = None, - include: Optional[Dict[str, Any]] = None, - exclude: Optional[Dict[str, Any]] = None, - random_state: Optional[np.random.RandomState] = None, - init_params: Optional[Dict[str, Any]] = None, - search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None + self, + config: Optional[Configuration] = None, + steps: Optional[List[Tuple[str, autoPyTorchChoice]]] = None, + dataset_properties: Optional[Dict[str, Any]] = None, + include: Optional[Dict[str, Any]] = None, + exclude: Optional[Dict[str, Any]] = None, + random_state: Optional[np.random.RandomState] = None, + init_params: Optional[Dict[str, Any]] = None, + search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None ): super().__init__( config, steps, dataset_properties, include, exclude, @@ -145,10 +145,10 @@ def predict_proba(self, X: np.ndarray, batch_size: Optional[int] = None) -> np.n return y def _get_hyperparameter_search_space( - self, - dataset_properties: Dict[str, Any], - include: Optional[Dict[str, Any]] = None, - exclude: Optional[Dict[str, Any]] = None, + self, + dataset_properties: Dict[str, Any], + include: Optional[Dict[str, Any]] = None, + exclude: Optional[Dict[str, Any]] = None, ) -> ConfigurationSpace: """Create the hyperparameter configuration space. @@ -192,31 +192,32 @@ def _get_hyperparameter_search_space( # Here we add custom code, like this with this # is not a valid configuration # Learned Entity Embedding is only valid when encoder is one hot encoder - embeddings = cs.get_hyperparameter('network_embedding:__choice__').choices - encoders = cs.get_hyperparameter('encoder:__choice__').choices - default = cs.get_hyperparameter('network_embedding:__choice__').default_value - possible_default_embeddings = copy.copy(list(embeddings)) - del possible_default_embeddings[possible_default_embeddings.index(default)] if 'network_embedding' in self.named_steps.keys() and 'encoder' in self.named_steps.keys(): - for encoder in encoders: - if encoder == 'OneHotEncoder': - continue - while True: - try: - cs.add_forbidden_clause(ForbiddenAndConjunction( - ForbiddenEqualsClause(cs.get_hyperparameter( - 'network_embedding:__choice__'), 'LearnedEntityEmbedding'), - ForbiddenEqualsClause(cs.get_hyperparameter('encoder:__choice__') - , encoder) - )) - break - except ValueError: - # change the default and try again + embeddings = cs.get_hyperparameter('network_embedding:__choice__').choices + if 'LearnedEntityEmbedding' in embeddings: + encoders = cs.get_hyperparameter('encoder:__choice__').choices + default = cs.get_hyperparameter('network_embedding:__choice__').default_value + possible_default_embeddings = copy.copy(list(embeddings)) + del possible_default_embeddings[possible_default_embeddings.index(default)] + + for encoder in encoders: + if encoder == 'OneHotEncoder': + continue + while True: try: - default = possible_default_embeddings.pop() - except IndexError: - raise ValueError("Cannot find a legal default configuration") - cs.get_hyperparameter('network_embedding:__choice__').default_value = default + cs.add_forbidden_clause(ForbiddenAndConjunction( + ForbiddenEqualsClause(cs.get_hyperparameter( + 'network_embedding:__choice__'), 'LearnedEntityEmbedding'), + ForbiddenEqualsClause(cs.get_hyperparameter('encoder:__choice__'), encoder) + )) + break + except ValueError: + # change the default and try again + try: + default = possible_default_embeddings.pop() + except IndexError: + raise ValueError("Cannot find a legal default configuration") + cs.get_hyperparameter('network_embedding:__choice__').default_value = default self.configuration_space = cs self.dataset_properties = dataset_properties diff --git a/autoPyTorch/pipeline/tabular_regression.py b/autoPyTorch/pipeline/tabular_regression.py index 52a65272f..db83a19f3 100644 --- a/autoPyTorch/pipeline/tabular_regression.py +++ b/autoPyTorch/pipeline/tabular_regression.py @@ -1,3 +1,4 @@ +import copy import warnings from typing import Any, Dict, List, Optional, Tuple @@ -93,10 +94,10 @@ def score(self, X: np.ndarray, y: np.ndarray, batch_size: Optional[int] = None) return r2 def _get_hyperparameter_search_space( - self, - dataset_properties: Dict[str, Any], - include: Optional[Dict[str, Any]] = None, - exclude: Optional[Dict[str, Any]] = None, + self, + dataset_properties: Dict[str, Any], + include: Optional[Dict[str, Any]] = None, + exclude: Optional[Dict[str, Any]] = None, ) -> ConfigurationSpace: """Create the hyperparameter configuration space. @@ -140,31 +141,32 @@ def _get_hyperparameter_search_space( # Here we add custom code, like this with this # is not a valid configuration # Learned Entity Embedding is only valid when encoder is one hot encoder - embeddings = cs.get_hyperparameter('network_embedding:__choice__').choices - encoders = cs.get_hyperparameter('encoder:__choice__').choices - default = cs.get_hyperparameter('network_embedding:__choice__').default_value - possible_default_embeddings = copy.copy(list(embeddings)) - del possible_default_embeddings[possible_default_embeddings.index(default)] if 'network_embedding' in self.named_steps.keys() and 'encoder' in self.named_steps.keys(): - for encoder in encoders: - if encoder == 'OneHotEncoder': - continue - while True: - try: - cs.add_forbidden_clause(ForbiddenAndConjunction( - ForbiddenEqualsClause(cs.get_hyperparameter( - 'network_embedding:__choice__'), 'LearnedEntityEmbedding'), - ForbiddenEqualsClause(cs.get_hyperparameter('encoder:__choice__') - , encoder) - )) - break - except ValueError: - # change the default and try again + embeddings = cs.get_hyperparameter('network_embedding:__choice__').choices + if 'LearnedEntityEmbedding' in embeddings: + encoders = cs.get_hyperparameter('encoder:__choice__').choices + default = cs.get_hyperparameter('network_embedding:__choice__').default_value + possible_default_embeddings = copy.copy(list(embeddings)) + del possible_default_embeddings[possible_default_embeddings.index(default)] + + for encoder in encoders: + if encoder == 'OneHotEncoder': + continue + while True: try: - default = possible_default_embeddings.pop() - except IndexError: - raise ValueError("Cannot find a legal default configuration") - cs.get_hyperparameter('network_embedding:__choice__').default_value = default + cs.add_forbidden_clause(ForbiddenAndConjunction( + ForbiddenEqualsClause(cs.get_hyperparameter( + 'network_embedding:__choice__'), 'LearnedEntityEmbedding'), + ForbiddenEqualsClause(cs.get_hyperparameter('encoder:__choice__'), encoder) + )) + break + except ValueError: + # change the default and try again + try: + default = possible_default_embeddings.pop() + except IndexError: + raise ValueError("Cannot find a legal default configuration") + cs.get_hyperparameter('network_embedding:__choice__').default_value = default self.configuration_space = cs self.dataset_properties = dataset_properties diff --git a/test/test_pipeline/components/setup/test_setup_networks.py b/test/test_pipeline/components/setup/test_setup_networks.py index 46debb0c5..9566e82cb 100644 --- a/test/test_pipeline/components/setup/test_setup_networks.py +++ b/test/test_pipeline/components/setup/test_setup_networks.py @@ -17,21 +17,33 @@ def head(request): return request.param +@pytest.fixture(params=['LearnedEntityEmbedding', 'NoEmbedding']) +def embedding(request): + return request.param + + @flaky.flaky(max_runs=3) @pytest.mark.parametrize("fit_dictionary", ['fit_dictionary_numerical_only', 'fit_dictionary_categorical_only', 'fit_dictionary_num_and_categorical'], indirect=True) class TestNetworks: - def test_pipeline_fit(self, fit_dictionary, backbone, head): + def test_pipeline_fit(self, fit_dictionary, embedding, backbone, head): """This test makes sure that the pipeline is able to fit - given random combinations of hyperparameters across the pipeline""" + every combination of network embedding, backbone, head""" + include = {'network_backbone': [backbone], 'network_head': [head], 'network_embedding': [embedding]} + + if len(fit_dictionary['dataset_properties'] + ['categorical_columns']) == 0 and embedding == 'LearnedEntityEmbedding': + pytest.skip("Learned Entity Embedding is not used with numerical only data") pipeline = TabularClassificationPipeline( dataset_properties=fit_dictionary['dataset_properties'], - include={'network_backbone': [backbone], 'network_head': [head]}) + include=include) + cs = pipeline.get_hyperparameter_search_space() config = cs.get_default_configuration() + assert embedding == config.get('network_embedding:__choice__', None) assert backbone == config.get('network_backbone:__choice__', None) assert head == config.get('network_head:__choice__', None) pipeline.set_hyperparameters(config) diff --git a/test/test_pipeline/test_tabular_classification.py b/test/test_pipeline/test_tabular_classification.py index 0da963c75..340f719ff 100644 --- a/test/test_pipeline/test_tabular_classification.py +++ b/test/test_pipeline/test_tabular_classification.py @@ -20,7 +20,8 @@ parse_hyperparameter_search_space_updates -@pytest.mark.parametrize("fit_dictionary", ['fit_dictionary_categorical_only', +@pytest.mark.parametrize("fit_dictionary", ['fit_dictionary_numerical_only', + 'fit_dictionary_categorical_only', 'fit_dictionary_num_and_categorical'], indirect=True) class TestTabularClassification: def _assert_pipeline_search_space(self, pipeline, search_space_updates): @@ -33,7 +34,13 @@ def _assert_pipeline_search_space(self, pipeline, search_space_updates): assert any(update.node_name + ':' + update.hyperparameter in name for name in config_space.get_hyperparameter_names()), \ "Can't find hyperparameter: {}".format(update.hyperparameter) - hyperparameter = config_space.get_hyperparameter(update.node_name + ':' + update.hyperparameter + '_1') + # dimension reduction in embedding starts from 0 + if 'embedding' in update.node_name: + hyperparameter = config_space.get_hyperparameter( + update.node_name + ':' + update.hyperparameter + '_0') + else: + hyperparameter = config_space.get_hyperparameter( + update.node_name + ':' + update.hyperparameter + '_1') assert update.default_value == hyperparameter.default_value if isinstance(hyperparameter, (UniformIntegerHyperparameter, UniformFloatHyperparameter)): assert update.value_range[0] == hyperparameter.lower @@ -169,6 +176,7 @@ def test_network_optimizer_lr_handshake(self, fit_dictionary): # Make sure that fitting a network adds a "network" to X assert 'network' in pipeline.named_steps.keys() + fit_dictionary['network_embedding'] = torch.nn.Linear(3, 3) fit_dictionary['network_backbone'] = torch.nn.Linear(3, 4) fit_dictionary['network_head'] = torch.nn.Linear(4, 1) X = pipeline.named_steps['network'].fit( From 1adc9a450269858f7b30ab23030faf8dd2f66c9e Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Mon, 15 Feb 2021 22:24:28 +0100 Subject: [PATCH 05/68] Removed ordinal encoder --- .../components/preprocessing/base.py | 2 +- .../components/preprocessing/test_encoders.py | 43 ------------------- 2 files changed, 1 insertion(+), 44 deletions(-) diff --git a/test/test_pipeline/components/preprocessing/base.py b/test/test_pipeline/components/preprocessing/base.py index 7bb4fee70..875ed399c 100644 --- a/test/test_pipeline/components/preprocessing/base.py +++ b/test/test_pipeline/components/preprocessing/base.py @@ -33,4 +33,4 @@ def _get_pipeline_steps(self, dataset_properties: Optional[Dict[str, Any]], ("scaler", ScalerChoice(default_dataset_properties)), ("tabular_transformer", TabularColumnTransformer()), ]) - return steps \ No newline at end of file + return steps diff --git a/test/test_pipeline/components/preprocessing/test_encoders.py b/test/test_pipeline/components/preprocessing/test_encoders.py index 1f210936f..a901823ba 100644 --- a/test/test_pipeline/components/preprocessing/test_encoders.py +++ b/test/test_pipeline/components/preprocessing/test_encoders.py @@ -8,7 +8,6 @@ from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.encoding.NoEncoder import NoEncoder from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.encoding.OneHotEncoder import OneHotEncoder -from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.encoding.OrdinalEncoder import OrdinalEncoder class TestEncoders(unittest.TestCase): @@ -53,48 +52,6 @@ def test_one_hot_encoder_no_unknown(self): # check if the transform is correct assert_array_equal(transformed, [['1.0', '0.0', 1], ['1.0', '0.0', 2]]) - def test_ordinal_encoder(self): - - data = np.array([[1, 'male'], - [1, 'female'], - [3, 'male'], - [2, 'female'], - [2, 'male']]) - - categorical_columns = [1] - numerical_columns = [0] - train_indices = np.array([0, 2, 3]) - test_indices = np.array([1, 4]) - - dataset_properties = { - 'categorical_columns': categorical_columns, - 'numerical_columns': numerical_columns, - 'categories': [['female', 'male', 'unknown']] - } - X = { - 'X_train': data[train_indices], - 'dataset_properties': dataset_properties - } - encoder_component = OrdinalEncoder() - encoder_component.fit(X) - X = encoder_component.transform(X) - - encoder = X['encoder']['categorical'] - - # check if the fit dictionary X is modified as expected - self.assertIsInstance(X['encoder'], dict) - self.assertIsInstance(encoder, BaseEstimator) - self.assertIsNone(X['encoder']['numerical']) - - # make column transformer with returned encoder to fit on data - column_transformer = make_column_transformer((encoder, X['dataset_properties']['categorical_columns']), - remainder='passthrough') - column_transformer = column_transformer.fit(X['X_train']) - transformed = column_transformer.transform(data[test_indices]) - - # check if we got the expected transformed array - assert_array_equal(transformed, [['0.0', 1], ['1.0', 2]]) - def test_none_encoder(self): data = np.array([[1, 'male'], From ae6bb44fb2216da53f7ce3c80108cfb5269bda29 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Mon, 15 Feb 2021 22:24:41 +0100 Subject: [PATCH 06/68] Removed ordinal encoder --- .../tabular_preprocessing/encoding/base_encoder_choice.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/base_encoder_choice.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/base_encoder_choice.py index 7be7c94a2..df71ff209 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/base_encoder_choice.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/base_encoder_choice.py @@ -65,7 +65,7 @@ def get_hyperparameter_search_space(self, raise ValueError("no encoders found, please add a encoder") if default is None: - defaults = ['OneHotEncoder', 'OrdinalEncoder', 'NoEncoder'] + defaults = ['OneHotEncoder', 'NoEncoder'] for default_ in defaults: if default_ in available_preprocessors: if include is not None and default_ not in include: From 8783240e2ef818d4cbb3447ae89a1319e14408c4 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Mon, 15 Feb 2021 23:45:38 +0100 Subject: [PATCH 07/68] Add seed for test_losses for reproducibility --- test/test_pipeline/test_losses.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test/test_pipeline/test_losses.py b/test/test_pipeline/test_losses.py index ca3438d58..0a3303c9c 100644 --- a/test/test_pipeline/test_losses.py +++ b/test/test_pipeline/test_losses.py @@ -37,6 +37,7 @@ def test_get_name_error(): @pytest.mark.parametrize('weighted', [True, False]) def test_losses(weighted): + torch.manual_seed(1) list_properties = [{'task_type': 'tabular_classification', 'output_type': 'multiclass'}, {'task_type': 'tabular_classification', 'output_type': 'binary'}, {'task_type': 'tabular_regression', 'output_type': 'continuous'}] From 761fb75ff06000f7e89ea5286721c2497c6b1ec8 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Wed, 17 Feb 2021 13:36:52 +0100 Subject: [PATCH 08/68] Addressed comments --- autoPyTorch/pipeline/tabular_regression.py | 2 ++ .../preprocessing/test_tabular_column_transformer.py | 10 ++++++++++ 2 files changed, 12 insertions(+) diff --git a/autoPyTorch/pipeline/tabular_regression.py b/autoPyTorch/pipeline/tabular_regression.py index db83a19f3..4fe943f98 100644 --- a/autoPyTorch/pipeline/tabular_regression.py +++ b/autoPyTorch/pipeline/tabular_regression.py @@ -24,6 +24,7 @@ from autoPyTorch.pipeline.components.setup.lr_scheduler.base_scheduler_choice import SchedulerChoice from autoPyTorch.pipeline.components.setup.network.base_network import NetworkComponent from autoPyTorch.pipeline.components.setup.network_backbone.base_network_backbone_choice import NetworkBackboneChoice +from autoPyTorch.pipeline.components.setup.network_embedding.base_network_embedding_choice import NetworkEmbeddingChoice from autoPyTorch.pipeline.components.setup.network_head.base_network_head_choice import NetworkHeadChoice from autoPyTorch.pipeline.components.setup.network_initializer.base_network_init_choice import ( NetworkInitializerChoice @@ -194,6 +195,7 @@ def _get_pipeline_steps(self, dataset_properties: Optional[Dict[str, Any]], ("scaler", ScalerChoice(default_dataset_properties)), ("tabular_transformer", TabularColumnTransformer()), ("preprocessing", EarlyPreprocessing()), + ("network_embedding", NetworkEmbeddingChoice(default_dataset_properties)), ("network_backbone", NetworkBackboneChoice(default_dataset_properties)), ("network_head", NetworkHeadChoice(default_dataset_properties)), ("network", NetworkComponent(default_dataset_properties)), diff --git a/test/test_pipeline/components/preprocessing/test_tabular_column_transformer.py b/test/test_pipeline/components/preprocessing/test_tabular_column_transformer.py index c0b4d94d3..ff74e068f 100644 --- a/test/test_pipeline/components/preprocessing/test_tabular_column_transformer.py +++ b/test/test_pipeline/components/preprocessing/test_tabular_column_transformer.py @@ -34,6 +34,16 @@ def test_tabular_preprocess(self, fit_dictionary): data = column_transformer.preprocessor.fit_transform(X['X_train']) assert isinstance(data, np.ndarray) + # Make sure no columns are unintentionally dropped after preprocessing + if len(fit_dictionary['dataset_properties']["numerical_columns"]) == 0: + categorical_pipeline = column_transformer.preprocessor.named_transformers_['categorical_pipeline'] + categorical_data = categorical_pipeline.transform(X['X_train']) + assert data.shape[1] == categorical_data.shape[1] + elif len(fit_dictionary['dataset_properties']["categorical_columns"]) == 0: + numerical_pipeline = column_transformer.preprocessor.named_transformers_['numerical_pipeline'] + numerical_data = numerical_pipeline.transform(X['X_train']) + assert data.shape[1] == numerical_data.shape[1] + def test_sparse_data(self, fit_dictionary): X = np.random.binomial(1, 0.1, (100, 2000)) sparse_X = csr_matrix(X) From 4bcbd8889039135a6c92dce0fc858ec1d4d086a5 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Mon, 22 Feb 2021 13:58:36 +0100 Subject: [PATCH 09/68] fix flake --- test/test_pipeline/components/training/base.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/test/test_pipeline/components/training/base.py b/test/test_pipeline/components/training/base.py index 25cb6f58e..ce095e7e5 100644 --- a/test/test_pipeline/components/training/base.py +++ b/test/test_pipeline/components/training/base.py @@ -5,7 +5,14 @@ import torch -from autoPyTorch.constants import CLASSIFICATION_TASKS, REGRESSION_TASKS, OUTPUT_TYPES_TO_STRING, CONTINUOUS, BINARY, TASK_TYPES_TO_STRING +from autoPyTorch.constants import ( + BINARY, + CLASSIFICATION_TASKS, + CONTINUOUS, + OUTPUT_TYPES_TO_STRING, + REGRESSION_TASKS, + TASK_TYPES_TO_STRING +) from autoPyTorch.pipeline.components.training.metrics.utils import get_metrics from autoPyTorch.pipeline.components.training.trainer.base_trainer import BaseTrainerComponent, BudgetTracker From aa83d6d43b9307d7e0293a127fdb7416d10d6ed4 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Tue, 23 Feb 2021 19:12:59 +0100 Subject: [PATCH 10/68] fix test import training --- test/test_pipeline/components/training/test_training.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_pipeline/components/training/test_training.py b/test/test_pipeline/components/training/test_training.py index 9005d1ad2..d6964fa14 100644 --- a/test/test_pipeline/components/training/test_training.py +++ b/test/test_pipeline/components/training/test_training.py @@ -27,7 +27,7 @@ ) sys.path.append(os.path.dirname(__file__)) -from test.test_pipeline.components.base import BaseTraining # noqa (E402: module level import not at top of file) +from test.test_pipeline.components.training.base import BaseTraining # noqa (E402: module level import not at top of file) class BaseDataLoaderTest(unittest.TestCase): From cbc7e09f00cb0ea595946a5a2ecdde48f23d1ab6 Mon Sep 17 00:00:00 2001 From: chico Date: Tue, 23 Feb 2021 19:36:46 +0100 Subject: [PATCH 11/68] ADD_109 --- test/conftest.py | 51 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/test/conftest.py b/test/conftest.py index f05f573a7..b13a067bd 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -8,6 +8,8 @@ import numpy as np +import openml + import pandas as pd import pytest @@ -23,6 +25,55 @@ from autoPyTorch.utils.pipeline import get_dataset_requirements +@pytest.fixture(scope="session", autouse=True) +def callattr_ahead_of_alltests(request): + """ + This procedure will run at the start of the pytest session. + It will prefetch several task that are going to be used by + the testing face, and it does so in a robust way, until the openml + API provides the desired resources + """ + start_time = time.time() + + tasks_used = [ + 146818, # Australian + 2295, # cholesterol + 2075, # abalone + 2071, # adult + 3, # kr-vs-kp + 9981, # cnae-9 + 146821, # car + 146822, # Segment + 2, # anneal + 53, # vehicle + 5136, # tecator + 4871, # sensory + 4857, # boston + 3916, # kc1 + ] + + # Try to populate the tests 5 times + patience = 5 + for i in range(patience): + try: + # Populate the cache + openml.populate_cache(task_ids=tasks_used) + # Also the bunch + for task in tasks_used: + fetch_openml(data_id=openml.tasks.get_task(task).dataset_id, + return_X_y=True) + break + except Exception as e: + if i == patience - 1: + print("Failed to preload openml dataset for testing after {} iters.".format( + patience + )) + raise e + + print(f"Pre-Fetch of {len(tasks_used)} tasks took {time.time() - start_time} seconds...") + return + + def slugify(text): return re.sub(r'[\[\]]+', '-', text.lower()) From 9dd447f0a75bc85de6bc704c4c4c227396c84983 Mon Sep 17 00:00:00 2001 From: chico Date: Tue, 23 Feb 2021 19:47:51 +0100 Subject: [PATCH 12/68] No print allow --- setup.py | 1 + test/conftest.py | 7 ------- 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/setup.py b/setup.py index c496a48c1..1d8e47ba5 100755 --- a/setup.py +++ b/setup.py @@ -47,6 +47,7 @@ "codecov", "pep8", "mypy", + "openml" ], "examples": [ "matplotlib", diff --git a/test/conftest.py b/test/conftest.py index b13a067bd..e18219a80 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -33,8 +33,6 @@ def callattr_ahead_of_alltests(request): the testing face, and it does so in a robust way, until the openml API provides the desired resources """ - start_time = time.time() - tasks_used = [ 146818, # Australian 2295, # cholesterol @@ -65,12 +63,7 @@ def callattr_ahead_of_alltests(request): break except Exception as e: if i == patience - 1: - print("Failed to preload openml dataset for testing after {} iters.".format( - patience - )) raise e - - print(f"Pre-Fetch of {len(tasks_used)} tasks took {time.time() - start_time} seconds...") return From 20a874f7380be3c85ed0bad5b13248afdec51764 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Tue, 23 Feb 2021 20:07:53 +0100 Subject: [PATCH 13/68] Fix tests and move to boston --- test/conftest.py | 4 ++-- test/test_api/test_api.py | 2 +- test/test_pipeline/test_tabular_regression.py | 9 ++++++++- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/test/conftest.py b/test/conftest.py index f05f573a7..31059326b 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -189,7 +189,7 @@ def get_tabular_data(task): validator = TabularInputValidator(is_classification=False).fit(X.copy(), y.copy()) elif task == "regression_categorical_only": - X, y = fetch_openml("cholesterol", return_X_y=True, as_frame=True) + X, y = fetch_openml("boston", return_X_y=True, as_frame=True) categorical_columns = [column for column in X.columns if X[column].dtype.name == 'category'] X = X[categorical_columns] @@ -207,7 +207,7 @@ def get_tabular_data(task): validator = TabularInputValidator(is_classification=False).fit(X.copy(), y.copy()) elif task == "regression_numerical_and_categorical": - X, y = fetch_openml("cholesterol", return_X_y=True, as_frame=True) + X, y = fetch_openml("boston", return_X_y=True, as_frame=True) # fill nan values for now since they are not handled properly yet for column in X.columns: diff --git a/test/test_api/test_api.py b/test/test_api/test_api.py index ea7cccd72..4ac194968 100644 --- a/test/test_api/test_api.py +++ b/test/test_api/test_api.py @@ -178,7 +178,7 @@ def test_tabular_classification(openml_id, resampling_strategy, backend): restored_estimator.predict(X_test) -@pytest.mark.parametrize('openml_name', ("cholesterol", )) +@pytest.mark.parametrize('openml_name', ("boston", )) @pytest.mark.parametrize('resampling_strategy', (HoldoutValTypes.holdout_validation, CrossValTypes.k_fold_cross_validation, )) diff --git a/test/test_pipeline/test_tabular_regression.py b/test/test_pipeline/test_tabular_regression.py index 15b8351f9..74de19405 100644 --- a/test/test_pipeline/test_tabular_regression.py +++ b/test/test_pipeline/test_tabular_regression.py @@ -39,7 +39,13 @@ def _assert_pipeline_search_space(self, pipeline, search_space_updates): assert any(update.node_name + ':' + update.hyperparameter in name for name in config_space.get_hyperparameter_names()), \ "Can't find hyperparameter: {}".format(update.hyperparameter) - hyperparameter = config_space.get_hyperparameter(update.node_name + ':' + update.hyperparameter + '_1') + # dimension reduction in embedding starts from 0 + if 'embedding' in update.node_name: + hyperparameter = config_space.get_hyperparameter( + update.node_name + ':' + update.hyperparameter + '_0') + else: + hyperparameter = config_space.get_hyperparameter( + update.node_name + ':' + update.hyperparameter + '_1') assert update.default_value == hyperparameter.default_value if isinstance(hyperparameter, (UniformIntegerHyperparameter, UniformFloatHyperparameter)): assert update.value_range[0] == hyperparameter.lower @@ -199,6 +205,7 @@ def test_network_optimizer_lr_handshake(self, fit_dictionary_tabular): # Make sure that fitting a network adds a "network" to X assert 'network' in pipeline.named_steps.keys() + fit_dictionary_tabular['network_embedding'] = torch.nn.Linear(3, 3) fit_dictionary_tabular['network_backbone'] = torch.nn.Linear(3, 4) fit_dictionary_tabular['network_head'] = torch.nn.Linear(4, 1) X = pipeline.named_steps['network'].fit( From 7eed312c0112820f8f44a3f66355471dc7fd73c0 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Thu, 25 Feb 2021 18:04:43 +0100 Subject: [PATCH 14/68] Debug issue with python 3.6 --- examples/example_tmp_for_debug.py | 168 ++++++++++++++++++++++++++++++ 1 file changed, 168 insertions(+) create mode 100644 examples/example_tmp_for_debug.py diff --git a/examples/example_tmp_for_debug.py b/examples/example_tmp_for_debug.py new file mode 100644 index 000000000..ca31c8f4b --- /dev/null +++ b/examples/example_tmp_for_debug.py @@ -0,0 +1,168 @@ +import os + +import sklearn.datasets +import time +import shutil + +from autoPyTorch.utils.backend import create + +from autoPyTorch.api.tabular_classification import TabularClassificationTask +from autoPyTorch.datasets.resampling_strategy import ( + CrossValTypes, +) +import re + + +from pathlib import Path + + +class DisplayablePath(object): + display_filename_prefix_middle = '├──' + display_filename_prefix_last = '└──' + display_parent_prefix_middle = ' ' + display_parent_prefix_last = '│ ' + + def __init__(self, path, parent_path, is_last): + self.path = Path(str(path)) + self.parent = parent_path + self.is_last = is_last + if self.parent: + self.depth = self.parent.depth + 1 + else: + self.depth = 0 + + @property + def displayname(self): + if self.path.is_dir(): + return self.path.name + '/' + return self.path.name + + @classmethod + def make_tree(cls, root, parent=None, is_last=False, criteria=None): + root = Path(str(root)) + criteria = criteria or cls._default_criteria + + displayable_root = cls(root, parent, is_last) + yield displayable_root + + children = sorted(list(path + for path in root.iterdir() + if criteria(path)), + key=lambda s: str(s).lower()) + count = 1 + for path in children: + is_last = count == len(children) + if path.is_dir(): + yield from cls.make_tree(path, + parent=displayable_root, + is_last=is_last, + criteria=criteria) + else: + yield cls(path, displayable_root, is_last) + count += 1 + + @classmethod + def _default_criteria(cls, path): + return True + + @property + def displayname(self): + if self.path.is_dir(): + return self.path.name + '/' + return self.path.name + + def displayable(self): + if self.parent is None: + return self.displayname + + _filename_prefix = (self.display_filename_prefix_last + if self.is_last + else self.display_filename_prefix_middle) + + parts = ['{!s} {!s}'.format(_filename_prefix, + self.displayname)] + + parent = self.parent + while parent and parent.parent is not None: + parts.append(self.display_parent_prefix_middle + if parent.is_last + else self.display_parent_prefix_last) + parent = parent.parent + + return ''.join(reversed(parts)) + + +def slugify(text): + return re.sub(r'[\[\]]+', '-', text.lower()) + + +test_dir = os.path.dirname(__file__) +tmp = slugify(os.path.join( + test_dir, '.tmp__%s' % __file__)) +output = slugify(os.path.join( + test_dir, '.output__%s' % __file__)) + +for dir in (tmp, output): + for i in range(10): + if os.path.exists(dir): + try: + shutil.rmtree(dir) + break + except OSError: + time.sleep(1) + +# Make sure the folders we wanna create do not already exist. +backend = create( + tmp, + output, + delete_tmp_folder_after_terminate=True, + delete_output_folder_after_terminate=True, +) + +openml_id = 40981 +resampling_strategy = CrossValTypes.k_fold_cross_validation +X, y = sklearn.datasets.fetch_openml( + data_id=int(openml_id), + return_X_y=True, as_frame=True +) +X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( + X, y, random_state=1) + +# Search for a good configuration +estimator = TabularClassificationTask( + backend=backend, + resampling_strategy=resampling_strategy, +) + +estimator.search( + X_train=X_train, y_train=y_train, + X_test=X_test, y_test=y_test, + optimize_metric='accuracy', + total_walltime_limit=150, + func_eval_time_limit=50, + traditional_per_total_budget=0 +) + +# Search for an existing run key in disc. A individual model might have +# a timeout and hence was not written to disc +for i, (run_key, value) in enumerate(estimator.run_history.data.items()): + if i == 0: + # Ignore dummy run + continue + if 'SUCCESS' not in str(value.status): + continue + + run_key_model_run_dir = estimator._backend.get_numrun_directory( + estimator.seed, run_key.config_id, run_key.budget) + if os.path.exists(run_key_model_run_dir): + break + + +model_file = os.path.join( + run_key_model_run_dir, + f"{estimator.seed}.{run_key.config_id}.{run_key.budget}.cv_model" +) +if not os.path.exists(model_file): + paths = DisplayablePath.make_tree(run_key_model_run_dir) + for path in paths: + print(path.displayable()) From cb3b398eaee28ea4ec6b75e662437672c4d43bc5 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Fri, 26 Feb 2021 14:31:17 +0100 Subject: [PATCH 15/68] Debug for python3.6 --- .github/workflows/examples.yml | 2 +- examples/example_tmp_for_debug.py | 183 +++++++----------------------- test/test_api/test_api.py | 10 +- test/utils.py | 77 +++++++++++++ 4 files changed, 127 insertions(+), 145 deletions(-) create mode 100644 test/utils.py diff --git a/.github/workflows/examples.yml b/.github/workflows/examples.yml index b278a8563..764a4815a 100644 --- a/.github/workflows/examples.yml +++ b/.github/workflows/examples.yml @@ -8,7 +8,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [3.8] + python-version: [3.6] fail-fast: false max-parallel: 2 diff --git a/examples/example_tmp_for_debug.py b/examples/example_tmp_for_debug.py index ca31c8f4b..664eac866 100644 --- a/examples/example_tmp_for_debug.py +++ b/examples/example_tmp_for_debug.py @@ -1,123 +1,17 @@ +""" +Example file to be deleted +""" import os import sklearn.datasets -import time -import shutil - -from autoPyTorch.utils.backend import create from autoPyTorch.api.tabular_classification import TabularClassificationTask from autoPyTorch.datasets.resampling_strategy import ( CrossValTypes, ) -import re - - -from pathlib import Path - - -class DisplayablePath(object): - display_filename_prefix_middle = '├──' - display_filename_prefix_last = '└──' - display_parent_prefix_middle = ' ' - display_parent_prefix_last = '│ ' - - def __init__(self, path, parent_path, is_last): - self.path = Path(str(path)) - self.parent = parent_path - self.is_last = is_last - if self.parent: - self.depth = self.parent.depth + 1 - else: - self.depth = 0 - - @property - def displayname(self): - if self.path.is_dir(): - return self.path.name + '/' - return self.path.name - - @classmethod - def make_tree(cls, root, parent=None, is_last=False, criteria=None): - root = Path(str(root)) - criteria = criteria or cls._default_criteria - - displayable_root = cls(root, parent, is_last) - yield displayable_root - - children = sorted(list(path - for path in root.iterdir() - if criteria(path)), - key=lambda s: str(s).lower()) - count = 1 - for path in children: - is_last = count == len(children) - if path.is_dir(): - yield from cls.make_tree(path, - parent=displayable_root, - is_last=is_last, - criteria=criteria) - else: - yield cls(path, displayable_root, is_last) - count += 1 - - @classmethod - def _default_criteria(cls, path): - return True - - @property - def displayname(self): - if self.path.is_dir(): - return self.path.name + '/' - return self.path.name - - def displayable(self): - if self.parent is None: - return self.displayname - - _filename_prefix = (self.display_filename_prefix_last - if self.is_last - else self.display_filename_prefix_middle) +from test.utils import DisplayablePath - parts = ['{!s} {!s}'.format(_filename_prefix, - self.displayname)] - parent = self.parent - while parent and parent.parent is not None: - parts.append(self.display_parent_prefix_middle - if parent.is_last - else self.display_parent_prefix_last) - parent = parent.parent - - return ''.join(reversed(parts)) - - -def slugify(text): - return re.sub(r'[\[\]]+', '-', text.lower()) - - -test_dir = os.path.dirname(__file__) -tmp = slugify(os.path.join( - test_dir, '.tmp__%s' % __file__)) -output = slugify(os.path.join( - test_dir, '.output__%s' % __file__)) - -for dir in (tmp, output): - for i in range(10): - if os.path.exists(dir): - try: - shutil.rmtree(dir) - break - except OSError: - time.sleep(1) - -# Make sure the folders we wanna create do not already exist. -backend = create( - tmp, - output, - delete_tmp_folder_after_terminate=True, - delete_output_folder_after_terminate=True, -) openml_id = 40981 resampling_strategy = CrossValTypes.k_fold_cross_validation @@ -128,41 +22,44 @@ def slugify(text): X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( X, y, random_state=1) -# Search for a good configuration -estimator = TabularClassificationTask( - backend=backend, - resampling_strategy=resampling_strategy, -) - -estimator.search( - X_train=X_train, y_train=y_train, - X_test=X_test, y_test=y_test, - optimize_metric='accuracy', - total_walltime_limit=150, - func_eval_time_limit=50, - traditional_per_total_budget=0 -) -# Search for an existing run key in disc. A individual model might have -# a timeout and hence was not written to disc -for i, (run_key, value) in enumerate(estimator.run_history.data.items()): - if i == 0: - # Ignore dummy run - continue - if 'SUCCESS' not in str(value.status): - continue +if __name__ == '__main__': + # Search for a good configuration + estimator = TabularClassificationTask( + temporary_directory='./tmp', + delete_tmp_folder_after_terminate=False, + resampling_strategy=resampling_strategy, + ) + + estimator.search( + X_train=X_train, y_train=y_train, + X_test=X_test, y_test=y_test, + optimize_metric='accuracy', + total_walltime_limit=150, + func_eval_time_limit=50, + traditional_per_total_budget=0 + ) + + # Search for an existing run key in disc. A individual model might have + # a timeout and hence was not written to disc + for i, (run_key, value) in enumerate(estimator.run_history.data.items()): + if i == 0: + # Ignore dummy run + continue + if 'SUCCESS' not in str(value.status): + continue + + run_key_model_run_dir = estimator._backend.get_numrun_directory( + estimator.seed, run_key.config_id, run_key.budget) + if os.path.exists(run_key_model_run_dir): + break + + + model_file = os.path.join( + run_key_model_run_dir, + f"{estimator.seed}.{run_key.config_id}.{run_key.budget}.cv_model" + ) - run_key_model_run_dir = estimator._backend.get_numrun_directory( - estimator.seed, run_key.config_id, run_key.budget) - if os.path.exists(run_key_model_run_dir): - break - - -model_file = os.path.join( - run_key_model_run_dir, - f"{estimator.seed}.{run_key.config_id}.{run_key.budget}.cv_model" -) -if not os.path.exists(model_file): paths = DisplayablePath.make_tree(run_key_model_run_dir) for path in paths: print(path.displayable()) diff --git a/test/test_api/test_api.py b/test/test_api/test_api.py index 4ac194968..4cf615e97 100644 --- a/test/test_api/test_api.py +++ b/test/test_api/test_api.py @@ -22,6 +22,7 @@ HoldoutValTypes, ) +from test.utils import DisplayablePath # Fixtures # ======== @@ -120,7 +121,14 @@ def test_tabular_classification(openml_id, resampling_strategy, backend): run_key_model_run_dir, f"{estimator.seed}.{run_key.config_id}.{run_key.budget}.cv_model" ) - assert os.path.exists(model_file), model_file + try: + assert os.path.exists(model_file), model_file + except AssertionError: + paths = DisplayablePath.make_tree(run_key_model_run_dir) + for path in paths: + print(path.displayable()) + raise AssertionError(model_file) + model = estimator._backend.load_cv_model_by_seed_and_id_and_budget( estimator.seed, run_key.config_id, run_key.budget) assert isinstance(model, VotingClassifier) diff --git a/test/utils.py b/test/utils.py new file mode 100644 index 000000000..b1c919c4c --- /dev/null +++ b/test/utils.py @@ -0,0 +1,77 @@ +from pathlib import Path + + +class DisplayablePath(object): + display_filename_prefix_middle = '├──' + display_filename_prefix_last = '└──' + display_parent_prefix_middle = ' ' + display_parent_prefix_last = '│ ' + + def __init__(self, path, parent_path, is_last): + self.path = Path(str(path)) + self.parent = parent_path + self.is_last = is_last + if self.parent: + self.depth = self.parent.depth + 1 + else: + self.depth = 0 + + @property + def displayname(self): + if self.path.is_dir(): + return self.path.name + '/' + return self.path.name + + @classmethod + def make_tree(cls, root, parent=None, is_last=False, criteria=None): + root = Path(str(root)) + criteria = criteria or cls._default_criteria + + displayable_root = cls(root, parent, is_last) + yield displayable_root + + children = sorted(list(path + for path in root.iterdir() + if criteria(path)), + key=lambda s: str(s).lower()) + count = 1 + for path in children: + is_last = count == len(children) + if path.is_dir(): + yield from cls.make_tree(path, + parent=displayable_root, + is_last=is_last, + criteria=criteria) + else: + yield cls(path, displayable_root, is_last) + count += 1 + + @classmethod + def _default_criteria(cls, path): + return True + + @property + def displayname(self): + if self.path.is_dir(): + return self.path.name + '/' + return self.path.name + + def displayable(self): + if self.parent is None: + return self.displayname + + _filename_prefix = (self.display_filename_prefix_last + if self.is_last + else self.display_filename_prefix_middle) + + parts = ['{!s} {!s}'.format(_filename_prefix, + self.displayname)] + + parent = self.parent + while parent and parent.parent is not None: + parts.append(self.display_parent_prefix_middle + if parent.is_last + else self.display_parent_prefix_last) + parent = parent.parent + + return ''.join(reversed(parts)) From 007be7d29df5ec4bf92630af8dac24a11a86d0cb Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Fri, 26 Feb 2021 15:00:25 +0100 Subject: [PATCH 16/68] Run only debug file --- .github/workflows/examples.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/examples.yml b/.github/workflows/examples.yml index b278a8563..53222930b 100644 --- a/.github/workflows/examples.yml +++ b/.github/workflows/examples.yml @@ -30,5 +30,7 @@ jobs: echo "::set-output name=BEFORE::$(git status --porcelain -b)" - name: Run tests run: | - python examples/example_tabular_classification.py - python examples/example_image_classification.py +# python examples/example_tabular_classification.py +# python examples/example_image_classification.py +# python examples/example_tabular_regression.py + python examples/example_tmp_for_debug.py \ No newline at end of file From b1e25d211afbde47304ccbf768f3610e47aa677c Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Mon, 8 Feb 2021 14:22:26 +0100 Subject: [PATCH 17/68] work in progress --- .../network/network_embedding/__init__.py | 0 .../base_network_embedding.py | 16 +++++ .../network_embedding/modules/__init__.py | 0 .../modules/learned_entity_embedding.py | 69 +++++++++++++++++++ .../network_embedding/modules/no_embedding.py | 12 ++++ 5 files changed, 97 insertions(+) create mode 100644 autoPyTorch/pipeline/components/setup/network/network_embedding/__init__.py create mode 100644 autoPyTorch/pipeline/components/setup/network/network_embedding/base_network_embedding.py create mode 100644 autoPyTorch/pipeline/components/setup/network/network_embedding/modules/__init__.py create mode 100644 autoPyTorch/pipeline/components/setup/network/network_embedding/modules/learned_entity_embedding.py create mode 100644 autoPyTorch/pipeline/components/setup/network/network_embedding/modules/no_embedding.py diff --git a/autoPyTorch/pipeline/components/setup/network/network_embedding/__init__.py b/autoPyTorch/pipeline/components/setup/network/network_embedding/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/autoPyTorch/pipeline/components/setup/network/network_embedding/base_network_embedding.py b/autoPyTorch/pipeline/components/setup/network/network_embedding/base_network_embedding.py new file mode 100644 index 000000000..5615062d4 --- /dev/null +++ b/autoPyTorch/pipeline/components/setup/network/network_embedding/base_network_embedding.py @@ -0,0 +1,16 @@ +from typing import Optional, Any + +from sklearn.base import BaseEstimator +from torch import nn + +from autoPyTorch.pipeline.components.setup.base_setup import autoPyTorchSetupComponent + + +class NetworkEmbeddingComponent(autoPyTorchSetupComponent): + def __init__(self, + **kwargs): + super().__init__() + self.config = kwargs + self.embedding: Optional[nn.Module] = None + + # def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator: \ No newline at end of file diff --git a/autoPyTorch/pipeline/components/setup/network/network_embedding/modules/__init__.py b/autoPyTorch/pipeline/components/setup/network/network_embedding/modules/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/autoPyTorch/pipeline/components/setup/network/network_embedding/modules/learned_entity_embedding.py b/autoPyTorch/pipeline/components/setup/network/network_embedding/modules/learned_entity_embedding.py new file mode 100644 index 000000000..e6eb88f19 --- /dev/null +++ b/autoPyTorch/pipeline/components/setup/network/network_embedding/modules/learned_entity_embedding.py @@ -0,0 +1,69 @@ +""" +Class to learn an embedding for categorical hyperparameters. +""" + +import torch +import torch.nn as nn +import numpy as np + + +class LearnedEntityEmbedding(nn.Module): + """ Parent class for MlpNet, ResNet, ... Can use entity embedding for categorical features""" + + def __init__(self, config, in_features, num_numerical_features): + """ + Initialize the BaseFeatureNet. + Arguments: + config: The configuration sampled by the hyperparameter optimizer + in_features: the number of features of the dataset + one_hot_encoder: OneHot encoder, that is used to encode X + """ + super(LearnedEntityEmbedding, self).__init__() + self.config = config + + # self.num_numerical = len([f for f in one_hot_encoder.categorical_features if not f]) + # self.num_input_features = [len(c) for c in one_hot_encoder.categories_] + self.num_numerical = num_numerical_features + self.embed_features = [num_in >= config["min_unique_values_for_embedding"] for num_in in + self.num_input_features] + self.num_output_dimensions = [config["dimension_reduction_" + str(i)] * num_in for i, num_in in + enumerate(self.num_input_features)] + self.num_output_dimensions = [int(np.clip(num_out, 1, num_in - 1)) for num_out, num_in in + zip(self.num_output_dimensions, self.num_input_features)] + self.num_output_dimensions = [num_out if embed else num_in for num_out, embed, num_in in + zip(self.num_output_dimensions, self.embed_features, self.num_input_features)] + self.num_out_feats = self.num_numerical + sum(self.num_output_dimensions) + + self.ee_layers = self._create_ee_layers(in_features) + + def forward(self, x): + # pass the columns of each categorical feature through entity embedding layer + # before passing it through the model + concat_seq = [] + last_concat = 0 + x_pointer = 0 + layer_pointer = 0 + for num_in, embed in zip(self.num_input_features, self.embed_features): + if not embed: + x_pointer += 1 + continue + if x_pointer > last_concat: + concat_seq.append(x[:, last_concat: x_pointer]) + categorical_feature_slice = x[:, x_pointer: x_pointer + num_in] + concat_seq.append(self.ee_layers[layer_pointer](categorical_feature_slice)) + layer_pointer += 1 + x_pointer += num_in + last_concat = x_pointer + + concat_seq.append(x[:, last_concat:]) + return torch.cat(concat_seq, dim=1) + + def _create_ee_layers(self, in_features): + # entity embeding layers are Linear Layers + layers = nn.ModuleList() + for i, (num_in, embed, num_out) in enumerate( + zip(self.num_input_features, self.embed_features, self.num_output_dimensions)): + if not embed: + continue + layers.append(nn.Linear(num_in, num_out)) + return layers \ No newline at end of file diff --git a/autoPyTorch/pipeline/components/setup/network/network_embedding/modules/no_embedding.py b/autoPyTorch/pipeline/components/setup/network/network_embedding/modules/no_embedding.py new file mode 100644 index 000000000..cab1e9b7f --- /dev/null +++ b/autoPyTorch/pipeline/components/setup/network/network_embedding/modules/no_embedding.py @@ -0,0 +1,12 @@ +from torch import nn + + +class NoEmbedding(nn.Module): + def __init__(self, config, in_features, num_numerical_features): + super(NoEmbedding, self).__init__() + self.config = config + self.n_feats = in_features + self.num_numerical = num_numerical_features + + def forward(self, x): + return x \ No newline at end of file From f41eae18ac9530d66ff8365cf222db234dd73e90 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Mon, 8 Feb 2021 14:44:57 +0100 Subject: [PATCH 18/68] in progress --- .../base_network_embedding.py | 16 ------------ .../network_embedding/__init__.py | 0 .../base_network_embedding.py | 25 +++++++++++++++++++ .../network_embedding/modules/__init__.py | 0 .../modules/learned_entity_embedding.py | 7 +++--- .../network_embedding/modules/no_embedding.py | 0 6 files changed, 29 insertions(+), 19 deletions(-) delete mode 100644 autoPyTorch/pipeline/components/setup/network/network_embedding/base_network_embedding.py rename autoPyTorch/pipeline/components/setup/{network => }/network_embedding/__init__.py (100%) create mode 100644 autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py rename autoPyTorch/pipeline/components/setup/{network => }/network_embedding/modules/__init__.py (100%) rename autoPyTorch/pipeline/components/setup/{network => }/network_embedding/modules/learned_entity_embedding.py (93%) rename autoPyTorch/pipeline/components/setup/{network => }/network_embedding/modules/no_embedding.py (100%) diff --git a/autoPyTorch/pipeline/components/setup/network/network_embedding/base_network_embedding.py b/autoPyTorch/pipeline/components/setup/network/network_embedding/base_network_embedding.py deleted file mode 100644 index 5615062d4..000000000 --- a/autoPyTorch/pipeline/components/setup/network/network_embedding/base_network_embedding.py +++ /dev/null @@ -1,16 +0,0 @@ -from typing import Optional, Any - -from sklearn.base import BaseEstimator -from torch import nn - -from autoPyTorch.pipeline.components.setup.base_setup import autoPyTorchSetupComponent - - -class NetworkEmbeddingComponent(autoPyTorchSetupComponent): - def __init__(self, - **kwargs): - super().__init__() - self.config = kwargs - self.embedding: Optional[nn.Module] = None - - # def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator: \ No newline at end of file diff --git a/autoPyTorch/pipeline/components/setup/network/network_embedding/__init__.py b/autoPyTorch/pipeline/components/setup/network_embedding/__init__.py similarity index 100% rename from autoPyTorch/pipeline/components/setup/network/network_embedding/__init__.py rename to autoPyTorch/pipeline/components/setup/network_embedding/__init__.py diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py new file mode 100644 index 000000000..4c087a89d --- /dev/null +++ b/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py @@ -0,0 +1,25 @@ +from typing import Any, Dict, Optional + +from sklearn.base import BaseEstimator +from torch import nn + +from autoPyTorch.pipeline.components.setup.base_setup import autoPyTorchSetupComponent + + +class NetworkEmbeddingComponent(autoPyTorchSetupComponent): + def __init__(self, + **kwargs): + super().__init__() + self.config = kwargs + self.embedding: Optional[nn.Module] = None + + def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator: + in_features = X['X_train'].shape[1:] + + self.embedding = self.build_embedding( + in_features=in_features, + num_numerical_features=len(X['numerical_features'])) + return self + + def build_embedding(self, in_features, num_numerical_features) -> nn.Module: + raise NotImplementedError \ No newline at end of file diff --git a/autoPyTorch/pipeline/components/setup/network/network_embedding/modules/__init__.py b/autoPyTorch/pipeline/components/setup/network_embedding/modules/__init__.py similarity index 100% rename from autoPyTorch/pipeline/components/setup/network/network_embedding/modules/__init__.py rename to autoPyTorch/pipeline/components/setup/network_embedding/modules/__init__.py diff --git a/autoPyTorch/pipeline/components/setup/network/network_embedding/modules/learned_entity_embedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/modules/learned_entity_embedding.py similarity index 93% rename from autoPyTorch/pipeline/components/setup/network/network_embedding/modules/learned_entity_embedding.py rename to autoPyTorch/pipeline/components/setup/network_embedding/modules/learned_entity_embedding.py index e6eb88f19..d7d294661 100644 --- a/autoPyTorch/pipeline/components/setup/network/network_embedding/modules/learned_entity_embedding.py +++ b/autoPyTorch/pipeline/components/setup/network_embedding/modules/learned_entity_embedding.py @@ -10,7 +10,7 @@ class LearnedEntityEmbedding(nn.Module): """ Parent class for MlpNet, ResNet, ... Can use entity embedding for categorical features""" - def __init__(self, config, in_features, num_numerical_features): + def __init__(self, config, num_input_features, num_numerical_features): """ Initialize the BaseFeatureNet. Arguments: @@ -24,6 +24,7 @@ def __init__(self, config, in_features, num_numerical_features): # self.num_numerical = len([f for f in one_hot_encoder.categorical_features if not f]) # self.num_input_features = [len(c) for c in one_hot_encoder.categories_] self.num_numerical = num_numerical_features + self.num_input_features = num_input_features self.embed_features = [num_in >= config["min_unique_values_for_embedding"] for num_in in self.num_input_features] self.num_output_dimensions = [config["dimension_reduction_" + str(i)] * num_in for i, num_in in @@ -34,7 +35,7 @@ def __init__(self, config, in_features, num_numerical_features): zip(self.num_output_dimensions, self.embed_features, self.num_input_features)] self.num_out_feats = self.num_numerical + sum(self.num_output_dimensions) - self.ee_layers = self._create_ee_layers(in_features) + self.ee_layers = self._create_ee_layers() def forward(self, x): # pass the columns of each categorical feature through entity embedding layer @@ -58,7 +59,7 @@ def forward(self, x): concat_seq.append(x[:, last_concat:]) return torch.cat(concat_seq, dim=1) - def _create_ee_layers(self, in_features): + def _create_ee_layers(self): # entity embeding layers are Linear Layers layers = nn.ModuleList() for i, (num_in, embed, num_out) in enumerate( diff --git a/autoPyTorch/pipeline/components/setup/network/network_embedding/modules/no_embedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/modules/no_embedding.py similarity index 100% rename from autoPyTorch/pipeline/components/setup/network/network_embedding/modules/no_embedding.py rename to autoPyTorch/pipeline/components/setup/network_embedding/modules/no_embedding.py From 6222399c4d25629d6d870bb30f42a46f3b5ca1ec Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Mon, 15 Feb 2021 19:57:00 +0100 Subject: [PATCH 19/68] Working network embedding --- .../TabularColumnTransformer.py | 8 +- .../components/setup/network/base_network.py | 3 +- .../network_backbone/base_network_backbone.py | 6 +- .../LearnedEntityEmbedding.py | 133 +++++++++++++ .../setup/network_embedding/NoEmbedding.py | 49 +++++ .../base_network_embedding.py | 45 ++++- .../base_network_embedding_choice.py | 188 ++++++++++++++++++ .../network_embedding/modules/__init__.py | 0 .../modules/learned_entity_embedding.py | 70 ------- .../network_embedding/modules/no_embedding.py | 12 -- .../pipeline/tabular_classification.py | 30 +++ autoPyTorch/pipeline/tabular_regression.py | 27 +++ .../components/preprocessing/base.py | 36 ++++ .../test_tabular_column_transformer.py | 2 +- .../components/{ => training}/base.py | 37 +--- 15 files changed, 511 insertions(+), 135 deletions(-) create mode 100644 autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py create mode 100644 autoPyTorch/pipeline/components/setup/network_embedding/NoEmbedding.py create mode 100644 autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding_choice.py delete mode 100644 autoPyTorch/pipeline/components/setup/network_embedding/modules/__init__.py delete mode 100644 autoPyTorch/pipeline/components/setup/network_embedding/modules/learned_entity_embedding.py delete mode 100644 autoPyTorch/pipeline/components/setup/network_embedding/modules/no_embedding.py create mode 100644 test/test_pipeline/components/preprocessing/base.py rename test/test_pipeline/components/{ => training}/base.py (69%) diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py index e90f35ed1..e1e08e94e 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py @@ -2,7 +2,7 @@ import numpy as np -from sklearn.compose import ColumnTransformer, make_column_transformer +from sklearn.compose import ColumnTransformer from sklearn.pipeline import make_pipeline import torch @@ -57,9 +57,9 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> "TabularColumnTransformer": if len(X['dataset_properties']['categorical_columns']): categorical_pipeline = make_pipeline(*preprocessors['categorical']) - self.preprocessor = make_column_transformer( - (numerical_pipeline, X['dataset_properties']['numerical_columns']), - (categorical_pipeline, X['dataset_properties']['categorical_columns']), + self.preprocessor = ColumnTransformer([ + ('numerical_pipeline', numerical_pipeline, X['dataset_properties']['numerical_columns']), + ('categorical_pipeline', categorical_pipeline, X['dataset_properties']['categorical_columns'])], remainder='passthrough' ) diff --git a/autoPyTorch/pipeline/components/setup/network/base_network.py b/autoPyTorch/pipeline/components/setup/network/base_network.py index 4f7c18b7c..81fd8e5f4 100644 --- a/autoPyTorch/pipeline/components/setup/network/base_network.py +++ b/autoPyTorch/pipeline/components/setup/network/base_network.py @@ -29,6 +29,7 @@ def __init__( self.add_fit_requirements([ FitRequirement("network_head", (torch.nn.Module,), user_defined=False, dataset_property=False), FitRequirement("network_backbone", (torch.nn.Module,), user_defined=False, dataset_property=False), + FitRequirement("network_embedding", (torch.nn.Module,), user_defined=False, dataset_property=False), ]) self.final_activation = None @@ -47,7 +48,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> autoPyTorchTrainingComponent: # information to fit this stage self.check_requirements(X, y) - self.network = torch.nn.Sequential(X['network_backbone'], X['network_head']) + self.network = torch.nn.Sequential(X['network_embedding'], X['network_backbone'], X['network_head']) # Properly set the network training device if self.device is None: diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py b/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py index 2557e92b8..241fcb51b 100644 --- a/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py +++ b/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py @@ -14,6 +14,7 @@ from autoPyTorch.pipeline.components.base_component import ( autoPyTorchComponent, ) +from autoPyTorch.pipeline.components.setup.network_backbone.utils import get_output_shape from autoPyTorch.utils.common import FitRequirement @@ -31,7 +32,9 @@ def __init__(self, FitRequirement('X_train', (np.ndarray, pd.DataFrame, csr_matrix), user_defined=True, dataset_property=False), FitRequirement('input_shape', (Iterable,), user_defined=True, dataset_property=True), - FitRequirement('tabular_transformer', (BaseEstimator,), user_defined=False, dataset_property=False)]) + FitRequirement('tabular_transformer', (BaseEstimator,), user_defined=False, dataset_property=False), + FitRequirement('network_embedding', (nn.Module,), user_defined=False, dataset_property=False) + ]) self.backbone: nn.Module = None self.config = kwargs self.input_shape: Optional[Iterable] = None @@ -56,6 +59,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator: column_transformer = X['tabular_transformer'].preprocessor input_shape = column_transformer.transform(X_train[:1]).shape[1:] + input_shape = get_output_shape(X['network_embedding'], input_shape=input_shape) self.input_shape = input_shape self.backbone = self.build_backbone( diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py new file mode 100644 index 000000000..41d2da581 --- /dev/null +++ b/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py @@ -0,0 +1,133 @@ +from typing import Any, Dict, Optional, Union + +from ConfigSpace.configuration_space import ConfigurationSpace +from ConfigSpace.hyperparameters import ( + UniformFloatHyperparameter, + UniformIntegerHyperparameter +) + +import numpy as np + +import torch +from torch import nn + +from autoPyTorch.pipeline.components.setup.network_embedding.base_network_embedding import NetworkEmbeddingComponent + + +class _LearnedEntityEmbedding(nn.Module): + """ Parent class for MlpNet, ResNet, ... Can use entity embedding for categorical features""" + + def __init__(self, config, num_input_features, num_numerical_features): + """ + Initialize the BaseFeatureNet. + Arguments: + config: The configuration sampled by the hyperparameter optimizer + # TODO: fix this + num_input_features: the number of features of the dataset + num_numerical_features: OneHot encoder, that is used to encode X + """ + super().__init__() + self.config = config + + self.num_numerical = num_numerical_features + # list of number of categories of categorical data + # or 0 for numerical data + self.num_input_features = num_input_features + categorical_features = self.num_input_features > 0 + + self.num_categorical_features = self.num_input_features[categorical_features] + + self.embed_features = [num_in >= config["min_unique_values_for_embedding"] for num_in in + self.num_input_features] + self.num_output_dimensions = [0] * num_numerical_features + self.num_output_dimensions.extend([config["dimension_reduction_" + str(i)] * num_in for i, num_in in + enumerate(self.num_categorical_features)]) + self.num_output_dimensions = [int(np.clip(num_out, 1, num_in - 1)) for num_out, num_in in + zip(self.num_output_dimensions, self.num_input_features)] + self.num_output_dimensions = [num_out if embed else num_in for num_out, embed, num_in in + zip(self.num_output_dimensions, self.embed_features, + self.num_input_features)] + self.num_out_feats = self.num_numerical + sum(self.num_output_dimensions) + + self.ee_layers = self._create_ee_layers() + + def forward(self, x): + # pass the columns of each categorical feature through entity embedding layer + # before passing it through the model + concat_seq = [] + last_concat = 0 + x_pointer = 0 + layer_pointer = 0 + for num_in, embed in zip(self.num_input_features, self.embed_features): + if not embed: + x_pointer += 1 + continue + if x_pointer > last_concat: + concat_seq.append(x[:, last_concat: x_pointer]) + categorical_feature_slice = x[:, x_pointer: x_pointer + num_in] + concat_seq.append(self.ee_layers[layer_pointer](categorical_feature_slice)) + layer_pointer += 1 + x_pointer += num_in + last_concat = x_pointer + + concat_seq.append(x[:, last_concat:]) + return torch.cat(concat_seq, dim=1) + + def _create_ee_layers(self): + # entity embeding layers are Linear Layers + layers = nn.ModuleList() + for i, (num_in, embed, num_out) in enumerate( + zip(self.num_input_features, self.embed_features, self.num_output_dimensions)): + if not embed: + continue + layers.append(nn.Linear(num_in, num_out)) + return layers + + +class LearnedEntityEmbedding(NetworkEmbeddingComponent): + """ + Class to learn an embedding for categorical hyperparameters. + """ + + def __init__(self, random_state: Optional[Union[np.random.RandomState, int]] = None, **kwargs: Any): + super().__init__(random_state=random_state) + self.config = kwargs + + def build_embedding(self, num_input_features, num_numerical_features) -> nn.Module: + return _LearnedEntityEmbedding(config=self.config, + num_input_features=num_input_features, + num_numerical_features=num_numerical_features) + + @staticmethod + def get_hyperparameter_search_space( + dataset_properties: Optional[Dict[str, str]] = None, + min_unique_values_for_embedding=((3, 7), 5, True), + dimension_reduction=((0, 1), 0.5), + ) -> ConfigurationSpace: + cs = ConfigurationSpace() + min_hp = UniformIntegerHyperparameter("min_unique_values_for_embedding", + lower=min_unique_values_for_embedding[0][0], + upper=min_unique_values_for_embedding[0][1], + default_value=min_unique_values_for_embedding[1], + log=min_unique_values_for_embedding[2] + ) + cs.add_hyperparameter(min_hp) + if dataset_properties is not None: + for i in range(len(dataset_properties['categorical_columns'])): + ee_dimensions_hp = UniformFloatHyperparameter("dimension_reduction_" + str(i), + lower=dimension_reduction[0][0], + upper=dimension_reduction[0][1], + default_value=dimension_reduction[1] + ) + cs.add_hyperparameter(ee_dimensions_hp) + return cs + + @staticmethod + def get_properties(dataset_properties: Optional[Dict[str, Any]] = None) -> Dict[str, Union[str, bool]]: + return { + 'shortname': 'embedding', + 'name': 'LearnedEntityEmbedding', + 'handles_tabular': True, + 'handles_image': False, + 'handles_time_series': False, + } diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/NoEmbedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/NoEmbedding.py new file mode 100644 index 000000000..0f18b5ed6 --- /dev/null +++ b/autoPyTorch/pipeline/components/setup/network_embedding/NoEmbedding.py @@ -0,0 +1,49 @@ +from typing import Any, Dict, Optional, Union + +from ConfigSpace.configuration_space import ConfigurationSpace + +import numpy as np + +from torch import nn + +from autoPyTorch.pipeline.components.setup.network_embedding.base_network_embedding import NetworkEmbeddingComponent + + +class _NoEmbedding(nn.Module): + def __init__(self, num_input_features, num_numerical_features): + super().__init__() + self.n_feats = num_input_features + self.num_numerical = num_numerical_features + + def forward(self, x): + return x + + +class NoEmbedding(NetworkEmbeddingComponent): + """ + Class to learn an embedding for categorical hyperparameters. + """ + + def __init__(self, random_state: Optional[Union[np.random.RandomState, int]] = None): + super().__init__(random_state=random_state) + + def build_embedding(self, num_input_features, num_numerical_features) -> nn.Module: + return _NoEmbedding(num_input_features=num_input_features, + num_numerical_features=num_numerical_features) + + @staticmethod + def get_hyperparameter_search_space( + dataset_properties: Optional[Dict[str, str]] = None, + ) -> ConfigurationSpace: + cs = ConfigurationSpace() + return cs + + @staticmethod + def get_properties(dataset_properties: Optional[Dict[str, Any]] = None) -> Dict[str, Union[str, bool]]: + return { + 'shortname': 'no embedding', + 'name': 'NoEmbedding', + 'handles_tabular': True, + 'handles_image': False, + 'handles_time_series': False, + } \ No newline at end of file diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py index 4c087a89d..e27cac3c0 100644 --- a/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py +++ b/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py @@ -1,25 +1,50 @@ -from typing import Any, Dict, Optional +import copy +from typing import Any, Dict, Optional, Union + +import numpy as np from sklearn.base import BaseEstimator + from torch import nn from autoPyTorch.pipeline.components.setup.base_setup import autoPyTorchSetupComponent - +from autoPyTorch.utils.common import subsampler class NetworkEmbeddingComponent(autoPyTorchSetupComponent): - def __init__(self, - **kwargs): + def __init__(self, random_state: Optional[Union[np.random.RandomState, int]] = None): super().__init__() - self.config = kwargs self.embedding: Optional[nn.Module] = None def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator: - in_features = X['X_train'].shape[1:] + + num_numerical_columns, num_input_features = self._get_args(X) self.embedding = self.build_embedding( - in_features=in_features, - num_numerical_features=len(X['numerical_features'])) + num_input_features=num_input_features, + num_numerical_features=num_numerical_columns) return self - def build_embedding(self, in_features, num_numerical_features) -> nn.Module: - raise NotImplementedError \ No newline at end of file + def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: + X.update({'network_embedding': self.embedding}) + return X + + def build_embedding(self, num_input_features, num_numerical_features) -> nn.Module: + raise NotImplementedError + + def _get_args(self, X: Dict[str, Any]) -> Union[int, np.ndarray]: + # Feature preprocessors can alter numerical columns + if len(X['dataset_properties']['numerical_columns']) == 0: + num_numerical_columns = 0 + else: + X_train = copy.deepcopy(X['backend'].load_datamanager().train_tensors[0][:2]) + # as numerical pipeline will always be the first pipeline + numerical_column_transformer = X['tabular_transformer'].preprocessor.named_transformers_['numerical_pipeline'] + num_numerical_columns = numerical_column_transformer.transform( + X_train.iloc[:, X['dataset_properties']['numerical_columns']]).shape[1] + num_input_features = np.zeros((num_numerical_columns + + len(X['dataset_properties']['categorical_columns'])), dtype=int) + categories = X['dataset_properties']['categories'] + + for i, category in enumerate(categories): + num_input_features[num_numerical_columns + i, ] = len(category) + return num_numerical_columns, num_input_features diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding_choice.py b/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding_choice.py new file mode 100644 index 000000000..c08b156ce --- /dev/null +++ b/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding_choice.py @@ -0,0 +1,188 @@ +import os +from collections import OrderedDict +from typing import Dict, List, Optional + +import ConfigSpace.hyperparameters as CSH +from ConfigSpace.configuration_space import ConfigurationSpace + +import numpy as np + +from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice +from autoPyTorch.pipeline.components.base_component import ( + ThirdPartyComponents, + autoPyTorchComponent, + find_components, +) +from autoPyTorch.pipeline.components.setup.network_embedding.base_network_embedding import ( + NetworkEmbeddingComponent, +) + +directory = os.path.split(__file__)[0] +_embeddings = find_components(__package__, + directory, + NetworkEmbeddingComponent) +_addons = ThirdPartyComponents(NetworkEmbeddingComponent) + + +def add_embedding(embedding: NetworkEmbeddingComponent) -> None: + _addons.add_component(embedding) + + +class NetworkEmbeddingChoice(autoPyTorchChoice): + + def get_components(self) -> Dict[str, autoPyTorchComponent]: + """Returns the available embedding components + + Args: + None + + Returns: + Dict[str, autoPyTorchComponent]: all baseembedding components available + as choices for learning rate scheduling + """ + components = OrderedDict() + components.update(_embeddings) + components.update(_addons.components) + return components + + def get_available_components( + self, + dataset_properties: Optional[Dict[str, str]] = None, + include: List[str] = None, + exclude: List[str] = None, + ) -> Dict[str, autoPyTorchComponent]: + """Filters out components based on user provided + include/exclude directives, as well as the dataset properties + + Args: + include (Optional[Dict[str, Any]]): what hyper-parameter configurations + to honor when creating the configuration space + exclude (Optional[Dict[str, Any]]): what hyper-parameter configurations + to remove from the configuration space + dataset_properties (Optional[Dict[str, Union[str, int]]]): Caracteristics + of the dataset to guide the pipeline choices of components + + Returns: + Dict[str, autoPyTorchComponent]: A filtered dict of learning + rate embeddings + + """ + if dataset_properties is None: + dataset_properties = {} + + if include is not None and exclude is not None: + raise ValueError( + "The argument include and exclude cannot be used together.") + + available_comp = self.get_components() + + if include is not None: + for incl in include: + if incl not in available_comp: + raise ValueError("Trying to include unknown component: " + "%s" % incl) + + components_dict = OrderedDict() + for name in available_comp: + if include is not None and name not in include: + continue + elif exclude is not None and name in exclude: + continue + + entry = available_comp[name] + + # Exclude itself to avoid infinite loop + if entry == NetworkEmbeddingChoice or hasattr(entry, 'get_components'): + continue + + task_type = dataset_properties['task_type'] + properties = entry.get_properties() + if 'tabular' in task_type and not properties['handles_tabular']: + continue + elif 'image' in task_type and not properties['handles_image']: + continue + elif 'time_series' in task_type and not properties['handles_time_series']: + continue + + components_dict[name] = entry + + return components_dict + + def get_hyperparameter_search_space( + self, + dataset_properties: Optional[Dict[str, str]] = None, + default: Optional[str] = None, + include: Optional[List[str]] = None, + exclude: Optional[List[str]] = None, + ) -> ConfigurationSpace: + """Returns the configuration space of the current chosen components + + Args: + dataset_properties (Optional[Dict[str, str]]): Describes the dataset to work on + default (Optional[str]): Default embedding to use + include: Optional[Dict[str, Any]]: what components to include. It is an exhaustive + list, and will exclusively use this components. + exclude: Optional[Dict[str, Any]]: which components to skip + + Returns: + ConfigurationSpace: the configuration space of the hyper-parameters of the + chosen component + """ + cs = ConfigurationSpace() + + if dataset_properties is None: + dataset_properties = {} + + # Compile a list of legal preprocessors for this problem + available_embedding = self.get_available_components( + dataset_properties=dataset_properties, + include=include, exclude=exclude) + + if len(available_embedding) == 0 and 'tabular' in dataset_properties['task_type']: + raise ValueError("No embedding found") + + if available_embedding == 0: + return cs + + if default is None: + defaults = [ + 'LearnedEntityEmbedding', + 'NoEmbedding' + ] + for default_ in defaults: + if default_ in available_embedding: + default = default_ + break + + if len(dataset_properties['categorical_columns']) == 0: + default = 'NoEmbedding' + if include is not None and default not in include: + raise ValueError("Provided {} in include, however, the dataset " + "is incompatible with it".format(include)) + embedding = CSH.CategoricalHyperparameter('__choice__', + ['NoEmbedding'], + default_value=default) + else: + embedding = CSH.CategoricalHyperparameter('__choice__', + list(available_embedding.keys()), + default_value=default) + + cs.add_hyperparameter(embedding) + for name in embedding.choices: + updates = self._get_search_space_updates(prefix=name) + config_space = available_embedding[name].get_hyperparameter_search_space(dataset_properties, # type: ignore + **updates) + parent_hyperparameter = {'parent': embedding, 'value': name} + cs.add_configuration_space( + name, + config_space, + parent_hyperparameter=parent_hyperparameter + ) + + self.configuration_space_ = cs + self.dataset_properties_ = dataset_properties + return cs + + def transform(self, X: np.ndarray) -> np.ndarray: + assert self.choice is not None, "Cannot call transform before the object is initialized" + return self.choice.transform(X) diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/modules/__init__.py b/autoPyTorch/pipeline/components/setup/network_embedding/modules/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/modules/learned_entity_embedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/modules/learned_entity_embedding.py deleted file mode 100644 index d7d294661..000000000 --- a/autoPyTorch/pipeline/components/setup/network_embedding/modules/learned_entity_embedding.py +++ /dev/null @@ -1,70 +0,0 @@ -""" -Class to learn an embedding for categorical hyperparameters. -""" - -import torch -import torch.nn as nn -import numpy as np - - -class LearnedEntityEmbedding(nn.Module): - """ Parent class for MlpNet, ResNet, ... Can use entity embedding for categorical features""" - - def __init__(self, config, num_input_features, num_numerical_features): - """ - Initialize the BaseFeatureNet. - Arguments: - config: The configuration sampled by the hyperparameter optimizer - in_features: the number of features of the dataset - one_hot_encoder: OneHot encoder, that is used to encode X - """ - super(LearnedEntityEmbedding, self).__init__() - self.config = config - - # self.num_numerical = len([f for f in one_hot_encoder.categorical_features if not f]) - # self.num_input_features = [len(c) for c in one_hot_encoder.categories_] - self.num_numerical = num_numerical_features - self.num_input_features = num_input_features - self.embed_features = [num_in >= config["min_unique_values_for_embedding"] for num_in in - self.num_input_features] - self.num_output_dimensions = [config["dimension_reduction_" + str(i)] * num_in for i, num_in in - enumerate(self.num_input_features)] - self.num_output_dimensions = [int(np.clip(num_out, 1, num_in - 1)) for num_out, num_in in - zip(self.num_output_dimensions, self.num_input_features)] - self.num_output_dimensions = [num_out if embed else num_in for num_out, embed, num_in in - zip(self.num_output_dimensions, self.embed_features, self.num_input_features)] - self.num_out_feats = self.num_numerical + sum(self.num_output_dimensions) - - self.ee_layers = self._create_ee_layers() - - def forward(self, x): - # pass the columns of each categorical feature through entity embedding layer - # before passing it through the model - concat_seq = [] - last_concat = 0 - x_pointer = 0 - layer_pointer = 0 - for num_in, embed in zip(self.num_input_features, self.embed_features): - if not embed: - x_pointer += 1 - continue - if x_pointer > last_concat: - concat_seq.append(x[:, last_concat: x_pointer]) - categorical_feature_slice = x[:, x_pointer: x_pointer + num_in] - concat_seq.append(self.ee_layers[layer_pointer](categorical_feature_slice)) - layer_pointer += 1 - x_pointer += num_in - last_concat = x_pointer - - concat_seq.append(x[:, last_concat:]) - return torch.cat(concat_seq, dim=1) - - def _create_ee_layers(self): - # entity embeding layers are Linear Layers - layers = nn.ModuleList() - for i, (num_in, embed, num_out) in enumerate( - zip(self.num_input_features, self.embed_features, self.num_output_dimensions)): - if not embed: - continue - layers.append(nn.Linear(num_in, num_out)) - return layers \ No newline at end of file diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/modules/no_embedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/modules/no_embedding.py deleted file mode 100644 index cab1e9b7f..000000000 --- a/autoPyTorch/pipeline/components/setup/network_embedding/modules/no_embedding.py +++ /dev/null @@ -1,12 +0,0 @@ -from torch import nn - - -class NoEmbedding(nn.Module): - def __init__(self, config, in_features, num_numerical_features): - super(NoEmbedding, self).__init__() - self.config = config - self.n_feats = in_features - self.num_numerical = num_numerical_features - - def forward(self, x): - return x \ No newline at end of file diff --git a/autoPyTorch/pipeline/tabular_classification.py b/autoPyTorch/pipeline/tabular_classification.py index e3abad9cc..f5b668a88 100644 --- a/autoPyTorch/pipeline/tabular_classification.py +++ b/autoPyTorch/pipeline/tabular_classification.py @@ -1,7 +1,9 @@ +import copy import warnings from typing import Any, Dict, List, Optional, Tuple from ConfigSpace.configuration_space import Configuration, ConfigurationSpace +from ConfigSpace.forbidden import ForbiddenEqualsClause, ForbiddenAndConjunction import numpy as np @@ -25,6 +27,7 @@ from autoPyTorch.pipeline.components.setup.lr_scheduler.base_scheduler_choice import SchedulerChoice from autoPyTorch.pipeline.components.setup.network.base_network import NetworkComponent from autoPyTorch.pipeline.components.setup.network_backbone.base_network_backbone_choice import NetworkBackboneChoice +from autoPyTorch.pipeline.components.setup.network_embedding.base_network_embedding_choice import NetworkEmbeddingChoice from autoPyTorch.pipeline.components.setup.network_head.base_network_head_choice import NetworkHeadChoice from autoPyTorch.pipeline.components.setup.network_initializer.base_network_init_choice import ( NetworkInitializerChoice @@ -188,6 +191,32 @@ def _get_hyperparameter_search_space(self, # Here we add custom code, like this with this # is not a valid configuration + # Learned Entity Embedding is only valid when encoder is one hot encoder + embeddings = cs.get_hyperparameter('network_embedding:__choice__').choices + encoders = cs.get_hyperparameter('encoder:__choice__').choices + default = cs.get_hyperparameter('network_embedding:__choice__').default_value + possible_default_embeddings = copy.copy(list(embeddings)) + del possible_default_embeddings[possible_default_embeddings.index(default)] + if 'network_embedding' in self.named_steps.keys() and 'encoder' in self.named_steps.keys(): + for encoder in encoders: + if encoder == 'OneHotEncoder': + continue + while True: + try: + cs.add_forbidden_clause(ForbiddenAndConjunction( + ForbiddenEqualsClause(cs.get_hyperparameter( + 'network_embedding:__choice__'), 'LearnedEntityEmbedding'), + ForbiddenEqualsClause(cs.get_hyperparameter('encoder:__choice__') + , encoder) + )) + break + except ValueError: + # change the default and try again + try: + default = possible_default_embeddings.pop() + except IndexError: + raise ValueError("Cannot find a legal default configuration") + cs.get_hyperparameter('network_embedding:__choice__').default_value = default self.configuration_space = cs self.dataset_properties = dataset_properties @@ -216,6 +245,7 @@ def _get_pipeline_steps(self, dataset_properties: Optional[Dict[str, Any]], ("feature_preprocessor", FeatureProprocessorChoice(default_dataset_properties)), ("tabular_transformer", TabularColumnTransformer()), ("preprocessing", EarlyPreprocessing()), + ("network_embedding", NetworkEmbeddingChoice(default_dataset_properties)), ("network_backbone", NetworkBackboneChoice(default_dataset_properties)), ("network_head", NetworkHeadChoice(default_dataset_properties)), ("network", NetworkComponent()), diff --git a/autoPyTorch/pipeline/tabular_regression.py b/autoPyTorch/pipeline/tabular_regression.py index 174e41dee..0c6463c31 100644 --- a/autoPyTorch/pipeline/tabular_regression.py +++ b/autoPyTorch/pipeline/tabular_regression.py @@ -2,6 +2,7 @@ from typing import Any, Dict, List, Optional, Tuple from ConfigSpace.configuration_space import Configuration, ConfigurationSpace +from ConfigSpace.forbidden import ForbiddenAndConjunction, ForbiddenEqualsClause import numpy as np @@ -136,6 +137,32 @@ def _get_hyperparameter_search_space(self, # Here we add custom code, like this with this # is not a valid configuration + # Learned Entity Embedding is only valid when encoder is one hot encoder + embeddings = cs.get_hyperparameter('network_embedding:__choice__').choices + encoders = cs.get_hyperparameter('encoder:__choice__').choices + default = cs.get_hyperparameter('network_embedding:__choice__').default_value + possible_default_embeddings = copy.copy(list(embeddings)) + del possible_default_embeddings[possible_default_embeddings.index(default)] + if 'network_embedding' in self.named_steps.keys() and 'encoder' in self.named_steps.keys(): + for encoder in encoders: + if encoder == 'OneHotEncoder': + continue + while True: + try: + cs.add_forbidden_clause(ForbiddenAndConjunction( + ForbiddenEqualsClause(cs.get_hyperparameter( + 'network_embedding:__choice__'), 'LearnedEntityEmbedding'), + ForbiddenEqualsClause(cs.get_hyperparameter('encoder:__choice__') + , encoder) + )) + break + except ValueError: + # change the default and try again + try: + default = possible_default_embeddings.pop() + except IndexError: + raise ValueError("Cannot find a legal default configuration") + cs.get_hyperparameter('network_embedding:__choice__').default_value = default self.configuration_space = cs self.dataset_properties = dataset_properties diff --git a/test/test_pipeline/components/preprocessing/base.py b/test/test_pipeline/components/preprocessing/base.py new file mode 100644 index 000000000..7bb4fee70 --- /dev/null +++ b/test/test_pipeline/components/preprocessing/base.py @@ -0,0 +1,36 @@ +from typing import Any, Dict, List, Optional, Tuple + +from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.TabularColumnTransformer import \ + TabularColumnTransformer +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.encoding.base_encoder_choice import \ + EncoderChoice +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.imputation.SimpleImputer import SimpleImputer +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling.base_scaler_choice import ScalerChoice +from autoPyTorch.pipeline.tabular_classification import TabularClassificationPipeline + + +class TabularPipeline(TabularClassificationPipeline): + def _get_pipeline_steps(self, dataset_properties: Optional[Dict[str, Any]], + ) -> List[Tuple[str, autoPyTorchChoice]]: + """ + Defines what steps a pipeline should follow. + The step itself has choices given via autoPyTorchChoice. + + Returns: + List[Tuple[str, autoPyTorchChoice]]: list of steps sequentially exercised + by the pipeline. + """ + steps = [] # type: List[Tuple[str, autoPyTorchChoice]] + + default_dataset_properties = {'target_type': 'tabular_classification'} + if dataset_properties is not None: + default_dataset_properties.update(dataset_properties) + + steps.extend([ + ("imputer", SimpleImputer()), + ("encoder", EncoderChoice(default_dataset_properties)), + ("scaler", ScalerChoice(default_dataset_properties)), + ("tabular_transformer", TabularColumnTransformer()), + ]) + return steps \ No newline at end of file diff --git a/test/test_pipeline/components/preprocessing/test_tabular_column_transformer.py b/test/test_pipeline/components/preprocessing/test_tabular_column_transformer.py index ef113c5eb..8e31bad05 100644 --- a/test/test_pipeline/components/preprocessing/test_tabular_column_transformer.py +++ b/test/test_pipeline/components/preprocessing/test_tabular_column_transformer.py @@ -1,4 +1,4 @@ -from test.test_pipeline.components.base import TabularPipeline +from test.test_pipeline.components.preprocessing.base import TabularPipeline import numpy as np diff --git a/test/test_pipeline/components/base.py b/test/test_pipeline/components/training/base.py similarity index 69% rename from test/test_pipeline/components/base.py rename to test/test_pipeline/components/training/base.py index 8211172e7..eaa80cf88 100644 --- a/test/test_pipeline/components/base.py +++ b/test/test_pipeline/components/training/base.py @@ -1,6 +1,5 @@ import logging import unittest -from typing import Any, Dict, List, Optional, Tuple from sklearn.datasets import make_classification, make_regression @@ -8,16 +7,8 @@ from autoPyTorch.constants import BINARY, CLASSIFICATION_TASKS, CONTINUOUS, OUTPUT_TYPES_TO_STRING, REGRESSION_TASKS, \ TASK_TYPES_TO_STRING -from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice -from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.TabularColumnTransformer import \ - TabularColumnTransformer -from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.encoding.base_encoder_choice import \ - EncoderChoice -from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.imputation.SimpleImputer import SimpleImputer -from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling.base_scaler_choice import ScalerChoice -from autoPyTorch.pipeline.components.training.metrics.utils import get_metrics from autoPyTorch.pipeline.components.training.trainer.base_trainer import BaseTrainerComponent, BudgetTracker -from autoPyTorch.pipeline.tabular_classification import TabularClassificationPipeline +from autoPyTorch.pipeline.components.training.metrics.utils import get_metrics class BaseTraining(unittest.TestCase): @@ -121,29 +112,3 @@ def train_model(self, # Backward pass loss.backward() optimizer.step() - - -class TabularPipeline(TabularClassificationPipeline): - def _get_pipeline_steps(self, dataset_properties: Optional[Dict[str, Any]], - ) -> List[Tuple[str, autoPyTorchChoice]]: - """ - Defines what steps a pipeline should follow. - The step itself has choices given via autoPyTorchChoice. - - Returns: - List[Tuple[str, autoPyTorchChoice]]: list of steps sequentially exercised - by the pipeline. - """ - steps = [] # type: List[Tuple[str, autoPyTorchChoice]] - - default_dataset_properties = {'target_type': 'tabular_classification'} - if dataset_properties is not None: - default_dataset_properties.update(dataset_properties) - - steps.extend([ - ("imputer", SimpleImputer()), - ("encoder", EncoderChoice(default_dataset_properties)), - ("scaler", ScalerChoice(default_dataset_properties)), - ("tabular_transformer", TabularColumnTransformer()), - ]) - return steps From b96e32a363c0cebf0c201eedec3d218da0315c19 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Mon, 15 Feb 2021 22:24:05 +0100 Subject: [PATCH 20/68] ADD tests for network embedding --- .../encoding/OrdinalEncoder.py | 33 --------- .../LearnedEntityEmbedding.py | 29 ++++---- .../setup/network_embedding/NoEmbedding.py | 15 ++--- .../base_network_embedding.py | 20 +++--- .../pipeline/tabular_classification.py | 67 ++++++++++--------- autoPyTorch/pipeline/tabular_regression.py | 48 ++++++------- .../components/setup/test_setup_networks.py | 23 ++++++- .../test_tabular_classification.py | 9 ++- 8 files changed, 119 insertions(+), 125 deletions(-) delete mode 100644 autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/OrdinalEncoder.py diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/OrdinalEncoder.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/OrdinalEncoder.py deleted file mode 100644 index c65726327..000000000 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/OrdinalEncoder.py +++ /dev/null @@ -1,33 +0,0 @@ -from typing import Any, Dict, Optional, Union - -import numpy as np - -from sklearn.preprocessing import OrdinalEncoder as OE - -from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.encoding.base_encoder import BaseEncoder - - -class OrdinalEncoder(BaseEncoder): - """ - Encode categorical features as a one-hot numerical array - """ - def __init__(self, random_state: Optional[Union[np.random.RandomState, int]] = None): - super().__init__() - self.random_state = random_state - - def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEncoder: - - self.check_requirements(X, y) - - self.preprocessor['categorical'] = OE(handle_unknown='use_encoded_value', - unknown_value=-1, - ) - return self - - @staticmethod - def get_properties(dataset_properties: Optional[Dict[str, Any]] = None) -> Dict[str, Union[str, bool]]: - return { - 'shortname': 'OrdinalEncoder', - 'name': 'Ordinal Encoder', - 'handles_sparse': False - } diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py index 41d2da581..3910afc37 100644 --- a/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py +++ b/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py @@ -1,4 +1,4 @@ -from typing import Any, Dict, Optional, Union +from typing import Any, Dict, Optional, Tuple, Union from ConfigSpace.configuration_space import ConfigurationSpace from ConfigSpace.hyperparameters import ( @@ -15,16 +15,15 @@ class _LearnedEntityEmbedding(nn.Module): - """ Parent class for MlpNet, ResNet, ... Can use entity embedding for categorical features""" + """ Learned entity embedding module for categorical features""" - def __init__(self, config, num_input_features, num_numerical_features): + def __init__(self, config: Dict[str, Any], num_input_features: np.ndarray, num_numerical_features: int): """ - Initialize the BaseFeatureNet. Arguments: - config: The configuration sampled by the hyperparameter optimizer - # TODO: fix this - num_input_features: the number of features of the dataset - num_numerical_features: OneHot encoder, that is used to encode X + config (Dict[str, Any]): The configuration sampled by the hyperparameter optimizer + num_input_features (np.ndarray): column wise information of number of output columns after transformation + for each categorical column and 0 for numerical columns + num_numerical_features (int): number of numerical features in X """ super().__init__() self.config = config @@ -51,7 +50,7 @@ def __init__(self, config, num_input_features, num_numerical_features): self.ee_layers = self._create_ee_layers() - def forward(self, x): + def forward(self, x: torch.Tensor) -> torch.Tensor: # pass the columns of each categorical feature through entity embedding layer # before passing it through the model concat_seq = [] @@ -73,11 +72,11 @@ def forward(self, x): concat_seq.append(x[:, last_concat:]) return torch.cat(concat_seq, dim=1) - def _create_ee_layers(self): + def _create_ee_layers(self) -> nn.ModuleList: # entity embeding layers are Linear Layers layers = nn.ModuleList() - for i, (num_in, embed, num_out) in enumerate( - zip(self.num_input_features, self.embed_features, self.num_output_dimensions)): + for i, (num_in, embed, num_out) in enumerate(zip(self.num_input_features, self.embed_features, + self.num_output_dimensions)): if not embed: continue layers.append(nn.Linear(num_in, num_out)) @@ -93,7 +92,7 @@ def __init__(self, random_state: Optional[Union[np.random.RandomState, int]] = N super().__init__(random_state=random_state) self.config = kwargs - def build_embedding(self, num_input_features, num_numerical_features) -> nn.Module: + def build_embedding(self, num_input_features: np.ndarray, num_numerical_features: int) -> nn.Module: return _LearnedEntityEmbedding(config=self.config, num_input_features=num_input_features, num_numerical_features=num_numerical_features) @@ -101,8 +100,8 @@ def build_embedding(self, num_input_features, num_numerical_features) -> nn.Modu @staticmethod def get_hyperparameter_search_space( dataset_properties: Optional[Dict[str, str]] = None, - min_unique_values_for_embedding=((3, 7), 5, True), - dimension_reduction=((0, 1), 0.5), + min_unique_values_for_embedding: Tuple[Tuple, int, bool] = ((3, 7), 5, True), + dimension_reduction: Tuple[Tuple, float] = ((0, 1), 0.5), ) -> ConfigurationSpace: cs = ConfigurationSpace() min_hp = UniformIntegerHyperparameter("min_unique_values_for_embedding", diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/NoEmbedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/NoEmbedding.py index 0f18b5ed6..a8b81af2f 100644 --- a/autoPyTorch/pipeline/components/setup/network_embedding/NoEmbedding.py +++ b/autoPyTorch/pipeline/components/setup/network_embedding/NoEmbedding.py @@ -4,18 +4,14 @@ import numpy as np +import torch from torch import nn from autoPyTorch.pipeline.components.setup.network_embedding.base_network_embedding import NetworkEmbeddingComponent class _NoEmbedding(nn.Module): - def __init__(self, num_input_features, num_numerical_features): - super().__init__() - self.n_feats = num_input_features - self.num_numerical = num_numerical_features - - def forward(self, x): + def forward(self, x: torch.Tensor) -> torch.Tensor: return x @@ -27,9 +23,8 @@ class NoEmbedding(NetworkEmbeddingComponent): def __init__(self, random_state: Optional[Union[np.random.RandomState, int]] = None): super().__init__(random_state=random_state) - def build_embedding(self, num_input_features, num_numerical_features) -> nn.Module: - return _NoEmbedding(num_input_features=num_input_features, - num_numerical_features=num_numerical_features) + def build_embedding(self, num_input_features: np.ndarray, num_numerical_features: int) -> nn.Module: + return _NoEmbedding() @staticmethod def get_hyperparameter_search_space( @@ -46,4 +41,4 @@ def get_properties(dataset_properties: Optional[Dict[str, Any]] = None) -> Dict[ 'handles_tabular': True, 'handles_image': False, 'handles_time_series': False, - } \ No newline at end of file + } diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py index e27cac3c0..8652c347c 100644 --- a/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py +++ b/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py @@ -1,5 +1,5 @@ import copy -from typing import Any, Dict, Optional, Union +from typing import Any, Dict, Optional, Tuple, Union import numpy as np @@ -8,12 +8,13 @@ from torch import nn from autoPyTorch.pipeline.components.setup.base_setup import autoPyTorchSetupComponent -from autoPyTorch.utils.common import subsampler + class NetworkEmbeddingComponent(autoPyTorchSetupComponent): def __init__(self, random_state: Optional[Union[np.random.RandomState, int]] = None): super().__init__() self.embedding: Optional[nn.Module] = None + self.random_state = random_state def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator: @@ -28,21 +29,22 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: X.update({'network_embedding': self.embedding}) return X - def build_embedding(self, num_input_features, num_numerical_features) -> nn.Module: + def build_embedding(self, num_input_features: np.ndarray, num_numerical_features: int) -> nn.Module: raise NotImplementedError - def _get_args(self, X: Dict[str, Any]) -> Union[int, np.ndarray]: + def _get_args(self, X: Dict[str, Any]) -> Tuple[int, np.ndarray]: # Feature preprocessors can alter numerical columns if len(X['dataset_properties']['numerical_columns']) == 0: num_numerical_columns = 0 else: X_train = copy.deepcopy(X['backend'].load_datamanager().train_tensors[0][:2]) - # as numerical pipeline will always be the first pipeline - numerical_column_transformer = X['tabular_transformer'].preprocessor.named_transformers_['numerical_pipeline'] + + numerical_column_transformer = X['tabular_transformer'].preprocessor. \ + named_transformers_['numerical_pipeline'] num_numerical_columns = numerical_column_transformer.transform( - X_train.iloc[:, X['dataset_properties']['numerical_columns']]).shape[1] - num_input_features = np.zeros((num_numerical_columns + - len(X['dataset_properties']['categorical_columns'])), dtype=int) + X_train[:, X['dataset_properties']['numerical_columns']]).shape[1] + num_input_features = np.zeros((num_numerical_columns + len(X['dataset_properties']['categorical_columns'])), + dtype=int) categories = X['dataset_properties']['categories'] for i, category in enumerate(categories): diff --git a/autoPyTorch/pipeline/tabular_classification.py b/autoPyTorch/pipeline/tabular_classification.py index f5b668a88..73dca2878 100644 --- a/autoPyTorch/pipeline/tabular_classification.py +++ b/autoPyTorch/pipeline/tabular_classification.py @@ -3,7 +3,7 @@ from typing import Any, Dict, List, Optional, Tuple from ConfigSpace.configuration_space import Configuration, ConfigurationSpace -from ConfigSpace.forbidden import ForbiddenEqualsClause, ForbiddenAndConjunction +from ConfigSpace.forbidden import ForbiddenAndConjunction, ForbiddenEqualsClause import numpy as np @@ -65,15 +65,15 @@ class TabularClassificationPipeline(ClassifierMixin, BasePipeline): """ def __init__( - self, - config: Optional[Configuration] = None, - steps: Optional[List[Tuple[str, autoPyTorchChoice]]] = None, - dataset_properties: Optional[Dict[str, Any]] = None, - include: Optional[Dict[str, Any]] = None, - exclude: Optional[Dict[str, Any]] = None, - random_state: Optional[np.random.RandomState] = None, - init_params: Optional[Dict[str, Any]] = None, - search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None + self, + config: Optional[Configuration] = None, + steps: Optional[List[Tuple[str, autoPyTorchChoice]]] = None, + dataset_properties: Optional[Dict[str, Any]] = None, + include: Optional[Dict[str, Any]] = None, + exclude: Optional[Dict[str, Any]] = None, + random_state: Optional[np.random.RandomState] = None, + init_params: Optional[Dict[str, Any]] = None, + search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None ): super().__init__( config, steps, dataset_properties, include, exclude, @@ -192,31 +192,32 @@ def _get_hyperparameter_search_space(self, # Here we add custom code, like this with this # is not a valid configuration # Learned Entity Embedding is only valid when encoder is one hot encoder - embeddings = cs.get_hyperparameter('network_embedding:__choice__').choices - encoders = cs.get_hyperparameter('encoder:__choice__').choices - default = cs.get_hyperparameter('network_embedding:__choice__').default_value - possible_default_embeddings = copy.copy(list(embeddings)) - del possible_default_embeddings[possible_default_embeddings.index(default)] if 'network_embedding' in self.named_steps.keys() and 'encoder' in self.named_steps.keys(): - for encoder in encoders: - if encoder == 'OneHotEncoder': - continue - while True: - try: - cs.add_forbidden_clause(ForbiddenAndConjunction( - ForbiddenEqualsClause(cs.get_hyperparameter( - 'network_embedding:__choice__'), 'LearnedEntityEmbedding'), - ForbiddenEqualsClause(cs.get_hyperparameter('encoder:__choice__') - , encoder) - )) - break - except ValueError: - # change the default and try again + embeddings = cs.get_hyperparameter('network_embedding:__choice__').choices + if 'LearnedEntityEmbedding' in embeddings: + encoders = cs.get_hyperparameter('encoder:__choice__').choices + default = cs.get_hyperparameter('network_embedding:__choice__').default_value + possible_default_embeddings = copy.copy(list(embeddings)) + del possible_default_embeddings[possible_default_embeddings.index(default)] + + for encoder in encoders: + if encoder == 'OneHotEncoder': + continue + while True: try: - default = possible_default_embeddings.pop() - except IndexError: - raise ValueError("Cannot find a legal default configuration") - cs.get_hyperparameter('network_embedding:__choice__').default_value = default + cs.add_forbidden_clause(ForbiddenAndConjunction( + ForbiddenEqualsClause(cs.get_hyperparameter( + 'network_embedding:__choice__'), 'LearnedEntityEmbedding'), + ForbiddenEqualsClause(cs.get_hyperparameter('encoder:__choice__'), encoder) + )) + break + except ValueError: + # change the default and try again + try: + default = possible_default_embeddings.pop() + except IndexError: + raise ValueError("Cannot find a legal default configuration") + cs.get_hyperparameter('network_embedding:__choice__').default_value = default self.configuration_space = cs self.dataset_properties = dataset_properties diff --git a/autoPyTorch/pipeline/tabular_regression.py b/autoPyTorch/pipeline/tabular_regression.py index 0c6463c31..3220f0fff 100644 --- a/autoPyTorch/pipeline/tabular_regression.py +++ b/autoPyTorch/pipeline/tabular_regression.py @@ -1,3 +1,4 @@ +import copy import warnings from typing import Any, Dict, List, Optional, Tuple @@ -138,31 +139,32 @@ def _get_hyperparameter_search_space(self, # Here we add custom code, like this with this # is not a valid configuration # Learned Entity Embedding is only valid when encoder is one hot encoder - embeddings = cs.get_hyperparameter('network_embedding:__choice__').choices - encoders = cs.get_hyperparameter('encoder:__choice__').choices - default = cs.get_hyperparameter('network_embedding:__choice__').default_value - possible_default_embeddings = copy.copy(list(embeddings)) - del possible_default_embeddings[possible_default_embeddings.index(default)] if 'network_embedding' in self.named_steps.keys() and 'encoder' in self.named_steps.keys(): - for encoder in encoders: - if encoder == 'OneHotEncoder': - continue - while True: - try: - cs.add_forbidden_clause(ForbiddenAndConjunction( - ForbiddenEqualsClause(cs.get_hyperparameter( - 'network_embedding:__choice__'), 'LearnedEntityEmbedding'), - ForbiddenEqualsClause(cs.get_hyperparameter('encoder:__choice__') - , encoder) - )) - break - except ValueError: - # change the default and try again + embeddings = cs.get_hyperparameter('network_embedding:__choice__').choices + if 'LearnedEntityEmbedding' in embeddings: + encoders = cs.get_hyperparameter('encoder:__choice__').choices + default = cs.get_hyperparameter('network_embedding:__choice__').default_value + possible_default_embeddings = copy.copy(list(embeddings)) + del possible_default_embeddings[possible_default_embeddings.index(default)] + + for encoder in encoders: + if encoder == 'OneHotEncoder': + continue + while True: try: - default = possible_default_embeddings.pop() - except IndexError: - raise ValueError("Cannot find a legal default configuration") - cs.get_hyperparameter('network_embedding:__choice__').default_value = default + cs.add_forbidden_clause(ForbiddenAndConjunction( + ForbiddenEqualsClause(cs.get_hyperparameter( + 'network_embedding:__choice__'), 'LearnedEntityEmbedding'), + ForbiddenEqualsClause(cs.get_hyperparameter('encoder:__choice__'), encoder) + )) + break + except ValueError: + # change the default and try again + try: + default = possible_default_embeddings.pop() + except IndexError: + raise ValueError("Cannot find a legal default configuration") + cs.get_hyperparameter('network_embedding:__choice__').default_value = default self.configuration_space = cs self.dataset_properties = dataset_properties diff --git a/test/test_pipeline/components/setup/test_setup_networks.py b/test/test_pipeline/components/setup/test_setup_networks.py index be8af94c5..df5ad4cfd 100644 --- a/test/test_pipeline/components/setup/test_setup_networks.py +++ b/test/test_pipeline/components/setup/test_setup_networks.py @@ -17,21 +17,42 @@ def head(request): return request.param +@pytest.fixture(params=['LearnedEntityEmbedding', 'NoEmbedding']) +def embedding(request): + return request.param + + @flaky.flaky(max_runs=3) @pytest.mark.parametrize("fit_dictionary_tabular", ['classification_numerical_only', 'classification_categorical_only', 'classification_numerical_and_categorical'], indirect=True) class TestNetworks: +<<<<<<< HEAD def test_pipeline_fit(self, fit_dictionary_tabular, backbone, head): +======= + def test_pipeline_fit(self, fit_dictionary, embedding, backbone, head): +>>>>>>> ADD tests for network embedding """This test makes sure that the pipeline is able to fit - given random combinations of hyperparameters across the pipeline""" + every combination of network embedding, backbone, head""" + include = {'network_backbone': [backbone], 'network_head': [head], 'network_embedding': [embedding]} + + if len(fit_dictionary['dataset_properties'] + ['categorical_columns']) == 0 and embedding == 'LearnedEntityEmbedding': + pytest.skip("Learned Entity Embedding is not used with numerical only data") pipeline = TabularClassificationPipeline( +<<<<<<< HEAD dataset_properties=fit_dictionary_tabular['dataset_properties'], include={'network_backbone': [backbone], 'network_head': [head]}) +======= + dataset_properties=fit_dictionary['dataset_properties'], + include=include) + +>>>>>>> ADD tests for network embedding cs = pipeline.get_hyperparameter_search_space() config = cs.get_default_configuration() + assert embedding == config.get('network_embedding:__choice__', None) assert backbone == config.get('network_backbone:__choice__', None) assert head == config.get('network_head:__choice__', None) pipeline.set_hyperparameters(config) diff --git a/test/test_pipeline/test_tabular_classification.py b/test/test_pipeline/test_tabular_classification.py index 260587adb..fc6eea0e4 100644 --- a/test/test_pipeline/test_tabular_classification.py +++ b/test/test_pipeline/test_tabular_classification.py @@ -35,7 +35,13 @@ def _assert_pipeline_search_space(self, pipeline, search_space_updates): assert any(update.node_name + ':' + update.hyperparameter in name for name in config_space.get_hyperparameter_names()), \ "Can't find hyperparameter: {}".format(update.hyperparameter) - hyperparameter = config_space.get_hyperparameter(update.node_name + ':' + update.hyperparameter + '_1') + # dimension reduction in embedding starts from 0 + if 'embedding' in update.node_name: + hyperparameter = config_space.get_hyperparameter( + update.node_name + ':' + update.hyperparameter + '_0') + else: + hyperparameter = config_space.get_hyperparameter( + update.node_name + ':' + update.hyperparameter + '_1') assert update.default_value == hyperparameter.default_value if isinstance(hyperparameter, (UniformIntegerHyperparameter, UniformFloatHyperparameter)): assert update.value_range[0] == hyperparameter.lower @@ -208,6 +214,7 @@ def test_network_optimizer_lr_handshake(self, fit_dictionary_tabular): # Make sure that fitting a network adds a "network" to X assert 'network' in pipeline.named_steps.keys() + fit_dictionary_tabular['network_embedding'] = torch.nn.Linear(3, 3) fit_dictionary_tabular['network_backbone'] = torch.nn.Linear(3, 4) fit_dictionary_tabular['network_head'] = torch.nn.Linear(4, 1) X = pipeline.named_steps['network'].fit( From ece63536f733227093fb906ea03b3fa9dc15257f Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Mon, 15 Feb 2021 22:24:28 +0100 Subject: [PATCH 21/68] Removed ordinal encoder --- .../components/preprocessing/base.py | 2 +- .../components/preprocessing/test_encoders.py | 43 ------------------- 2 files changed, 1 insertion(+), 44 deletions(-) diff --git a/test/test_pipeline/components/preprocessing/base.py b/test/test_pipeline/components/preprocessing/base.py index 7bb4fee70..875ed399c 100644 --- a/test/test_pipeline/components/preprocessing/base.py +++ b/test/test_pipeline/components/preprocessing/base.py @@ -33,4 +33,4 @@ def _get_pipeline_steps(self, dataset_properties: Optional[Dict[str, Any]], ("scaler", ScalerChoice(default_dataset_properties)), ("tabular_transformer", TabularColumnTransformer()), ]) - return steps \ No newline at end of file + return steps diff --git a/test/test_pipeline/components/preprocessing/test_encoders.py b/test/test_pipeline/components/preprocessing/test_encoders.py index 1f210936f..a901823ba 100644 --- a/test/test_pipeline/components/preprocessing/test_encoders.py +++ b/test/test_pipeline/components/preprocessing/test_encoders.py @@ -8,7 +8,6 @@ from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.encoding.NoEncoder import NoEncoder from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.encoding.OneHotEncoder import OneHotEncoder -from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.encoding.OrdinalEncoder import OrdinalEncoder class TestEncoders(unittest.TestCase): @@ -53,48 +52,6 @@ def test_one_hot_encoder_no_unknown(self): # check if the transform is correct assert_array_equal(transformed, [['1.0', '0.0', 1], ['1.0', '0.0', 2]]) - def test_ordinal_encoder(self): - - data = np.array([[1, 'male'], - [1, 'female'], - [3, 'male'], - [2, 'female'], - [2, 'male']]) - - categorical_columns = [1] - numerical_columns = [0] - train_indices = np.array([0, 2, 3]) - test_indices = np.array([1, 4]) - - dataset_properties = { - 'categorical_columns': categorical_columns, - 'numerical_columns': numerical_columns, - 'categories': [['female', 'male', 'unknown']] - } - X = { - 'X_train': data[train_indices], - 'dataset_properties': dataset_properties - } - encoder_component = OrdinalEncoder() - encoder_component.fit(X) - X = encoder_component.transform(X) - - encoder = X['encoder']['categorical'] - - # check if the fit dictionary X is modified as expected - self.assertIsInstance(X['encoder'], dict) - self.assertIsInstance(encoder, BaseEstimator) - self.assertIsNone(X['encoder']['numerical']) - - # make column transformer with returned encoder to fit on data - column_transformer = make_column_transformer((encoder, X['dataset_properties']['categorical_columns']), - remainder='passthrough') - column_transformer = column_transformer.fit(X['X_train']) - transformed = column_transformer.transform(data[test_indices]) - - # check if we got the expected transformed array - assert_array_equal(transformed, [['0.0', 1], ['1.0', 2]]) - def test_none_encoder(self): data = np.array([[1, 'male'], From 70b0a7930568f4da177a2d450d712806fc781c35 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Mon, 15 Feb 2021 22:24:41 +0100 Subject: [PATCH 22/68] Removed ordinal encoder --- .../tabular_preprocessing/encoding/base_encoder_choice.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/base_encoder_choice.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/base_encoder_choice.py index 7be7c94a2..df71ff209 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/base_encoder_choice.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/base_encoder_choice.py @@ -65,7 +65,7 @@ def get_hyperparameter_search_space(self, raise ValueError("no encoders found, please add a encoder") if default is None: - defaults = ['OneHotEncoder', 'OrdinalEncoder', 'NoEncoder'] + defaults = ['OneHotEncoder', 'NoEncoder'] for default_ in defaults: if default_ in available_preprocessors: if include is not None and default_ not in include: From 48d7a85315d58e9533248aef634b1702a45e0d1e Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Wed, 17 Feb 2021 13:36:52 +0100 Subject: [PATCH 23/68] Addressed comments --- autoPyTorch/pipeline/tabular_regression.py | 2 ++ .../preprocessing/test_tabular_column_transformer.py | 11 +++++++++++ 2 files changed, 13 insertions(+) diff --git a/autoPyTorch/pipeline/tabular_regression.py b/autoPyTorch/pipeline/tabular_regression.py index 3220f0fff..855a025e8 100644 --- a/autoPyTorch/pipeline/tabular_regression.py +++ b/autoPyTorch/pipeline/tabular_regression.py @@ -25,6 +25,7 @@ from autoPyTorch.pipeline.components.setup.lr_scheduler.base_scheduler_choice import SchedulerChoice from autoPyTorch.pipeline.components.setup.network.base_network import NetworkComponent from autoPyTorch.pipeline.components.setup.network_backbone.base_network_backbone_choice import NetworkBackboneChoice +from autoPyTorch.pipeline.components.setup.network_embedding.base_network_embedding_choice import NetworkEmbeddingChoice from autoPyTorch.pipeline.components.setup.network_head.base_network_head_choice import NetworkHeadChoice from autoPyTorch.pipeline.components.setup.network_initializer.base_network_init_choice import ( NetworkInitializerChoice @@ -191,6 +192,7 @@ def _get_pipeline_steps(self, dataset_properties: Optional[Dict[str, Any]]) -> L ("scaler", ScalerChoice(default_dataset_properties)), ("tabular_transformer", TabularColumnTransformer()), ("preprocessing", EarlyPreprocessing()), + ("network_embedding", NetworkEmbeddingChoice(default_dataset_properties)), ("network_backbone", NetworkBackboneChoice(default_dataset_properties)), ("network_head", NetworkHeadChoice(default_dataset_properties)), ("network", NetworkComponent()), diff --git a/test/test_pipeline/components/preprocessing/test_tabular_column_transformer.py b/test/test_pipeline/components/preprocessing/test_tabular_column_transformer.py index 8e31bad05..66a96f27f 100644 --- a/test/test_pipeline/components/preprocessing/test_tabular_column_transformer.py +++ b/test/test_pipeline/components/preprocessing/test_tabular_column_transformer.py @@ -33,7 +33,18 @@ def test_tabular_preprocess(self, fit_dictionary_tabular): data = column_transformer.preprocessor.fit_transform(X['X_train']) assert isinstance(data, np.ndarray) + # Make sure no columns are unintentionally dropped after preprocessing + if len(fit_dictionary_tabular['dataset_properties']["numerical_columns"]) == 0: + categorical_pipeline = column_transformer.preprocessor.named_transformers_['categorical_pipeline'] + categorical_data = categorical_pipeline.transform(X['X_train']) + assert data.shape[1] == categorical_data.shape[1] + elif len(fit_dictionary_tabular['dataset_properties']["categorical_columns"]) == 0: + numerical_pipeline = column_transformer.preprocessor.named_transformers_['numerical_pipeline'] + numerical_data = numerical_pipeline.transform(X['X_train']) + assert data.shape[1] == numerical_data.shape[1] + def test_sparse_data(self, fit_dictionary_tabular): + X = np.random.binomial(1, 0.1, (100, 2000)) sparse_X = csr_matrix(X) numerical_columns = list(range(2000)) From a92fcafa2858880b8015222c3bd7d44a184c0395 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Mon, 22 Feb 2021 13:58:36 +0100 Subject: [PATCH 24/68] fix flake --- test/test_pipeline/components/training/base.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/test/test_pipeline/components/training/base.py b/test/test_pipeline/components/training/base.py index eaa80cf88..c13a47aa8 100644 --- a/test/test_pipeline/components/training/base.py +++ b/test/test_pipeline/components/training/base.py @@ -5,8 +5,14 @@ import torch -from autoPyTorch.constants import BINARY, CLASSIFICATION_TASKS, CONTINUOUS, OUTPUT_TYPES_TO_STRING, REGRESSION_TASKS, \ +from autoPyTorch.constants import ( + BINARY, + CLASSIFICATION_TASKS, + CONTINUOUS, + OUTPUT_TYPES_TO_STRING, + REGRESSION_TASKS, TASK_TYPES_TO_STRING +) from autoPyTorch.pipeline.components.training.trainer.base_trainer import BaseTrainerComponent, BudgetTracker from autoPyTorch.pipeline.components.training.metrics.utils import get_metrics From a11ee8e42c79a7b65103720d7993dcd191622ce7 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Tue, 23 Feb 2021 19:12:59 +0100 Subject: [PATCH 25/68] fix test import training --- test/test_pipeline/components/training/test_training.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_pipeline/components/training/test_training.py b/test/test_pipeline/components/training/test_training.py index 9005d1ad2..d6964fa14 100644 --- a/test/test_pipeline/components/training/test_training.py +++ b/test/test_pipeline/components/training/test_training.py @@ -27,7 +27,7 @@ ) sys.path.append(os.path.dirname(__file__)) -from test.test_pipeline.components.base import BaseTraining # noqa (E402: module level import not at top of file) +from test.test_pipeline.components.training.base import BaseTraining # noqa (E402: module level import not at top of file) class BaseDataLoaderTest(unittest.TestCase): From 789bd8d88754ac0c999800d823dfb75ebeb85875 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Tue, 23 Feb 2021 20:07:53 +0100 Subject: [PATCH 26/68] Fix tests and move to boston --- test/conftest.py | 4 ++-- test/test_api/test_api.py | 2 +- test/test_pipeline/test_tabular_regression.py | 9 ++++++++- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/test/conftest.py b/test/conftest.py index f05f573a7..31059326b 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -189,7 +189,7 @@ def get_tabular_data(task): validator = TabularInputValidator(is_classification=False).fit(X.copy(), y.copy()) elif task == "regression_categorical_only": - X, y = fetch_openml("cholesterol", return_X_y=True, as_frame=True) + X, y = fetch_openml("boston", return_X_y=True, as_frame=True) categorical_columns = [column for column in X.columns if X[column].dtype.name == 'category'] X = X[categorical_columns] @@ -207,7 +207,7 @@ def get_tabular_data(task): validator = TabularInputValidator(is_classification=False).fit(X.copy(), y.copy()) elif task == "regression_numerical_and_categorical": - X, y = fetch_openml("cholesterol", return_X_y=True, as_frame=True) + X, y = fetch_openml("boston", return_X_y=True, as_frame=True) # fill nan values for now since they are not handled properly yet for column in X.columns: diff --git a/test/test_api/test_api.py b/test/test_api/test_api.py index ea7cccd72..4ac194968 100644 --- a/test/test_api/test_api.py +++ b/test/test_api/test_api.py @@ -178,7 +178,7 @@ def test_tabular_classification(openml_id, resampling_strategy, backend): restored_estimator.predict(X_test) -@pytest.mark.parametrize('openml_name', ("cholesterol", )) +@pytest.mark.parametrize('openml_name', ("boston", )) @pytest.mark.parametrize('resampling_strategy', (HoldoutValTypes.holdout_validation, CrossValTypes.k_fold_cross_validation, )) diff --git a/test/test_pipeline/test_tabular_regression.py b/test/test_pipeline/test_tabular_regression.py index 15b8351f9..74de19405 100644 --- a/test/test_pipeline/test_tabular_regression.py +++ b/test/test_pipeline/test_tabular_regression.py @@ -39,7 +39,13 @@ def _assert_pipeline_search_space(self, pipeline, search_space_updates): assert any(update.node_name + ':' + update.hyperparameter in name for name in config_space.get_hyperparameter_names()), \ "Can't find hyperparameter: {}".format(update.hyperparameter) - hyperparameter = config_space.get_hyperparameter(update.node_name + ':' + update.hyperparameter + '_1') + # dimension reduction in embedding starts from 0 + if 'embedding' in update.node_name: + hyperparameter = config_space.get_hyperparameter( + update.node_name + ':' + update.hyperparameter + '_0') + else: + hyperparameter = config_space.get_hyperparameter( + update.node_name + ':' + update.hyperparameter + '_1') assert update.default_value == hyperparameter.default_value if isinstance(hyperparameter, (UniformIntegerHyperparameter, UniformFloatHyperparameter)): assert update.value_range[0] == hyperparameter.lower @@ -199,6 +205,7 @@ def test_network_optimizer_lr_handshake(self, fit_dictionary_tabular): # Make sure that fitting a network adds a "network" to X assert 'network' in pipeline.named_steps.keys() + fit_dictionary_tabular['network_embedding'] = torch.nn.Linear(3, 3) fit_dictionary_tabular['network_backbone'] = torch.nn.Linear(3, 4) fit_dictionary_tabular['network_head'] = torch.nn.Linear(4, 1) X = pipeline.named_steps['network'].fit( From 85e178fa8f7a2c77479df52a5834e0ccca0a967a Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Thu, 25 Feb 2021 18:04:43 +0100 Subject: [PATCH 27/68] Debug issue with python 3.6 --- examples/example_tmp_for_debug.py | 168 ++++++++++++++++++++++++++++++ 1 file changed, 168 insertions(+) create mode 100644 examples/example_tmp_for_debug.py diff --git a/examples/example_tmp_for_debug.py b/examples/example_tmp_for_debug.py new file mode 100644 index 000000000..ca31c8f4b --- /dev/null +++ b/examples/example_tmp_for_debug.py @@ -0,0 +1,168 @@ +import os + +import sklearn.datasets +import time +import shutil + +from autoPyTorch.utils.backend import create + +from autoPyTorch.api.tabular_classification import TabularClassificationTask +from autoPyTorch.datasets.resampling_strategy import ( + CrossValTypes, +) +import re + + +from pathlib import Path + + +class DisplayablePath(object): + display_filename_prefix_middle = '├──' + display_filename_prefix_last = '└──' + display_parent_prefix_middle = ' ' + display_parent_prefix_last = '│ ' + + def __init__(self, path, parent_path, is_last): + self.path = Path(str(path)) + self.parent = parent_path + self.is_last = is_last + if self.parent: + self.depth = self.parent.depth + 1 + else: + self.depth = 0 + + @property + def displayname(self): + if self.path.is_dir(): + return self.path.name + '/' + return self.path.name + + @classmethod + def make_tree(cls, root, parent=None, is_last=False, criteria=None): + root = Path(str(root)) + criteria = criteria or cls._default_criteria + + displayable_root = cls(root, parent, is_last) + yield displayable_root + + children = sorted(list(path + for path in root.iterdir() + if criteria(path)), + key=lambda s: str(s).lower()) + count = 1 + for path in children: + is_last = count == len(children) + if path.is_dir(): + yield from cls.make_tree(path, + parent=displayable_root, + is_last=is_last, + criteria=criteria) + else: + yield cls(path, displayable_root, is_last) + count += 1 + + @classmethod + def _default_criteria(cls, path): + return True + + @property + def displayname(self): + if self.path.is_dir(): + return self.path.name + '/' + return self.path.name + + def displayable(self): + if self.parent is None: + return self.displayname + + _filename_prefix = (self.display_filename_prefix_last + if self.is_last + else self.display_filename_prefix_middle) + + parts = ['{!s} {!s}'.format(_filename_prefix, + self.displayname)] + + parent = self.parent + while parent and parent.parent is not None: + parts.append(self.display_parent_prefix_middle + if parent.is_last + else self.display_parent_prefix_last) + parent = parent.parent + + return ''.join(reversed(parts)) + + +def slugify(text): + return re.sub(r'[\[\]]+', '-', text.lower()) + + +test_dir = os.path.dirname(__file__) +tmp = slugify(os.path.join( + test_dir, '.tmp__%s' % __file__)) +output = slugify(os.path.join( + test_dir, '.output__%s' % __file__)) + +for dir in (tmp, output): + for i in range(10): + if os.path.exists(dir): + try: + shutil.rmtree(dir) + break + except OSError: + time.sleep(1) + +# Make sure the folders we wanna create do not already exist. +backend = create( + tmp, + output, + delete_tmp_folder_after_terminate=True, + delete_output_folder_after_terminate=True, +) + +openml_id = 40981 +resampling_strategy = CrossValTypes.k_fold_cross_validation +X, y = sklearn.datasets.fetch_openml( + data_id=int(openml_id), + return_X_y=True, as_frame=True +) +X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( + X, y, random_state=1) + +# Search for a good configuration +estimator = TabularClassificationTask( + backend=backend, + resampling_strategy=resampling_strategy, +) + +estimator.search( + X_train=X_train, y_train=y_train, + X_test=X_test, y_test=y_test, + optimize_metric='accuracy', + total_walltime_limit=150, + func_eval_time_limit=50, + traditional_per_total_budget=0 +) + +# Search for an existing run key in disc. A individual model might have +# a timeout and hence was not written to disc +for i, (run_key, value) in enumerate(estimator.run_history.data.items()): + if i == 0: + # Ignore dummy run + continue + if 'SUCCESS' not in str(value.status): + continue + + run_key_model_run_dir = estimator._backend.get_numrun_directory( + estimator.seed, run_key.config_id, run_key.budget) + if os.path.exists(run_key_model_run_dir): + break + + +model_file = os.path.join( + run_key_model_run_dir, + f"{estimator.seed}.{run_key.config_id}.{run_key.budget}.cv_model" +) +if not os.path.exists(model_file): + paths = DisplayablePath.make_tree(run_key_model_run_dir) + for path in paths: + print(path.displayable()) From ddf198bcb85d6953cc9de1531ec7212ef5d70a48 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Fri, 26 Feb 2021 15:00:25 +0100 Subject: [PATCH 28/68] Run only debug file --- .github/workflows/examples.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/examples.yml b/.github/workflows/examples.yml index b278a8563..53222930b 100644 --- a/.github/workflows/examples.yml +++ b/.github/workflows/examples.yml @@ -30,5 +30,7 @@ jobs: echo "::set-output name=BEFORE::$(git status --porcelain -b)" - name: Run tests run: | - python examples/example_tabular_classification.py - python examples/example_image_classification.py +# python examples/example_tabular_classification.py +# python examples/example_image_classification.py +# python examples/example_tabular_regression.py + python examples/example_tmp_for_debug.py \ No newline at end of file From a073f066152268f7441f053462d9dfae0d17563e Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Fri, 26 Feb 2021 14:31:17 +0100 Subject: [PATCH 29/68] Debug for python3.6 --- .github/workflows/examples.yml | 2 +- examples/example_tmp_for_debug.py | 183 +++++++----------------------- test/test_api/test_api.py | 10 +- test/utils.py | 77 +++++++++++++ 4 files changed, 127 insertions(+), 145 deletions(-) create mode 100644 test/utils.py diff --git a/.github/workflows/examples.yml b/.github/workflows/examples.yml index 53222930b..09b52f4e6 100644 --- a/.github/workflows/examples.yml +++ b/.github/workflows/examples.yml @@ -8,7 +8,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [3.8] + python-version: [3.6] fail-fast: false max-parallel: 2 diff --git a/examples/example_tmp_for_debug.py b/examples/example_tmp_for_debug.py index ca31c8f4b..664eac866 100644 --- a/examples/example_tmp_for_debug.py +++ b/examples/example_tmp_for_debug.py @@ -1,123 +1,17 @@ +""" +Example file to be deleted +""" import os import sklearn.datasets -import time -import shutil - -from autoPyTorch.utils.backend import create from autoPyTorch.api.tabular_classification import TabularClassificationTask from autoPyTorch.datasets.resampling_strategy import ( CrossValTypes, ) -import re - - -from pathlib import Path - - -class DisplayablePath(object): - display_filename_prefix_middle = '├──' - display_filename_prefix_last = '└──' - display_parent_prefix_middle = ' ' - display_parent_prefix_last = '│ ' - - def __init__(self, path, parent_path, is_last): - self.path = Path(str(path)) - self.parent = parent_path - self.is_last = is_last - if self.parent: - self.depth = self.parent.depth + 1 - else: - self.depth = 0 - - @property - def displayname(self): - if self.path.is_dir(): - return self.path.name + '/' - return self.path.name - - @classmethod - def make_tree(cls, root, parent=None, is_last=False, criteria=None): - root = Path(str(root)) - criteria = criteria or cls._default_criteria - - displayable_root = cls(root, parent, is_last) - yield displayable_root - - children = sorted(list(path - for path in root.iterdir() - if criteria(path)), - key=lambda s: str(s).lower()) - count = 1 - for path in children: - is_last = count == len(children) - if path.is_dir(): - yield from cls.make_tree(path, - parent=displayable_root, - is_last=is_last, - criteria=criteria) - else: - yield cls(path, displayable_root, is_last) - count += 1 - - @classmethod - def _default_criteria(cls, path): - return True - - @property - def displayname(self): - if self.path.is_dir(): - return self.path.name + '/' - return self.path.name - - def displayable(self): - if self.parent is None: - return self.displayname - - _filename_prefix = (self.display_filename_prefix_last - if self.is_last - else self.display_filename_prefix_middle) +from test.utils import DisplayablePath - parts = ['{!s} {!s}'.format(_filename_prefix, - self.displayname)] - parent = self.parent - while parent and parent.parent is not None: - parts.append(self.display_parent_prefix_middle - if parent.is_last - else self.display_parent_prefix_last) - parent = parent.parent - - return ''.join(reversed(parts)) - - -def slugify(text): - return re.sub(r'[\[\]]+', '-', text.lower()) - - -test_dir = os.path.dirname(__file__) -tmp = slugify(os.path.join( - test_dir, '.tmp__%s' % __file__)) -output = slugify(os.path.join( - test_dir, '.output__%s' % __file__)) - -for dir in (tmp, output): - for i in range(10): - if os.path.exists(dir): - try: - shutil.rmtree(dir) - break - except OSError: - time.sleep(1) - -# Make sure the folders we wanna create do not already exist. -backend = create( - tmp, - output, - delete_tmp_folder_after_terminate=True, - delete_output_folder_after_terminate=True, -) openml_id = 40981 resampling_strategy = CrossValTypes.k_fold_cross_validation @@ -128,41 +22,44 @@ def slugify(text): X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( X, y, random_state=1) -# Search for a good configuration -estimator = TabularClassificationTask( - backend=backend, - resampling_strategy=resampling_strategy, -) - -estimator.search( - X_train=X_train, y_train=y_train, - X_test=X_test, y_test=y_test, - optimize_metric='accuracy', - total_walltime_limit=150, - func_eval_time_limit=50, - traditional_per_total_budget=0 -) -# Search for an existing run key in disc. A individual model might have -# a timeout and hence was not written to disc -for i, (run_key, value) in enumerate(estimator.run_history.data.items()): - if i == 0: - # Ignore dummy run - continue - if 'SUCCESS' not in str(value.status): - continue +if __name__ == '__main__': + # Search for a good configuration + estimator = TabularClassificationTask( + temporary_directory='./tmp', + delete_tmp_folder_after_terminate=False, + resampling_strategy=resampling_strategy, + ) + + estimator.search( + X_train=X_train, y_train=y_train, + X_test=X_test, y_test=y_test, + optimize_metric='accuracy', + total_walltime_limit=150, + func_eval_time_limit=50, + traditional_per_total_budget=0 + ) + + # Search for an existing run key in disc. A individual model might have + # a timeout and hence was not written to disc + for i, (run_key, value) in enumerate(estimator.run_history.data.items()): + if i == 0: + # Ignore dummy run + continue + if 'SUCCESS' not in str(value.status): + continue + + run_key_model_run_dir = estimator._backend.get_numrun_directory( + estimator.seed, run_key.config_id, run_key.budget) + if os.path.exists(run_key_model_run_dir): + break + + + model_file = os.path.join( + run_key_model_run_dir, + f"{estimator.seed}.{run_key.config_id}.{run_key.budget}.cv_model" + ) - run_key_model_run_dir = estimator._backend.get_numrun_directory( - estimator.seed, run_key.config_id, run_key.budget) - if os.path.exists(run_key_model_run_dir): - break - - -model_file = os.path.join( - run_key_model_run_dir, - f"{estimator.seed}.{run_key.config_id}.{run_key.budget}.cv_model" -) -if not os.path.exists(model_file): paths = DisplayablePath.make_tree(run_key_model_run_dir) for path in paths: print(path.displayable()) diff --git a/test/test_api/test_api.py b/test/test_api/test_api.py index 4ac194968..4cf615e97 100644 --- a/test/test_api/test_api.py +++ b/test/test_api/test_api.py @@ -22,6 +22,7 @@ HoldoutValTypes, ) +from test.utils import DisplayablePath # Fixtures # ======== @@ -120,7 +121,14 @@ def test_tabular_classification(openml_id, resampling_strategy, backend): run_key_model_run_dir, f"{estimator.seed}.{run_key.config_id}.{run_key.budget}.cv_model" ) - assert os.path.exists(model_file), model_file + try: + assert os.path.exists(model_file), model_file + except AssertionError: + paths = DisplayablePath.make_tree(run_key_model_run_dir) + for path in paths: + print(path.displayable()) + raise AssertionError(model_file) + model = estimator._backend.load_cv_model_by_seed_and_id_and_budget( estimator.seed, run_key.config_id, run_key.budget) assert isinstance(model, VotingClassifier) diff --git a/test/utils.py b/test/utils.py new file mode 100644 index 000000000..b1c919c4c --- /dev/null +++ b/test/utils.py @@ -0,0 +1,77 @@ +from pathlib import Path + + +class DisplayablePath(object): + display_filename_prefix_middle = '├──' + display_filename_prefix_last = '└──' + display_parent_prefix_middle = ' ' + display_parent_prefix_last = '│ ' + + def __init__(self, path, parent_path, is_last): + self.path = Path(str(path)) + self.parent = parent_path + self.is_last = is_last + if self.parent: + self.depth = self.parent.depth + 1 + else: + self.depth = 0 + + @property + def displayname(self): + if self.path.is_dir(): + return self.path.name + '/' + return self.path.name + + @classmethod + def make_tree(cls, root, parent=None, is_last=False, criteria=None): + root = Path(str(root)) + criteria = criteria or cls._default_criteria + + displayable_root = cls(root, parent, is_last) + yield displayable_root + + children = sorted(list(path + for path in root.iterdir() + if criteria(path)), + key=lambda s: str(s).lower()) + count = 1 + for path in children: + is_last = count == len(children) + if path.is_dir(): + yield from cls.make_tree(path, + parent=displayable_root, + is_last=is_last, + criteria=criteria) + else: + yield cls(path, displayable_root, is_last) + count += 1 + + @classmethod + def _default_criteria(cls, path): + return True + + @property + def displayname(self): + if self.path.is_dir(): + return self.path.name + '/' + return self.path.name + + def displayable(self): + if self.parent is None: + return self.displayname + + _filename_prefix = (self.display_filename_prefix_last + if self.is_last + else self.display_filename_prefix_middle) + + parts = ['{!s} {!s}'.format(_filename_prefix, + self.displayname)] + + parent = self.parent + while parent and parent.parent is not None: + parts.append(self.display_parent_prefix_middle + if parent.is_last + else self.display_parent_prefix_last) + parent = parent.parent + + return ''.join(reversed(parts)) From e625ee7bd214f6c7b5f8eeb4d93bf851d97dd003 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Fri, 26 Feb 2021 15:57:41 +0100 Subject: [PATCH 30/68] print paths of parent dir --- examples/example_tmp_for_debug.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/example_tmp_for_debug.py b/examples/example_tmp_for_debug.py index 664eac866..9ecaca32e 100644 --- a/examples/example_tmp_for_debug.py +++ b/examples/example_tmp_for_debug.py @@ -60,6 +60,6 @@ f"{estimator.seed}.{run_key.config_id}.{run_key.budget}.cv_model" ) - paths = DisplayablePath.make_tree(run_key_model_run_dir) + paths = DisplayablePath.make_tree(os.path.dirname(run_key_model_run_dir)) for path in paths: print(path.displayable()) From 9164bc2a49dcd3509223613e67ed5f365a17f079 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Fri, 26 Feb 2021 16:33:17 +0100 Subject: [PATCH 31/68] Trying to run examples --- .github/workflows/examples.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/.github/workflows/examples.yml b/.github/workflows/examples.yml index 09b52f4e6..680bee134 100644 --- a/.github/workflows/examples.yml +++ b/.github/workflows/examples.yml @@ -30,7 +30,4 @@ jobs: echo "::set-output name=BEFORE::$(git status --porcelain -b)" - name: Run tests run: | -# python examples/example_tabular_classification.py -# python examples/example_image_classification.py -# python examples/example_tabular_regression.py python examples/example_tmp_for_debug.py \ No newline at end of file From f1beb14b7db35c0948967d1a5572515752748e51 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Sat, 27 Feb 2021 13:27:17 +0100 Subject: [PATCH 32/68] Trying to run examples --- examples/example_tmp_for_debug.py | 78 ++++++++++++++++++++++++++++++- test/utils.py | 6 --- 2 files changed, 77 insertions(+), 7 deletions(-) diff --git a/examples/example_tmp_for_debug.py b/examples/example_tmp_for_debug.py index 9ecaca32e..15cc283da 100644 --- a/examples/example_tmp_for_debug.py +++ b/examples/example_tmp_for_debug.py @@ -9,8 +9,84 @@ from autoPyTorch.datasets.resampling_strategy import ( CrossValTypes, ) -from test.utils import DisplayablePath +from pathlib import Path + + +class DisplayablePath(object): + display_filename_prefix_middle = '├──' + display_filename_prefix_last = '└──' + display_parent_prefix_middle = ' ' + display_parent_prefix_last = '│ ' + + def __init__(self, path, parent_path, is_last): + self.path = Path(str(path)) + self.parent = parent_path + self.is_last = is_last + if self.parent: + self.depth = self.parent.depth + 1 + else: + self.depth = 0 + + @property + def displayname(self): + if self.path.is_dir(): + return self.path.name + '/' + return self.path.name + + @classmethod + def make_tree(cls, root, parent=None, is_last=False, criteria=None): + root = Path(str(root)) + criteria = criteria or cls._default_criteria + + displayable_root = cls(root, parent, is_last) + yield displayable_root + + children = sorted(list(path + for path in root.iterdir() + if criteria(path)), + key=lambda s: str(s).lower()) + count = 1 + for path in children: + is_last = count == len(children) + if path.is_dir(): + yield from cls.make_tree(path, + parent=displayable_root, + is_last=is_last, + criteria=criteria) + else: + yield cls(path, displayable_root, is_last) + count += 1 + + @classmethod + def _default_criteria(cls, path): + return True + + @property + def displayname(self): + if self.path.is_dir(): + return self.path.name + '/' + return self.path.name + + def displayable(self): + if self.parent is None: + return self.displayname + + _filename_prefix = (self.display_filename_prefix_last + if self.is_last + else self.display_filename_prefix_middle) + + parts = ['{!s} {!s}'.format(_filename_prefix, + self.displayname)] + + parent = self.parent + while parent and parent.parent is not None: + parts.append(self.display_parent_prefix_middle + if parent.is_last + else self.display_parent_prefix_last) + parent = parent.parent + + return ''.join(reversed(parts)) openml_id = 40981 diff --git a/test/utils.py b/test/utils.py index b1c919c4c..171d4d052 100644 --- a/test/utils.py +++ b/test/utils.py @@ -50,12 +50,6 @@ def make_tree(cls, root, parent=None, is_last=False, criteria=None): def _default_criteria(cls, path): return True - @property - def displayname(self): - if self.path.is_dir(): - return self.path.name + '/' - return self.path.name - def displayable(self): if self.parent is None: return self.displayname From af17afcebc2c794e5c296002baa82749fcd40f0e Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Mon, 1 Mar 2021 12:12:37 +0100 Subject: [PATCH 33/68] Add success model --- examples/example_tmp_for_debug.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/example_tmp_for_debug.py b/examples/example_tmp_for_debug.py index 15cc283da..ae6a4ca92 100644 --- a/examples/example_tmp_for_debug.py +++ b/examples/example_tmp_for_debug.py @@ -130,12 +130,13 @@ def displayable(self): if os.path.exists(run_key_model_run_dir): break - model_file = os.path.join( run_key_model_run_dir, f"{estimator.seed}.{run_key.config_id}.{run_key.budget}.cv_model" ) + print(model_file) + paths = DisplayablePath.make_tree(os.path.dirname(run_key_model_run_dir)) for path in paths: print(path.displayable()) From d64e4fd6011f148cfb2438d54449a77af1395fdb Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Mon, 1 Mar 2021 12:18:06 +0100 Subject: [PATCH 34/68] Added parent directory for printing paths --- test/test_api/test_api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_api/test_api.py b/test/test_api/test_api.py index 4cf615e97..30ddaeed6 100644 --- a/test/test_api/test_api.py +++ b/test/test_api/test_api.py @@ -124,7 +124,7 @@ def test_tabular_classification(openml_id, resampling_strategy, backend): try: assert os.path.exists(model_file), model_file except AssertionError: - paths = DisplayablePath.make_tree(run_key_model_run_dir) + paths = DisplayablePath.make_tree(os.path.dirname(run_key_model_run_dir)) for path in paths: print(path.displayable()) raise AssertionError(model_file) From 1602933a50d30c1b746b79ddb28fd4c40ff1ab68 Mon Sep 17 00:00:00 2001 From: chico Date: Mon, 1 Mar 2021 12:30:36 +0100 Subject: [PATCH 35/68] Try no autouse --- test/conftest.py | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/test/conftest.py b/test/conftest.py index e18219a80..c8ff6529e 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -25,7 +25,7 @@ from autoPyTorch.utils.pipeline import get_dataset_requirements -@pytest.fixture(scope="session", autouse=True) +@pytest.fixture(scope="session") def callattr_ahead_of_alltests(request): """ This procedure will run at the start of the pytest session. @@ -50,20 +50,14 @@ def callattr_ahead_of_alltests(request): 3916, # kc1 ] - # Try to populate the tests 5 times - patience = 5 - for i in range(patience): - try: - # Populate the cache - openml.populate_cache(task_ids=tasks_used) - # Also the bunch - for task in tasks_used: - fetch_openml(data_id=openml.tasks.get_task(task).dataset_id, - return_X_y=True) - break - except Exception as e: - if i == patience - 1: - raise e + # Populate the cache + # This will make the test fail immediately rather than + # Waiting for a openml fetch timeout + openml.populate_cache(task_ids=tasks_used) + # Also the bunch + for task in tasks_used: + fetch_openml(data_id=openml.tasks.get_task(task).dataset_id, + return_X_y=True) return From c8d98babcaf4f9d9e7a5da0129cfcc80129453d1 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Mon, 1 Mar 2021 13:12:17 +0100 Subject: [PATCH 36/68] print log file to see if backend is saving num run --- autoPyTorch/evaluation/abstract_evaluator.py | 1 + autoPyTorch/utils/backend.py | 3 +++ examples/example_tmp_for_debug.py | 8 ++++++++ 3 files changed, 12 insertions(+) diff --git a/autoPyTorch/evaluation/abstract_evaluator.py b/autoPyTorch/evaluation/abstract_evaluator.py index c1f7da60d..fcc3507cc 100644 --- a/autoPyTorch/evaluation/abstract_evaluator.py +++ b/autoPyTorch/evaluation/abstract_evaluator.py @@ -538,6 +538,7 @@ def file_output( else: pipeline = None + self.logger.debug("Saving directory {}, {}, {}".format(self.seed, self.num_run, self.budget)) self.backend.save_numrun_to_dir( seed=int(self.seed), idx=int(self.num_run), diff --git a/autoPyTorch/utils/backend.py b/autoPyTorch/utils/backend.py index dd24c2340..5111c116f 100644 --- a/autoPyTorch/utils/backend.py +++ b/autoPyTorch/utils/backend.py @@ -392,6 +392,7 @@ def save_numrun_to_dir( cv_model: Optional[BasePipeline], ensemble_predictions: Optional[np.ndarray], valid_predictions: Optional[np.ndarray], test_predictions: Optional[np.ndarray], ) -> None: + assert self._logger is not None runs_directory = self.get_runs_directory() tmpdir = tempfile.mkdtemp(dir=runs_directory) if model is not None: @@ -417,6 +418,8 @@ def save_numrun_to_dir( with open(file_path, 'wb') as fh: pickle.dump(preds.astype(np.float32), fh, -1) try: + self._logger.debug("Renaming {} to {}".format(tmpdir, + self.get_numrun_directory(seed, idx, budget))) os.rename(tmpdir, self.get_numrun_directory(seed, idx, budget)) except OSError: if os.path.exists(self.get_numrun_directory(seed, idx, budget)): diff --git a/examples/example_tmp_for_debug.py b/examples/example_tmp_for_debug.py index ae6a4ca92..52fc1d76c 100644 --- a/examples/example_tmp_for_debug.py +++ b/examples/example_tmp_for_debug.py @@ -140,3 +140,11 @@ def displayable(self): paths = DisplayablePath.make_tree(os.path.dirname(run_key_model_run_dir)) for path in paths: print(path.displayable()) + + # printing log file + tmp_dir = estimator._backend.temporary_directory + log_file = os.path.join(tmp_dir, "AutoPyTorch:{}:{}.log".format(estimator.dataset_name, estimator.seed)) + f = open(log_file, 'r') + lines = f.readlines() + for line in lines: + print(line) From 76fcd766efe63539b0e949de713fcd6ed4ece2af Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Mon, 1 Mar 2021 15:06:00 +0100 Subject: [PATCH 37/68] Setup logger in backend --- autoPyTorch/evaluation/abstract_evaluator.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/autoPyTorch/evaluation/abstract_evaluator.py b/autoPyTorch/evaluation/abstract_evaluator.py index fcc3507cc..29075841a 100644 --- a/autoPyTorch/evaluation/abstract_evaluator.py +++ b/autoPyTorch/evaluation/abstract_evaluator.py @@ -331,6 +331,8 @@ def __init__(self, backend: Backend, name=logger_name, port=logger_port, ) + self.backend.setup_logger(name=logger_name, port=logger_port) + self.Y_optimization: Optional[np.ndarray] = None self.Y_actual_train: Optional[np.ndarray] = None self.pipelines: Optional[List[BaseEstimator]] = None From ffc162031a6ca87f65ab52615c25a4ca4228c918 Mon Sep 17 00:00:00 2001 From: Ravin Kohli <13005107+ravinkohli@users.noreply.github.com> Date: Mon, 1 Mar 2021 17:23:57 +0100 Subject: [PATCH 38/68] handle nans in categorical columns (#118) * handle nans in categorical columns * Fixed error in self dtypes * Addressed comments from francisco * Forgot to commit * Fix flake --- autoPyTorch/data/tabular_feature_validator.py | 79 ++++++++++--------- test/test_data/test_feature_validator.py | 13 ++- 2 files changed, 51 insertions(+), 41 deletions(-) diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py index fb9a72082..e73b66bb1 100644 --- a/autoPyTorch/data/tabular_feature_validator.py +++ b/autoPyTorch/data/tabular_feature_validator.py @@ -11,7 +11,7 @@ import sklearn.utils from sklearn import preprocessing from sklearn.base import BaseEstimator -from sklearn.compose import make_column_transformer +from sklearn.compose import ColumnTransformer from sklearn.exceptions import NotFittedError from autoPyTorch.data.base_feature_validator import BaseFeatureValidator, SUPPORTED_FEAT_TYPES @@ -53,16 +53,34 @@ def _fit( for column in X.columns: if X[column].isna().all(): X[column] = pd.to_numeric(X[column]) + # Also note this change in self.dtypes + if len(self.dtypes) != 0: + self.dtypes[list(X.columns).index(column)] = X[column].dtype self.enc_columns, self.feat_type = self._get_columns_to_encode(X) if len(self.enc_columns) > 0: - - self.encoder = make_column_transformer( - (preprocessing.OrdinalEncoder( - handle_unknown='use_encoded_value', - unknown_value=-1, - ), self.enc_columns), + # impute missing values before encoding, + # remove once sklearn natively supports + # it in ordinal encoding. Sklearn issue: + # "https://github.com/scikit-learn/scikit-learn/issues/17123)" + for column in self.enc_columns: + if X[column].isna().any(): + missing_value: typing.Union[int, str] = -1 + # make sure for a string column we give + # string missing value else we give numeric + if type(X[column][0]) == str: + missing_value = str(missing_value) + X[column] = X[column].cat.add_categories([missing_value]) + X[column] = X[column].fillna(missing_value) + + self.encoder = ColumnTransformer( + [ + ("encoder", + preprocessing.OrdinalEncoder( + handle_unknown='use_encoded_value', + unknown_value=-1, + ), self.enc_columns)], remainder="passthrough" ) @@ -85,6 +103,7 @@ def comparator(cmp1: str, cmp2: str) -> int: return 1 else: raise ValueError((cmp1, cmp2)) + self.feat_type = sorted( self.feat_type, key=functools.cmp_to_key(comparator) @@ -182,9 +201,8 @@ def _check_data( if not isinstance(X, (np.ndarray, pd.DataFrame)) and not scipy.sparse.issparse(X): raise ValueError("AutoPyTorch only supports Numpy arrays, Pandas DataFrames," " scipy sparse and Python Lists, yet, the provided input is" - " of type {}".format( - type(X) - )) + " of type {}".format(type(X)) + ) if self.data_type is None: self.data_type = type(X) @@ -217,28 +235,14 @@ def _check_data( # per estimator enc_columns, _ = self._get_columns_to_encode(X) - if len(enc_columns) > 0: - if np.any(pd.isnull( - X[enc_columns].dropna( # type: ignore[call-overload] - axis='columns', how='all') - )): - # Ignore all NaN columns, and if still a NaN - # Error out - raise ValueError("Categorical features in a dataframe cannot contain " - "missing/NaN values. The OrdinalEncoder used by " - "AutoPyTorch cannot handle this yet (due to a " - "limitation on scikit-learn being addressed via: " - "https://github.com/scikit-learn/scikit-learn/issues/17123)" - ) column_order = [column for column in X.columns] if len(self.column_order) > 0: if self.column_order != column_order: raise ValueError("Changing the column order of the features after fit() is " "not supported. Fit() method was called with " - "{} whereas the new features have {} as type".format( - self.column_order, - column_order, - )) + "{} whereas the new features have {} as type".format(self.column_order, + column_order,) + ) else: self.column_order = column_order dtypes = [dtype.name for dtype in X.dtypes] @@ -246,10 +250,10 @@ def _check_data( if self.dtypes != dtypes: raise ValueError("Changing the dtype of the features after fit() is " "not supported. Fit() method was called with " - "{} whereas the new features have {} as type".format( - self.dtypes, - dtypes, - )) + "{} whereas the new features have {} as type".format(self.dtypes, + dtypes, + ) + ) else: self.dtypes = dtypes @@ -294,7 +298,8 @@ def _get_columns_to_encode( "pandas.Series.astype ." "If working with string objects, the following " "tutorial illustrates how to work with text data: " - "https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html".format( # noqa: E501 + "https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html".format( + # noqa: E501 column, ) ) @@ -349,15 +354,13 @@ def list_to_dataframe( # If a list was provided, it will be converted to pandas X_train = pd.DataFrame(data=X_train).infer_objects() self.logger.warning("The provided feature types to AutoPyTorch are of type list." - "Features have been interpreted as: {}".format( - [(col, t) for col, t in zip(X_train.columns, X_train.dtypes)] - )) + "Features have been interpreted as: {}".format([(col, t) for col, t in + zip(X_train.columns, X_train.dtypes)])) if X_test is not None: if not isinstance(X_test, list): self.logger.warning("Train features are a list while the provided test data" - "is {}. X_test will be casted as DataFrame.".format( - type(X_test) - )) + "is {}. X_test will be casted as DataFrame.".format(type(X_test)) + ) X_test = pd.DataFrame(data=X_test).infer_objects() return X_train, X_test diff --git a/test/test_data/test_feature_validator.py b/test/test_data/test_feature_validator.py index afa2b43e1..6d90ef2f9 100644 --- a/test/test_data/test_feature_validator.py +++ b/test/test_data/test_feature_validator.py @@ -231,10 +231,17 @@ def test_featurevalidator_unsupported_numpy(input_data_featuretest): ), indirect=True ) -def test_featurevalidator_unsupported_pandas(input_data_featuretest): +def test_featurevalidator_categorical_nan(input_data_featuretest): validator = TabularFeatureValidator() - with pytest.raises(ValueError, match=r"Categorical features in a dataframe.*missing/NaN"): - validator.fit(input_data_featuretest) + validator.fit(input_data_featuretest) + transformed_X = validator.transform(input_data_featuretest) + assert any(pd.isna(input_data_featuretest)) + assert any((-1 in categories) or ('-1' in categories) for categories in + validator.encoder.named_transformers_['encoder'].categories_) + assert np.shape(input_data_featuretest) == np.shape(transformed_X) + assert np.issubdtype(transformed_X.dtype, np.number) + assert validator._is_fitted + assert isinstance(transformed_X, np.ndarray) @pytest.mark.parametrize( From 3f39f58b4901dda71ad596aabc0c224d0741d415 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Mon, 1 Mar 2021 21:11:28 +0100 Subject: [PATCH 39/68] try without embeddings --- test/test_api/test_api.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test/test_api/test_api.py b/test/test_api/test_api.py index 30ddaeed6..0b392ceae 100644 --- a/test/test_api/test_api.py +++ b/test/test_api/test_api.py @@ -48,6 +48,7 @@ def test_tabular_classification(openml_id, resampling_strategy, backend): estimator = TabularClassificationTask( backend=backend, resampling_strategy=resampling_strategy, + include_components={'network_embedding': ['NoEmbedding']} ) estimator.search( From 715d277981beff48c3903bf5885c50eee0fd5904 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Mon, 8 Feb 2021 14:22:26 +0100 Subject: [PATCH 40/68] work in progress --- .../network/network_embedding/__init__.py | 0 .../base_network_embedding.py | 16 +++++ .../network_embedding/modules/__init__.py | 0 .../modules/learned_entity_embedding.py | 69 +++++++++++++++++++ .../network_embedding/modules/no_embedding.py | 12 ++++ 5 files changed, 97 insertions(+) create mode 100644 autoPyTorch/pipeline/components/setup/network/network_embedding/__init__.py create mode 100644 autoPyTorch/pipeline/components/setup/network/network_embedding/base_network_embedding.py create mode 100644 autoPyTorch/pipeline/components/setup/network/network_embedding/modules/__init__.py create mode 100644 autoPyTorch/pipeline/components/setup/network/network_embedding/modules/learned_entity_embedding.py create mode 100644 autoPyTorch/pipeline/components/setup/network/network_embedding/modules/no_embedding.py diff --git a/autoPyTorch/pipeline/components/setup/network/network_embedding/__init__.py b/autoPyTorch/pipeline/components/setup/network/network_embedding/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/autoPyTorch/pipeline/components/setup/network/network_embedding/base_network_embedding.py b/autoPyTorch/pipeline/components/setup/network/network_embedding/base_network_embedding.py new file mode 100644 index 000000000..5615062d4 --- /dev/null +++ b/autoPyTorch/pipeline/components/setup/network/network_embedding/base_network_embedding.py @@ -0,0 +1,16 @@ +from typing import Optional, Any + +from sklearn.base import BaseEstimator +from torch import nn + +from autoPyTorch.pipeline.components.setup.base_setup import autoPyTorchSetupComponent + + +class NetworkEmbeddingComponent(autoPyTorchSetupComponent): + def __init__(self, + **kwargs): + super().__init__() + self.config = kwargs + self.embedding: Optional[nn.Module] = None + + # def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator: \ No newline at end of file diff --git a/autoPyTorch/pipeline/components/setup/network/network_embedding/modules/__init__.py b/autoPyTorch/pipeline/components/setup/network/network_embedding/modules/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/autoPyTorch/pipeline/components/setup/network/network_embedding/modules/learned_entity_embedding.py b/autoPyTorch/pipeline/components/setup/network/network_embedding/modules/learned_entity_embedding.py new file mode 100644 index 000000000..e6eb88f19 --- /dev/null +++ b/autoPyTorch/pipeline/components/setup/network/network_embedding/modules/learned_entity_embedding.py @@ -0,0 +1,69 @@ +""" +Class to learn an embedding for categorical hyperparameters. +""" + +import torch +import torch.nn as nn +import numpy as np + + +class LearnedEntityEmbedding(nn.Module): + """ Parent class for MlpNet, ResNet, ... Can use entity embedding for categorical features""" + + def __init__(self, config, in_features, num_numerical_features): + """ + Initialize the BaseFeatureNet. + Arguments: + config: The configuration sampled by the hyperparameter optimizer + in_features: the number of features of the dataset + one_hot_encoder: OneHot encoder, that is used to encode X + """ + super(LearnedEntityEmbedding, self).__init__() + self.config = config + + # self.num_numerical = len([f for f in one_hot_encoder.categorical_features if not f]) + # self.num_input_features = [len(c) for c in one_hot_encoder.categories_] + self.num_numerical = num_numerical_features + self.embed_features = [num_in >= config["min_unique_values_for_embedding"] for num_in in + self.num_input_features] + self.num_output_dimensions = [config["dimension_reduction_" + str(i)] * num_in for i, num_in in + enumerate(self.num_input_features)] + self.num_output_dimensions = [int(np.clip(num_out, 1, num_in - 1)) for num_out, num_in in + zip(self.num_output_dimensions, self.num_input_features)] + self.num_output_dimensions = [num_out if embed else num_in for num_out, embed, num_in in + zip(self.num_output_dimensions, self.embed_features, self.num_input_features)] + self.num_out_feats = self.num_numerical + sum(self.num_output_dimensions) + + self.ee_layers = self._create_ee_layers(in_features) + + def forward(self, x): + # pass the columns of each categorical feature through entity embedding layer + # before passing it through the model + concat_seq = [] + last_concat = 0 + x_pointer = 0 + layer_pointer = 0 + for num_in, embed in zip(self.num_input_features, self.embed_features): + if not embed: + x_pointer += 1 + continue + if x_pointer > last_concat: + concat_seq.append(x[:, last_concat: x_pointer]) + categorical_feature_slice = x[:, x_pointer: x_pointer + num_in] + concat_seq.append(self.ee_layers[layer_pointer](categorical_feature_slice)) + layer_pointer += 1 + x_pointer += num_in + last_concat = x_pointer + + concat_seq.append(x[:, last_concat:]) + return torch.cat(concat_seq, dim=1) + + def _create_ee_layers(self, in_features): + # entity embeding layers are Linear Layers + layers = nn.ModuleList() + for i, (num_in, embed, num_out) in enumerate( + zip(self.num_input_features, self.embed_features, self.num_output_dimensions)): + if not embed: + continue + layers.append(nn.Linear(num_in, num_out)) + return layers \ No newline at end of file diff --git a/autoPyTorch/pipeline/components/setup/network/network_embedding/modules/no_embedding.py b/autoPyTorch/pipeline/components/setup/network/network_embedding/modules/no_embedding.py new file mode 100644 index 000000000..cab1e9b7f --- /dev/null +++ b/autoPyTorch/pipeline/components/setup/network/network_embedding/modules/no_embedding.py @@ -0,0 +1,12 @@ +from torch import nn + + +class NoEmbedding(nn.Module): + def __init__(self, config, in_features, num_numerical_features): + super(NoEmbedding, self).__init__() + self.config = config + self.n_feats = in_features + self.num_numerical = num_numerical_features + + def forward(self, x): + return x \ No newline at end of file From d68a3912f1e461cf066ff2100820fc2c5727b088 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Mon, 8 Feb 2021 14:44:57 +0100 Subject: [PATCH 41/68] in progress --- .../base_network_embedding.py | 16 ------------ .../network_embedding/__init__.py | 0 .../base_network_embedding.py | 25 +++++++++++++++++++ .../network_embedding/modules/__init__.py | 0 .../modules/learned_entity_embedding.py | 7 +++--- .../network_embedding/modules/no_embedding.py | 0 6 files changed, 29 insertions(+), 19 deletions(-) delete mode 100644 autoPyTorch/pipeline/components/setup/network/network_embedding/base_network_embedding.py rename autoPyTorch/pipeline/components/setup/{network => }/network_embedding/__init__.py (100%) create mode 100644 autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py rename autoPyTorch/pipeline/components/setup/{network => }/network_embedding/modules/__init__.py (100%) rename autoPyTorch/pipeline/components/setup/{network => }/network_embedding/modules/learned_entity_embedding.py (93%) rename autoPyTorch/pipeline/components/setup/{network => }/network_embedding/modules/no_embedding.py (100%) diff --git a/autoPyTorch/pipeline/components/setup/network/network_embedding/base_network_embedding.py b/autoPyTorch/pipeline/components/setup/network/network_embedding/base_network_embedding.py deleted file mode 100644 index 5615062d4..000000000 --- a/autoPyTorch/pipeline/components/setup/network/network_embedding/base_network_embedding.py +++ /dev/null @@ -1,16 +0,0 @@ -from typing import Optional, Any - -from sklearn.base import BaseEstimator -from torch import nn - -from autoPyTorch.pipeline.components.setup.base_setup import autoPyTorchSetupComponent - - -class NetworkEmbeddingComponent(autoPyTorchSetupComponent): - def __init__(self, - **kwargs): - super().__init__() - self.config = kwargs - self.embedding: Optional[nn.Module] = None - - # def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator: \ No newline at end of file diff --git a/autoPyTorch/pipeline/components/setup/network/network_embedding/__init__.py b/autoPyTorch/pipeline/components/setup/network_embedding/__init__.py similarity index 100% rename from autoPyTorch/pipeline/components/setup/network/network_embedding/__init__.py rename to autoPyTorch/pipeline/components/setup/network_embedding/__init__.py diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py new file mode 100644 index 000000000..4c087a89d --- /dev/null +++ b/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py @@ -0,0 +1,25 @@ +from typing import Any, Dict, Optional + +from sklearn.base import BaseEstimator +from torch import nn + +from autoPyTorch.pipeline.components.setup.base_setup import autoPyTorchSetupComponent + + +class NetworkEmbeddingComponent(autoPyTorchSetupComponent): + def __init__(self, + **kwargs): + super().__init__() + self.config = kwargs + self.embedding: Optional[nn.Module] = None + + def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator: + in_features = X['X_train'].shape[1:] + + self.embedding = self.build_embedding( + in_features=in_features, + num_numerical_features=len(X['numerical_features'])) + return self + + def build_embedding(self, in_features, num_numerical_features) -> nn.Module: + raise NotImplementedError \ No newline at end of file diff --git a/autoPyTorch/pipeline/components/setup/network/network_embedding/modules/__init__.py b/autoPyTorch/pipeline/components/setup/network_embedding/modules/__init__.py similarity index 100% rename from autoPyTorch/pipeline/components/setup/network/network_embedding/modules/__init__.py rename to autoPyTorch/pipeline/components/setup/network_embedding/modules/__init__.py diff --git a/autoPyTorch/pipeline/components/setup/network/network_embedding/modules/learned_entity_embedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/modules/learned_entity_embedding.py similarity index 93% rename from autoPyTorch/pipeline/components/setup/network/network_embedding/modules/learned_entity_embedding.py rename to autoPyTorch/pipeline/components/setup/network_embedding/modules/learned_entity_embedding.py index e6eb88f19..d7d294661 100644 --- a/autoPyTorch/pipeline/components/setup/network/network_embedding/modules/learned_entity_embedding.py +++ b/autoPyTorch/pipeline/components/setup/network_embedding/modules/learned_entity_embedding.py @@ -10,7 +10,7 @@ class LearnedEntityEmbedding(nn.Module): """ Parent class for MlpNet, ResNet, ... Can use entity embedding for categorical features""" - def __init__(self, config, in_features, num_numerical_features): + def __init__(self, config, num_input_features, num_numerical_features): """ Initialize the BaseFeatureNet. Arguments: @@ -24,6 +24,7 @@ def __init__(self, config, in_features, num_numerical_features): # self.num_numerical = len([f for f in one_hot_encoder.categorical_features if not f]) # self.num_input_features = [len(c) for c in one_hot_encoder.categories_] self.num_numerical = num_numerical_features + self.num_input_features = num_input_features self.embed_features = [num_in >= config["min_unique_values_for_embedding"] for num_in in self.num_input_features] self.num_output_dimensions = [config["dimension_reduction_" + str(i)] * num_in for i, num_in in @@ -34,7 +35,7 @@ def __init__(self, config, in_features, num_numerical_features): zip(self.num_output_dimensions, self.embed_features, self.num_input_features)] self.num_out_feats = self.num_numerical + sum(self.num_output_dimensions) - self.ee_layers = self._create_ee_layers(in_features) + self.ee_layers = self._create_ee_layers() def forward(self, x): # pass the columns of each categorical feature through entity embedding layer @@ -58,7 +59,7 @@ def forward(self, x): concat_seq.append(x[:, last_concat:]) return torch.cat(concat_seq, dim=1) - def _create_ee_layers(self, in_features): + def _create_ee_layers(self): # entity embeding layers are Linear Layers layers = nn.ModuleList() for i, (num_in, embed, num_out) in enumerate( diff --git a/autoPyTorch/pipeline/components/setup/network/network_embedding/modules/no_embedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/modules/no_embedding.py similarity index 100% rename from autoPyTorch/pipeline/components/setup/network/network_embedding/modules/no_embedding.py rename to autoPyTorch/pipeline/components/setup/network_embedding/modules/no_embedding.py From 02dc06455b89bf725b74078c9b25280b2349dc00 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Mon, 15 Feb 2021 19:57:00 +0100 Subject: [PATCH 42/68] Working network embedding --- .../TabularColumnTransformer.py | 8 +- .../components/setup/network/base_network.py | 3 +- .../network_backbone/base_network_backbone.py | 6 +- .../LearnedEntityEmbedding.py | 133 +++++++++++++ .../setup/network_embedding/NoEmbedding.py | 49 +++++ .../base_network_embedding.py | 45 ++++- .../base_network_embedding_choice.py | 188 ++++++++++++++++++ .../network_embedding/modules/__init__.py | 0 .../modules/learned_entity_embedding.py | 70 ------- .../network_embedding/modules/no_embedding.py | 12 -- .../pipeline/tabular_classification.py | 30 +++ autoPyTorch/pipeline/tabular_regression.py | 27 +++ .../components/preprocessing/base.py | 36 ++++ .../test_tabular_column_transformer.py | 2 +- .../components/{ => training}/base.py | 37 +--- 15 files changed, 511 insertions(+), 135 deletions(-) create mode 100644 autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py create mode 100644 autoPyTorch/pipeline/components/setup/network_embedding/NoEmbedding.py create mode 100644 autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding_choice.py delete mode 100644 autoPyTorch/pipeline/components/setup/network_embedding/modules/__init__.py delete mode 100644 autoPyTorch/pipeline/components/setup/network_embedding/modules/learned_entity_embedding.py delete mode 100644 autoPyTorch/pipeline/components/setup/network_embedding/modules/no_embedding.py create mode 100644 test/test_pipeline/components/preprocessing/base.py rename test/test_pipeline/components/{ => training}/base.py (69%) diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py index e90f35ed1..e1e08e94e 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py @@ -2,7 +2,7 @@ import numpy as np -from sklearn.compose import ColumnTransformer, make_column_transformer +from sklearn.compose import ColumnTransformer from sklearn.pipeline import make_pipeline import torch @@ -57,9 +57,9 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> "TabularColumnTransformer": if len(X['dataset_properties']['categorical_columns']): categorical_pipeline = make_pipeline(*preprocessors['categorical']) - self.preprocessor = make_column_transformer( - (numerical_pipeline, X['dataset_properties']['numerical_columns']), - (categorical_pipeline, X['dataset_properties']['categorical_columns']), + self.preprocessor = ColumnTransformer([ + ('numerical_pipeline', numerical_pipeline, X['dataset_properties']['numerical_columns']), + ('categorical_pipeline', categorical_pipeline, X['dataset_properties']['categorical_columns'])], remainder='passthrough' ) diff --git a/autoPyTorch/pipeline/components/setup/network/base_network.py b/autoPyTorch/pipeline/components/setup/network/base_network.py index 4f7c18b7c..81fd8e5f4 100644 --- a/autoPyTorch/pipeline/components/setup/network/base_network.py +++ b/autoPyTorch/pipeline/components/setup/network/base_network.py @@ -29,6 +29,7 @@ def __init__( self.add_fit_requirements([ FitRequirement("network_head", (torch.nn.Module,), user_defined=False, dataset_property=False), FitRequirement("network_backbone", (torch.nn.Module,), user_defined=False, dataset_property=False), + FitRequirement("network_embedding", (torch.nn.Module,), user_defined=False, dataset_property=False), ]) self.final_activation = None @@ -47,7 +48,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> autoPyTorchTrainingComponent: # information to fit this stage self.check_requirements(X, y) - self.network = torch.nn.Sequential(X['network_backbone'], X['network_head']) + self.network = torch.nn.Sequential(X['network_embedding'], X['network_backbone'], X['network_head']) # Properly set the network training device if self.device is None: diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py b/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py index 2557e92b8..241fcb51b 100644 --- a/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py +++ b/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py @@ -14,6 +14,7 @@ from autoPyTorch.pipeline.components.base_component import ( autoPyTorchComponent, ) +from autoPyTorch.pipeline.components.setup.network_backbone.utils import get_output_shape from autoPyTorch.utils.common import FitRequirement @@ -31,7 +32,9 @@ def __init__(self, FitRequirement('X_train', (np.ndarray, pd.DataFrame, csr_matrix), user_defined=True, dataset_property=False), FitRequirement('input_shape', (Iterable,), user_defined=True, dataset_property=True), - FitRequirement('tabular_transformer', (BaseEstimator,), user_defined=False, dataset_property=False)]) + FitRequirement('tabular_transformer', (BaseEstimator,), user_defined=False, dataset_property=False), + FitRequirement('network_embedding', (nn.Module,), user_defined=False, dataset_property=False) + ]) self.backbone: nn.Module = None self.config = kwargs self.input_shape: Optional[Iterable] = None @@ -56,6 +59,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator: column_transformer = X['tabular_transformer'].preprocessor input_shape = column_transformer.transform(X_train[:1]).shape[1:] + input_shape = get_output_shape(X['network_embedding'], input_shape=input_shape) self.input_shape = input_shape self.backbone = self.build_backbone( diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py new file mode 100644 index 000000000..41d2da581 --- /dev/null +++ b/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py @@ -0,0 +1,133 @@ +from typing import Any, Dict, Optional, Union + +from ConfigSpace.configuration_space import ConfigurationSpace +from ConfigSpace.hyperparameters import ( + UniformFloatHyperparameter, + UniformIntegerHyperparameter +) + +import numpy as np + +import torch +from torch import nn + +from autoPyTorch.pipeline.components.setup.network_embedding.base_network_embedding import NetworkEmbeddingComponent + + +class _LearnedEntityEmbedding(nn.Module): + """ Parent class for MlpNet, ResNet, ... Can use entity embedding for categorical features""" + + def __init__(self, config, num_input_features, num_numerical_features): + """ + Initialize the BaseFeatureNet. + Arguments: + config: The configuration sampled by the hyperparameter optimizer + # TODO: fix this + num_input_features: the number of features of the dataset + num_numerical_features: OneHot encoder, that is used to encode X + """ + super().__init__() + self.config = config + + self.num_numerical = num_numerical_features + # list of number of categories of categorical data + # or 0 for numerical data + self.num_input_features = num_input_features + categorical_features = self.num_input_features > 0 + + self.num_categorical_features = self.num_input_features[categorical_features] + + self.embed_features = [num_in >= config["min_unique_values_for_embedding"] for num_in in + self.num_input_features] + self.num_output_dimensions = [0] * num_numerical_features + self.num_output_dimensions.extend([config["dimension_reduction_" + str(i)] * num_in for i, num_in in + enumerate(self.num_categorical_features)]) + self.num_output_dimensions = [int(np.clip(num_out, 1, num_in - 1)) for num_out, num_in in + zip(self.num_output_dimensions, self.num_input_features)] + self.num_output_dimensions = [num_out if embed else num_in for num_out, embed, num_in in + zip(self.num_output_dimensions, self.embed_features, + self.num_input_features)] + self.num_out_feats = self.num_numerical + sum(self.num_output_dimensions) + + self.ee_layers = self._create_ee_layers() + + def forward(self, x): + # pass the columns of each categorical feature through entity embedding layer + # before passing it through the model + concat_seq = [] + last_concat = 0 + x_pointer = 0 + layer_pointer = 0 + for num_in, embed in zip(self.num_input_features, self.embed_features): + if not embed: + x_pointer += 1 + continue + if x_pointer > last_concat: + concat_seq.append(x[:, last_concat: x_pointer]) + categorical_feature_slice = x[:, x_pointer: x_pointer + num_in] + concat_seq.append(self.ee_layers[layer_pointer](categorical_feature_slice)) + layer_pointer += 1 + x_pointer += num_in + last_concat = x_pointer + + concat_seq.append(x[:, last_concat:]) + return torch.cat(concat_seq, dim=1) + + def _create_ee_layers(self): + # entity embeding layers are Linear Layers + layers = nn.ModuleList() + for i, (num_in, embed, num_out) in enumerate( + zip(self.num_input_features, self.embed_features, self.num_output_dimensions)): + if not embed: + continue + layers.append(nn.Linear(num_in, num_out)) + return layers + + +class LearnedEntityEmbedding(NetworkEmbeddingComponent): + """ + Class to learn an embedding for categorical hyperparameters. + """ + + def __init__(self, random_state: Optional[Union[np.random.RandomState, int]] = None, **kwargs: Any): + super().__init__(random_state=random_state) + self.config = kwargs + + def build_embedding(self, num_input_features, num_numerical_features) -> nn.Module: + return _LearnedEntityEmbedding(config=self.config, + num_input_features=num_input_features, + num_numerical_features=num_numerical_features) + + @staticmethod + def get_hyperparameter_search_space( + dataset_properties: Optional[Dict[str, str]] = None, + min_unique_values_for_embedding=((3, 7), 5, True), + dimension_reduction=((0, 1), 0.5), + ) -> ConfigurationSpace: + cs = ConfigurationSpace() + min_hp = UniformIntegerHyperparameter("min_unique_values_for_embedding", + lower=min_unique_values_for_embedding[0][0], + upper=min_unique_values_for_embedding[0][1], + default_value=min_unique_values_for_embedding[1], + log=min_unique_values_for_embedding[2] + ) + cs.add_hyperparameter(min_hp) + if dataset_properties is not None: + for i in range(len(dataset_properties['categorical_columns'])): + ee_dimensions_hp = UniformFloatHyperparameter("dimension_reduction_" + str(i), + lower=dimension_reduction[0][0], + upper=dimension_reduction[0][1], + default_value=dimension_reduction[1] + ) + cs.add_hyperparameter(ee_dimensions_hp) + return cs + + @staticmethod + def get_properties(dataset_properties: Optional[Dict[str, Any]] = None) -> Dict[str, Union[str, bool]]: + return { + 'shortname': 'embedding', + 'name': 'LearnedEntityEmbedding', + 'handles_tabular': True, + 'handles_image': False, + 'handles_time_series': False, + } diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/NoEmbedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/NoEmbedding.py new file mode 100644 index 000000000..0f18b5ed6 --- /dev/null +++ b/autoPyTorch/pipeline/components/setup/network_embedding/NoEmbedding.py @@ -0,0 +1,49 @@ +from typing import Any, Dict, Optional, Union + +from ConfigSpace.configuration_space import ConfigurationSpace + +import numpy as np + +from torch import nn + +from autoPyTorch.pipeline.components.setup.network_embedding.base_network_embedding import NetworkEmbeddingComponent + + +class _NoEmbedding(nn.Module): + def __init__(self, num_input_features, num_numerical_features): + super().__init__() + self.n_feats = num_input_features + self.num_numerical = num_numerical_features + + def forward(self, x): + return x + + +class NoEmbedding(NetworkEmbeddingComponent): + """ + Class to learn an embedding for categorical hyperparameters. + """ + + def __init__(self, random_state: Optional[Union[np.random.RandomState, int]] = None): + super().__init__(random_state=random_state) + + def build_embedding(self, num_input_features, num_numerical_features) -> nn.Module: + return _NoEmbedding(num_input_features=num_input_features, + num_numerical_features=num_numerical_features) + + @staticmethod + def get_hyperparameter_search_space( + dataset_properties: Optional[Dict[str, str]] = None, + ) -> ConfigurationSpace: + cs = ConfigurationSpace() + return cs + + @staticmethod + def get_properties(dataset_properties: Optional[Dict[str, Any]] = None) -> Dict[str, Union[str, bool]]: + return { + 'shortname': 'no embedding', + 'name': 'NoEmbedding', + 'handles_tabular': True, + 'handles_image': False, + 'handles_time_series': False, + } \ No newline at end of file diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py index 4c087a89d..e27cac3c0 100644 --- a/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py +++ b/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py @@ -1,25 +1,50 @@ -from typing import Any, Dict, Optional +import copy +from typing import Any, Dict, Optional, Union + +import numpy as np from sklearn.base import BaseEstimator + from torch import nn from autoPyTorch.pipeline.components.setup.base_setup import autoPyTorchSetupComponent - +from autoPyTorch.utils.common import subsampler class NetworkEmbeddingComponent(autoPyTorchSetupComponent): - def __init__(self, - **kwargs): + def __init__(self, random_state: Optional[Union[np.random.RandomState, int]] = None): super().__init__() - self.config = kwargs self.embedding: Optional[nn.Module] = None def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator: - in_features = X['X_train'].shape[1:] + + num_numerical_columns, num_input_features = self._get_args(X) self.embedding = self.build_embedding( - in_features=in_features, - num_numerical_features=len(X['numerical_features'])) + num_input_features=num_input_features, + num_numerical_features=num_numerical_columns) return self - def build_embedding(self, in_features, num_numerical_features) -> nn.Module: - raise NotImplementedError \ No newline at end of file + def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: + X.update({'network_embedding': self.embedding}) + return X + + def build_embedding(self, num_input_features, num_numerical_features) -> nn.Module: + raise NotImplementedError + + def _get_args(self, X: Dict[str, Any]) -> Union[int, np.ndarray]: + # Feature preprocessors can alter numerical columns + if len(X['dataset_properties']['numerical_columns']) == 0: + num_numerical_columns = 0 + else: + X_train = copy.deepcopy(X['backend'].load_datamanager().train_tensors[0][:2]) + # as numerical pipeline will always be the first pipeline + numerical_column_transformer = X['tabular_transformer'].preprocessor.named_transformers_['numerical_pipeline'] + num_numerical_columns = numerical_column_transformer.transform( + X_train.iloc[:, X['dataset_properties']['numerical_columns']]).shape[1] + num_input_features = np.zeros((num_numerical_columns + + len(X['dataset_properties']['categorical_columns'])), dtype=int) + categories = X['dataset_properties']['categories'] + + for i, category in enumerate(categories): + num_input_features[num_numerical_columns + i, ] = len(category) + return num_numerical_columns, num_input_features diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding_choice.py b/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding_choice.py new file mode 100644 index 000000000..c08b156ce --- /dev/null +++ b/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding_choice.py @@ -0,0 +1,188 @@ +import os +from collections import OrderedDict +from typing import Dict, List, Optional + +import ConfigSpace.hyperparameters as CSH +from ConfigSpace.configuration_space import ConfigurationSpace + +import numpy as np + +from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice +from autoPyTorch.pipeline.components.base_component import ( + ThirdPartyComponents, + autoPyTorchComponent, + find_components, +) +from autoPyTorch.pipeline.components.setup.network_embedding.base_network_embedding import ( + NetworkEmbeddingComponent, +) + +directory = os.path.split(__file__)[0] +_embeddings = find_components(__package__, + directory, + NetworkEmbeddingComponent) +_addons = ThirdPartyComponents(NetworkEmbeddingComponent) + + +def add_embedding(embedding: NetworkEmbeddingComponent) -> None: + _addons.add_component(embedding) + + +class NetworkEmbeddingChoice(autoPyTorchChoice): + + def get_components(self) -> Dict[str, autoPyTorchComponent]: + """Returns the available embedding components + + Args: + None + + Returns: + Dict[str, autoPyTorchComponent]: all baseembedding components available + as choices for learning rate scheduling + """ + components = OrderedDict() + components.update(_embeddings) + components.update(_addons.components) + return components + + def get_available_components( + self, + dataset_properties: Optional[Dict[str, str]] = None, + include: List[str] = None, + exclude: List[str] = None, + ) -> Dict[str, autoPyTorchComponent]: + """Filters out components based on user provided + include/exclude directives, as well as the dataset properties + + Args: + include (Optional[Dict[str, Any]]): what hyper-parameter configurations + to honor when creating the configuration space + exclude (Optional[Dict[str, Any]]): what hyper-parameter configurations + to remove from the configuration space + dataset_properties (Optional[Dict[str, Union[str, int]]]): Caracteristics + of the dataset to guide the pipeline choices of components + + Returns: + Dict[str, autoPyTorchComponent]: A filtered dict of learning + rate embeddings + + """ + if dataset_properties is None: + dataset_properties = {} + + if include is not None and exclude is not None: + raise ValueError( + "The argument include and exclude cannot be used together.") + + available_comp = self.get_components() + + if include is not None: + for incl in include: + if incl not in available_comp: + raise ValueError("Trying to include unknown component: " + "%s" % incl) + + components_dict = OrderedDict() + for name in available_comp: + if include is not None and name not in include: + continue + elif exclude is not None and name in exclude: + continue + + entry = available_comp[name] + + # Exclude itself to avoid infinite loop + if entry == NetworkEmbeddingChoice or hasattr(entry, 'get_components'): + continue + + task_type = dataset_properties['task_type'] + properties = entry.get_properties() + if 'tabular' in task_type and not properties['handles_tabular']: + continue + elif 'image' in task_type and not properties['handles_image']: + continue + elif 'time_series' in task_type and not properties['handles_time_series']: + continue + + components_dict[name] = entry + + return components_dict + + def get_hyperparameter_search_space( + self, + dataset_properties: Optional[Dict[str, str]] = None, + default: Optional[str] = None, + include: Optional[List[str]] = None, + exclude: Optional[List[str]] = None, + ) -> ConfigurationSpace: + """Returns the configuration space of the current chosen components + + Args: + dataset_properties (Optional[Dict[str, str]]): Describes the dataset to work on + default (Optional[str]): Default embedding to use + include: Optional[Dict[str, Any]]: what components to include. It is an exhaustive + list, and will exclusively use this components. + exclude: Optional[Dict[str, Any]]: which components to skip + + Returns: + ConfigurationSpace: the configuration space of the hyper-parameters of the + chosen component + """ + cs = ConfigurationSpace() + + if dataset_properties is None: + dataset_properties = {} + + # Compile a list of legal preprocessors for this problem + available_embedding = self.get_available_components( + dataset_properties=dataset_properties, + include=include, exclude=exclude) + + if len(available_embedding) == 0 and 'tabular' in dataset_properties['task_type']: + raise ValueError("No embedding found") + + if available_embedding == 0: + return cs + + if default is None: + defaults = [ + 'LearnedEntityEmbedding', + 'NoEmbedding' + ] + for default_ in defaults: + if default_ in available_embedding: + default = default_ + break + + if len(dataset_properties['categorical_columns']) == 0: + default = 'NoEmbedding' + if include is not None and default not in include: + raise ValueError("Provided {} in include, however, the dataset " + "is incompatible with it".format(include)) + embedding = CSH.CategoricalHyperparameter('__choice__', + ['NoEmbedding'], + default_value=default) + else: + embedding = CSH.CategoricalHyperparameter('__choice__', + list(available_embedding.keys()), + default_value=default) + + cs.add_hyperparameter(embedding) + for name in embedding.choices: + updates = self._get_search_space_updates(prefix=name) + config_space = available_embedding[name].get_hyperparameter_search_space(dataset_properties, # type: ignore + **updates) + parent_hyperparameter = {'parent': embedding, 'value': name} + cs.add_configuration_space( + name, + config_space, + parent_hyperparameter=parent_hyperparameter + ) + + self.configuration_space_ = cs + self.dataset_properties_ = dataset_properties + return cs + + def transform(self, X: np.ndarray) -> np.ndarray: + assert self.choice is not None, "Cannot call transform before the object is initialized" + return self.choice.transform(X) diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/modules/__init__.py b/autoPyTorch/pipeline/components/setup/network_embedding/modules/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/modules/learned_entity_embedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/modules/learned_entity_embedding.py deleted file mode 100644 index d7d294661..000000000 --- a/autoPyTorch/pipeline/components/setup/network_embedding/modules/learned_entity_embedding.py +++ /dev/null @@ -1,70 +0,0 @@ -""" -Class to learn an embedding for categorical hyperparameters. -""" - -import torch -import torch.nn as nn -import numpy as np - - -class LearnedEntityEmbedding(nn.Module): - """ Parent class for MlpNet, ResNet, ... Can use entity embedding for categorical features""" - - def __init__(self, config, num_input_features, num_numerical_features): - """ - Initialize the BaseFeatureNet. - Arguments: - config: The configuration sampled by the hyperparameter optimizer - in_features: the number of features of the dataset - one_hot_encoder: OneHot encoder, that is used to encode X - """ - super(LearnedEntityEmbedding, self).__init__() - self.config = config - - # self.num_numerical = len([f for f in one_hot_encoder.categorical_features if not f]) - # self.num_input_features = [len(c) for c in one_hot_encoder.categories_] - self.num_numerical = num_numerical_features - self.num_input_features = num_input_features - self.embed_features = [num_in >= config["min_unique_values_for_embedding"] for num_in in - self.num_input_features] - self.num_output_dimensions = [config["dimension_reduction_" + str(i)] * num_in for i, num_in in - enumerate(self.num_input_features)] - self.num_output_dimensions = [int(np.clip(num_out, 1, num_in - 1)) for num_out, num_in in - zip(self.num_output_dimensions, self.num_input_features)] - self.num_output_dimensions = [num_out if embed else num_in for num_out, embed, num_in in - zip(self.num_output_dimensions, self.embed_features, self.num_input_features)] - self.num_out_feats = self.num_numerical + sum(self.num_output_dimensions) - - self.ee_layers = self._create_ee_layers() - - def forward(self, x): - # pass the columns of each categorical feature through entity embedding layer - # before passing it through the model - concat_seq = [] - last_concat = 0 - x_pointer = 0 - layer_pointer = 0 - for num_in, embed in zip(self.num_input_features, self.embed_features): - if not embed: - x_pointer += 1 - continue - if x_pointer > last_concat: - concat_seq.append(x[:, last_concat: x_pointer]) - categorical_feature_slice = x[:, x_pointer: x_pointer + num_in] - concat_seq.append(self.ee_layers[layer_pointer](categorical_feature_slice)) - layer_pointer += 1 - x_pointer += num_in - last_concat = x_pointer - - concat_seq.append(x[:, last_concat:]) - return torch.cat(concat_seq, dim=1) - - def _create_ee_layers(self): - # entity embeding layers are Linear Layers - layers = nn.ModuleList() - for i, (num_in, embed, num_out) in enumerate( - zip(self.num_input_features, self.embed_features, self.num_output_dimensions)): - if not embed: - continue - layers.append(nn.Linear(num_in, num_out)) - return layers \ No newline at end of file diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/modules/no_embedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/modules/no_embedding.py deleted file mode 100644 index cab1e9b7f..000000000 --- a/autoPyTorch/pipeline/components/setup/network_embedding/modules/no_embedding.py +++ /dev/null @@ -1,12 +0,0 @@ -from torch import nn - - -class NoEmbedding(nn.Module): - def __init__(self, config, in_features, num_numerical_features): - super(NoEmbedding, self).__init__() - self.config = config - self.n_feats = in_features - self.num_numerical = num_numerical_features - - def forward(self, x): - return x \ No newline at end of file diff --git a/autoPyTorch/pipeline/tabular_classification.py b/autoPyTorch/pipeline/tabular_classification.py index e3abad9cc..f5b668a88 100644 --- a/autoPyTorch/pipeline/tabular_classification.py +++ b/autoPyTorch/pipeline/tabular_classification.py @@ -1,7 +1,9 @@ +import copy import warnings from typing import Any, Dict, List, Optional, Tuple from ConfigSpace.configuration_space import Configuration, ConfigurationSpace +from ConfigSpace.forbidden import ForbiddenEqualsClause, ForbiddenAndConjunction import numpy as np @@ -25,6 +27,7 @@ from autoPyTorch.pipeline.components.setup.lr_scheduler.base_scheduler_choice import SchedulerChoice from autoPyTorch.pipeline.components.setup.network.base_network import NetworkComponent from autoPyTorch.pipeline.components.setup.network_backbone.base_network_backbone_choice import NetworkBackboneChoice +from autoPyTorch.pipeline.components.setup.network_embedding.base_network_embedding_choice import NetworkEmbeddingChoice from autoPyTorch.pipeline.components.setup.network_head.base_network_head_choice import NetworkHeadChoice from autoPyTorch.pipeline.components.setup.network_initializer.base_network_init_choice import ( NetworkInitializerChoice @@ -188,6 +191,32 @@ def _get_hyperparameter_search_space(self, # Here we add custom code, like this with this # is not a valid configuration + # Learned Entity Embedding is only valid when encoder is one hot encoder + embeddings = cs.get_hyperparameter('network_embedding:__choice__').choices + encoders = cs.get_hyperparameter('encoder:__choice__').choices + default = cs.get_hyperparameter('network_embedding:__choice__').default_value + possible_default_embeddings = copy.copy(list(embeddings)) + del possible_default_embeddings[possible_default_embeddings.index(default)] + if 'network_embedding' in self.named_steps.keys() and 'encoder' in self.named_steps.keys(): + for encoder in encoders: + if encoder == 'OneHotEncoder': + continue + while True: + try: + cs.add_forbidden_clause(ForbiddenAndConjunction( + ForbiddenEqualsClause(cs.get_hyperparameter( + 'network_embedding:__choice__'), 'LearnedEntityEmbedding'), + ForbiddenEqualsClause(cs.get_hyperparameter('encoder:__choice__') + , encoder) + )) + break + except ValueError: + # change the default and try again + try: + default = possible_default_embeddings.pop() + except IndexError: + raise ValueError("Cannot find a legal default configuration") + cs.get_hyperparameter('network_embedding:__choice__').default_value = default self.configuration_space = cs self.dataset_properties = dataset_properties @@ -216,6 +245,7 @@ def _get_pipeline_steps(self, dataset_properties: Optional[Dict[str, Any]], ("feature_preprocessor", FeatureProprocessorChoice(default_dataset_properties)), ("tabular_transformer", TabularColumnTransformer()), ("preprocessing", EarlyPreprocessing()), + ("network_embedding", NetworkEmbeddingChoice(default_dataset_properties)), ("network_backbone", NetworkBackboneChoice(default_dataset_properties)), ("network_head", NetworkHeadChoice(default_dataset_properties)), ("network", NetworkComponent()), diff --git a/autoPyTorch/pipeline/tabular_regression.py b/autoPyTorch/pipeline/tabular_regression.py index 174e41dee..0c6463c31 100644 --- a/autoPyTorch/pipeline/tabular_regression.py +++ b/autoPyTorch/pipeline/tabular_regression.py @@ -2,6 +2,7 @@ from typing import Any, Dict, List, Optional, Tuple from ConfigSpace.configuration_space import Configuration, ConfigurationSpace +from ConfigSpace.forbidden import ForbiddenAndConjunction, ForbiddenEqualsClause import numpy as np @@ -136,6 +137,32 @@ def _get_hyperparameter_search_space(self, # Here we add custom code, like this with this # is not a valid configuration + # Learned Entity Embedding is only valid when encoder is one hot encoder + embeddings = cs.get_hyperparameter('network_embedding:__choice__').choices + encoders = cs.get_hyperparameter('encoder:__choice__').choices + default = cs.get_hyperparameter('network_embedding:__choice__').default_value + possible_default_embeddings = copy.copy(list(embeddings)) + del possible_default_embeddings[possible_default_embeddings.index(default)] + if 'network_embedding' in self.named_steps.keys() and 'encoder' in self.named_steps.keys(): + for encoder in encoders: + if encoder == 'OneHotEncoder': + continue + while True: + try: + cs.add_forbidden_clause(ForbiddenAndConjunction( + ForbiddenEqualsClause(cs.get_hyperparameter( + 'network_embedding:__choice__'), 'LearnedEntityEmbedding'), + ForbiddenEqualsClause(cs.get_hyperparameter('encoder:__choice__') + , encoder) + )) + break + except ValueError: + # change the default and try again + try: + default = possible_default_embeddings.pop() + except IndexError: + raise ValueError("Cannot find a legal default configuration") + cs.get_hyperparameter('network_embedding:__choice__').default_value = default self.configuration_space = cs self.dataset_properties = dataset_properties diff --git a/test/test_pipeline/components/preprocessing/base.py b/test/test_pipeline/components/preprocessing/base.py new file mode 100644 index 000000000..7bb4fee70 --- /dev/null +++ b/test/test_pipeline/components/preprocessing/base.py @@ -0,0 +1,36 @@ +from typing import Any, Dict, List, Optional, Tuple + +from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.TabularColumnTransformer import \ + TabularColumnTransformer +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.encoding.base_encoder_choice import \ + EncoderChoice +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.imputation.SimpleImputer import SimpleImputer +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling.base_scaler_choice import ScalerChoice +from autoPyTorch.pipeline.tabular_classification import TabularClassificationPipeline + + +class TabularPipeline(TabularClassificationPipeline): + def _get_pipeline_steps(self, dataset_properties: Optional[Dict[str, Any]], + ) -> List[Tuple[str, autoPyTorchChoice]]: + """ + Defines what steps a pipeline should follow. + The step itself has choices given via autoPyTorchChoice. + + Returns: + List[Tuple[str, autoPyTorchChoice]]: list of steps sequentially exercised + by the pipeline. + """ + steps = [] # type: List[Tuple[str, autoPyTorchChoice]] + + default_dataset_properties = {'target_type': 'tabular_classification'} + if dataset_properties is not None: + default_dataset_properties.update(dataset_properties) + + steps.extend([ + ("imputer", SimpleImputer()), + ("encoder", EncoderChoice(default_dataset_properties)), + ("scaler", ScalerChoice(default_dataset_properties)), + ("tabular_transformer", TabularColumnTransformer()), + ]) + return steps \ No newline at end of file diff --git a/test/test_pipeline/components/preprocessing/test_tabular_column_transformer.py b/test/test_pipeline/components/preprocessing/test_tabular_column_transformer.py index ef113c5eb..8e31bad05 100644 --- a/test/test_pipeline/components/preprocessing/test_tabular_column_transformer.py +++ b/test/test_pipeline/components/preprocessing/test_tabular_column_transformer.py @@ -1,4 +1,4 @@ -from test.test_pipeline.components.base import TabularPipeline +from test.test_pipeline.components.preprocessing.base import TabularPipeline import numpy as np diff --git a/test/test_pipeline/components/base.py b/test/test_pipeline/components/training/base.py similarity index 69% rename from test/test_pipeline/components/base.py rename to test/test_pipeline/components/training/base.py index 8211172e7..eaa80cf88 100644 --- a/test/test_pipeline/components/base.py +++ b/test/test_pipeline/components/training/base.py @@ -1,6 +1,5 @@ import logging import unittest -from typing import Any, Dict, List, Optional, Tuple from sklearn.datasets import make_classification, make_regression @@ -8,16 +7,8 @@ from autoPyTorch.constants import BINARY, CLASSIFICATION_TASKS, CONTINUOUS, OUTPUT_TYPES_TO_STRING, REGRESSION_TASKS, \ TASK_TYPES_TO_STRING -from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice -from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.TabularColumnTransformer import \ - TabularColumnTransformer -from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.encoding.base_encoder_choice import \ - EncoderChoice -from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.imputation.SimpleImputer import SimpleImputer -from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling.base_scaler_choice import ScalerChoice -from autoPyTorch.pipeline.components.training.metrics.utils import get_metrics from autoPyTorch.pipeline.components.training.trainer.base_trainer import BaseTrainerComponent, BudgetTracker -from autoPyTorch.pipeline.tabular_classification import TabularClassificationPipeline +from autoPyTorch.pipeline.components.training.metrics.utils import get_metrics class BaseTraining(unittest.TestCase): @@ -121,29 +112,3 @@ def train_model(self, # Backward pass loss.backward() optimizer.step() - - -class TabularPipeline(TabularClassificationPipeline): - def _get_pipeline_steps(self, dataset_properties: Optional[Dict[str, Any]], - ) -> List[Tuple[str, autoPyTorchChoice]]: - """ - Defines what steps a pipeline should follow. - The step itself has choices given via autoPyTorchChoice. - - Returns: - List[Tuple[str, autoPyTorchChoice]]: list of steps sequentially exercised - by the pipeline. - """ - steps = [] # type: List[Tuple[str, autoPyTorchChoice]] - - default_dataset_properties = {'target_type': 'tabular_classification'} - if dataset_properties is not None: - default_dataset_properties.update(dataset_properties) - - steps.extend([ - ("imputer", SimpleImputer()), - ("encoder", EncoderChoice(default_dataset_properties)), - ("scaler", ScalerChoice(default_dataset_properties)), - ("tabular_transformer", TabularColumnTransformer()), - ]) - return steps From 37cd8c506f1cb0583b6cc07b2ce3336edf5486cf Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Mon, 15 Feb 2021 22:24:05 +0100 Subject: [PATCH 43/68] ADD tests for network embedding --- .../encoding/OrdinalEncoder.py | 33 --------- .../LearnedEntityEmbedding.py | 29 ++++---- .../setup/network_embedding/NoEmbedding.py | 15 ++--- .../base_network_embedding.py | 20 +++--- .../pipeline/tabular_classification.py | 67 ++++++++++--------- autoPyTorch/pipeline/tabular_regression.py | 48 ++++++------- .../components/setup/test_setup_networks.py | 23 ++++++- .../test_tabular_classification.py | 9 ++- 8 files changed, 119 insertions(+), 125 deletions(-) delete mode 100644 autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/OrdinalEncoder.py diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/OrdinalEncoder.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/OrdinalEncoder.py deleted file mode 100644 index c65726327..000000000 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/OrdinalEncoder.py +++ /dev/null @@ -1,33 +0,0 @@ -from typing import Any, Dict, Optional, Union - -import numpy as np - -from sklearn.preprocessing import OrdinalEncoder as OE - -from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.encoding.base_encoder import BaseEncoder - - -class OrdinalEncoder(BaseEncoder): - """ - Encode categorical features as a one-hot numerical array - """ - def __init__(self, random_state: Optional[Union[np.random.RandomState, int]] = None): - super().__init__() - self.random_state = random_state - - def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEncoder: - - self.check_requirements(X, y) - - self.preprocessor['categorical'] = OE(handle_unknown='use_encoded_value', - unknown_value=-1, - ) - return self - - @staticmethod - def get_properties(dataset_properties: Optional[Dict[str, Any]] = None) -> Dict[str, Union[str, bool]]: - return { - 'shortname': 'OrdinalEncoder', - 'name': 'Ordinal Encoder', - 'handles_sparse': False - } diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py index 41d2da581..3910afc37 100644 --- a/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py +++ b/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py @@ -1,4 +1,4 @@ -from typing import Any, Dict, Optional, Union +from typing import Any, Dict, Optional, Tuple, Union from ConfigSpace.configuration_space import ConfigurationSpace from ConfigSpace.hyperparameters import ( @@ -15,16 +15,15 @@ class _LearnedEntityEmbedding(nn.Module): - """ Parent class for MlpNet, ResNet, ... Can use entity embedding for categorical features""" + """ Learned entity embedding module for categorical features""" - def __init__(self, config, num_input_features, num_numerical_features): + def __init__(self, config: Dict[str, Any], num_input_features: np.ndarray, num_numerical_features: int): """ - Initialize the BaseFeatureNet. Arguments: - config: The configuration sampled by the hyperparameter optimizer - # TODO: fix this - num_input_features: the number of features of the dataset - num_numerical_features: OneHot encoder, that is used to encode X + config (Dict[str, Any]): The configuration sampled by the hyperparameter optimizer + num_input_features (np.ndarray): column wise information of number of output columns after transformation + for each categorical column and 0 for numerical columns + num_numerical_features (int): number of numerical features in X """ super().__init__() self.config = config @@ -51,7 +50,7 @@ def __init__(self, config, num_input_features, num_numerical_features): self.ee_layers = self._create_ee_layers() - def forward(self, x): + def forward(self, x: torch.Tensor) -> torch.Tensor: # pass the columns of each categorical feature through entity embedding layer # before passing it through the model concat_seq = [] @@ -73,11 +72,11 @@ def forward(self, x): concat_seq.append(x[:, last_concat:]) return torch.cat(concat_seq, dim=1) - def _create_ee_layers(self): + def _create_ee_layers(self) -> nn.ModuleList: # entity embeding layers are Linear Layers layers = nn.ModuleList() - for i, (num_in, embed, num_out) in enumerate( - zip(self.num_input_features, self.embed_features, self.num_output_dimensions)): + for i, (num_in, embed, num_out) in enumerate(zip(self.num_input_features, self.embed_features, + self.num_output_dimensions)): if not embed: continue layers.append(nn.Linear(num_in, num_out)) @@ -93,7 +92,7 @@ def __init__(self, random_state: Optional[Union[np.random.RandomState, int]] = N super().__init__(random_state=random_state) self.config = kwargs - def build_embedding(self, num_input_features, num_numerical_features) -> nn.Module: + def build_embedding(self, num_input_features: np.ndarray, num_numerical_features: int) -> nn.Module: return _LearnedEntityEmbedding(config=self.config, num_input_features=num_input_features, num_numerical_features=num_numerical_features) @@ -101,8 +100,8 @@ def build_embedding(self, num_input_features, num_numerical_features) -> nn.Modu @staticmethod def get_hyperparameter_search_space( dataset_properties: Optional[Dict[str, str]] = None, - min_unique_values_for_embedding=((3, 7), 5, True), - dimension_reduction=((0, 1), 0.5), + min_unique_values_for_embedding: Tuple[Tuple, int, bool] = ((3, 7), 5, True), + dimension_reduction: Tuple[Tuple, float] = ((0, 1), 0.5), ) -> ConfigurationSpace: cs = ConfigurationSpace() min_hp = UniformIntegerHyperparameter("min_unique_values_for_embedding", diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/NoEmbedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/NoEmbedding.py index 0f18b5ed6..a8b81af2f 100644 --- a/autoPyTorch/pipeline/components/setup/network_embedding/NoEmbedding.py +++ b/autoPyTorch/pipeline/components/setup/network_embedding/NoEmbedding.py @@ -4,18 +4,14 @@ import numpy as np +import torch from torch import nn from autoPyTorch.pipeline.components.setup.network_embedding.base_network_embedding import NetworkEmbeddingComponent class _NoEmbedding(nn.Module): - def __init__(self, num_input_features, num_numerical_features): - super().__init__() - self.n_feats = num_input_features - self.num_numerical = num_numerical_features - - def forward(self, x): + def forward(self, x: torch.Tensor) -> torch.Tensor: return x @@ -27,9 +23,8 @@ class NoEmbedding(NetworkEmbeddingComponent): def __init__(self, random_state: Optional[Union[np.random.RandomState, int]] = None): super().__init__(random_state=random_state) - def build_embedding(self, num_input_features, num_numerical_features) -> nn.Module: - return _NoEmbedding(num_input_features=num_input_features, - num_numerical_features=num_numerical_features) + def build_embedding(self, num_input_features: np.ndarray, num_numerical_features: int) -> nn.Module: + return _NoEmbedding() @staticmethod def get_hyperparameter_search_space( @@ -46,4 +41,4 @@ def get_properties(dataset_properties: Optional[Dict[str, Any]] = None) -> Dict[ 'handles_tabular': True, 'handles_image': False, 'handles_time_series': False, - } \ No newline at end of file + } diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py index e27cac3c0..8652c347c 100644 --- a/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py +++ b/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py @@ -1,5 +1,5 @@ import copy -from typing import Any, Dict, Optional, Union +from typing import Any, Dict, Optional, Tuple, Union import numpy as np @@ -8,12 +8,13 @@ from torch import nn from autoPyTorch.pipeline.components.setup.base_setup import autoPyTorchSetupComponent -from autoPyTorch.utils.common import subsampler + class NetworkEmbeddingComponent(autoPyTorchSetupComponent): def __init__(self, random_state: Optional[Union[np.random.RandomState, int]] = None): super().__init__() self.embedding: Optional[nn.Module] = None + self.random_state = random_state def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator: @@ -28,21 +29,22 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: X.update({'network_embedding': self.embedding}) return X - def build_embedding(self, num_input_features, num_numerical_features) -> nn.Module: + def build_embedding(self, num_input_features: np.ndarray, num_numerical_features: int) -> nn.Module: raise NotImplementedError - def _get_args(self, X: Dict[str, Any]) -> Union[int, np.ndarray]: + def _get_args(self, X: Dict[str, Any]) -> Tuple[int, np.ndarray]: # Feature preprocessors can alter numerical columns if len(X['dataset_properties']['numerical_columns']) == 0: num_numerical_columns = 0 else: X_train = copy.deepcopy(X['backend'].load_datamanager().train_tensors[0][:2]) - # as numerical pipeline will always be the first pipeline - numerical_column_transformer = X['tabular_transformer'].preprocessor.named_transformers_['numerical_pipeline'] + + numerical_column_transformer = X['tabular_transformer'].preprocessor. \ + named_transformers_['numerical_pipeline'] num_numerical_columns = numerical_column_transformer.transform( - X_train.iloc[:, X['dataset_properties']['numerical_columns']]).shape[1] - num_input_features = np.zeros((num_numerical_columns + - len(X['dataset_properties']['categorical_columns'])), dtype=int) + X_train[:, X['dataset_properties']['numerical_columns']]).shape[1] + num_input_features = np.zeros((num_numerical_columns + len(X['dataset_properties']['categorical_columns'])), + dtype=int) categories = X['dataset_properties']['categories'] for i, category in enumerate(categories): diff --git a/autoPyTorch/pipeline/tabular_classification.py b/autoPyTorch/pipeline/tabular_classification.py index f5b668a88..73dca2878 100644 --- a/autoPyTorch/pipeline/tabular_classification.py +++ b/autoPyTorch/pipeline/tabular_classification.py @@ -3,7 +3,7 @@ from typing import Any, Dict, List, Optional, Tuple from ConfigSpace.configuration_space import Configuration, ConfigurationSpace -from ConfigSpace.forbidden import ForbiddenEqualsClause, ForbiddenAndConjunction +from ConfigSpace.forbidden import ForbiddenAndConjunction, ForbiddenEqualsClause import numpy as np @@ -65,15 +65,15 @@ class TabularClassificationPipeline(ClassifierMixin, BasePipeline): """ def __init__( - self, - config: Optional[Configuration] = None, - steps: Optional[List[Tuple[str, autoPyTorchChoice]]] = None, - dataset_properties: Optional[Dict[str, Any]] = None, - include: Optional[Dict[str, Any]] = None, - exclude: Optional[Dict[str, Any]] = None, - random_state: Optional[np.random.RandomState] = None, - init_params: Optional[Dict[str, Any]] = None, - search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None + self, + config: Optional[Configuration] = None, + steps: Optional[List[Tuple[str, autoPyTorchChoice]]] = None, + dataset_properties: Optional[Dict[str, Any]] = None, + include: Optional[Dict[str, Any]] = None, + exclude: Optional[Dict[str, Any]] = None, + random_state: Optional[np.random.RandomState] = None, + init_params: Optional[Dict[str, Any]] = None, + search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None ): super().__init__( config, steps, dataset_properties, include, exclude, @@ -192,31 +192,32 @@ def _get_hyperparameter_search_space(self, # Here we add custom code, like this with this # is not a valid configuration # Learned Entity Embedding is only valid when encoder is one hot encoder - embeddings = cs.get_hyperparameter('network_embedding:__choice__').choices - encoders = cs.get_hyperparameter('encoder:__choice__').choices - default = cs.get_hyperparameter('network_embedding:__choice__').default_value - possible_default_embeddings = copy.copy(list(embeddings)) - del possible_default_embeddings[possible_default_embeddings.index(default)] if 'network_embedding' in self.named_steps.keys() and 'encoder' in self.named_steps.keys(): - for encoder in encoders: - if encoder == 'OneHotEncoder': - continue - while True: - try: - cs.add_forbidden_clause(ForbiddenAndConjunction( - ForbiddenEqualsClause(cs.get_hyperparameter( - 'network_embedding:__choice__'), 'LearnedEntityEmbedding'), - ForbiddenEqualsClause(cs.get_hyperparameter('encoder:__choice__') - , encoder) - )) - break - except ValueError: - # change the default and try again + embeddings = cs.get_hyperparameter('network_embedding:__choice__').choices + if 'LearnedEntityEmbedding' in embeddings: + encoders = cs.get_hyperparameter('encoder:__choice__').choices + default = cs.get_hyperparameter('network_embedding:__choice__').default_value + possible_default_embeddings = copy.copy(list(embeddings)) + del possible_default_embeddings[possible_default_embeddings.index(default)] + + for encoder in encoders: + if encoder == 'OneHotEncoder': + continue + while True: try: - default = possible_default_embeddings.pop() - except IndexError: - raise ValueError("Cannot find a legal default configuration") - cs.get_hyperparameter('network_embedding:__choice__').default_value = default + cs.add_forbidden_clause(ForbiddenAndConjunction( + ForbiddenEqualsClause(cs.get_hyperparameter( + 'network_embedding:__choice__'), 'LearnedEntityEmbedding'), + ForbiddenEqualsClause(cs.get_hyperparameter('encoder:__choice__'), encoder) + )) + break + except ValueError: + # change the default and try again + try: + default = possible_default_embeddings.pop() + except IndexError: + raise ValueError("Cannot find a legal default configuration") + cs.get_hyperparameter('network_embedding:__choice__').default_value = default self.configuration_space = cs self.dataset_properties = dataset_properties diff --git a/autoPyTorch/pipeline/tabular_regression.py b/autoPyTorch/pipeline/tabular_regression.py index 0c6463c31..3220f0fff 100644 --- a/autoPyTorch/pipeline/tabular_regression.py +++ b/autoPyTorch/pipeline/tabular_regression.py @@ -1,3 +1,4 @@ +import copy import warnings from typing import Any, Dict, List, Optional, Tuple @@ -138,31 +139,32 @@ def _get_hyperparameter_search_space(self, # Here we add custom code, like this with this # is not a valid configuration # Learned Entity Embedding is only valid when encoder is one hot encoder - embeddings = cs.get_hyperparameter('network_embedding:__choice__').choices - encoders = cs.get_hyperparameter('encoder:__choice__').choices - default = cs.get_hyperparameter('network_embedding:__choice__').default_value - possible_default_embeddings = copy.copy(list(embeddings)) - del possible_default_embeddings[possible_default_embeddings.index(default)] if 'network_embedding' in self.named_steps.keys() and 'encoder' in self.named_steps.keys(): - for encoder in encoders: - if encoder == 'OneHotEncoder': - continue - while True: - try: - cs.add_forbidden_clause(ForbiddenAndConjunction( - ForbiddenEqualsClause(cs.get_hyperparameter( - 'network_embedding:__choice__'), 'LearnedEntityEmbedding'), - ForbiddenEqualsClause(cs.get_hyperparameter('encoder:__choice__') - , encoder) - )) - break - except ValueError: - # change the default and try again + embeddings = cs.get_hyperparameter('network_embedding:__choice__').choices + if 'LearnedEntityEmbedding' in embeddings: + encoders = cs.get_hyperparameter('encoder:__choice__').choices + default = cs.get_hyperparameter('network_embedding:__choice__').default_value + possible_default_embeddings = copy.copy(list(embeddings)) + del possible_default_embeddings[possible_default_embeddings.index(default)] + + for encoder in encoders: + if encoder == 'OneHotEncoder': + continue + while True: try: - default = possible_default_embeddings.pop() - except IndexError: - raise ValueError("Cannot find a legal default configuration") - cs.get_hyperparameter('network_embedding:__choice__').default_value = default + cs.add_forbidden_clause(ForbiddenAndConjunction( + ForbiddenEqualsClause(cs.get_hyperparameter( + 'network_embedding:__choice__'), 'LearnedEntityEmbedding'), + ForbiddenEqualsClause(cs.get_hyperparameter('encoder:__choice__'), encoder) + )) + break + except ValueError: + # change the default and try again + try: + default = possible_default_embeddings.pop() + except IndexError: + raise ValueError("Cannot find a legal default configuration") + cs.get_hyperparameter('network_embedding:__choice__').default_value = default self.configuration_space = cs self.dataset_properties = dataset_properties diff --git a/test/test_pipeline/components/setup/test_setup_networks.py b/test/test_pipeline/components/setup/test_setup_networks.py index be8af94c5..df5ad4cfd 100644 --- a/test/test_pipeline/components/setup/test_setup_networks.py +++ b/test/test_pipeline/components/setup/test_setup_networks.py @@ -17,21 +17,42 @@ def head(request): return request.param +@pytest.fixture(params=['LearnedEntityEmbedding', 'NoEmbedding']) +def embedding(request): + return request.param + + @flaky.flaky(max_runs=3) @pytest.mark.parametrize("fit_dictionary_tabular", ['classification_numerical_only', 'classification_categorical_only', 'classification_numerical_and_categorical'], indirect=True) class TestNetworks: +<<<<<<< HEAD def test_pipeline_fit(self, fit_dictionary_tabular, backbone, head): +======= + def test_pipeline_fit(self, fit_dictionary, embedding, backbone, head): +>>>>>>> ADD tests for network embedding """This test makes sure that the pipeline is able to fit - given random combinations of hyperparameters across the pipeline""" + every combination of network embedding, backbone, head""" + include = {'network_backbone': [backbone], 'network_head': [head], 'network_embedding': [embedding]} + + if len(fit_dictionary['dataset_properties'] + ['categorical_columns']) == 0 and embedding == 'LearnedEntityEmbedding': + pytest.skip("Learned Entity Embedding is not used with numerical only data") pipeline = TabularClassificationPipeline( +<<<<<<< HEAD dataset_properties=fit_dictionary_tabular['dataset_properties'], include={'network_backbone': [backbone], 'network_head': [head]}) +======= + dataset_properties=fit_dictionary['dataset_properties'], + include=include) + +>>>>>>> ADD tests for network embedding cs = pipeline.get_hyperparameter_search_space() config = cs.get_default_configuration() + assert embedding == config.get('network_embedding:__choice__', None) assert backbone == config.get('network_backbone:__choice__', None) assert head == config.get('network_head:__choice__', None) pipeline.set_hyperparameters(config) diff --git a/test/test_pipeline/test_tabular_classification.py b/test/test_pipeline/test_tabular_classification.py index 260587adb..fc6eea0e4 100644 --- a/test/test_pipeline/test_tabular_classification.py +++ b/test/test_pipeline/test_tabular_classification.py @@ -35,7 +35,13 @@ def _assert_pipeline_search_space(self, pipeline, search_space_updates): assert any(update.node_name + ':' + update.hyperparameter in name for name in config_space.get_hyperparameter_names()), \ "Can't find hyperparameter: {}".format(update.hyperparameter) - hyperparameter = config_space.get_hyperparameter(update.node_name + ':' + update.hyperparameter + '_1') + # dimension reduction in embedding starts from 0 + if 'embedding' in update.node_name: + hyperparameter = config_space.get_hyperparameter( + update.node_name + ':' + update.hyperparameter + '_0') + else: + hyperparameter = config_space.get_hyperparameter( + update.node_name + ':' + update.hyperparameter + '_1') assert update.default_value == hyperparameter.default_value if isinstance(hyperparameter, (UniformIntegerHyperparameter, UniformFloatHyperparameter)): assert update.value_range[0] == hyperparameter.lower @@ -208,6 +214,7 @@ def test_network_optimizer_lr_handshake(self, fit_dictionary_tabular): # Make sure that fitting a network adds a "network" to X assert 'network' in pipeline.named_steps.keys() + fit_dictionary_tabular['network_embedding'] = torch.nn.Linear(3, 3) fit_dictionary_tabular['network_backbone'] = torch.nn.Linear(3, 4) fit_dictionary_tabular['network_head'] = torch.nn.Linear(4, 1) X = pipeline.named_steps['network'].fit( From a3c1625e5f75e6a3e76fed2cdca1a46c745a751a Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Mon, 15 Feb 2021 22:24:28 +0100 Subject: [PATCH 44/68] Removed ordinal encoder --- .../components/preprocessing/base.py | 2 +- .../components/preprocessing/test_encoders.py | 43 ------------------- 2 files changed, 1 insertion(+), 44 deletions(-) diff --git a/test/test_pipeline/components/preprocessing/base.py b/test/test_pipeline/components/preprocessing/base.py index 7bb4fee70..875ed399c 100644 --- a/test/test_pipeline/components/preprocessing/base.py +++ b/test/test_pipeline/components/preprocessing/base.py @@ -33,4 +33,4 @@ def _get_pipeline_steps(self, dataset_properties: Optional[Dict[str, Any]], ("scaler", ScalerChoice(default_dataset_properties)), ("tabular_transformer", TabularColumnTransformer()), ]) - return steps \ No newline at end of file + return steps diff --git a/test/test_pipeline/components/preprocessing/test_encoders.py b/test/test_pipeline/components/preprocessing/test_encoders.py index 1f210936f..a901823ba 100644 --- a/test/test_pipeline/components/preprocessing/test_encoders.py +++ b/test/test_pipeline/components/preprocessing/test_encoders.py @@ -8,7 +8,6 @@ from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.encoding.NoEncoder import NoEncoder from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.encoding.OneHotEncoder import OneHotEncoder -from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.encoding.OrdinalEncoder import OrdinalEncoder class TestEncoders(unittest.TestCase): @@ -53,48 +52,6 @@ def test_one_hot_encoder_no_unknown(self): # check if the transform is correct assert_array_equal(transformed, [['1.0', '0.0', 1], ['1.0', '0.0', 2]]) - def test_ordinal_encoder(self): - - data = np.array([[1, 'male'], - [1, 'female'], - [3, 'male'], - [2, 'female'], - [2, 'male']]) - - categorical_columns = [1] - numerical_columns = [0] - train_indices = np.array([0, 2, 3]) - test_indices = np.array([1, 4]) - - dataset_properties = { - 'categorical_columns': categorical_columns, - 'numerical_columns': numerical_columns, - 'categories': [['female', 'male', 'unknown']] - } - X = { - 'X_train': data[train_indices], - 'dataset_properties': dataset_properties - } - encoder_component = OrdinalEncoder() - encoder_component.fit(X) - X = encoder_component.transform(X) - - encoder = X['encoder']['categorical'] - - # check if the fit dictionary X is modified as expected - self.assertIsInstance(X['encoder'], dict) - self.assertIsInstance(encoder, BaseEstimator) - self.assertIsNone(X['encoder']['numerical']) - - # make column transformer with returned encoder to fit on data - column_transformer = make_column_transformer((encoder, X['dataset_properties']['categorical_columns']), - remainder='passthrough') - column_transformer = column_transformer.fit(X['X_train']) - transformed = column_transformer.transform(data[test_indices]) - - # check if we got the expected transformed array - assert_array_equal(transformed, [['0.0', 1], ['1.0', 2]]) - def test_none_encoder(self): data = np.array([[1, 'male'], From b8896ad1ab5eb42267717976d826b0e3666899e9 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Mon, 15 Feb 2021 22:24:41 +0100 Subject: [PATCH 45/68] Removed ordinal encoder --- .../tabular_preprocessing/encoding/base_encoder_choice.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/base_encoder_choice.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/base_encoder_choice.py index 7be7c94a2..df71ff209 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/base_encoder_choice.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/base_encoder_choice.py @@ -65,7 +65,7 @@ def get_hyperparameter_search_space(self, raise ValueError("no encoders found, please add a encoder") if default is None: - defaults = ['OneHotEncoder', 'OrdinalEncoder', 'NoEncoder'] + defaults = ['OneHotEncoder', 'NoEncoder'] for default_ in defaults: if default_ in available_preprocessors: if include is not None and default_ not in include: From e0bfb0bc93827e8976e297334a774ade0fa7c96c Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Wed, 17 Feb 2021 13:36:52 +0100 Subject: [PATCH 46/68] Addressed comments --- autoPyTorch/pipeline/tabular_regression.py | 2 ++ .../preprocessing/test_tabular_column_transformer.py | 11 +++++++++++ 2 files changed, 13 insertions(+) diff --git a/autoPyTorch/pipeline/tabular_regression.py b/autoPyTorch/pipeline/tabular_regression.py index 3220f0fff..855a025e8 100644 --- a/autoPyTorch/pipeline/tabular_regression.py +++ b/autoPyTorch/pipeline/tabular_regression.py @@ -25,6 +25,7 @@ from autoPyTorch.pipeline.components.setup.lr_scheduler.base_scheduler_choice import SchedulerChoice from autoPyTorch.pipeline.components.setup.network.base_network import NetworkComponent from autoPyTorch.pipeline.components.setup.network_backbone.base_network_backbone_choice import NetworkBackboneChoice +from autoPyTorch.pipeline.components.setup.network_embedding.base_network_embedding_choice import NetworkEmbeddingChoice from autoPyTorch.pipeline.components.setup.network_head.base_network_head_choice import NetworkHeadChoice from autoPyTorch.pipeline.components.setup.network_initializer.base_network_init_choice import ( NetworkInitializerChoice @@ -191,6 +192,7 @@ def _get_pipeline_steps(self, dataset_properties: Optional[Dict[str, Any]]) -> L ("scaler", ScalerChoice(default_dataset_properties)), ("tabular_transformer", TabularColumnTransformer()), ("preprocessing", EarlyPreprocessing()), + ("network_embedding", NetworkEmbeddingChoice(default_dataset_properties)), ("network_backbone", NetworkBackboneChoice(default_dataset_properties)), ("network_head", NetworkHeadChoice(default_dataset_properties)), ("network", NetworkComponent()), diff --git a/test/test_pipeline/components/preprocessing/test_tabular_column_transformer.py b/test/test_pipeline/components/preprocessing/test_tabular_column_transformer.py index 8e31bad05..66a96f27f 100644 --- a/test/test_pipeline/components/preprocessing/test_tabular_column_transformer.py +++ b/test/test_pipeline/components/preprocessing/test_tabular_column_transformer.py @@ -33,7 +33,18 @@ def test_tabular_preprocess(self, fit_dictionary_tabular): data = column_transformer.preprocessor.fit_transform(X['X_train']) assert isinstance(data, np.ndarray) + # Make sure no columns are unintentionally dropped after preprocessing + if len(fit_dictionary_tabular['dataset_properties']["numerical_columns"]) == 0: + categorical_pipeline = column_transformer.preprocessor.named_transformers_['categorical_pipeline'] + categorical_data = categorical_pipeline.transform(X['X_train']) + assert data.shape[1] == categorical_data.shape[1] + elif len(fit_dictionary_tabular['dataset_properties']["categorical_columns"]) == 0: + numerical_pipeline = column_transformer.preprocessor.named_transformers_['numerical_pipeline'] + numerical_data = numerical_pipeline.transform(X['X_train']) + assert data.shape[1] == numerical_data.shape[1] + def test_sparse_data(self, fit_dictionary_tabular): + X = np.random.binomial(1, 0.1, (100, 2000)) sparse_X = csr_matrix(X) numerical_columns = list(range(2000)) From 23f677710c2d3850285cde716201a6930cf2ea0b Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Mon, 22 Feb 2021 13:58:36 +0100 Subject: [PATCH 47/68] fix flake --- test/test_pipeline/components/training/base.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/test/test_pipeline/components/training/base.py b/test/test_pipeline/components/training/base.py index eaa80cf88..c13a47aa8 100644 --- a/test/test_pipeline/components/training/base.py +++ b/test/test_pipeline/components/training/base.py @@ -5,8 +5,14 @@ import torch -from autoPyTorch.constants import BINARY, CLASSIFICATION_TASKS, CONTINUOUS, OUTPUT_TYPES_TO_STRING, REGRESSION_TASKS, \ +from autoPyTorch.constants import ( + BINARY, + CLASSIFICATION_TASKS, + CONTINUOUS, + OUTPUT_TYPES_TO_STRING, + REGRESSION_TASKS, TASK_TYPES_TO_STRING +) from autoPyTorch.pipeline.components.training.trainer.base_trainer import BaseTrainerComponent, BudgetTracker from autoPyTorch.pipeline.components.training.metrics.utils import get_metrics From 4c1f33f3484090d6664022b11a53e8b3809fb055 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Tue, 23 Feb 2021 19:12:59 +0100 Subject: [PATCH 48/68] fix test import training --- test/test_pipeline/components/training/test_training.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_pipeline/components/training/test_training.py b/test/test_pipeline/components/training/test_training.py index 9005d1ad2..d6964fa14 100644 --- a/test/test_pipeline/components/training/test_training.py +++ b/test/test_pipeline/components/training/test_training.py @@ -27,7 +27,7 @@ ) sys.path.append(os.path.dirname(__file__)) -from test.test_pipeline.components.base import BaseTraining # noqa (E402: module level import not at top of file) +from test.test_pipeline.components.training.base import BaseTraining # noqa (E402: module level import not at top of file) class BaseDataLoaderTest(unittest.TestCase): From 8c4233cbcd7b1a917462118307f8797b584aed24 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Tue, 23 Feb 2021 20:07:53 +0100 Subject: [PATCH 49/68] Fix tests and move to boston --- test/conftest.py | 4 ++-- test/test_api/test_api.py | 2 +- test/test_pipeline/test_tabular_regression.py | 9 ++++++++- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/test/conftest.py b/test/conftest.py index c8ff6529e..e658b7e37 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -227,7 +227,7 @@ def get_tabular_data(task): validator = TabularInputValidator(is_classification=False).fit(X.copy(), y.copy()) elif task == "regression_categorical_only": - X, y = fetch_openml("cholesterol", return_X_y=True, as_frame=True) + X, y = fetch_openml("boston", return_X_y=True, as_frame=True) categorical_columns = [column for column in X.columns if X[column].dtype.name == 'category'] X = X[categorical_columns] @@ -245,7 +245,7 @@ def get_tabular_data(task): validator = TabularInputValidator(is_classification=False).fit(X.copy(), y.copy()) elif task == "regression_numerical_and_categorical": - X, y = fetch_openml("cholesterol", return_X_y=True, as_frame=True) + X, y = fetch_openml("boston", return_X_y=True, as_frame=True) # fill nan values for now since they are not handled properly yet for column in X.columns: diff --git a/test/test_api/test_api.py b/test/test_api/test_api.py index ea7cccd72..4ac194968 100644 --- a/test/test_api/test_api.py +++ b/test/test_api/test_api.py @@ -178,7 +178,7 @@ def test_tabular_classification(openml_id, resampling_strategy, backend): restored_estimator.predict(X_test) -@pytest.mark.parametrize('openml_name', ("cholesterol", )) +@pytest.mark.parametrize('openml_name', ("boston", )) @pytest.mark.parametrize('resampling_strategy', (HoldoutValTypes.holdout_validation, CrossValTypes.k_fold_cross_validation, )) diff --git a/test/test_pipeline/test_tabular_regression.py b/test/test_pipeline/test_tabular_regression.py index 15b8351f9..74de19405 100644 --- a/test/test_pipeline/test_tabular_regression.py +++ b/test/test_pipeline/test_tabular_regression.py @@ -39,7 +39,13 @@ def _assert_pipeline_search_space(self, pipeline, search_space_updates): assert any(update.node_name + ':' + update.hyperparameter in name for name in config_space.get_hyperparameter_names()), \ "Can't find hyperparameter: {}".format(update.hyperparameter) - hyperparameter = config_space.get_hyperparameter(update.node_name + ':' + update.hyperparameter + '_1') + # dimension reduction in embedding starts from 0 + if 'embedding' in update.node_name: + hyperparameter = config_space.get_hyperparameter( + update.node_name + ':' + update.hyperparameter + '_0') + else: + hyperparameter = config_space.get_hyperparameter( + update.node_name + ':' + update.hyperparameter + '_1') assert update.default_value == hyperparameter.default_value if isinstance(hyperparameter, (UniformIntegerHyperparameter, UniformFloatHyperparameter)): assert update.value_range[0] == hyperparameter.lower @@ -199,6 +205,7 @@ def test_network_optimizer_lr_handshake(self, fit_dictionary_tabular): # Make sure that fitting a network adds a "network" to X assert 'network' in pipeline.named_steps.keys() + fit_dictionary_tabular['network_embedding'] = torch.nn.Linear(3, 3) fit_dictionary_tabular['network_backbone'] = torch.nn.Linear(3, 4) fit_dictionary_tabular['network_head'] = torch.nn.Linear(4, 1) X = pipeline.named_steps['network'].fit( From 18b5771f39cd5ca006586eabb10e96c75a5e8b9e Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Thu, 25 Feb 2021 18:04:43 +0100 Subject: [PATCH 50/68] Debug issue with python 3.6 --- examples/example_tmp_for_debug.py | 168 ++++++++++++++++++++++++++++++ 1 file changed, 168 insertions(+) create mode 100644 examples/example_tmp_for_debug.py diff --git a/examples/example_tmp_for_debug.py b/examples/example_tmp_for_debug.py new file mode 100644 index 000000000..ca31c8f4b --- /dev/null +++ b/examples/example_tmp_for_debug.py @@ -0,0 +1,168 @@ +import os + +import sklearn.datasets +import time +import shutil + +from autoPyTorch.utils.backend import create + +from autoPyTorch.api.tabular_classification import TabularClassificationTask +from autoPyTorch.datasets.resampling_strategy import ( + CrossValTypes, +) +import re + + +from pathlib import Path + + +class DisplayablePath(object): + display_filename_prefix_middle = '├──' + display_filename_prefix_last = '└──' + display_parent_prefix_middle = ' ' + display_parent_prefix_last = '│ ' + + def __init__(self, path, parent_path, is_last): + self.path = Path(str(path)) + self.parent = parent_path + self.is_last = is_last + if self.parent: + self.depth = self.parent.depth + 1 + else: + self.depth = 0 + + @property + def displayname(self): + if self.path.is_dir(): + return self.path.name + '/' + return self.path.name + + @classmethod + def make_tree(cls, root, parent=None, is_last=False, criteria=None): + root = Path(str(root)) + criteria = criteria or cls._default_criteria + + displayable_root = cls(root, parent, is_last) + yield displayable_root + + children = sorted(list(path + for path in root.iterdir() + if criteria(path)), + key=lambda s: str(s).lower()) + count = 1 + for path in children: + is_last = count == len(children) + if path.is_dir(): + yield from cls.make_tree(path, + parent=displayable_root, + is_last=is_last, + criteria=criteria) + else: + yield cls(path, displayable_root, is_last) + count += 1 + + @classmethod + def _default_criteria(cls, path): + return True + + @property + def displayname(self): + if self.path.is_dir(): + return self.path.name + '/' + return self.path.name + + def displayable(self): + if self.parent is None: + return self.displayname + + _filename_prefix = (self.display_filename_prefix_last + if self.is_last + else self.display_filename_prefix_middle) + + parts = ['{!s} {!s}'.format(_filename_prefix, + self.displayname)] + + parent = self.parent + while parent and parent.parent is not None: + parts.append(self.display_parent_prefix_middle + if parent.is_last + else self.display_parent_prefix_last) + parent = parent.parent + + return ''.join(reversed(parts)) + + +def slugify(text): + return re.sub(r'[\[\]]+', '-', text.lower()) + + +test_dir = os.path.dirname(__file__) +tmp = slugify(os.path.join( + test_dir, '.tmp__%s' % __file__)) +output = slugify(os.path.join( + test_dir, '.output__%s' % __file__)) + +for dir in (tmp, output): + for i in range(10): + if os.path.exists(dir): + try: + shutil.rmtree(dir) + break + except OSError: + time.sleep(1) + +# Make sure the folders we wanna create do not already exist. +backend = create( + tmp, + output, + delete_tmp_folder_after_terminate=True, + delete_output_folder_after_terminate=True, +) + +openml_id = 40981 +resampling_strategy = CrossValTypes.k_fold_cross_validation +X, y = sklearn.datasets.fetch_openml( + data_id=int(openml_id), + return_X_y=True, as_frame=True +) +X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( + X, y, random_state=1) + +# Search for a good configuration +estimator = TabularClassificationTask( + backend=backend, + resampling_strategy=resampling_strategy, +) + +estimator.search( + X_train=X_train, y_train=y_train, + X_test=X_test, y_test=y_test, + optimize_metric='accuracy', + total_walltime_limit=150, + func_eval_time_limit=50, + traditional_per_total_budget=0 +) + +# Search for an existing run key in disc. A individual model might have +# a timeout and hence was not written to disc +for i, (run_key, value) in enumerate(estimator.run_history.data.items()): + if i == 0: + # Ignore dummy run + continue + if 'SUCCESS' not in str(value.status): + continue + + run_key_model_run_dir = estimator._backend.get_numrun_directory( + estimator.seed, run_key.config_id, run_key.budget) + if os.path.exists(run_key_model_run_dir): + break + + +model_file = os.path.join( + run_key_model_run_dir, + f"{estimator.seed}.{run_key.config_id}.{run_key.budget}.cv_model" +) +if not os.path.exists(model_file): + paths = DisplayablePath.make_tree(run_key_model_run_dir) + for path in paths: + print(path.displayable()) From d839b5d39e18bba86e4bd949c6691b02ddd353af Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Fri, 26 Feb 2021 15:00:25 +0100 Subject: [PATCH 51/68] Run only debug file --- .github/workflows/examples.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/examples.yml b/.github/workflows/examples.yml index b278a8563..53222930b 100644 --- a/.github/workflows/examples.yml +++ b/.github/workflows/examples.yml @@ -30,5 +30,7 @@ jobs: echo "::set-output name=BEFORE::$(git status --porcelain -b)" - name: Run tests run: | - python examples/example_tabular_classification.py - python examples/example_image_classification.py +# python examples/example_tabular_classification.py +# python examples/example_image_classification.py +# python examples/example_tabular_regression.py + python examples/example_tmp_for_debug.py \ No newline at end of file From de1d4c333ff7dd533f58f68f65ba07d4955e4945 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Fri, 26 Feb 2021 14:31:17 +0100 Subject: [PATCH 52/68] Debug for python3.6 --- .github/workflows/examples.yml | 2 +- examples/example_tmp_for_debug.py | 183 +++++++----------------------- test/test_api/test_api.py | 10 +- test/utils.py | 77 +++++++++++++ 4 files changed, 127 insertions(+), 145 deletions(-) create mode 100644 test/utils.py diff --git a/.github/workflows/examples.yml b/.github/workflows/examples.yml index 53222930b..09b52f4e6 100644 --- a/.github/workflows/examples.yml +++ b/.github/workflows/examples.yml @@ -8,7 +8,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [3.8] + python-version: [3.6] fail-fast: false max-parallel: 2 diff --git a/examples/example_tmp_for_debug.py b/examples/example_tmp_for_debug.py index ca31c8f4b..664eac866 100644 --- a/examples/example_tmp_for_debug.py +++ b/examples/example_tmp_for_debug.py @@ -1,123 +1,17 @@ +""" +Example file to be deleted +""" import os import sklearn.datasets -import time -import shutil - -from autoPyTorch.utils.backend import create from autoPyTorch.api.tabular_classification import TabularClassificationTask from autoPyTorch.datasets.resampling_strategy import ( CrossValTypes, ) -import re - - -from pathlib import Path - - -class DisplayablePath(object): - display_filename_prefix_middle = '├──' - display_filename_prefix_last = '└──' - display_parent_prefix_middle = ' ' - display_parent_prefix_last = '│ ' - - def __init__(self, path, parent_path, is_last): - self.path = Path(str(path)) - self.parent = parent_path - self.is_last = is_last - if self.parent: - self.depth = self.parent.depth + 1 - else: - self.depth = 0 - - @property - def displayname(self): - if self.path.is_dir(): - return self.path.name + '/' - return self.path.name - - @classmethod - def make_tree(cls, root, parent=None, is_last=False, criteria=None): - root = Path(str(root)) - criteria = criteria or cls._default_criteria - - displayable_root = cls(root, parent, is_last) - yield displayable_root - - children = sorted(list(path - for path in root.iterdir() - if criteria(path)), - key=lambda s: str(s).lower()) - count = 1 - for path in children: - is_last = count == len(children) - if path.is_dir(): - yield from cls.make_tree(path, - parent=displayable_root, - is_last=is_last, - criteria=criteria) - else: - yield cls(path, displayable_root, is_last) - count += 1 - - @classmethod - def _default_criteria(cls, path): - return True - - @property - def displayname(self): - if self.path.is_dir(): - return self.path.name + '/' - return self.path.name - - def displayable(self): - if self.parent is None: - return self.displayname - - _filename_prefix = (self.display_filename_prefix_last - if self.is_last - else self.display_filename_prefix_middle) +from test.utils import DisplayablePath - parts = ['{!s} {!s}'.format(_filename_prefix, - self.displayname)] - parent = self.parent - while parent and parent.parent is not None: - parts.append(self.display_parent_prefix_middle - if parent.is_last - else self.display_parent_prefix_last) - parent = parent.parent - - return ''.join(reversed(parts)) - - -def slugify(text): - return re.sub(r'[\[\]]+', '-', text.lower()) - - -test_dir = os.path.dirname(__file__) -tmp = slugify(os.path.join( - test_dir, '.tmp__%s' % __file__)) -output = slugify(os.path.join( - test_dir, '.output__%s' % __file__)) - -for dir in (tmp, output): - for i in range(10): - if os.path.exists(dir): - try: - shutil.rmtree(dir) - break - except OSError: - time.sleep(1) - -# Make sure the folders we wanna create do not already exist. -backend = create( - tmp, - output, - delete_tmp_folder_after_terminate=True, - delete_output_folder_after_terminate=True, -) openml_id = 40981 resampling_strategy = CrossValTypes.k_fold_cross_validation @@ -128,41 +22,44 @@ def slugify(text): X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( X, y, random_state=1) -# Search for a good configuration -estimator = TabularClassificationTask( - backend=backend, - resampling_strategy=resampling_strategy, -) - -estimator.search( - X_train=X_train, y_train=y_train, - X_test=X_test, y_test=y_test, - optimize_metric='accuracy', - total_walltime_limit=150, - func_eval_time_limit=50, - traditional_per_total_budget=0 -) -# Search for an existing run key in disc. A individual model might have -# a timeout and hence was not written to disc -for i, (run_key, value) in enumerate(estimator.run_history.data.items()): - if i == 0: - # Ignore dummy run - continue - if 'SUCCESS' not in str(value.status): - continue +if __name__ == '__main__': + # Search for a good configuration + estimator = TabularClassificationTask( + temporary_directory='./tmp', + delete_tmp_folder_after_terminate=False, + resampling_strategy=resampling_strategy, + ) + + estimator.search( + X_train=X_train, y_train=y_train, + X_test=X_test, y_test=y_test, + optimize_metric='accuracy', + total_walltime_limit=150, + func_eval_time_limit=50, + traditional_per_total_budget=0 + ) + + # Search for an existing run key in disc. A individual model might have + # a timeout and hence was not written to disc + for i, (run_key, value) in enumerate(estimator.run_history.data.items()): + if i == 0: + # Ignore dummy run + continue + if 'SUCCESS' not in str(value.status): + continue + + run_key_model_run_dir = estimator._backend.get_numrun_directory( + estimator.seed, run_key.config_id, run_key.budget) + if os.path.exists(run_key_model_run_dir): + break + + + model_file = os.path.join( + run_key_model_run_dir, + f"{estimator.seed}.{run_key.config_id}.{run_key.budget}.cv_model" + ) - run_key_model_run_dir = estimator._backend.get_numrun_directory( - estimator.seed, run_key.config_id, run_key.budget) - if os.path.exists(run_key_model_run_dir): - break - - -model_file = os.path.join( - run_key_model_run_dir, - f"{estimator.seed}.{run_key.config_id}.{run_key.budget}.cv_model" -) -if not os.path.exists(model_file): paths = DisplayablePath.make_tree(run_key_model_run_dir) for path in paths: print(path.displayable()) diff --git a/test/test_api/test_api.py b/test/test_api/test_api.py index 4ac194968..4cf615e97 100644 --- a/test/test_api/test_api.py +++ b/test/test_api/test_api.py @@ -22,6 +22,7 @@ HoldoutValTypes, ) +from test.utils import DisplayablePath # Fixtures # ======== @@ -120,7 +121,14 @@ def test_tabular_classification(openml_id, resampling_strategy, backend): run_key_model_run_dir, f"{estimator.seed}.{run_key.config_id}.{run_key.budget}.cv_model" ) - assert os.path.exists(model_file), model_file + try: + assert os.path.exists(model_file), model_file + except AssertionError: + paths = DisplayablePath.make_tree(run_key_model_run_dir) + for path in paths: + print(path.displayable()) + raise AssertionError(model_file) + model = estimator._backend.load_cv_model_by_seed_and_id_and_budget( estimator.seed, run_key.config_id, run_key.budget) assert isinstance(model, VotingClassifier) diff --git a/test/utils.py b/test/utils.py new file mode 100644 index 000000000..b1c919c4c --- /dev/null +++ b/test/utils.py @@ -0,0 +1,77 @@ +from pathlib import Path + + +class DisplayablePath(object): + display_filename_prefix_middle = '├──' + display_filename_prefix_last = '└──' + display_parent_prefix_middle = ' ' + display_parent_prefix_last = '│ ' + + def __init__(self, path, parent_path, is_last): + self.path = Path(str(path)) + self.parent = parent_path + self.is_last = is_last + if self.parent: + self.depth = self.parent.depth + 1 + else: + self.depth = 0 + + @property + def displayname(self): + if self.path.is_dir(): + return self.path.name + '/' + return self.path.name + + @classmethod + def make_tree(cls, root, parent=None, is_last=False, criteria=None): + root = Path(str(root)) + criteria = criteria or cls._default_criteria + + displayable_root = cls(root, parent, is_last) + yield displayable_root + + children = sorted(list(path + for path in root.iterdir() + if criteria(path)), + key=lambda s: str(s).lower()) + count = 1 + for path in children: + is_last = count == len(children) + if path.is_dir(): + yield from cls.make_tree(path, + parent=displayable_root, + is_last=is_last, + criteria=criteria) + else: + yield cls(path, displayable_root, is_last) + count += 1 + + @classmethod + def _default_criteria(cls, path): + return True + + @property + def displayname(self): + if self.path.is_dir(): + return self.path.name + '/' + return self.path.name + + def displayable(self): + if self.parent is None: + return self.displayname + + _filename_prefix = (self.display_filename_prefix_last + if self.is_last + else self.display_filename_prefix_middle) + + parts = ['{!s} {!s}'.format(_filename_prefix, + self.displayname)] + + parent = self.parent + while parent and parent.parent is not None: + parts.append(self.display_parent_prefix_middle + if parent.is_last + else self.display_parent_prefix_last) + parent = parent.parent + + return ''.join(reversed(parts)) From e0a488a7c8db7b5a29d1358c85b2fea7d368c1ae Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Mon, 8 Feb 2021 14:22:26 +0100 Subject: [PATCH 53/68] work in progress --- .../network/network_embedding/__init__.py | 0 .../base_network_embedding.py | 16 +++++ .../network_embedding/modules/__init__.py | 0 .../modules/learned_entity_embedding.py | 69 +++++++++++++++++++ .../network_embedding/modules/no_embedding.py | 12 ++++ 5 files changed, 97 insertions(+) create mode 100644 autoPyTorch/pipeline/components/setup/network/network_embedding/__init__.py create mode 100644 autoPyTorch/pipeline/components/setup/network/network_embedding/base_network_embedding.py create mode 100644 autoPyTorch/pipeline/components/setup/network/network_embedding/modules/__init__.py create mode 100644 autoPyTorch/pipeline/components/setup/network/network_embedding/modules/learned_entity_embedding.py create mode 100644 autoPyTorch/pipeline/components/setup/network/network_embedding/modules/no_embedding.py diff --git a/autoPyTorch/pipeline/components/setup/network/network_embedding/__init__.py b/autoPyTorch/pipeline/components/setup/network/network_embedding/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/autoPyTorch/pipeline/components/setup/network/network_embedding/base_network_embedding.py b/autoPyTorch/pipeline/components/setup/network/network_embedding/base_network_embedding.py new file mode 100644 index 000000000..5615062d4 --- /dev/null +++ b/autoPyTorch/pipeline/components/setup/network/network_embedding/base_network_embedding.py @@ -0,0 +1,16 @@ +from typing import Optional, Any + +from sklearn.base import BaseEstimator +from torch import nn + +from autoPyTorch.pipeline.components.setup.base_setup import autoPyTorchSetupComponent + + +class NetworkEmbeddingComponent(autoPyTorchSetupComponent): + def __init__(self, + **kwargs): + super().__init__() + self.config = kwargs + self.embedding: Optional[nn.Module] = None + + # def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator: \ No newline at end of file diff --git a/autoPyTorch/pipeline/components/setup/network/network_embedding/modules/__init__.py b/autoPyTorch/pipeline/components/setup/network/network_embedding/modules/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/autoPyTorch/pipeline/components/setup/network/network_embedding/modules/learned_entity_embedding.py b/autoPyTorch/pipeline/components/setup/network/network_embedding/modules/learned_entity_embedding.py new file mode 100644 index 000000000..e6eb88f19 --- /dev/null +++ b/autoPyTorch/pipeline/components/setup/network/network_embedding/modules/learned_entity_embedding.py @@ -0,0 +1,69 @@ +""" +Class to learn an embedding for categorical hyperparameters. +""" + +import torch +import torch.nn as nn +import numpy as np + + +class LearnedEntityEmbedding(nn.Module): + """ Parent class for MlpNet, ResNet, ... Can use entity embedding for categorical features""" + + def __init__(self, config, in_features, num_numerical_features): + """ + Initialize the BaseFeatureNet. + Arguments: + config: The configuration sampled by the hyperparameter optimizer + in_features: the number of features of the dataset + one_hot_encoder: OneHot encoder, that is used to encode X + """ + super(LearnedEntityEmbedding, self).__init__() + self.config = config + + # self.num_numerical = len([f for f in one_hot_encoder.categorical_features if not f]) + # self.num_input_features = [len(c) for c in one_hot_encoder.categories_] + self.num_numerical = num_numerical_features + self.embed_features = [num_in >= config["min_unique_values_for_embedding"] for num_in in + self.num_input_features] + self.num_output_dimensions = [config["dimension_reduction_" + str(i)] * num_in for i, num_in in + enumerate(self.num_input_features)] + self.num_output_dimensions = [int(np.clip(num_out, 1, num_in - 1)) for num_out, num_in in + zip(self.num_output_dimensions, self.num_input_features)] + self.num_output_dimensions = [num_out if embed else num_in for num_out, embed, num_in in + zip(self.num_output_dimensions, self.embed_features, self.num_input_features)] + self.num_out_feats = self.num_numerical + sum(self.num_output_dimensions) + + self.ee_layers = self._create_ee_layers(in_features) + + def forward(self, x): + # pass the columns of each categorical feature through entity embedding layer + # before passing it through the model + concat_seq = [] + last_concat = 0 + x_pointer = 0 + layer_pointer = 0 + for num_in, embed in zip(self.num_input_features, self.embed_features): + if not embed: + x_pointer += 1 + continue + if x_pointer > last_concat: + concat_seq.append(x[:, last_concat: x_pointer]) + categorical_feature_slice = x[:, x_pointer: x_pointer + num_in] + concat_seq.append(self.ee_layers[layer_pointer](categorical_feature_slice)) + layer_pointer += 1 + x_pointer += num_in + last_concat = x_pointer + + concat_seq.append(x[:, last_concat:]) + return torch.cat(concat_seq, dim=1) + + def _create_ee_layers(self, in_features): + # entity embeding layers are Linear Layers + layers = nn.ModuleList() + for i, (num_in, embed, num_out) in enumerate( + zip(self.num_input_features, self.embed_features, self.num_output_dimensions)): + if not embed: + continue + layers.append(nn.Linear(num_in, num_out)) + return layers \ No newline at end of file diff --git a/autoPyTorch/pipeline/components/setup/network/network_embedding/modules/no_embedding.py b/autoPyTorch/pipeline/components/setup/network/network_embedding/modules/no_embedding.py new file mode 100644 index 000000000..cab1e9b7f --- /dev/null +++ b/autoPyTorch/pipeline/components/setup/network/network_embedding/modules/no_embedding.py @@ -0,0 +1,12 @@ +from torch import nn + + +class NoEmbedding(nn.Module): + def __init__(self, config, in_features, num_numerical_features): + super(NoEmbedding, self).__init__() + self.config = config + self.n_feats = in_features + self.num_numerical = num_numerical_features + + def forward(self, x): + return x \ No newline at end of file From 736890866297c572182b48b465adf5f9b372e767 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Mon, 8 Feb 2021 14:44:57 +0100 Subject: [PATCH 54/68] in progress --- .../network_embedding/base_network_embedding.py | 16 ---------------- .../network_embedding/modules/__init__.py | 0 .../modules}/__init__.py | 0 .../modules/learned_entity_embedding.py | 7 ++++--- .../network_embedding/modules/no_embedding.py | 0 5 files changed, 4 insertions(+), 19 deletions(-) delete mode 100644 autoPyTorch/pipeline/components/setup/network/network_embedding/base_network_embedding.py delete mode 100644 autoPyTorch/pipeline/components/setup/network/network_embedding/modules/__init__.py rename autoPyTorch/pipeline/components/setup/{network/network_embedding => network_embedding/modules}/__init__.py (100%) rename autoPyTorch/pipeline/components/setup/{network => }/network_embedding/modules/learned_entity_embedding.py (93%) rename autoPyTorch/pipeline/components/setup/{network => }/network_embedding/modules/no_embedding.py (100%) diff --git a/autoPyTorch/pipeline/components/setup/network/network_embedding/base_network_embedding.py b/autoPyTorch/pipeline/components/setup/network/network_embedding/base_network_embedding.py deleted file mode 100644 index 5615062d4..000000000 --- a/autoPyTorch/pipeline/components/setup/network/network_embedding/base_network_embedding.py +++ /dev/null @@ -1,16 +0,0 @@ -from typing import Optional, Any - -from sklearn.base import BaseEstimator -from torch import nn - -from autoPyTorch.pipeline.components.setup.base_setup import autoPyTorchSetupComponent - - -class NetworkEmbeddingComponent(autoPyTorchSetupComponent): - def __init__(self, - **kwargs): - super().__init__() - self.config = kwargs - self.embedding: Optional[nn.Module] = None - - # def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator: \ No newline at end of file diff --git a/autoPyTorch/pipeline/components/setup/network/network_embedding/modules/__init__.py b/autoPyTorch/pipeline/components/setup/network/network_embedding/modules/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/autoPyTorch/pipeline/components/setup/network/network_embedding/__init__.py b/autoPyTorch/pipeline/components/setup/network_embedding/modules/__init__.py similarity index 100% rename from autoPyTorch/pipeline/components/setup/network/network_embedding/__init__.py rename to autoPyTorch/pipeline/components/setup/network_embedding/modules/__init__.py diff --git a/autoPyTorch/pipeline/components/setup/network/network_embedding/modules/learned_entity_embedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/modules/learned_entity_embedding.py similarity index 93% rename from autoPyTorch/pipeline/components/setup/network/network_embedding/modules/learned_entity_embedding.py rename to autoPyTorch/pipeline/components/setup/network_embedding/modules/learned_entity_embedding.py index e6eb88f19..d7d294661 100644 --- a/autoPyTorch/pipeline/components/setup/network/network_embedding/modules/learned_entity_embedding.py +++ b/autoPyTorch/pipeline/components/setup/network_embedding/modules/learned_entity_embedding.py @@ -10,7 +10,7 @@ class LearnedEntityEmbedding(nn.Module): """ Parent class for MlpNet, ResNet, ... Can use entity embedding for categorical features""" - def __init__(self, config, in_features, num_numerical_features): + def __init__(self, config, num_input_features, num_numerical_features): """ Initialize the BaseFeatureNet. Arguments: @@ -24,6 +24,7 @@ def __init__(self, config, in_features, num_numerical_features): # self.num_numerical = len([f for f in one_hot_encoder.categorical_features if not f]) # self.num_input_features = [len(c) for c in one_hot_encoder.categories_] self.num_numerical = num_numerical_features + self.num_input_features = num_input_features self.embed_features = [num_in >= config["min_unique_values_for_embedding"] for num_in in self.num_input_features] self.num_output_dimensions = [config["dimension_reduction_" + str(i)] * num_in for i, num_in in @@ -34,7 +35,7 @@ def __init__(self, config, in_features, num_numerical_features): zip(self.num_output_dimensions, self.embed_features, self.num_input_features)] self.num_out_feats = self.num_numerical + sum(self.num_output_dimensions) - self.ee_layers = self._create_ee_layers(in_features) + self.ee_layers = self._create_ee_layers() def forward(self, x): # pass the columns of each categorical feature through entity embedding layer @@ -58,7 +59,7 @@ def forward(self, x): concat_seq.append(x[:, last_concat:]) return torch.cat(concat_seq, dim=1) - def _create_ee_layers(self, in_features): + def _create_ee_layers(self): # entity embeding layers are Linear Layers layers = nn.ModuleList() for i, (num_in, embed, num_out) in enumerate( diff --git a/autoPyTorch/pipeline/components/setup/network/network_embedding/modules/no_embedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/modules/no_embedding.py similarity index 100% rename from autoPyTorch/pipeline/components/setup/network/network_embedding/modules/no_embedding.py rename to autoPyTorch/pipeline/components/setup/network_embedding/modules/no_embedding.py From 00789ac27b504c0127585785a2f4ac4a61a4fc87 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Mon, 15 Feb 2021 19:57:00 +0100 Subject: [PATCH 55/68] Working network embedding --- .../network_embedding/modules/__init__.py | 0 .../modules/learned_entity_embedding.py | 70 ------------------- .../network_embedding/modules/no_embedding.py | 12 ---- .../test_tabular_classification.py | 5 ++ 4 files changed, 5 insertions(+), 82 deletions(-) delete mode 100644 autoPyTorch/pipeline/components/setup/network_embedding/modules/__init__.py delete mode 100644 autoPyTorch/pipeline/components/setup/network_embedding/modules/learned_entity_embedding.py delete mode 100644 autoPyTorch/pipeline/components/setup/network_embedding/modules/no_embedding.py diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/modules/__init__.py b/autoPyTorch/pipeline/components/setup/network_embedding/modules/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/modules/learned_entity_embedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/modules/learned_entity_embedding.py deleted file mode 100644 index d7d294661..000000000 --- a/autoPyTorch/pipeline/components/setup/network_embedding/modules/learned_entity_embedding.py +++ /dev/null @@ -1,70 +0,0 @@ -""" -Class to learn an embedding for categorical hyperparameters. -""" - -import torch -import torch.nn as nn -import numpy as np - - -class LearnedEntityEmbedding(nn.Module): - """ Parent class for MlpNet, ResNet, ... Can use entity embedding for categorical features""" - - def __init__(self, config, num_input_features, num_numerical_features): - """ - Initialize the BaseFeatureNet. - Arguments: - config: The configuration sampled by the hyperparameter optimizer - in_features: the number of features of the dataset - one_hot_encoder: OneHot encoder, that is used to encode X - """ - super(LearnedEntityEmbedding, self).__init__() - self.config = config - - # self.num_numerical = len([f for f in one_hot_encoder.categorical_features if not f]) - # self.num_input_features = [len(c) for c in one_hot_encoder.categories_] - self.num_numerical = num_numerical_features - self.num_input_features = num_input_features - self.embed_features = [num_in >= config["min_unique_values_for_embedding"] for num_in in - self.num_input_features] - self.num_output_dimensions = [config["dimension_reduction_" + str(i)] * num_in for i, num_in in - enumerate(self.num_input_features)] - self.num_output_dimensions = [int(np.clip(num_out, 1, num_in - 1)) for num_out, num_in in - zip(self.num_output_dimensions, self.num_input_features)] - self.num_output_dimensions = [num_out if embed else num_in for num_out, embed, num_in in - zip(self.num_output_dimensions, self.embed_features, self.num_input_features)] - self.num_out_feats = self.num_numerical + sum(self.num_output_dimensions) - - self.ee_layers = self._create_ee_layers() - - def forward(self, x): - # pass the columns of each categorical feature through entity embedding layer - # before passing it through the model - concat_seq = [] - last_concat = 0 - x_pointer = 0 - layer_pointer = 0 - for num_in, embed in zip(self.num_input_features, self.embed_features): - if not embed: - x_pointer += 1 - continue - if x_pointer > last_concat: - concat_seq.append(x[:, last_concat: x_pointer]) - categorical_feature_slice = x[:, x_pointer: x_pointer + num_in] - concat_seq.append(self.ee_layers[layer_pointer](categorical_feature_slice)) - layer_pointer += 1 - x_pointer += num_in - last_concat = x_pointer - - concat_seq.append(x[:, last_concat:]) - return torch.cat(concat_seq, dim=1) - - def _create_ee_layers(self): - # entity embeding layers are Linear Layers - layers = nn.ModuleList() - for i, (num_in, embed, num_out) in enumerate( - zip(self.num_input_features, self.embed_features, self.num_output_dimensions)): - if not embed: - continue - layers.append(nn.Linear(num_in, num_out)) - return layers \ No newline at end of file diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/modules/no_embedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/modules/no_embedding.py deleted file mode 100644 index cab1e9b7f..000000000 --- a/autoPyTorch/pipeline/components/setup/network_embedding/modules/no_embedding.py +++ /dev/null @@ -1,12 +0,0 @@ -from torch import nn - - -class NoEmbedding(nn.Module): - def __init__(self, config, in_features, num_numerical_features): - super(NoEmbedding, self).__init__() - self.config = config - self.n_feats = in_features - self.num_numerical = num_numerical_features - - def forward(self, x): - return x \ No newline at end of file diff --git a/test/test_pipeline/test_tabular_classification.py b/test/test_pipeline/test_tabular_classification.py index fc6eea0e4..f3fc039c3 100644 --- a/test/test_pipeline/test_tabular_classification.py +++ b/test/test_pipeline/test_tabular_classification.py @@ -21,9 +21,14 @@ parse_hyperparameter_search_space_updates +<<<<<<< HEAD @pytest.mark.parametrize("fit_dictionary_tabular", ['classification_categorical_only', 'classification_numerical_only', 'classification_numerical_and_categorical'], indirect=True) +======= +@pytest.mark.parametrize("fit_dictionary", ['fit_dictionary_categorical_only', + 'fit_dictionary_num_and_categorical'], indirect=True) +>>>>>>> Working network embedding class TestTabularClassification: def _assert_pipeline_search_space(self, pipeline, search_space_updates): config_space = pipeline.get_hyperparameter_search_space() From 6a02fe448088314987cfe20b35dff96d04ab45fd Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Mon, 15 Feb 2021 22:24:05 +0100 Subject: [PATCH 56/68] ADD tests for network embedding --- .../components/setup/test_setup_networks.py | 11 +---------- test/test_pipeline/test_tabular_classification.py | 5 ----- 2 files changed, 1 insertion(+), 15 deletions(-) diff --git a/test/test_pipeline/components/setup/test_setup_networks.py b/test/test_pipeline/components/setup/test_setup_networks.py index df5ad4cfd..732c2dd7b 100644 --- a/test/test_pipeline/components/setup/test_setup_networks.py +++ b/test/test_pipeline/components/setup/test_setup_networks.py @@ -27,28 +27,19 @@ def embedding(request): 'classification_categorical_only', 'classification_numerical_and_categorical'], indirect=True) class TestNetworks: -<<<<<<< HEAD def test_pipeline_fit(self, fit_dictionary_tabular, backbone, head): -======= - def test_pipeline_fit(self, fit_dictionary, embedding, backbone, head): ->>>>>>> ADD tests for network embedding """This test makes sure that the pipeline is able to fit every combination of network embedding, backbone, head""" include = {'network_backbone': [backbone], 'network_head': [head], 'network_embedding': [embedding]} - if len(fit_dictionary['dataset_properties'] + if len(fit_dictionary_tabular['dataset_properties'] ['categorical_columns']) == 0 and embedding == 'LearnedEntityEmbedding': pytest.skip("Learned Entity Embedding is not used with numerical only data") pipeline = TabularClassificationPipeline( -<<<<<<< HEAD dataset_properties=fit_dictionary_tabular['dataset_properties'], - include={'network_backbone': [backbone], 'network_head': [head]}) -======= - dataset_properties=fit_dictionary['dataset_properties'], include=include) ->>>>>>> ADD tests for network embedding cs = pipeline.get_hyperparameter_search_space() config = cs.get_default_configuration() diff --git a/test/test_pipeline/test_tabular_classification.py b/test/test_pipeline/test_tabular_classification.py index f3fc039c3..fc6eea0e4 100644 --- a/test/test_pipeline/test_tabular_classification.py +++ b/test/test_pipeline/test_tabular_classification.py @@ -21,14 +21,9 @@ parse_hyperparameter_search_space_updates -<<<<<<< HEAD @pytest.mark.parametrize("fit_dictionary_tabular", ['classification_categorical_only', 'classification_numerical_only', 'classification_numerical_and_categorical'], indirect=True) -======= -@pytest.mark.parametrize("fit_dictionary", ['fit_dictionary_categorical_only', - 'fit_dictionary_num_and_categorical'], indirect=True) ->>>>>>> Working network embedding class TestTabularClassification: def _assert_pipeline_search_space(self, pipeline, search_space_updates): config_space = pipeline.get_hyperparameter_search_space() From 3f7c2ccefe3a5b8737d6c3440d1f8d297128a843 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Fri, 26 Feb 2021 15:57:41 +0100 Subject: [PATCH 57/68] print paths of parent dir --- examples/example_tmp_for_debug.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/example_tmp_for_debug.py b/examples/example_tmp_for_debug.py index 664eac866..9ecaca32e 100644 --- a/examples/example_tmp_for_debug.py +++ b/examples/example_tmp_for_debug.py @@ -60,6 +60,6 @@ f"{estimator.seed}.{run_key.config_id}.{run_key.budget}.cv_model" ) - paths = DisplayablePath.make_tree(run_key_model_run_dir) + paths = DisplayablePath.make_tree(os.path.dirname(run_key_model_run_dir)) for path in paths: print(path.displayable()) From f7653470d77283b6ba48cfb906ee74e9f18bf9d1 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Fri, 26 Feb 2021 16:33:17 +0100 Subject: [PATCH 58/68] Trying to run examples --- .github/workflows/examples.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/.github/workflows/examples.yml b/.github/workflows/examples.yml index 09b52f4e6..680bee134 100644 --- a/.github/workflows/examples.yml +++ b/.github/workflows/examples.yml @@ -30,7 +30,4 @@ jobs: echo "::set-output name=BEFORE::$(git status --porcelain -b)" - name: Run tests run: | -# python examples/example_tabular_classification.py -# python examples/example_image_classification.py -# python examples/example_tabular_regression.py python examples/example_tmp_for_debug.py \ No newline at end of file From 6ad8550f81c1672e3827c52bac6d3765a21e548f Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Sat, 27 Feb 2021 13:27:17 +0100 Subject: [PATCH 59/68] Trying to run examples --- examples/example_tmp_for_debug.py | 78 ++++++++++++++++++++++++++++++- test/utils.py | 6 --- 2 files changed, 77 insertions(+), 7 deletions(-) diff --git a/examples/example_tmp_for_debug.py b/examples/example_tmp_for_debug.py index 9ecaca32e..15cc283da 100644 --- a/examples/example_tmp_for_debug.py +++ b/examples/example_tmp_for_debug.py @@ -9,8 +9,84 @@ from autoPyTorch.datasets.resampling_strategy import ( CrossValTypes, ) -from test.utils import DisplayablePath +from pathlib import Path + + +class DisplayablePath(object): + display_filename_prefix_middle = '├──' + display_filename_prefix_last = '└──' + display_parent_prefix_middle = ' ' + display_parent_prefix_last = '│ ' + + def __init__(self, path, parent_path, is_last): + self.path = Path(str(path)) + self.parent = parent_path + self.is_last = is_last + if self.parent: + self.depth = self.parent.depth + 1 + else: + self.depth = 0 + + @property + def displayname(self): + if self.path.is_dir(): + return self.path.name + '/' + return self.path.name + + @classmethod + def make_tree(cls, root, parent=None, is_last=False, criteria=None): + root = Path(str(root)) + criteria = criteria or cls._default_criteria + + displayable_root = cls(root, parent, is_last) + yield displayable_root + + children = sorted(list(path + for path in root.iterdir() + if criteria(path)), + key=lambda s: str(s).lower()) + count = 1 + for path in children: + is_last = count == len(children) + if path.is_dir(): + yield from cls.make_tree(path, + parent=displayable_root, + is_last=is_last, + criteria=criteria) + else: + yield cls(path, displayable_root, is_last) + count += 1 + + @classmethod + def _default_criteria(cls, path): + return True + + @property + def displayname(self): + if self.path.is_dir(): + return self.path.name + '/' + return self.path.name + + def displayable(self): + if self.parent is None: + return self.displayname + + _filename_prefix = (self.display_filename_prefix_last + if self.is_last + else self.display_filename_prefix_middle) + + parts = ['{!s} {!s}'.format(_filename_prefix, + self.displayname)] + + parent = self.parent + while parent and parent.parent is not None: + parts.append(self.display_parent_prefix_middle + if parent.is_last + else self.display_parent_prefix_last) + parent = parent.parent + + return ''.join(reversed(parts)) openml_id = 40981 diff --git a/test/utils.py b/test/utils.py index b1c919c4c..171d4d052 100644 --- a/test/utils.py +++ b/test/utils.py @@ -50,12 +50,6 @@ def make_tree(cls, root, parent=None, is_last=False, criteria=None): def _default_criteria(cls, path): return True - @property - def displayname(self): - if self.path.is_dir(): - return self.path.name + '/' - return self.path.name - def displayable(self): if self.parent is None: return self.displayname From 011c0ef9018a4d2d8c37e98a60e021e8908714d2 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Mon, 1 Mar 2021 12:12:37 +0100 Subject: [PATCH 60/68] Add success model --- examples/example_tmp_for_debug.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/example_tmp_for_debug.py b/examples/example_tmp_for_debug.py index 15cc283da..ae6a4ca92 100644 --- a/examples/example_tmp_for_debug.py +++ b/examples/example_tmp_for_debug.py @@ -130,12 +130,13 @@ def displayable(self): if os.path.exists(run_key_model_run_dir): break - model_file = os.path.join( run_key_model_run_dir, f"{estimator.seed}.{run_key.config_id}.{run_key.budget}.cv_model" ) + print(model_file) + paths = DisplayablePath.make_tree(os.path.dirname(run_key_model_run_dir)) for path in paths: print(path.displayable()) From 1efc39af0ed4103359407658b6feab7e98be8401 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Mon, 1 Mar 2021 12:18:06 +0100 Subject: [PATCH 61/68] Added parent directory for printing paths --- test/test_api/test_api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_api/test_api.py b/test/test_api/test_api.py index 4cf615e97..30ddaeed6 100644 --- a/test/test_api/test_api.py +++ b/test/test_api/test_api.py @@ -124,7 +124,7 @@ def test_tabular_classification(openml_id, resampling_strategy, backend): try: assert os.path.exists(model_file), model_file except AssertionError: - paths = DisplayablePath.make_tree(run_key_model_run_dir) + paths = DisplayablePath.make_tree(os.path.dirname(run_key_model_run_dir)) for path in paths: print(path.displayable()) raise AssertionError(model_file) From 3d54db83ca61f75ed88196b90f613d067b86b594 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Mon, 1 Mar 2021 13:12:17 +0100 Subject: [PATCH 62/68] print log file to see if backend is saving num run --- autoPyTorch/evaluation/abstract_evaluator.py | 1 + autoPyTorch/utils/backend.py | 3 +++ examples/example_tmp_for_debug.py | 8 ++++++++ 3 files changed, 12 insertions(+) diff --git a/autoPyTorch/evaluation/abstract_evaluator.py b/autoPyTorch/evaluation/abstract_evaluator.py index c1f7da60d..fcc3507cc 100644 --- a/autoPyTorch/evaluation/abstract_evaluator.py +++ b/autoPyTorch/evaluation/abstract_evaluator.py @@ -538,6 +538,7 @@ def file_output( else: pipeline = None + self.logger.debug("Saving directory {}, {}, {}".format(self.seed, self.num_run, self.budget)) self.backend.save_numrun_to_dir( seed=int(self.seed), idx=int(self.num_run), diff --git a/autoPyTorch/utils/backend.py b/autoPyTorch/utils/backend.py index dd24c2340..5111c116f 100644 --- a/autoPyTorch/utils/backend.py +++ b/autoPyTorch/utils/backend.py @@ -392,6 +392,7 @@ def save_numrun_to_dir( cv_model: Optional[BasePipeline], ensemble_predictions: Optional[np.ndarray], valid_predictions: Optional[np.ndarray], test_predictions: Optional[np.ndarray], ) -> None: + assert self._logger is not None runs_directory = self.get_runs_directory() tmpdir = tempfile.mkdtemp(dir=runs_directory) if model is not None: @@ -417,6 +418,8 @@ def save_numrun_to_dir( with open(file_path, 'wb') as fh: pickle.dump(preds.astype(np.float32), fh, -1) try: + self._logger.debug("Renaming {} to {}".format(tmpdir, + self.get_numrun_directory(seed, idx, budget))) os.rename(tmpdir, self.get_numrun_directory(seed, idx, budget)) except OSError: if os.path.exists(self.get_numrun_directory(seed, idx, budget)): diff --git a/examples/example_tmp_for_debug.py b/examples/example_tmp_for_debug.py index ae6a4ca92..52fc1d76c 100644 --- a/examples/example_tmp_for_debug.py +++ b/examples/example_tmp_for_debug.py @@ -140,3 +140,11 @@ def displayable(self): paths = DisplayablePath.make_tree(os.path.dirname(run_key_model_run_dir)) for path in paths: print(path.displayable()) + + # printing log file + tmp_dir = estimator._backend.temporary_directory + log_file = os.path.join(tmp_dir, "AutoPyTorch:{}:{}.log".format(estimator.dataset_name, estimator.seed)) + f = open(log_file, 'r') + lines = f.readlines() + for line in lines: + print(line) From 6c5e8becb3c6d50aaa717cb0794485625515b6e0 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Mon, 1 Mar 2021 15:06:00 +0100 Subject: [PATCH 63/68] Setup logger in backend --- autoPyTorch/evaluation/abstract_evaluator.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/autoPyTorch/evaluation/abstract_evaluator.py b/autoPyTorch/evaluation/abstract_evaluator.py index fcc3507cc..29075841a 100644 --- a/autoPyTorch/evaluation/abstract_evaluator.py +++ b/autoPyTorch/evaluation/abstract_evaluator.py @@ -331,6 +331,8 @@ def __init__(self, backend: Backend, name=logger_name, port=logger_port, ) + self.backend.setup_logger(name=logger_name, port=logger_port) + self.Y_optimization: Optional[np.ndarray] = None self.Y_actual_train: Optional[np.ndarray] = None self.pipelines: Optional[List[BaseEstimator]] = None From 873438427eedec45656ac78490485ef5389e0dbf Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Mon, 1 Mar 2021 21:11:28 +0100 Subject: [PATCH 64/68] try without embeddings --- test/test_api/test_api.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test/test_api/test_api.py b/test/test_api/test_api.py index 30ddaeed6..0b392ceae 100644 --- a/test/test_api/test_api.py +++ b/test/test_api/test_api.py @@ -48,6 +48,7 @@ def test_tabular_classification(openml_id, resampling_strategy, backend): estimator = TabularClassificationTask( backend=backend, resampling_strategy=resampling_strategy, + include_components={'network_embedding': ['NoEmbedding']} ) estimator.search( From 8941c950c3070af661b6da48a3099224be7ad71e Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Tue, 2 Mar 2021 12:48:22 +0100 Subject: [PATCH 65/68] no embedding for python 3.6 --- .github/workflows/examples.yml | 6 +- examples/example_tmp_for_debug.py | 150 ------------------ test/test_api/test_api.py | 16 +- .../test_pipeline/components/training/base.py | 2 +- test/utils.py | 71 --------- 5 files changed, 12 insertions(+), 233 deletions(-) delete mode 100644 examples/example_tmp_for_debug.py delete mode 100644 test/utils.py diff --git a/.github/workflows/examples.yml b/.github/workflows/examples.yml index 680bee134..c4d2e2396 100644 --- a/.github/workflows/examples.yml +++ b/.github/workflows/examples.yml @@ -8,7 +8,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [3.6] + python-version: [3.8] fail-fast: false max-parallel: 2 @@ -30,4 +30,6 @@ jobs: echo "::set-output name=BEFORE::$(git status --porcelain -b)" - name: Run tests run: | - python examples/example_tmp_for_debug.py \ No newline at end of file + python examples/example_tabular_classification.py + python examples/example_tabular_regression.py + python examples/example_image_classification.py \ No newline at end of file diff --git a/examples/example_tmp_for_debug.py b/examples/example_tmp_for_debug.py deleted file mode 100644 index 52fc1d76c..000000000 --- a/examples/example_tmp_for_debug.py +++ /dev/null @@ -1,150 +0,0 @@ -""" -Example file to be deleted -""" -import os - -import sklearn.datasets - -from autoPyTorch.api.tabular_classification import TabularClassificationTask -from autoPyTorch.datasets.resampling_strategy import ( - CrossValTypes, -) - -from pathlib import Path - - -class DisplayablePath(object): - display_filename_prefix_middle = '├──' - display_filename_prefix_last = '└──' - display_parent_prefix_middle = ' ' - display_parent_prefix_last = '│ ' - - def __init__(self, path, parent_path, is_last): - self.path = Path(str(path)) - self.parent = parent_path - self.is_last = is_last - if self.parent: - self.depth = self.parent.depth + 1 - else: - self.depth = 0 - - @property - def displayname(self): - if self.path.is_dir(): - return self.path.name + '/' - return self.path.name - - @classmethod - def make_tree(cls, root, parent=None, is_last=False, criteria=None): - root = Path(str(root)) - criteria = criteria or cls._default_criteria - - displayable_root = cls(root, parent, is_last) - yield displayable_root - - children = sorted(list(path - for path in root.iterdir() - if criteria(path)), - key=lambda s: str(s).lower()) - count = 1 - for path in children: - is_last = count == len(children) - if path.is_dir(): - yield from cls.make_tree(path, - parent=displayable_root, - is_last=is_last, - criteria=criteria) - else: - yield cls(path, displayable_root, is_last) - count += 1 - - @classmethod - def _default_criteria(cls, path): - return True - - @property - def displayname(self): - if self.path.is_dir(): - return self.path.name + '/' - return self.path.name - - def displayable(self): - if self.parent is None: - return self.displayname - - _filename_prefix = (self.display_filename_prefix_last - if self.is_last - else self.display_filename_prefix_middle) - - parts = ['{!s} {!s}'.format(_filename_prefix, - self.displayname)] - - parent = self.parent - while parent and parent.parent is not None: - parts.append(self.display_parent_prefix_middle - if parent.is_last - else self.display_parent_prefix_last) - parent = parent.parent - - return ''.join(reversed(parts)) - - -openml_id = 40981 -resampling_strategy = CrossValTypes.k_fold_cross_validation -X, y = sklearn.datasets.fetch_openml( - data_id=int(openml_id), - return_X_y=True, as_frame=True -) -X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( - X, y, random_state=1) - - -if __name__ == '__main__': - # Search for a good configuration - estimator = TabularClassificationTask( - temporary_directory='./tmp', - delete_tmp_folder_after_terminate=False, - resampling_strategy=resampling_strategy, - ) - - estimator.search( - X_train=X_train, y_train=y_train, - X_test=X_test, y_test=y_test, - optimize_metric='accuracy', - total_walltime_limit=150, - func_eval_time_limit=50, - traditional_per_total_budget=0 - ) - - # Search for an existing run key in disc. A individual model might have - # a timeout and hence was not written to disc - for i, (run_key, value) in enumerate(estimator.run_history.data.items()): - if i == 0: - # Ignore dummy run - continue - if 'SUCCESS' not in str(value.status): - continue - - run_key_model_run_dir = estimator._backend.get_numrun_directory( - estimator.seed, run_key.config_id, run_key.budget) - if os.path.exists(run_key_model_run_dir): - break - - model_file = os.path.join( - run_key_model_run_dir, - f"{estimator.seed}.{run_key.config_id}.{run_key.budget}.cv_model" - ) - - print(model_file) - - paths = DisplayablePath.make_tree(os.path.dirname(run_key_model_run_dir)) - for path in paths: - print(path.displayable()) - - # printing log file - tmp_dir = estimator._backend.temporary_directory - log_file = os.path.join(tmp_dir, "AutoPyTorch:{}:{}.log".format(estimator.dataset_name, estimator.seed)) - f = open(log_file, 'r') - lines = f.readlines() - for line in lines: - print(line) diff --git a/test/test_api/test_api.py b/test/test_api/test_api.py index 0b392ceae..607448de0 100644 --- a/test/test_api/test_api.py +++ b/test/test_api/test_api.py @@ -22,7 +22,6 @@ HoldoutValTypes, ) -from test.utils import DisplayablePath # Fixtures # ======== @@ -44,11 +43,16 @@ def test_tabular_classification(openml_id, resampling_strategy, backend): X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( X, y, random_state=1) + include = None + # for python less than 3.7, learned entity embedding + # is not able to be stored on disk (only on CI) + if sys.version_info < (3, 7): + include = {'network_embedding': ['NoEmbedding']} # Search for a good configuration estimator = TabularClassificationTask( backend=backend, resampling_strategy=resampling_strategy, - include_components={'network_embedding': ['NoEmbedding']} + include_components=include ) estimator.search( @@ -122,13 +126,7 @@ def test_tabular_classification(openml_id, resampling_strategy, backend): run_key_model_run_dir, f"{estimator.seed}.{run_key.config_id}.{run_key.budget}.cv_model" ) - try: - assert os.path.exists(model_file), model_file - except AssertionError: - paths = DisplayablePath.make_tree(os.path.dirname(run_key_model_run_dir)) - for path in paths: - print(path.displayable()) - raise AssertionError(model_file) + assert os.path.exists(model_file), model_file model = estimator._backend.load_cv_model_by_seed_and_id_and_budget( estimator.seed, run_key.config_id, run_key.budget) diff --git a/test/test_pipeline/components/training/base.py b/test/test_pipeline/components/training/base.py index c13a47aa8..10d9ea416 100644 --- a/test/test_pipeline/components/training/base.py +++ b/test/test_pipeline/components/training/base.py @@ -13,8 +13,8 @@ REGRESSION_TASKS, TASK_TYPES_TO_STRING ) -from autoPyTorch.pipeline.components.training.trainer.base_trainer import BaseTrainerComponent, BudgetTracker from autoPyTorch.pipeline.components.training.metrics.utils import get_metrics +from autoPyTorch.pipeline.components.training.trainer.base_trainer import BaseTrainerComponent, BudgetTracker class BaseTraining(unittest.TestCase): diff --git a/test/utils.py b/test/utils.py deleted file mode 100644 index 171d4d052..000000000 --- a/test/utils.py +++ /dev/null @@ -1,71 +0,0 @@ -from pathlib import Path - - -class DisplayablePath(object): - display_filename_prefix_middle = '├──' - display_filename_prefix_last = '└──' - display_parent_prefix_middle = ' ' - display_parent_prefix_last = '│ ' - - def __init__(self, path, parent_path, is_last): - self.path = Path(str(path)) - self.parent = parent_path - self.is_last = is_last - if self.parent: - self.depth = self.parent.depth + 1 - else: - self.depth = 0 - - @property - def displayname(self): - if self.path.is_dir(): - return self.path.name + '/' - return self.path.name - - @classmethod - def make_tree(cls, root, parent=None, is_last=False, criteria=None): - root = Path(str(root)) - criteria = criteria or cls._default_criteria - - displayable_root = cls(root, parent, is_last) - yield displayable_root - - children = sorted(list(path - for path in root.iterdir() - if criteria(path)), - key=lambda s: str(s).lower()) - count = 1 - for path in children: - is_last = count == len(children) - if path.is_dir(): - yield from cls.make_tree(path, - parent=displayable_root, - is_last=is_last, - criteria=criteria) - else: - yield cls(path, displayable_root, is_last) - count += 1 - - @classmethod - def _default_criteria(cls, path): - return True - - def displayable(self): - if self.parent is None: - return self.displayname - - _filename_prefix = (self.display_filename_prefix_last - if self.is_last - else self.display_filename_prefix_middle) - - parts = ['{!s} {!s}'.format(_filename_prefix, - self.displayname)] - - parent = self.parent - while parent and parent.parent is not None: - parts.append(self.display_parent_prefix_middle - if parent.is_last - else self.display_parent_prefix_last) - parent = parent.parent - - return ''.join(reversed(parts)) From 5aec1e1b1cab423d035293a093cc5425a0c53ce2 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Tue, 2 Mar 2021 13:21:05 +0100 Subject: [PATCH 66/68] Deleted debug example --- examples/example_tmp_for_debug.py | 150 ------------------------------ 1 file changed, 150 deletions(-) delete mode 100644 examples/example_tmp_for_debug.py diff --git a/examples/example_tmp_for_debug.py b/examples/example_tmp_for_debug.py deleted file mode 100644 index 52fc1d76c..000000000 --- a/examples/example_tmp_for_debug.py +++ /dev/null @@ -1,150 +0,0 @@ -""" -Example file to be deleted -""" -import os - -import sklearn.datasets - -from autoPyTorch.api.tabular_classification import TabularClassificationTask -from autoPyTorch.datasets.resampling_strategy import ( - CrossValTypes, -) - -from pathlib import Path - - -class DisplayablePath(object): - display_filename_prefix_middle = '├──' - display_filename_prefix_last = '└──' - display_parent_prefix_middle = ' ' - display_parent_prefix_last = '│ ' - - def __init__(self, path, parent_path, is_last): - self.path = Path(str(path)) - self.parent = parent_path - self.is_last = is_last - if self.parent: - self.depth = self.parent.depth + 1 - else: - self.depth = 0 - - @property - def displayname(self): - if self.path.is_dir(): - return self.path.name + '/' - return self.path.name - - @classmethod - def make_tree(cls, root, parent=None, is_last=False, criteria=None): - root = Path(str(root)) - criteria = criteria or cls._default_criteria - - displayable_root = cls(root, parent, is_last) - yield displayable_root - - children = sorted(list(path - for path in root.iterdir() - if criteria(path)), - key=lambda s: str(s).lower()) - count = 1 - for path in children: - is_last = count == len(children) - if path.is_dir(): - yield from cls.make_tree(path, - parent=displayable_root, - is_last=is_last, - criteria=criteria) - else: - yield cls(path, displayable_root, is_last) - count += 1 - - @classmethod - def _default_criteria(cls, path): - return True - - @property - def displayname(self): - if self.path.is_dir(): - return self.path.name + '/' - return self.path.name - - def displayable(self): - if self.parent is None: - return self.displayname - - _filename_prefix = (self.display_filename_prefix_last - if self.is_last - else self.display_filename_prefix_middle) - - parts = ['{!s} {!s}'.format(_filename_prefix, - self.displayname)] - - parent = self.parent - while parent and parent.parent is not None: - parts.append(self.display_parent_prefix_middle - if parent.is_last - else self.display_parent_prefix_last) - parent = parent.parent - - return ''.join(reversed(parts)) - - -openml_id = 40981 -resampling_strategy = CrossValTypes.k_fold_cross_validation -X, y = sklearn.datasets.fetch_openml( - data_id=int(openml_id), - return_X_y=True, as_frame=True -) -X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( - X, y, random_state=1) - - -if __name__ == '__main__': - # Search for a good configuration - estimator = TabularClassificationTask( - temporary_directory='./tmp', - delete_tmp_folder_after_terminate=False, - resampling_strategy=resampling_strategy, - ) - - estimator.search( - X_train=X_train, y_train=y_train, - X_test=X_test, y_test=y_test, - optimize_metric='accuracy', - total_walltime_limit=150, - func_eval_time_limit=50, - traditional_per_total_budget=0 - ) - - # Search for an existing run key in disc. A individual model might have - # a timeout and hence was not written to disc - for i, (run_key, value) in enumerate(estimator.run_history.data.items()): - if i == 0: - # Ignore dummy run - continue - if 'SUCCESS' not in str(value.status): - continue - - run_key_model_run_dir = estimator._backend.get_numrun_directory( - estimator.seed, run_key.config_id, run_key.budget) - if os.path.exists(run_key_model_run_dir): - break - - model_file = os.path.join( - run_key_model_run_dir, - f"{estimator.seed}.{run_key.config_id}.{run_key.budget}.cv_model" - ) - - print(model_file) - - paths = DisplayablePath.make_tree(os.path.dirname(run_key_model_run_dir)) - for path in paths: - print(path.displayable()) - - # printing log file - tmp_dir = estimator._backend.temporary_directory - log_file = os.path.join(tmp_dir, "AutoPyTorch:{}:{}.log".format(estimator.dataset_name, estimator.seed)) - f = open(log_file, 'r') - lines = f.readlines() - for line in lines: - print(line) From 36ae93c6a8bf83c8ecca8601946e473213a184b4 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Tue, 2 Mar 2021 13:57:00 +0100 Subject: [PATCH 67/68] Fix test for evaluation --- test/test_evaluation/test_evaluation.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/test/test_evaluation/test_evaluation.py b/test/test_evaluation/test_evaluation.py index f4345cb40..415dc707f 100644 --- a/test/test_evaluation/test_evaluation.py +++ b/test/test_evaluation/test_evaluation.py @@ -380,6 +380,10 @@ def test_silent_exception_in_target_function(self): """'save_targets_ensemble'",)""", """AttributeError("'BackendMock' object has no attribute """ """'save_targets_ensemble'")""", + """AttributeError("'BackendMock' object has no attribute """ + """'setup_logger'",)""", + """AttributeError("'BackendMock' object has no attribute """ + """'setup_logger'")""", ) ) self.assertNotIn('exitcode', info[1].additional_info) From c9ef56e47e5f68d16a97bec3c6328210613a485d Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Thu, 4 Mar 2021 13:37:03 +0100 Subject: [PATCH 68/68] Deleted utils file --- test/utils.py | 71 --------------------------------------------------- 1 file changed, 71 deletions(-) delete mode 100644 test/utils.py diff --git a/test/utils.py b/test/utils.py deleted file mode 100644 index 171d4d052..000000000 --- a/test/utils.py +++ /dev/null @@ -1,71 +0,0 @@ -from pathlib import Path - - -class DisplayablePath(object): - display_filename_prefix_middle = '├──' - display_filename_prefix_last = '└──' - display_parent_prefix_middle = ' ' - display_parent_prefix_last = '│ ' - - def __init__(self, path, parent_path, is_last): - self.path = Path(str(path)) - self.parent = parent_path - self.is_last = is_last - if self.parent: - self.depth = self.parent.depth + 1 - else: - self.depth = 0 - - @property - def displayname(self): - if self.path.is_dir(): - return self.path.name + '/' - return self.path.name - - @classmethod - def make_tree(cls, root, parent=None, is_last=False, criteria=None): - root = Path(str(root)) - criteria = criteria or cls._default_criteria - - displayable_root = cls(root, parent, is_last) - yield displayable_root - - children = sorted(list(path - for path in root.iterdir() - if criteria(path)), - key=lambda s: str(s).lower()) - count = 1 - for path in children: - is_last = count == len(children) - if path.is_dir(): - yield from cls.make_tree(path, - parent=displayable_root, - is_last=is_last, - criteria=criteria) - else: - yield cls(path, displayable_root, is_last) - count += 1 - - @classmethod - def _default_criteria(cls, path): - return True - - def displayable(self): - if self.parent is None: - return self.displayname - - _filename_prefix = (self.display_filename_prefix_last - if self.is_last - else self.display_filename_prefix_middle) - - parts = ['{!s} {!s}'.format(_filename_prefix, - self.displayname)] - - parent = self.parent - while parent and parent.parent is not None: - parts.append(self.display_parent_prefix_middle - if parent.is_last - else self.display_parent_prefix_last) - parent = parent.parent - - return ''.join(reversed(parts))