From 671edc57216406fc29c7ecd042ab776a9f0b0950 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Thu, 7 Apr 2022 10:43:26 +0200 Subject: [PATCH 1/9] add updates for apt1.0+reg_cocktails --- autoPyTorch/api/base_task.py | 161 +++++++++++++++++++++++++++++++++++ 1 file changed, 161 insertions(+) diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py index c19ea358d..180b8b882 100644 --- a/autoPyTorch/api/base_task.py +++ b/autoPyTorch/api/base_task.py @@ -125,6 +125,167 @@ def get_search_updates(categorical_indicator: List[bool]) -> HyperparameterSearc search_space_updates = HyperparameterSearchSpaceUpdates() + # architecture head + search_space_updates.append( + node_name='network_head', + hyperparameter='__choice__', + value_range=['no_head'], + default_value='no_head', + ) + search_space_updates.append( + node_name='network_head', + hyperparameter='no_head:activation', + value_range=['relu'], + default_value='relu', + ) + + # weights initialisation + search_space_updates.append( + node_name='network_init', + hyperparameter='__choice__', + value_range=['NoInit'], + default_value='NoInit', + ) + search_space_updates.append( + node_name='network_init', + hyperparameter='NoInit:bias_strategy', + value_range=['Zero'], + default_value='Zero', + ) + + # backbone architecture choices + search_space_updates.append( + node_name='network_backbone', + hyperparameter='__choice__', + value_range=['ShapedResNetBackbone'], + default_value='ShapedResNetBackbone', + ) + + # resnet backbone + search_space_updates.append( + node_name='network_backbone', + hyperparameter='ShapedResNetBackbone:resnet_shape', + value_range=['funnel'], + default_value='funnel', + ) + search_space_updates.append( + node_name='network_backbone', + hyperparameter='ShapedResNetBackbone:num_groups', + value_range=[1, 4], + default_value=2, + ) + search_space_updates.append( + node_name='network_backbone', + hyperparameter='ShapedResNetBackbone:blocks_per_group', + value_range=[1, 3], + default_value=2, + ) + search_space_updates.append( + node_name='network_backbone', + hyperparameter='ShapedResNetBackbone:output_dim', + value_range=[32, 512], + default_value=64, + log=True + ) + search_space_updates.append( + node_name='network_backbone', + hyperparameter='ShapedResNetBackbone:max_units', + value_range=[32, 512], + default_value=64, + log=True + ) + search_space_updates.append( + node_name='network_backbone', + hyperparameter='ShapedResNetBackbone:activation', + value_range=['relu'], + default_value='relu', + ) + + # training updates + # lr scheduler + search_space_updates.append( + node_name='lr_scheduler', + hyperparameter='__choice__', + value_range=['CosineAnnealingWarmRestarts'], + default_value='CosineAnnealingWarmRestarts', + ) + search_space_updates.append( + node_name='lr_scheduler', + hyperparameter='CosineAnnealingWarmRestarts:n_restarts', + value_range=[1, 3], + default_value=2, + ) + # optimizer + search_space_updates.append( + node_name='optimizer', + hyperparameter='__choice__', + value_range=['AdamWOptimizer'], + default_value='AdamWOptimizer', + ) + # adamw + search_space_updates.append( + node_name='optimizer', + hyperparameter='AdamWOptimizer:lr', + value_range=[1e-4, 1e-1], + default_value=1e-3, + log=True + ) + search_space_updates.append( + node_name='optimizer', + hyperparameter='AdamWOptimizer:beta1', + value_range=[0.9], + default_value=0.9, + ) + search_space_updates.append( + node_name='optimizer', + hyperparameter='AdamWOptimizer:beta2', + value_range=[0.999], + default_value=0.999, + ) + search_space_updates.append( + node_name='data_loader', + hyperparameter='batch_size', + value_range=[16, 512], + default_value=128, + log=True + ) + + # preprocessing + if has_numerical_features: + search_space_updates.append( + node_name='feature_preprocessor', + hyperparameter='__choice__', + value_range=['NoFeaturePreprocessor', 'TruncatedSVD'], + default_value='NoFeaturePreprocessor', + ) + search_space_updates.append( + node_name='feature_preprocessor', + hyperparameter='TruncatedSVD:target_dim', + value_range=[0.1, 0.9], + default_value=0.4, + ) + search_space_updates.append( + node_name='imputer', + hyperparameter='numerical_strategy', + value_range=['mean'], + default_value='mean', + ) + search_space_updates.append( + node_name='scaler', + hyperparameter='__choice__', + value_range=['StandardScaler'], + default_value='StandardScaler', + ) + + if has_cat_features: + search_space_updates.append( + node_name='encoder', + hyperparameter='__choice__', + value_range=['OneHotEncoder'], + default_value='OneHotEncoder', + ) + + return search_space_updates From f2f5f72743cbd659a666c35fe92f253ecff87ccc Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Tue, 3 May 2022 17:58:02 +0200 Subject: [PATCH 2/9] debug loggers for checking data and network memory usage --- .../early_preprocessor/EarlyPreprocessing.py | 13 ++++++++++++- .../network_backbone/base_network_backbone.py | 13 ++++++++++++- .../base_network_embedding.py | 18 ++++++++++++++++-- 3 files changed, 40 insertions(+), 4 deletions(-) diff --git a/autoPyTorch/pipeline/components/setup/early_preprocessor/EarlyPreprocessing.py b/autoPyTorch/pipeline/components/setup/early_preprocessor/EarlyPreprocessing.py index 486ce2ef7..959389abc 100644 --- a/autoPyTorch/pipeline/components/setup/early_preprocessor/EarlyPreprocessing.py +++ b/autoPyTorch/pipeline/components/setup/early_preprocessor/EarlyPreprocessing.py @@ -1,4 +1,7 @@ from typing import Any, Dict, Optional, Union +import logging.handlers +import time +import psutil from ConfigSpace.configuration_space import ConfigurationSpace @@ -12,6 +15,7 @@ from autoPyTorch.pipeline.components.setup.base_setup import autoPyTorchSetupComponent from autoPyTorch.pipeline.components.setup.early_preprocessor.utils import get_preprocess_transforms, preprocess from autoPyTorch.utils.common import FitRequirement +from autoPyTorch.utils.logging_ import get_named_client_logger class EarlyPreprocessing(autoPyTorchSetupComponent): @@ -25,7 +29,12 @@ def __init__(self, random_state: Optional[np.random.RandomState] = None) -> None def fit(self, X: Dict[str, Any], y: Any = None) -> "EarlyPreprocessing": self.check_requirements(X, y) - + self.logger = get_named_client_logger( + name=f"{X['num_run']}_{self.__class__.__name__}_{time.time()}", + # Log to a user provided port else to the default logging port + port=X['logger_port' + ] if 'logger_port' in X else logging.handlers.DEFAULT_TCP_LOGGING_PORT, + ) return self def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: @@ -37,7 +46,9 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: # Incorporate the transform to the dataset X_train = X['backend'].load_datamanager().train_tensors[0] + self.logger.debug(f"Available virtual memory: {psutil.virtual_memory().available/1024/1024}, total virtual memroy: {psutil.virtual_memory().total/1024/1024}") X['X_train'] = preprocess(dataset=X_train, transforms=transforms) + self.logger.debug(f"After preprocessing Available virtual memory: {psutil.virtual_memory().available/1024/1024}, total virtual memroy: {psutil.virtual_memory().total/1024/1024}") # We need to also save the preprocess transforms for inference X.update({ diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py b/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py index f63ebd578..54f3ce853 100644 --- a/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py +++ b/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py @@ -1,5 +1,8 @@ from abc import abstractmethod from typing import Any, Dict, Iterable, Optional, Tuple +import logging.handlers +import time +import psutil import numpy as np @@ -16,6 +19,8 @@ ) from autoPyTorch.pipeline.components.setup.network_backbone.utils import get_output_shape from autoPyTorch.utils.common import FitRequirement +from autoPyTorch.utils.logging_ import get_named_client_logger + class NetworkBackboneComponent(autoPyTorchComponent): @@ -48,11 +53,17 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator: Self """ self.check_requirements(X, y) - + self.logger = get_named_client_logger( + name=f"{X['num_run']}_{self.__class__.__name__}_{time.time()}", + # Log to a user provided port else to the default logging port + port=X['logger_port' + ] if 'logger_port' in X else logging.handlers.DEFAULT_TCP_LOGGING_PORT, + ) input_shape = X['shape_after_preprocessing'] input_shape = get_output_shape(X['network_embedding'], input_shape=input_shape) self.input_shape = input_shape + self.logger.debug(f"Before building backbone Available virtual memory: {psutil.virtual_memory().available/1024/1024}, total virtual memroy: {psutil.virtual_memory().total/1024/1024}") self.backbone = self.build_backbone( input_shape=input_shape, diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py index 5fa451434..b8e0fdf37 100644 --- a/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py +++ b/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py @@ -1,4 +1,8 @@ -from typing import Any, Dict, List, Optional, Tuple +import copy +from typing import Any, Dict, Optional, Tuple +import logging.handlers +import time +import psutil import numpy as np @@ -7,6 +11,8 @@ from torch import nn from autoPyTorch.pipeline.components.setup.base_setup import autoPyTorchSetupComponent +from autoPyTorch.utils.logging_ import get_named_client_logger + from autoPyTorch.utils.common import FitRequirement @@ -20,12 +26,20 @@ def __init__(self, random_state: Optional[np.random.RandomState] = None): self.embedding: Optional[nn.Module] = None def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator: + self.logger = get_named_client_logger( + name=f"{X['num_run']}_{self.__class__.__name__}_{time.time()}", + # Log to a user provided port else to the default logging port + port=X['logger_port' + ] if 'logger_port' in X else logging.handlers.DEFAULT_TCP_LOGGING_PORT, + ) + self.logger.debug(f"Before getting info for embedding Available virtual memory: {psutil.virtual_memory().available/1024/1024}, total virtual memroy: {psutil.virtual_memory().total/1024/1024}") num_features_excl_embed, num_categories_per_col = self._get_required_info_from_data(X) - + self.logger.debug(f"Before building embedding Available virtual memory: {psutil.virtual_memory().available/1024/1024}, total virtual memroy: {psutil.virtual_memory().total/1024/1024}") self.embedding = self.build_embedding( num_categories_per_col=num_categories_per_col, num_features_excl_embed=num_features_excl_embed) + self.logger.debug(f"After building embedding Available virtual memory: {psutil.virtual_memory().available/1024/1024}, total virtual memroy: {psutil.virtual_memory().total/1024/1024}") return self def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: From 47b5c51a9116efd8b65d6158c75afc62e896d2ec Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Wed, 15 Jun 2022 18:47:18 +0200 Subject: [PATCH 3/9] add support for pandas, test for data passing, remove debug loggers --- autoPyTorch/api/base_task.py | 179 ------------------ autoPyTorch/data/tabular_validator.py | 2 + autoPyTorch/data/utils.py | 48 ++++- .../early_preprocessor/EarlyPreprocessing.py | 9 +- .../network_backbone/base_network_backbone.py | 12 +- .../base_network_embedding.py | 20 +- test/test_api/test_api.py | 4 +- test/test_data/test_utils.py | 15 +- test/test_data/test_validation.py | 21 +- 9 files changed, 76 insertions(+), 234 deletions(-) diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py index 180b8b882..2e15c114e 100644 --- a/autoPyTorch/api/base_task.py +++ b/autoPyTorch/api/base_task.py @@ -111,184 +111,6 @@ def send_warnings_to_log( return prediction -def get_search_updates(categorical_indicator: List[bool]) -> HyperparameterSearchSpaceUpdates: - """ - These updates mimic the autopytorch tabular paper. - Returns: - ________ - search_space_updates - HyperparameterSearchSpaceUpdates - The search space updates like setting different hps to different values or ranges. - """ - - # has_cat_features = any(categorical_indicator) - # has_numerical_features = not all(categorical_indicator) - - search_space_updates = HyperparameterSearchSpaceUpdates() - - # architecture head - search_space_updates.append( - node_name='network_head', - hyperparameter='__choice__', - value_range=['no_head'], - default_value='no_head', - ) - search_space_updates.append( - node_name='network_head', - hyperparameter='no_head:activation', - value_range=['relu'], - default_value='relu', - ) - - # weights initialisation - search_space_updates.append( - node_name='network_init', - hyperparameter='__choice__', - value_range=['NoInit'], - default_value='NoInit', - ) - search_space_updates.append( - node_name='network_init', - hyperparameter='NoInit:bias_strategy', - value_range=['Zero'], - default_value='Zero', - ) - - # backbone architecture choices - search_space_updates.append( - node_name='network_backbone', - hyperparameter='__choice__', - value_range=['ShapedResNetBackbone'], - default_value='ShapedResNetBackbone', - ) - - # resnet backbone - search_space_updates.append( - node_name='network_backbone', - hyperparameter='ShapedResNetBackbone:resnet_shape', - value_range=['funnel'], - default_value='funnel', - ) - search_space_updates.append( - node_name='network_backbone', - hyperparameter='ShapedResNetBackbone:num_groups', - value_range=[1, 4], - default_value=2, - ) - search_space_updates.append( - node_name='network_backbone', - hyperparameter='ShapedResNetBackbone:blocks_per_group', - value_range=[1, 3], - default_value=2, - ) - search_space_updates.append( - node_name='network_backbone', - hyperparameter='ShapedResNetBackbone:output_dim', - value_range=[32, 512], - default_value=64, - log=True - ) - search_space_updates.append( - node_name='network_backbone', - hyperparameter='ShapedResNetBackbone:max_units', - value_range=[32, 512], - default_value=64, - log=True - ) - search_space_updates.append( - node_name='network_backbone', - hyperparameter='ShapedResNetBackbone:activation', - value_range=['relu'], - default_value='relu', - ) - - # training updates - # lr scheduler - search_space_updates.append( - node_name='lr_scheduler', - hyperparameter='__choice__', - value_range=['CosineAnnealingWarmRestarts'], - default_value='CosineAnnealingWarmRestarts', - ) - search_space_updates.append( - node_name='lr_scheduler', - hyperparameter='CosineAnnealingWarmRestarts:n_restarts', - value_range=[1, 3], - default_value=2, - ) - # optimizer - search_space_updates.append( - node_name='optimizer', - hyperparameter='__choice__', - value_range=['AdamWOptimizer'], - default_value='AdamWOptimizer', - ) - # adamw - search_space_updates.append( - node_name='optimizer', - hyperparameter='AdamWOptimizer:lr', - value_range=[1e-4, 1e-1], - default_value=1e-3, - log=True - ) - search_space_updates.append( - node_name='optimizer', - hyperparameter='AdamWOptimizer:beta1', - value_range=[0.9], - default_value=0.9, - ) - search_space_updates.append( - node_name='optimizer', - hyperparameter='AdamWOptimizer:beta2', - value_range=[0.999], - default_value=0.999, - ) - search_space_updates.append( - node_name='data_loader', - hyperparameter='batch_size', - value_range=[16, 512], - default_value=128, - log=True - ) - - # preprocessing - if has_numerical_features: - search_space_updates.append( - node_name='feature_preprocessor', - hyperparameter='__choice__', - value_range=['NoFeaturePreprocessor', 'TruncatedSVD'], - default_value='NoFeaturePreprocessor', - ) - search_space_updates.append( - node_name='feature_preprocessor', - hyperparameter='TruncatedSVD:target_dim', - value_range=[0.1, 0.9], - default_value=0.4, - ) - search_space_updates.append( - node_name='imputer', - hyperparameter='numerical_strategy', - value_range=['mean'], - default_value='mean', - ) - search_space_updates.append( - node_name='scaler', - hyperparameter='__choice__', - value_range=['StandardScaler'], - default_value='StandardScaler', - ) - - if has_cat_features: - search_space_updates.append( - node_name='encoder', - hyperparameter='__choice__', - value_range=['OneHotEncoder'], - default_value='OneHotEncoder', - ) - - - return search_space_updates - - class BaseTask(ABC): """ Base class for the tasks that serve as API to the pipelines. @@ -361,7 +183,6 @@ def __init__( resampling_strategy_args: Optional[Dict[str, Any]] = None, search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None, task_type: Optional[str] = None, - categorical_indicator: Optional[List[bool]] = None ) -> None: if isinstance(resampling_strategy, NoResamplingStrategyTypes) and ensemble_size != 0: diff --git a/autoPyTorch/data/tabular_validator.py b/autoPyTorch/data/tabular_validator.py index 492327fbe..a4b366651 100644 --- a/autoPyTorch/data/tabular_validator.py +++ b/autoPyTorch/data/tabular_validator.py @@ -104,6 +104,8 @@ def _compress_dataset( y=y, is_classification=self.is_classification, random_state=self.seed, + categorical_columns=self.feature_validator.categorical_columns, + n_categories_per_cat_column=self.feature_validator.num_categories_per_col, **self.dataset_compression # type: ignore [arg-type] ) self._reduced_dtype = dict(X.dtypes) if is_dataframe else X.dtype diff --git a/autoPyTorch/data/utils.py b/autoPyTorch/data/utils.py index 20ad5612e..333f123e2 100644 --- a/autoPyTorch/data/utils.py +++ b/autoPyTorch/data/utils.py @@ -459,8 +459,7 @@ def _subsample_by_indices( return X, y -def megabytes(arr: DatasetCompressionInputType) -> float: - +def get_raw_memory_usage(arr: DatasetCompressionInputType) -> float: if isinstance(arr, np.ndarray): memory_in_bytes = arr.nbytes elif issparse(arr): @@ -470,8 +469,40 @@ def megabytes(arr: DatasetCompressionInputType) -> float: else: raise ValueError(f"Unrecognised data type of X, expected data type to " f"be in (np.ndarray, spmatrix, pd.DataFrame) but got :{type(arr)}") + return memory_in_bytes + + +def get_approximate_mem_usage_in_mb( + arr: DatasetCompressionInputType, + categorical_columns: List, + n_categories_per_cat_column: Optional[List[int]] = None +) -> float: + + + if ispandas(arr): + arr_dtypes = arr.dtypes.to_dict() + multipliers = [dtype.itemsize for col, dtype in arr_dtypes.items() if col not in categorical_columns] + if len(categorical_columns) > 0: + if n_categories_per_cat_column is None: + raise ValueError("Value number of categories per categorical is required when the data has categorical columns") + for col, num_cat in zip(categorical_columns, n_categories_per_cat_column): + multipliers.append(num_cat * arr_dtypes[col].itemsize) + size_one_row = sum(multipliers) + + elif isinstance(arr, (np.ndarray, spmatrix)): + width = arr.shape[1] - len(categorical_columns) + multiplier = np.zeros(1, dtype=arr.dtype).itemsize + if len(categorical_columns) > 0: + if n_categories_per_cat_column is None: + raise ValueError("Value number of categories per categorical is required when the data has categorical columns") + # multiply num categories with the size of the column to capture memory after one hot encoding + width += sum(n_categories_per_cat_column) + size_one_row = width * multiplier + else: + raise ValueError(f"Unrecognised data type of X, expected data type to " + f"be in {DatasetCompressionInputType}, but got :{type(arr)}") - return float(memory_in_bytes / (2**20)) + return float(arr.shape[0] * size_one_row / (2**20)) def reduce_dataset_size_if_too_large( @@ -479,10 +510,13 @@ def reduce_dataset_size_if_too_large( memory_allocation: Union[int, float], is_classification: bool, random_state: Union[int, np.random.RandomState], + categorical_columns: List, + n_categories_per_cat_column: Optional[List[int]] = None, y: Optional[SupportedTargetTypes] = None, methods: List[str] = ['precision', 'subsample'], ) -> DatasetCompressionInputType: - f""" Reduces the size of the dataset if it's too close to the memory limit. + f""" + Reduces the size of the dataset if it's too close to the memory limit. Follows the order of the operations passed in and retains the type of its input. @@ -513,7 +547,6 @@ def reduce_dataset_size_if_too_large( Reduce the amount of samples of the dataset such that it fits into the allocated memory. Ensures stratification and that unique labels are present - memory_allocation (Union[int, float]): The amount of memory to allocate to the dataset. It should specify an absolute amount. @@ -524,7 +557,7 @@ def reduce_dataset_size_if_too_large( """ for method in methods: - if megabytes(X) <= memory_allocation: + if get_approximate_mem_usage_in_mb(X, categorical_columns, n_categories_per_cat_column) <= memory_allocation: break if method == 'precision': @@ -540,7 +573,8 @@ def reduce_dataset_size_if_too_large( # into the allocated memory, we subsample it so that it does n_samples_before = X.shape[0] - sample_percentage = memory_allocation / megabytes(X) + sample_percentage = memory_allocation / get_approximate_mem_usage_in_mb( + X, categorical_columns, n_categories_per_cat_column) # NOTE: type ignore # diff --git a/autoPyTorch/pipeline/components/setup/early_preprocessor/EarlyPreprocessing.py b/autoPyTorch/pipeline/components/setup/early_preprocessor/EarlyPreprocessing.py index 959389abc..8eb983d40 100644 --- a/autoPyTorch/pipeline/components/setup/early_preprocessor/EarlyPreprocessing.py +++ b/autoPyTorch/pipeline/components/setup/early_preprocessor/EarlyPreprocessing.py @@ -29,12 +29,7 @@ def __init__(self, random_state: Optional[np.random.RandomState] = None) -> None def fit(self, X: Dict[str, Any], y: Any = None) -> "EarlyPreprocessing": self.check_requirements(X, y) - self.logger = get_named_client_logger( - name=f"{X['num_run']}_{self.__class__.__name__}_{time.time()}", - # Log to a user provided port else to the default logging port - port=X['logger_port' - ] if 'logger_port' in X else logging.handlers.DEFAULT_TCP_LOGGING_PORT, - ) + return self def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: @@ -46,9 +41,7 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: # Incorporate the transform to the dataset X_train = X['backend'].load_datamanager().train_tensors[0] - self.logger.debug(f"Available virtual memory: {psutil.virtual_memory().available/1024/1024}, total virtual memroy: {psutil.virtual_memory().total/1024/1024}") X['X_train'] = preprocess(dataset=X_train, transforms=transforms) - self.logger.debug(f"After preprocessing Available virtual memory: {psutil.virtual_memory().available/1024/1024}, total virtual memroy: {psutil.virtual_memory().total/1024/1024}") # We need to also save the preprocess transforms for inference X.update({ diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py b/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py index 54f3ce853..ddda9289c 100644 --- a/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py +++ b/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py @@ -1,8 +1,5 @@ from abc import abstractmethod from typing import Any, Dict, Iterable, Optional, Tuple -import logging.handlers -import time -import psutil import numpy as np @@ -19,7 +16,6 @@ ) from autoPyTorch.pipeline.components.setup.network_backbone.utils import get_output_shape from autoPyTorch.utils.common import FitRequirement -from autoPyTorch.utils.logging_ import get_named_client_logger @@ -53,17 +49,11 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator: Self """ self.check_requirements(X, y) - self.logger = get_named_client_logger( - name=f"{X['num_run']}_{self.__class__.__name__}_{time.time()}", - # Log to a user provided port else to the default logging port - port=X['logger_port' - ] if 'logger_port' in X else logging.handlers.DEFAULT_TCP_LOGGING_PORT, - ) + input_shape = X['shape_after_preprocessing'] input_shape = get_output_shape(X['network_embedding'], input_shape=input_shape) self.input_shape = input_shape - self.logger.debug(f"Before building backbone Available virtual memory: {psutil.virtual_memory().available/1024/1024}, total virtual memroy: {psutil.virtual_memory().total/1024/1024}") self.backbone = self.build_backbone( input_shape=input_shape, diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py index b8e0fdf37..6825669ea 100644 --- a/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py +++ b/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py @@ -1,8 +1,5 @@ -import copy -from typing import Any, Dict, Optional, Tuple -import logging.handlers -import time -import psutil +from typing import Any, Dict, List, Optional, Tuple + import numpy as np @@ -11,8 +8,6 @@ from torch import nn from autoPyTorch.pipeline.components.setup.base_setup import autoPyTorchSetupComponent -from autoPyTorch.utils.logging_ import get_named_client_logger - from autoPyTorch.utils.common import FitRequirement @@ -26,20 +21,13 @@ def __init__(self, random_state: Optional[np.random.RandomState] = None): self.embedding: Optional[nn.Module] = None def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator: - self.logger = get_named_client_logger( - name=f"{X['num_run']}_{self.__class__.__name__}_{time.time()}", - # Log to a user provided port else to the default logging port - port=X['logger_port' - ] if 'logger_port' in X else logging.handlers.DEFAULT_TCP_LOGGING_PORT, - ) - self.logger.debug(f"Before getting info for embedding Available virtual memory: {psutil.virtual_memory().available/1024/1024}, total virtual memroy: {psutil.virtual_memory().total/1024/1024}") num_features_excl_embed, num_categories_per_col = self._get_required_info_from_data(X) - self.logger.debug(f"Before building embedding Available virtual memory: {psutil.virtual_memory().available/1024/1024}, total virtual memroy: {psutil.virtual_memory().total/1024/1024}") + self.embedding = self.build_embedding( num_categories_per_col=num_categories_per_col, num_features_excl_embed=num_features_excl_embed) - self.logger.debug(f"After building embedding Available virtual memory: {psutil.virtual_memory().available/1024/1024}, total virtual memroy: {psutil.virtual_memory().total/1024/1024}") + return self def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: diff --git a/test/test_api/test_api.py b/test/test_api/test_api.py index 13cf62ffd..285f0e4bb 100644 --- a/test/test_api/test_api.py +++ b/test/test_api/test_api.py @@ -500,10 +500,10 @@ def test_do_dummy_prediction(dask_client, fit_dictionary_tabular): del estimator -@pytest.skip("Fix with new portfolio PR") @unittest.mock.patch('autoPyTorch.evaluation.tae.eval_train_function', new=dummy_eval_train_function) @pytest.mark.parametrize('openml_id', (40981, )) +@pytest.mark.skip(reason="Fix with new portfolio PR") def test_portfolio_selection(openml_id, backend, n_samples): # Get the data and check that contents of data-manager make sense @@ -543,7 +543,7 @@ def test_portfolio_selection(openml_id, backend, n_samples): assert any(successful_config in portfolio_configs for successful_config in successful_configs) -@pytest.skip("Fix with new portfolio PR") +@pytest.mark.skip(reason="Fix with new portfolio PR") @unittest.mock.patch('autoPyTorch.evaluation.tae.eval_train_function', new=dummy_eval_train_function) @pytest.mark.parametrize('openml_id', (40981, )) diff --git a/test/test_data/test_utils.py b/test/test_data/test_utils.py index 4269c4e5f..44609b1e2 100644 --- a/test/test_data/test_utils.py +++ b/test/test_data/test_utils.py @@ -25,7 +25,8 @@ from autoPyTorch.data.utils import ( default_dataset_compression_arg, get_dataset_compression_mapping, - megabytes, + get_approximate_mem_usage_in_mb, + get_raw_memory_usage, reduce_dataset_size_if_too_large, reduce_precision, subsample, @@ -35,9 +36,8 @@ @pytest.mark.parametrize('openmlid', [2, 40984]) -@pytest.mark.parametrize('as_frame', [True, False]) -def test_reduce_dataset_if_too_large(openmlid, as_frame, n_samples): - X, y = fetch_openml(data_id=openmlid, return_X_y=True, as_frame=as_frame) +def test_reduce_dataset_if_too_large(openmlid, n_samples): + X, y = fetch_openml(data_id=openmlid, return_X_y=True, as_frame=False) X = subsampler(data=X, x=range(n_samples)) y = subsampler(data=y, x=range(n_samples)) @@ -45,13 +45,14 @@ def test_reduce_dataset_if_too_large(openmlid, as_frame, n_samples): X.copy(), y=y.copy(), is_classification=True, + categorical_columns=[], random_state=1, memory_allocation=0.001) assert X_converted.shape[0] < X.shape[0] assert y_converted.shape[0] < y.shape[0] - assert megabytes(X_converted) < megabytes(X) + assert get_raw_memory_usage(X_converted) < get_raw_memory_usage(X) @pytest.mark.parametrize("X", [np.asarray([[1, 1, 1]] * 30)]) @@ -211,8 +212,8 @@ def test_unsupported_errors(): ['a', 'b', 'c', 'a', 'b', 'c'], ['a', 'b', 'd', 'r', 'b', 'c']]) with pytest.raises(ValueError, match=r'X.dtype = .*'): - reduce_dataset_size_if_too_large(X, is_classification=True, random_state=1, memory_allocation=0) + reduce_dataset_size_if_too_large(X, is_classification=True, categorical_columns=[], random_state=1, memory_allocation=0) X = [[1, 2], [2, 3]] with pytest.raises(ValueError, match=r'Unrecognised data type of X, expected data type to be in .*'): - reduce_dataset_size_if_too_large(X, is_classification=True, random_state=1, memory_allocation=0) + reduce_dataset_size_if_too_large(X, is_classification=True, categorical_columns=[], random_state=1, memory_allocation=0) diff --git a/test/test_data/test_validation.py b/test/test_data/test_validation.py index 48a3ccfeb..af05a6c29 100644 --- a/test/test_data/test_validation.py +++ b/test/test_data/test_validation.py @@ -8,7 +8,8 @@ import sklearn.model_selection from autoPyTorch.data.tabular_validator import TabularInputValidator -from autoPyTorch.data.utils import megabytes +from autoPyTorch.data.utils import get_approximate_mem_usage_in_mb, get_raw_memory_usage +from autoPyTorch.utils.common import ispandas @pytest.mark.parametrize('openmlid', [2, 40975, 40984]) @@ -148,16 +149,28 @@ def test_featurevalidator_dataset_compression(input_data_featuretest): X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( input_data_featuretest, input_data_targets, test_size=0.1, random_state=1) validator = TabularInputValidator( - dataset_compression={'memory_allocation': 0.8 * megabytes(X_train), 'methods': ['precision', 'subsample']} + dataset_compression={'memory_allocation': 0.8 * get_approximate_mem_usage_in_mb(X_train, [], None), 'methods': ['precision', 'subsample']} ) validator.fit(X_train=X_train, y_train=y_train) transformed_X_train, _ = validator.transform(X_train.copy(), y_train.copy()) + if ispandas(X_train): + # input validator converts transformed_X_train to numpy and the cat columns are chosen as column indices + columns = X_train.columns + categorical_columns = [columns[col] for col in validator.feature_validator.categorical_columns] + else: + categorical_columns = validator.feature_validator.categorical_columns + assert validator._reduced_dtype is not None - assert megabytes(transformed_X_train) < megabytes(X_train) + assert get_approximate_mem_usage_in_mb( + transformed_X_train, validator.feature_validator.categorical_columns, validator.feature_validator.num_categories_per_col + ) < get_approximate_mem_usage_in_mb(X_train, categorical_columns, validator.feature_validator.num_categories_per_col) transformed_X_test, _ = validator.transform(X_test.copy(), y_test.copy()) - assert megabytes(transformed_X_test) < megabytes(X_test) + assert get_approximate_mem_usage_in_mb( + transformed_X_test, validator.feature_validator.categorical_columns, validator.feature_validator.num_categories_per_col + ) < get_approximate_mem_usage_in_mb(X_test, categorical_columns, validator.feature_validator.num_categories_per_col) + if hasattr(transformed_X_train, 'iloc'): assert all(transformed_X_train.dtypes == transformed_X_test.dtypes) assert all(transformed_X_train.dtypes == validator._precision) From 689fdcbdaf37acaa5381adf8932a7583e4d10fec Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Wed, 15 Jun 2022 18:48:47 +0200 Subject: [PATCH 4/9] remove unwanted changes --- .../components/setup/early_preprocessor/EarlyPreprocessing.py | 4 ---- .../setup/network_backbone/base_network_backbone.py | 1 - .../setup/network_embedding/base_network_embedding.py | 2 -- 3 files changed, 7 deletions(-) diff --git a/autoPyTorch/pipeline/components/setup/early_preprocessor/EarlyPreprocessing.py b/autoPyTorch/pipeline/components/setup/early_preprocessor/EarlyPreprocessing.py index 8eb983d40..486ce2ef7 100644 --- a/autoPyTorch/pipeline/components/setup/early_preprocessor/EarlyPreprocessing.py +++ b/autoPyTorch/pipeline/components/setup/early_preprocessor/EarlyPreprocessing.py @@ -1,7 +1,4 @@ from typing import Any, Dict, Optional, Union -import logging.handlers -import time -import psutil from ConfigSpace.configuration_space import ConfigurationSpace @@ -15,7 +12,6 @@ from autoPyTorch.pipeline.components.setup.base_setup import autoPyTorchSetupComponent from autoPyTorch.pipeline.components.setup.early_preprocessor.utils import get_preprocess_transforms, preprocess from autoPyTorch.utils.common import FitRequirement -from autoPyTorch.utils.logging_ import get_named_client_logger class EarlyPreprocessing(autoPyTorchSetupComponent): diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py b/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py index ddda9289c..f63ebd578 100644 --- a/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py +++ b/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py @@ -18,7 +18,6 @@ from autoPyTorch.utils.common import FitRequirement - class NetworkBackboneComponent(autoPyTorchComponent): """ Base class for network backbones. Holds the backbone module and the config which was used to create it. diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py index 6825669ea..5fa451434 100644 --- a/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py +++ b/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py @@ -1,6 +1,5 @@ from typing import Any, Dict, List, Optional, Tuple - import numpy as np from sklearn.base import BaseEstimator @@ -27,7 +26,6 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator: self.embedding = self.build_embedding( num_categories_per_col=num_categories_per_col, num_features_excl_embed=num_features_excl_embed) - return self def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: From 09015f368d76048ef7b187e5f0afa82a06280562 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Wed, 15 Jun 2022 19:11:58 +0200 Subject: [PATCH 5/9] : --- autoPyTorch/api/tabular_classification.py | 2 -- autoPyTorch/data/utils.py | 9 +++++---- test/test_data/test_utils.py | 22 ++++++++++++++++------ test/test_data/test_validation.py | 20 ++++++++++++++------ 4 files changed, 35 insertions(+), 18 deletions(-) diff --git a/autoPyTorch/api/tabular_classification.py b/autoPyTorch/api/tabular_classification.py index ec0237046..b39f47834 100644 --- a/autoPyTorch/api/tabular_classification.py +++ b/autoPyTorch/api/tabular_classification.py @@ -98,7 +98,6 @@ def __init__( resampling_strategy_args: Optional[Dict[str, Any]] = None, backend: Optional[Backend] = None, search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None, - categorical_indicator: Optional[List[bool]] = None ): super().__init__( seed=seed, @@ -119,7 +118,6 @@ def __init__( resampling_strategy_args=resampling_strategy_args, search_space_updates=search_space_updates, task_type=TASK_TYPES_TO_STRING[TABULAR_CLASSIFICATION], - categorical_indicator=categorical_indicator ) def build_pipeline( diff --git a/autoPyTorch/data/utils.py b/autoPyTorch/data/utils.py index 333f123e2..92ee7f907 100644 --- a/autoPyTorch/data/utils.py +++ b/autoPyTorch/data/utils.py @@ -460,6 +460,7 @@ def _subsample_by_indices( def get_raw_memory_usage(arr: DatasetCompressionInputType) -> float: + memory_in_bytes: float if isinstance(arr, np.ndarray): memory_in_bytes = arr.nbytes elif issparse(arr): @@ -478,13 +479,13 @@ def get_approximate_mem_usage_in_mb( n_categories_per_cat_column: Optional[List[int]] = None ) -> float: - + err_msg = "Value number of categories per categorical is required when the data has categorical columns" if ispandas(arr): arr_dtypes = arr.dtypes.to_dict() multipliers = [dtype.itemsize for col, dtype in arr_dtypes.items() if col not in categorical_columns] if len(categorical_columns) > 0: if n_categories_per_cat_column is None: - raise ValueError("Value number of categories per categorical is required when the data has categorical columns") + raise ValueError(err_msg) for col, num_cat in zip(categorical_columns, n_categories_per_cat_column): multipliers.append(num_cat * arr_dtypes[col].itemsize) size_one_row = sum(multipliers) @@ -494,13 +495,13 @@ def get_approximate_mem_usage_in_mb( multiplier = np.zeros(1, dtype=arr.dtype).itemsize if len(categorical_columns) > 0: if n_categories_per_cat_column is None: - raise ValueError("Value number of categories per categorical is required when the data has categorical columns") + raise ValueError(err_msg) # multiply num categories with the size of the column to capture memory after one hot encoding width += sum(n_categories_per_cat_column) size_one_row = width * multiplier else: raise ValueError(f"Unrecognised data type of X, expected data type to " - f"be in {DatasetCompressionInputType}, but got :{type(arr)}") + f"be in (np.ndarray, spmatrix, pd.DataFrame), but got :{type(arr)}") return float(arr.shape[0] * size_one_row / (2**20)) diff --git a/test/test_data/test_utils.py b/test/test_data/test_utils.py index 44609b1e2..6228740b0 100644 --- a/test/test_data/test_utils.py +++ b/test/test_data/test_utils.py @@ -25,7 +25,6 @@ from autoPyTorch.data.utils import ( default_dataset_compression_arg, get_dataset_compression_mapping, - get_approximate_mem_usage_in_mb, get_raw_memory_usage, reduce_dataset_size_if_too_large, reduce_precision, @@ -36,8 +35,9 @@ @pytest.mark.parametrize('openmlid', [2, 40984]) -def test_reduce_dataset_if_too_large(openmlid, n_samples): - X, y = fetch_openml(data_id=openmlid, return_X_y=True, as_frame=False) +@pytest.mark.parametrize('as_frame', [True, False]) +def test_reduce_dataset_if_too_large(openmlid, as_frame, n_samples): + X, y = fetch_openml(data_id=openmlid, return_X_y=True, as_frame=as_frame) X = subsampler(data=X, x=range(n_samples)) y = subsampler(data=y, x=range(n_samples)) @@ -47,7 +47,7 @@ def test_reduce_dataset_if_too_large(openmlid, n_samples): is_classification=True, categorical_columns=[], random_state=1, - memory_allocation=0.001) + memory_allocation=0.01) assert X_converted.shape[0] < X.shape[0] assert y_converted.shape[0] < y.shape[0] @@ -212,8 +212,18 @@ def test_unsupported_errors(): ['a', 'b', 'c', 'a', 'b', 'c'], ['a', 'b', 'd', 'r', 'b', 'c']]) with pytest.raises(ValueError, match=r'X.dtype = .*'): - reduce_dataset_size_if_too_large(X, is_classification=True, categorical_columns=[], random_state=1, memory_allocation=0) + reduce_dataset_size_if_too_large( + X, + is_classification=True, + categorical_columns=[], + random_state=1, + memory_allocation=0) X = [[1, 2], [2, 3]] with pytest.raises(ValueError, match=r'Unrecognised data type of X, expected data type to be in .*'): - reduce_dataset_size_if_too_large(X, is_classification=True, categorical_columns=[], random_state=1, memory_allocation=0) + reduce_dataset_size_if_too_large( + X, + is_classification=True, + categorical_columns=[], + random_state=1, + memory_allocation=0) diff --git a/test/test_data/test_validation.py b/test/test_data/test_validation.py index af05a6c29..58481d230 100644 --- a/test/test_data/test_validation.py +++ b/test/test_data/test_validation.py @@ -8,7 +8,7 @@ import sklearn.model_selection from autoPyTorch.data.tabular_validator import TabularInputValidator -from autoPyTorch.data.utils import get_approximate_mem_usage_in_mb, get_raw_memory_usage +from autoPyTorch.data.utils import get_approximate_mem_usage_in_mb from autoPyTorch.utils.common import ispandas @@ -149,7 +149,9 @@ def test_featurevalidator_dataset_compression(input_data_featuretest): X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( input_data_featuretest, input_data_targets, test_size=0.1, random_state=1) validator = TabularInputValidator( - dataset_compression={'memory_allocation': 0.8 * get_approximate_mem_usage_in_mb(X_train, [], None), 'methods': ['precision', 'subsample']} + dataset_compression={ + 'memory_allocation': 0.8 * get_approximate_mem_usage_in_mb(X_train, [], None), + 'methods': ['precision', 'subsample']} ) validator.fit(X_train=X_train, y_train=y_train) transformed_X_train, _ = validator.transform(X_train.copy(), y_train.copy()) @@ -163,13 +165,19 @@ def test_featurevalidator_dataset_compression(input_data_featuretest): assert validator._reduced_dtype is not None assert get_approximate_mem_usage_in_mb( - transformed_X_train, validator.feature_validator.categorical_columns, validator.feature_validator.num_categories_per_col - ) < get_approximate_mem_usage_in_mb(X_train, categorical_columns, validator.feature_validator.num_categories_per_col) + transformed_X_train, + validator.feature_validator.categorical_columns, + validator.feature_validator.num_categories_per_col + ) < get_approximate_mem_usage_in_mb( + X_train, categorical_columns, validator.feature_validator.num_categories_per_col) transformed_X_test, _ = validator.transform(X_test.copy(), y_test.copy()) assert get_approximate_mem_usage_in_mb( - transformed_X_test, validator.feature_validator.categorical_columns, validator.feature_validator.num_categories_per_col - ) < get_approximate_mem_usage_in_mb(X_test, categorical_columns, validator.feature_validator.num_categories_per_col) + transformed_X_test, + validator.feature_validator.categorical_columns, + validator.feature_validator.num_categories_per_col + ) < get_approximate_mem_usage_in_mb( + X_test, categorical_columns, validator.feature_validator.num_categories_per_col) if hasattr(transformed_X_train, 'iloc'): assert all(transformed_X_train.dtypes == transformed_X_test.dtypes) From 7a839420a140c24b688b87b1eeb1b15581929451 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Mon, 20 Jun 2022 18:02:34 +0200 Subject: [PATCH 6/9] Adjust formula to account for embedding columns --- autoPyTorch/constants.py | 2 ++ autoPyTorch/data/utils.py | 8 ++++++-- .../TabularColumnTransformer.py | 13 +++++++++++-- .../column_splitting/ColumnSplitter.py | 4 ++-- .../tabular_preprocessing/encoding/OneHotEncoder.py | 3 ++- 5 files changed, 23 insertions(+), 7 deletions(-) diff --git a/autoPyTorch/constants.py b/autoPyTorch/constants.py index 652a546b9..d2d23d886 100644 --- a/autoPyTorch/constants.py +++ b/autoPyTorch/constants.py @@ -54,3 +54,5 @@ CLASSIFICATION_OUTPUTS = [BINARY, MULTICLASS, MULTICLASSMULTIOUTPUT] REGRESSION_OUTPUTS = [CONTINUOUS, CONTINUOUSMULTIOUTPUT] + +MIN_CATEGORIES_FOR_EMBEDDING_MAX = 7 \ No newline at end of file diff --git a/autoPyTorch/data/utils.py b/autoPyTorch/data/utils.py index 92ee7f907..a2994d069 100644 --- a/autoPyTorch/data/utils.py +++ b/autoPyTorch/data/utils.py @@ -25,6 +25,7 @@ from sklearn.utils import _approximate_mode, check_random_state from sklearn.utils.validation import _num_samples, check_array +from autoPyTorch.constants import MIN_CATEGORIES_FOR_EMBEDDING_MAX from autoPyTorch.data.base_target_validator import SupportedTargetTypes from autoPyTorch.utils.common import ispandas @@ -487,7 +488,10 @@ def get_approximate_mem_usage_in_mb( if n_categories_per_cat_column is None: raise ValueError(err_msg) for col, num_cat in zip(categorical_columns, n_categories_per_cat_column): - multipliers.append(num_cat * arr_dtypes[col].itemsize) + if num_cat < MIN_CATEGORIES_FOR_EMBEDDING_MAX: + multipliers.append(num_cat * arr_dtypes[col].itemsize) + else: + multipliers.append(arr_dtypes[col].itemsize) size_one_row = sum(multipliers) elif isinstance(arr, (np.ndarray, spmatrix)): @@ -497,7 +501,7 @@ def get_approximate_mem_usage_in_mb( if n_categories_per_cat_column is None: raise ValueError(err_msg) # multiply num categories with the size of the column to capture memory after one hot encoding - width += sum(n_categories_per_cat_column) + width += sum([num_cat if num_cat < MIN_CATEGORIES_FOR_EMBEDDING_MAX else 1 for num_cat in n_categories_per_cat_column]) size_one_row = width * multiplier else: raise ValueError(f"Unrecognised data type of X, expected data type to " diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py index 6b38b4650..7fe88f21c 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py @@ -1,6 +1,7 @@ from typing import Any, Dict, List, Optional, Tuple, Union import numpy as np +import psutil from sklearn.base import BaseEstimator from sklearn.compose import ColumnTransformer @@ -24,6 +25,7 @@ def __init__(self, random_state: Optional[Union[np.random.RandomState, int]] = N self.add_fit_requirements([ FitRequirement('numerical_columns', (List,), user_defined=True, dataset_property=True), FitRequirement('categorical_columns', (List,), user_defined=True, dataset_property=True)]) + def get_column_transformer(self) -> ColumnTransformer: """ @@ -51,6 +53,13 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> "TabularColumnTransformer": self.check_requirements(X, y) + self.logger = get_named_client_logger( + name=f"{X['num_run']}_{self.__class__.__name__}_{time.time()}", + # Log to a user provided port else to the default logging port + port=X['logger_port' + ] if 'logger_port' in X else logging.handlers.DEFAULT_TCP_LOGGING_PORT, + ) + preprocessors = get_tabular_preprocessers(X) column_transformers: List[Tuple[str, BaseEstimator, List[int]]] = [] if len(preprocessors['numerical']) > 0: @@ -71,7 +80,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> "TabularColumnTransformer": column_transformers, remainder='passthrough' ) - + self.logger.debug(f"Available virtual memory: {psutil.virtual_memory().available/1024/1024}, total virtual memory: {psutil.virtual_memory().total/1024/1024}") # Where to get the data -- Prioritize X_train if any else # get from backend if 'X_train' in X: @@ -85,7 +94,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> "TabularColumnTransformer": y_train = X['backend'].load_datamanager().train_tensors[1] self.preprocessor.fit(X_train, y=y_train) - + self.logger.debug(f"Available virtual memory: {psutil.virtual_memory().available/1024/1024}, total virtual memory: {psutil.virtual_memory().total/1024/1024}") return self def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/column_splitting/ColumnSplitter.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/column_splitting/ColumnSplitter.py index eeca9fdc4..437198d9e 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/column_splitting/ColumnSplitter.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/column_splitting/ColumnSplitter.py @@ -7,7 +7,7 @@ import numpy as np - +from autoPyTorch.constants import MIN_CATEGORIES_FOR_EMBEDDING_MAX from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.base_tabular_preprocessing import \ autoPyTorchTabularPreprocessingComponent @@ -72,7 +72,7 @@ def get_hyperparameter_search_space( dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None, min_categories_for_embedding: HyperparameterSearchSpace = HyperparameterSearchSpace( hyperparameter="min_categories_for_embedding", - value_range=(3, 7), + value_range=(3, MIN_CATEGORIES_FOR_EMBEDDING_MAX), default_value=3, log=True), ) -> ConfigurationSpace: diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/OneHotEncoder.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/OneHotEncoder.py index 80cf3f748..4f8878615 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/OneHotEncoder.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/OneHotEncoder.py @@ -24,7 +24,8 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEncoder: # It is safer to have the OHE produce a 0 array than to crash a good configuration categories='auto', sparse=False, - handle_unknown='ignore') + handle_unknown='ignore', + dtype=np.float32) return self @staticmethod From 58f897e829e55ceda2c0d7766205acd676130d4e Mon Sep 17 00:00:00 2001 From: Ravin Kohli <13005107+ravinkohli@users.noreply.github.com> Date: Fri, 1 Jul 2022 12:46:00 +0200 Subject: [PATCH 7/9] Apply suggestions from code review Co-authored-by: nabenabe0928 <47781922+nabenabe0928@users.noreply.github.com> --- autoPyTorch/data/utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/autoPyTorch/data/utils.py b/autoPyTorch/data/utils.py index a2994d069..462720f67 100644 --- a/autoPyTorch/data/utils.py +++ b/autoPyTorch/data/utils.py @@ -495,13 +495,13 @@ def get_approximate_mem_usage_in_mb( size_one_row = sum(multipliers) elif isinstance(arr, (np.ndarray, spmatrix)): - width = arr.shape[1] - len(categorical_columns) - multiplier = np.zeros(1, dtype=arr.dtype).itemsize + n_cols = arr.shape[-1] - len(categorical_columns) + multiplier = arr.dtype.itemsize if len(categorical_columns) > 0: if n_categories_per_cat_column is None: raise ValueError(err_msg) # multiply num categories with the size of the column to capture memory after one hot encoding - width += sum([num_cat if num_cat < MIN_CATEGORIES_FOR_EMBEDDING_MAX else 1 for num_cat in n_categories_per_cat_column]) + width += sum(num_cat if num_cat < MIN_CATEGORIES_FOR_EMBEDDING_MAX else 1 for num_cat in n_categories_per_cat_column) size_one_row = width * multiplier else: raise ValueError(f"Unrecognised data type of X, expected data type to " From bff3edf9c8c3e85d3af0b7391e8feed4cdfc79fd Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Fri, 1 Jul 2022 12:49:45 +0200 Subject: [PATCH 8/9] remove unwanted additions --- autoPyTorch/data/utils.py | 4 ++-- .../tabular_preprocessing/TabularColumnTransformer.py | 11 ++--------- 2 files changed, 4 insertions(+), 11 deletions(-) diff --git a/autoPyTorch/data/utils.py b/autoPyTorch/data/utils.py index 462720f67..2a44dd5c2 100644 --- a/autoPyTorch/data/utils.py +++ b/autoPyTorch/data/utils.py @@ -501,8 +501,8 @@ def get_approximate_mem_usage_in_mb( if n_categories_per_cat_column is None: raise ValueError(err_msg) # multiply num categories with the size of the column to capture memory after one hot encoding - width += sum(num_cat if num_cat < MIN_CATEGORIES_FOR_EMBEDDING_MAX else 1 for num_cat in n_categories_per_cat_column) - size_one_row = width * multiplier + n_cols += sum(num_cat if num_cat < MIN_CATEGORIES_FOR_EMBEDDING_MAX else 1 for num_cat in n_categories_per_cat_column) + size_one_row = n_cols * multiplier else: raise ValueError(f"Unrecognised data type of X, expected data type to " f"be in (np.ndarray, spmatrix, pd.DataFrame), but got :{type(arr)}") diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py index 7fe88f21c..8106015cf 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py @@ -53,13 +53,6 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> "TabularColumnTransformer": self.check_requirements(X, y) - self.logger = get_named_client_logger( - name=f"{X['num_run']}_{self.__class__.__name__}_{time.time()}", - # Log to a user provided port else to the default logging port - port=X['logger_port' - ] if 'logger_port' in X else logging.handlers.DEFAULT_TCP_LOGGING_PORT, - ) - preprocessors = get_tabular_preprocessers(X) column_transformers: List[Tuple[str, BaseEstimator, List[int]]] = [] if len(preprocessors['numerical']) > 0: @@ -80,7 +73,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> "TabularColumnTransformer": column_transformers, remainder='passthrough' ) - self.logger.debug(f"Available virtual memory: {psutil.virtual_memory().available/1024/1024}, total virtual memory: {psutil.virtual_memory().total/1024/1024}") + # Where to get the data -- Prioritize X_train if any else # get from backend if 'X_train' in X: @@ -94,7 +87,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> "TabularColumnTransformer": y_train = X['backend'].load_datamanager().train_tensors[1] self.preprocessor.fit(X_train, y=y_train) - self.logger.debug(f"Available virtual memory: {psutil.virtual_memory().available/1024/1024}, total virtual memory: {psutil.virtual_memory().total/1024/1024}") + return self def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: From 72f1c7ca43e5f691d3c91b94e66bbb386a21c1ab Mon Sep 17 00:00:00 2001 From: Ravin Kohli <13005107+ravinkohli@users.noreply.github.com> Date: Sat, 16 Jul 2022 14:16:59 +0200 Subject: [PATCH 9/9] Update autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py --- .../tabular_preprocessing/TabularColumnTransformer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py index 8106015cf..48f40e9fe 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py @@ -1,7 +1,6 @@ from typing import Any, Dict, List, Optional, Tuple, Union import numpy as np -import psutil from sklearn.base import BaseEstimator from sklearn.compose import ColumnTransformer